diff --git a/.github/workflows/build_doc.yml b/.github/workflows/build_doc.yml index bbd292cba0c839..16936e2a3559e0 100644 --- a/.github/workflows/build_doc.yml +++ b/.github/workflows/build_doc.yml @@ -25,7 +25,7 @@ jobs: lfs: true - name: Install apt-get dependencies - uses: awalsh128/cache-apt-pkgs-action@v1.1.3 + uses: awalsh128/cache-apt-pkgs-action@v1.2.4 with: packages: graphviz texlive liblua5.2-0 libclang1-9 libclang-cpp9 version: 3.0 diff --git a/.github/workflows/mo.yml b/.github/workflows/mo.yml index 2f28e1e7d510c5..15f5c2ae92d2e9 100644 --- a/.github/workflows/mo.yml +++ b/.github/workflows/mo.yml @@ -30,7 +30,7 @@ jobs: python-version: '3.10' - name: Cache pip - uses: actions/cache@v1 + uses: actions/cache@v3 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('tools/mo/requirements*.txt') }} diff --git a/cmake/developer_package/api_validator/api_validator.cmake b/cmake/developer_package/api_validator/api_validator.cmake index 02cdc760c02aef..08f91322a3476f 100644 --- a/cmake/developer_package/api_validator/api_validator.cmake +++ b/cmake/developer_package/api_validator/api_validator.cmake @@ -5,60 +5,99 @@ if(WIN32) set(PROGRAMFILES_ENV "ProgramFiles(X86)") file(TO_CMAKE_PATH $ENV{${PROGRAMFILES_ENV}} PROGRAMFILES) - set(UWP_SDK_PATH "${PROGRAMFILES}/Windows Kits/10/bin/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}/x64") - message(STATUS "Trying to find apivalidator in: ${UWP_SDK_PATH}") - find_host_program(UWP_API_VALIDATOR + set(WDK_PATHS "${PROGRAMFILES}/Windows Kits/10/bin/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}/x64" + "${PROGRAMFILES}/Windows Kits/10/bin/x64") + + message(STATUS "Trying to find apivalidator in: ") + foreach(wdk_path IN LISTS WDK_PATHS) + message(" * ${wdk_path}") + endforeach() + + find_host_program(ONECORE_API_VALIDATOR NAMES apivalidator - PATHS "${UWP_SDK_PATH}" - DOC "ApiValidator for UWP compliance") + PATHS ${WDK_PATHS} + DOC "ApiValidator for OneCore compliance") - if(UWP_API_VALIDATOR) - message(STATUS "Found apivalidator: ${UWP_API_VALIDATOR}") + if(ONECORE_API_VALIDATOR) + message(STATUS "Found apivalidator: ${ONECORE_API_VALIDATOR}") endif() endif() function(_ie_add_api_validator_post_build_step_recursive) cmake_parse_arguments(API_VALIDATOR "" "TARGET" "" ${ARGN}) - list(APPEND API_VALIDATOR_TARGETS ${API_VALIDATOR_TARGET}) - set(API_VALIDATOR_TARGETS ${API_VALIDATOR_TARGETS} PARENT_SCOPE) - - get_target_property(IS_IMPORTED ${API_VALIDATOR_TARGET} IMPORTED) - if(IS_IMPORTED) - return() + get_target_property(LIBRARY_TYPE ${API_VALIDATOR_TARGET} TYPE) + if(LIBRARY_TYPE MATCHES "^(SHARED_LIBRARY|MODULE_LIBRARY|EXECUTABLE)$" AND + NOT ${API_VALIDATOR_TARGET} IN_LIST API_VALIDATOR_TARGETS) + list(APPEND API_VALIDATOR_TARGETS ${API_VALIDATOR_TARGET}) endif() + # keep checks target list to track cyclic dependencies, leading to infinite recursion + list(APPEND checked_targets ${API_VALIDATOR_TARGET}) - get_target_property(LIBRARY_TYPE ${API_VALIDATOR_TARGET} TYPE) - if(LIBRARY_TYPE STREQUAL "EXECUTABLE" OR LIBRARY_TYPE STREQUAL "SHARED_LIBRARY") + if(NOT LIBRARY_TYPE STREQUAL "INTERFACE_LIBRARY") get_target_property(LINKED_LIBRARIES ${API_VALIDATOR_TARGET} LINK_LIBRARIES) - if(LINKED_LIBRARIES) - foreach(ITEM IN LISTS LINKED_LIBRARIES) - if(NOT TARGET ${ITEM}) - continue() - endif() - get_target_property(LIBRARY_TYPE_DEPENDENCY ${ITEM} TYPE) - if(LIBRARY_TYPE_DEPENDENCY STREQUAL "SHARED_LIBRARY") - _ie_add_api_validator_post_build_step_recursive(TARGET ${ITEM}) - endif() - endforeach() - endif() + else() + set(LINKED_LIBRARIES) endif() + get_target_property(INTERFACE_LINKED_LIBRARIES ${API_VALIDATOR_TARGET} INTERFACE_LINK_LIBRARIES) + + foreach(library IN LISTS LINKED_LIBRARIES INTERFACE_LINKED_LIBRARIES) + if(TARGET "${library}") + get_target_property(orig_library ${library} ALIASED_TARGET) + if(orig_library IN_LIST checked_targets OR library IN_LIST checked_targets) + # in case of cyclic dependencies, we need to skip current target + continue() + endif() + if(TARGET "${orig_library}") + _ie_add_api_validator_post_build_step_recursive(TARGET ${orig_library}) + else() + _ie_add_api_validator_post_build_step_recursive(TARGET ${library}) + endif() + endif() + endforeach() set(API_VALIDATOR_TARGETS ${API_VALIDATOR_TARGETS} PARENT_SCOPE) endfunction() -set(VALIDATED_LIBRARIES "" CACHE INTERNAL "") +set(VALIDATED_TARGETS "" CACHE INTERNAL "") function(_ov_add_api_validator_post_build_step) - set(UWP_API_VALIDATOR_APIS "${PROGRAMFILES}/Windows Kits/10/build/universalDDIs/x64/UniversalDDIs.xml") - set(UWP_API_VALIDATOR_EXCLUSION "${UWP_SDK_PATH}/BinaryExclusionlist.xml") + if((NOT ONECORE_API_VALIDATOR) OR (WINDOWS_STORE OR WINDOWS_PHONE)) + return() + endif() - if((NOT UWP_API_VALIDATOR) OR (WINDOWS_STORE OR WINDOWS_PHONE)) + # see https://learn.microsoft.com/en-us/windows-hardware/drivers/develop/validating-windows-drivers#known-apivalidator-issues + # ApiValidator does not run on Arm64 because AitStatic does not work on Arm64 + if(HOST_AARCH64) return() endif() - cmake_parse_arguments(API_VALIDATOR "" "TARGET" "" ${ARGN}) + if(X86_64) + set(wdk_platform "x64") + elseif(X86) + set(wdk_platform "x86") + elseif(ARM) + set(wdk_platform "arm") + elseif(AARCH64) + set(wdk_platform "arm64") + else() + message(FATAL_ERROR "Unknown configuration: ${CMAKE_HOST_SYSTEM_PROCESSOR}") + endif() + + find_file(ONECORE_API_VALIDATOR_APIS NAMES UniversalDDIs.xml + PATHS "${PROGRAMFILES}/Windows Kits/10/build/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION}/universalDDIs/${wdk_platform}" + "${PROGRAMFILES}/Windows Kits/10/build/universalDDIs/${wdk_platform}" + DOC "Path to UniversalDDIs.xml file") + find_file(ONECORE_API_VALIDATOR_EXCLUSION NAMES BinaryExclusionlist.xml + PATHS ${WDK_PATHS} + DOC "Path to BinaryExclusionlist.xml file") + + if(NOT ONECORE_API_VALIDATOR_APIS) + message(FATAL_ERROR "Internal error: apiValidator is found (${ONECORE_API_VALIDATOR}), but UniversalDDIs.xml file has not been found for ${wdk_platform} platform") + endif() + + cmake_parse_arguments(API_VALIDATOR "" "TARGET" "EXTRA" "" ${ARGN}) if(NOT API_VALIDATOR_TARGET) message(FATAL_ERROR "RunApiValidator requires TARGET to validate!") @@ -69,74 +108,81 @@ function(_ov_add_api_validator_post_build_step) endif() # collect targets - _ie_add_api_validator_post_build_step_recursive(TARGET ${API_VALIDATOR_TARGET}) + if (API_VALIDATOR_EXTRA) + foreach(target IN LISTS API_VALIDATOR_EXTRA) + _ie_add_api_validator_post_build_step_recursive(TARGET ${target}) + endforeach() + endif() # remove targets which were tested before - foreach(target IN LISTS API_VALIDATOR_TARGETS) - list(FIND VALIDATED_LIBRARIES ${target} index) - if (NOT index EQUAL -1) - list(APPEND VALIDATED_TARGETS ${target}) - endif() - if(TARGET "${target}") - get_target_property(orig_target ${target} ALIASED_TARGET) - list(FIND VALIDATED_LIBRARIES ${orig_target} index) - if (NOT index EQUAL -1) - list(APPEND VALIDATED_TARGETS ${target}) - endif() - endif() - endforeach() foreach(item IN LISTS VALIDATED_TARGETS) list(REMOVE_ITEM API_VALIDATOR_TARGETS ${item}) endforeach() - list(REMOVE_DUPLICATES API_VALIDATOR_TARGETS) - if(NOT API_VALIDATOR_TARGETS) return() endif() # apply check - macro(api_validator_get_target_name) - get_target_property(IS_IMPORTED ${target} IMPORTED) + get_target_property(is_imported ${target} IMPORTED) get_target_property(orig_target ${target} ALIASED_TARGET) - if(IS_IMPORTED) - get_target_property(target_location ${target} LOCATION) - get_filename_component(target_name "${target_location}" NAME_WE) + if(is_imported) + get_target_property(imported_configs ${target} IMPORTED_CONFIGURATIONS) + foreach(imported_config RELEASE RELWITHDEBINFO DEBUG) + if(imported_config IN_LIST imported_configs) + get_target_property(target_location ${target} IMPORTED_LOCATION_${imported_config}) + get_filename_component(target_name "${target_location}" NAME_WE) + break() + endif() + endforeach() + unset(imported_configs) elseif(TARGET "${orig_target}") set(target_name ${orig_target}) + set(target_location $) else() set(target_name ${target}) + set(target_location $) endif() + + unset(orig_target) + unset(is_imported) endmacro() foreach(target IN LISTS API_VALIDATOR_TARGETS) api_validator_get_target_name() - if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.21 AND OV_GENERATOR_MULTI_CONFIG) - set(output_file "${CMAKE_BINARY_DIR}/api_validator/$/${target_name}.txt") + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20 AND OV_GENERATOR_MULTI_CONFIG) + set(output_file "${OpenVINO_BINARY_DIR}/api_validator/$/${target_name}.txt") else() - set(output_file "${CMAKE_BINARY_DIR}/api_validator/${target_name}.txt") + set(output_file "${OpenVINO_BINARY_DIR}/api_validator/${target_name}.txt") endif() - add_custom_command(TARGET ${API_VALIDATOR_TARGET} POST_BUILD - COMMAND ${CMAKE_COMMAND} --config $ - -D UWP_API_VALIDATOR=${UWP_API_VALIDATOR} - -D UWP_API_VALIDATOR_TARGET=$ - -D UWP_API_VALIDATOR_APIS=${UWP_API_VALIDATOR_APIS} - -D UWP_API_VALIDATOR_EXCLUSION=${UWP_API_VALIDATOR_EXCLUSION} - -D UWP_API_VALIDATOR_OUTPUT=${output_file} + list(APPEND post_build_commands + ${CMAKE_COMMAND} --config $ + -D ONECORE_API_VALIDATOR=${ONECORE_API_VALIDATOR} + -D ONECORE_API_VALIDATOR_TARGET=${target_location} + -D ONECORE_API_VALIDATOR_APIS=${ONECORE_API_VALIDATOR_APIS} + -D ONECORE_API_VALIDATOR_EXCLUSION=${ONECORE_API_VALIDATOR_EXCLUSION} + -D ONECORE_API_VALIDATOR_OUTPUT=${output_file} -D CMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} - -P "${IEDevScripts_DIR}/api_validator/api_validator_run.cmake" - BYPRODUCTS ${output_file} - COMMENT "[apiValidator] Check ${target_name} for OneCore compliance" - VERBATIM) + -P "${IEDevScripts_DIR}/api_validator/api_validator_run.cmake") + list(APPEND byproducts_files ${output_file}) + + unset(target_name) + unset(target_location) endforeach() + add_custom_command(TARGET ${API_VALIDATOR_TARGET} POST_BUILD + COMMAND ${post_build_commands} + BYPRODUCTS ${byproducts_files} + COMMENT "[apiValidator] Check ${API_VALIDATOR_TARGET} and dependencies for OneCore compliance" + VERBATIM) + # update list of validated libraries - list(APPEND VALIDATED_LIBRARIES ${API_VALIDATOR_TARGETS}) - set(VALIDATED_LIBRARIES "${VALIDATED_LIBRARIES}" CACHE INTERNAL "" FORCE) + list(APPEND VALIDATED_TARGETS ${API_VALIDATOR_TARGETS}) + set(VALIDATED_TARGETS "${VALIDATED_TARGETS}" CACHE INTERNAL "" FORCE) endfunction() # diff --git a/cmake/developer_package/api_validator/api_validator_run.cmake b/cmake/developer_package/api_validator/api_validator_run.cmake index 3781e17c1ed3ac..998c354118d4eb 100644 --- a/cmake/developer_package/api_validator/api_validator_run.cmake +++ b/cmake/developer_package/api_validator/api_validator_run.cmake @@ -4,9 +4,9 @@ cmake_policy(SET CMP0012 NEW) -foreach(var UWP_API_VALIDATOR UWP_API_VALIDATOR_TARGET - UWP_API_VALIDATOR_APIS UWP_API_VALIDATOR_EXCLUSION - UWP_API_VALIDATOR_OUTPUT CMAKE_TOOLCHAIN_FILE) +foreach(var ONECORE_API_VALIDATOR ONECORE_API_VALIDATOR_TARGET + ONECORE_API_VALIDATOR_APIS ONECORE_API_VALIDATOR_EXCLUSION + ONECORE_API_VALIDATOR_OUTPUT CMAKE_TOOLCHAIN_FILE) if(NOT DEFINED ${var}) message(FATAL_ERROR "Variable ${var} is not defined") endif() @@ -14,18 +14,18 @@ endforeach() # create command -if(NOT EXISTS "${UWP_API_VALIDATOR_APIS}") - message(FATAL_ERROR "${UWP_API_VALIDATOR_APIS} does not exist") +if(NOT EXISTS "${ONECORE_API_VALIDATOR_APIS}") + message(FATAL_ERROR "${ONECORE_API_VALIDATOR_APIS} does not exist") endif() -set(command "${UWP_API_VALIDATOR}" - -SupportedApiXmlFiles:${UWP_API_VALIDATOR_APIS} - -DriverPackagePath:${UWP_API_VALIDATOR_TARGET}) -if(EXISTS "${UWP_API_VALIDATOR_EXCLUSION}") +set(command "${ONECORE_API_VALIDATOR}" + -SupportedApiXmlFiles:${ONECORE_API_VALIDATOR_APIS} + -DriverPackagePath:${ONECORE_API_VALIDATOR_TARGET}) +if(EXISTS "${ONECORE_API_VALIDATOR_EXCLUSION}") list(APPEND command - -BinaryExclusionListXmlFile:${UWP_API_VALIDATOR_EXCLUSION} + -BinaryExclusionListXmlFile:${ONECORE_API_VALIDATOR_EXCLUSION} -StrictCompliance:TRUE) - set(UWP_HAS_BINARY_EXCLUSION ON) + set(ONECORE_HAS_BINARY_EXCLUSION ON) endif() # execute @@ -36,13 +36,13 @@ execute_process(COMMAND ${command} RESULT_VARIABLE exit_code OUTPUT_STRIP_TRAILING_WHITESPACE) -file(WRITE "${UWP_API_VALIDATOR_OUTPUT}" "${output_message}\n\n\n${error_message}") +file(WRITE "${ONECORE_API_VALIDATOR_OUTPUT}" "CMAKE COMMAND: ${command}\n\n\n${output_message}\n\n\n${error_message}") # post-process output -get_filename_component(name "${UWP_API_VALIDATOR_TARGET}" NAME) +get_filename_component(name "${ONECORE_API_VALIDATOR_TARGET}" NAME) -if(NOT UWP_HAS_BINARY_EXCLUSION) +if(NOT ONECORE_HAS_BINARY_EXCLUSION) if(CMAKE_TOOLCHAIN_FILE MATCHES "onecoreuap.toolchain.cmake$") # empty since we compile with static MSVC runtime else() @@ -66,7 +66,7 @@ endif() # write output -if(UWP_HAS_BINARY_EXCLUSION AND NOT exit_code EQUAL 0) +if(ONECORE_HAS_BINARY_EXCLUSION AND NOT exit_code EQUAL 0) message(FATAL_ERROR "${error_message}") endif() diff --git a/cmake/developer_package/frontends/frontends.cmake b/cmake/developer_package/frontends/frontends.cmake index ad78058f6a0747..84a9eea0735988 100644 --- a/cmake/developer_package/frontends/frontends.cmake +++ b/cmake/developer_package/frontends/frontends.cmake @@ -182,7 +182,7 @@ macro(ov_add_frontend) add_library(openvino::frontend::${OV_FRONTEND_NAME} ALIAS ${TARGET_NAME}) endif() - # Shutdown protobuf when unloading the front dynamic library + # Shutdown protobuf when unloading the frontend dynamic library if(proto_files AND BUILD_SHARED_LIBS) target_link_libraries(${TARGET_NAME} PRIVATE ov_protobuf_shutdown) endif() @@ -217,8 +217,6 @@ macro(ov_add_frontend) ie_add_vs_version_file(NAME ${TARGET_NAME} FILEDESCRIPTION ${OV_FRONTEND_FILEDESCRIPTION}) - ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) - target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime) target_link_libraries(${TARGET_NAME} PRIVATE ${OV_FRONTEND_LINK_LIBRARIES}) ov_add_library_version(${TARGET_NAME}) @@ -259,6 +257,11 @@ macro(ov_add_frontend) add_dependencies(ov_frontends ${TARGET_NAME}) + # must be called after all target_link_libraries + ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) + + # installation + if(NOT OV_FRONTEND_SKIP_INSTALL) if(BUILD_SHARED_LIBS) # Note: diff --git a/cmake/developer_package/target_flags.cmake b/cmake/developer_package/target_flags.cmake index 0d2117d55e0432..0a37c910ae8888 100644 --- a/cmake/developer_package/target_flags.cmake +++ b/cmake/developer_package/target_flags.cmake @@ -20,7 +20,7 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") set(arch_flag X86_64) elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(arch_flag X86) -elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*)") +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") set(arch_flag AARCH64) elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") set(arch_flag ARM) @@ -31,8 +31,8 @@ endif() set(HOST_${arch_flag} ON) macro(_ie_process_msvc_generator_platform arch_flag) - # if cmake -A is passed - if(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") + # if cmake -A is passed + if(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64") set(AARCH64 ON) elseif(CMAKE_GENERATOR_PLATFORM STREQUAL "ARM") set(ARM ON) diff --git a/docs/IE_PLUGIN_DG/InferRequest.md b/docs/IE_PLUGIN_DG/InferRequest.md index 2c6aacefc3e19a..db03bb3b06d54b 100644 --- a/docs/IE_PLUGIN_DG/InferRequest.md +++ b/docs/IE_PLUGIN_DG/InferRequest.md @@ -12,7 +12,7 @@ Inference Engine Plugin API provides the helper InferenceEngine::IInferRequestIn to use as a base class for a synchronous inference request implementation. Based of that, a declaration of a synchronous request class can look as follows: -@snippet src/infer_request.hpp infer_request:header +@snippet src/sync_infer_request.hpp infer_request:header #### Class Fields @@ -34,7 +34,7 @@ The example class has several fields: The constructor initializes helper fields and calls methods which allocate blobs: -@snippet src/infer_request.cpp infer_request:ctor +@snippet src/sync_infer_request.cpp infer_request:ctor > **NOTE**: Call InferenceEngine::CNNNetwork::getInputsInfo and InferenceEngine::CNNNetwork::getOutputsInfo to specify both layout and precision of blobs, which you can set with InferenceEngine::InferRequest::SetBlob and get with InferenceEngine::InferRequest::GetBlob. A plugin uses these hints to determine its internal layouts and precisions for input and output blobs if needed. @@ -42,7 +42,7 @@ The constructor initializes helper fields and calls methods which allocate blobs Decrements a number of created inference requests: -@snippet src/infer_request.cpp infer_request:dtor +@snippet src/sync_infer_request.cpp infer_request:dtor ### `InferImpl()` @@ -50,13 +50,13 @@ Decrements a number of created inference requests: - Checks blobs set by users - Calls the `InferImpl` method defined in a derived class to call actual pipeline stages synchronously -@snippet src/infer_request.cpp infer_request:infer_impl +@snippet src/sync_infer_request.cpp infer_request:infer_impl #### 1. `inferPreprocess` Below is the code of the `inferPreprocess` method to demonstrate Inference Engine common preprocessing step handling: -@snippet src/infer_request.cpp infer_request:infer_preprocess +@snippet src/sync_infer_request.cpp infer_request:infer_preprocess **Details:** * `InferImpl` must call the InferenceEngine::IInferRequestInternal::execDataPreprocessing function, which executes common Inference Engine preprocessing step (for example, applies resize or color conversion operations) if it is set by the user. The output dimensions, layout and precision matches the input information set via InferenceEngine::CNNNetwork::getInputsInfo. @@ -66,18 +66,18 @@ Below is the code of the `inferPreprocess` method to demonstrate Inference Engin Executes a pipeline synchronously using `_executable` object: -@snippet src/infer_request.cpp infer_request:start_pipeline +@snippet src/sync_infer_request.cpp infer_request:start_pipeline #### 3. `inferPostprocess` Converts output blobs if precisions of backend output blobs and blobs passed by user are different: -@snippet src/infer_request.cpp infer_request:infer_postprocess +@snippet src/sync_infer_request.cpp infer_request:infer_postprocess ### `GetPerformanceCounts()` The method sets performance counters which were measured during pipeline stages execution: -@snippet src/infer_request.cpp infer_request:get_performance_counts +@snippet src/sync_infer_request.cpp infer_request:get_performance_counts The next step in the plugin library implementation is the [Asynchronous Inference Request](@ref openvino_docs_ie_plugin_dg_async_infer_request) class. diff --git a/docs/OV_Runtime_UG/Operations_specifications.md b/docs/OV_Runtime_UG/Operations_specifications.md index 107cfa9a4060df..15331ac28160d9 100644 --- a/docs/OV_Runtime_UG/Operations_specifications.md +++ b/docs/OV_Runtime_UG/Operations_specifications.md @@ -202,6 +202,7 @@ Tile-1 TopK-1 TopK-3 + TopK-11 Transpose-1 Unique-10 Unsqueeze-1 diff --git a/docs/OV_Runtime_UG/auto_device_selection.md b/docs/OV_Runtime_UG/auto_device_selection.md index a2eb7aa0758e8b..4567100e9d50f2 100644 --- a/docs/OV_Runtime_UG/auto_device_selection.md +++ b/docs/OV_Runtime_UG/auto_device_selection.md @@ -8,11 +8,15 @@ Debugging Auto-Device Plugin -@endsphinxdirective This article introduces how Automatic Device Selection works and how to use it for inference. -## How AUTO Works + +.. _how-auto-works: + + +How AUTO Works +#################### The Automatic Device Selection mode, or AUTO for short, uses a "virtual" or a "proxy" device, which does not bind to a specific type of hardware, but rather selects the processing unit for inference automatically. @@ -21,13 +25,14 @@ This way, you can write the application once and deploy it anywhere. The selection also depends on your performance requirements, defined by the “hints” configuration API, as well as device priority list limitations, if you choose to exclude some hardware from the process. -The logic behind the choice is as follows: -1. Check what supported devices are available. -2. Check precisions of the input model (for detailed information on precisions read more on the `ov::device::capabilities`) -3. Select the highest-priority device capable of supporting the given model, as listed in the table below. -4. If model’s precision is FP32 but there is no device capable of supporting it, offload the model to a device supporting FP16. +The logic behind the choice is as follows: + +1. Check what supported devices are available. +2. Check precisions of the input model (for detailed information on precisions read more on the ``ov::device::capabilities``). +3. Select the highest-priority device capable of supporting the given model, as listed in the table below. +4. If model’s precision is FP32 but there is no device capable of supporting it, offload the model to a device supporting FP16. + -@sphinxdirective +----------+------------------------------------------------------+-------------------------------------+ | Device || Supported || Supported | | Priority || Device || model precision | @@ -41,135 +46,140 @@ The logic behind the choice is as follows: | 3 || Intel® CPU | FP32, FP16, INT8, BIN | | || (e.g. Intel® Core™ i7-1165G7) | | +----------+------------------------------------------------------+-------------------------------------+ -@endsphinxdirective -To put it simply, when loading the model to the first device on the list fails, AUTO will try to load it to the next device in line, until one of them succeeds. -What is important, **AUTO starts inference with the CPU of the system by default**, as it provides very low latency and can start inference with no additional delays. + +To put it simply, when loading the model to the first device on the list fails, AUTO will try to load it to the next device in line, until one of them succeeds. +What is important, **AUTO starts inference with the CPU of the system by default**, as it provides very low latency and can start inference with no additional delays. While the CPU is performing inference, AUTO continues to load the model to the device best suited for the purpose and transfers the task to it when ready. This way, the devices which are much slower in compiling models, GPU being the best example, do not impede inference at its initial stages. For example, if you use a CPU and a GPU, the first-inference latency of AUTO will be better than that of using GPU alone. -Note that if you choose to exclude CPU from the priority list or disable the initial CPU acceleration feature via `ov::intel_auto::enable_startup_fallback`, it will be unable to support the initial model compilation stage. - -![](../img/autoplugin_accelerate.svg) +Note that if you choose to exclude CPU from the priority list or disable the initial CPU acceleration feature via ``ov::intel_auto::enable_startup_fallback``, it will be unable to support the initial model compilation stage. -This mechanism can be easily observed in the [Using AUTO with Benchmark app sample](#using-auto-with-openvino-samples-and-benchmark-app) section, showing how the first-inference latency (the time it takes to compile the model and perform the first inference) is reduced when using AUTO. For example: -```sh -benchmark_app -m ../public/alexnet/FP32/alexnet.xml -d GPU -niter 128 -``` +.. image:: _static/images/autoplugin_accelerate.svg -```sh -benchmark_app -m ../public/alexnet/FP32/alexnet.xml -d AUTO -niter 128 -``` +This mechanism can be easily observed in the :ref:`Using AUTO with Benchmark app sample ` section, showing how the first-inference latency (the time it takes to compile the model and perform the first inference) is reduced when using AUTO. For example: -@sphinxdirective -.. note:: - The longer the process runs, the closer realtime performance will be to that of the best-suited device. -@endsphinxdirective +.. code-block: sh -## Using AUTO + benchmark_app -m ../public/alexnet/FP32/alexnet.xml -d GPU -niter 128 -Following the OpenVINO™ naming convention, the Automatic Device Selection mode is assigned the label of “AUTO.” It may be defined with no additional parameters, resulting in defaults being used, or configured further with the following setup options: -@sphinxdirective +.. code-block: sh -+---------------------------------------------+----------------------------------------------------------------------+ -| | Property | | Values and Description | -+=============================================+======================================================================+ -| | | | **Values**: | -| | | | empty | -| | | | `AUTO` | -| | | | `AUTO: ` (comma-separated, no spaces) | -| | | | | -| | | | Lists the devices available for selection. | -| | | | The device sequence will be taken as priority from high to low. | -| | | | If not specified, `AUTO` will be used as default, | -| | | | and all devices will be "viewed" as candidates. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::device::priorities` | | **Values**: | -| | | | `` (comma-separated, no spaces) | -| | | | | -| | | | Specifies the devices for AUTO to select. | -| | | | The device sequence will be taken as priority from high to low. | -| | | | This configuration is optional. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::hint::performance_mode` | | **Values**: | -| | | | `ov::hint::PerformanceMode::LATENCY` | -| | | | `ov::hint::PerformanceMode::THROUGHPUT` | -| | | | `ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT` | -| | | | | -| | | | Specifies the performance option preferred by the application. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::hint::model_priority` | | **Values**: | -| | | | `ov::hint::Priority::HIGH` | -| | | | `ov::hint::Priority::MEDIUM` | -| | | | `ov::hint::Priority::LOW` | -| | | | | -| | | | Indicates the priority for a model. | -| | | | IMPORTANT: This property is not fully supported yet. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::execution_devices` | | Lists the runtime target devices on which the inferences are being | -| | | | executed. | -| | | | Examples of returning results could be `(CPU)`(`(CPU)` is a | -| | | | temporary device, indicating that CPU is used for acceleration at | -| | | | the model compilation stage), `CPU`, `GPU`, `CPU GPU`, `GPU.0`, | -| | | | etc. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::intel_auto::enable_startup_fallback` | | **Values**: | -| | | | `true` | -| | | | `false` | -| | | | | -| | | | Enables/disables CPU as acceleration (or the helper device) in the | -| | | | beginning. The default value is `true`, indicating that CPU is used| -| | | | as acceleration by default. | -+---------------------------------------------+----------------------------------------------------------------------+ + benchmark_app -m ../public/alexnet/FP32/alexnet.xml -d AUTO -niter 128 -@endsphinxdirective + + +.. note:: + + The longer the process runs, the closer realtime performance will be to that of the best-suited device. + + +Using AUTO +#################### + +Following the OpenVINO™ naming convention, the Automatic Device Selection mode is assigned the label of "AUTO". It may be defined with no additional parameters, resulting in defaults being used, or configured further with the following setup options: + + ++-----------------------------------------------+----------------------------------------------------------------------+ +| | Property | | Values and Description | ++===============================================+======================================================================+ +| | | | **Values**: | +| | | | empty | +| | | | ``AUTO`` | +| | | | ``AUTO: `` (comma-separated, no spaces) | +| | | | | +| | | | Lists the devices available for selection. | +| | | | The device sequence will be taken as priority from high to low. | +| | | | If not specified, ``AUTO`` will be used as default, | +| | | | and all devices will be "viewed" as candidates. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::device::priorities`` | | **Values**: | +| | | | ```` (comma-separated, no spaces) | +| | | | | +| | | | Specifies the devices for AUTO to select. | +| | | | The device sequence will be taken as priority from high to low. | +| | | | This configuration is optional. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::hint::performance_mode`` | | **Values**: | +| | | | ``ov::hint::PerformanceMode::LATENCY`` | +| | | | ``ov::hint::PerformanceMode::THROUGHPUT`` | +| | | | ``ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT`` | +| | | | | +| | | | Specifies the performance option preferred by the application. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::hint::model_priority`` | | **Values**: | +| | | | ``ov::hint::Priority::HIGH`` | +| | | | ``ov::hint::Priority::MEDIUM`` | +| | | | ``ov::hint::Priority::LOW`` | +| | | | | +| | | | Indicates the priority for a model. | +| | | | IMPORTANT: This property is not fully supported yet. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::execution_devices`` | | Lists the runtime target devices on which the inferences are being | +| | | | executed. | +| | | | Examples of returning results could be ``(CPU)``(``CPU`` is a | +| | | | temporary device, indicating that CPU is used for acceleration at | +| | | | the model compilation stage), ``CPU``, ``GPU``, ``CPU GPU``, | +| | | | ``GPU.0``, etc. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::intel_auto::enable_startup_fallback`` | | **Values**: | +| | | | ``true`` | +| | | | ``false`` | +| | | | | +| | | | Enables/disables CPU as acceleration (or the helper device) in the | +| | | | beginning. The default value is ``true``, indicating that CPU is | +| | | | used as acceleration by default. | ++-----------------------------------------------+----------------------------------------------------------------------+ Inference with AUTO is configured similarly to when device plugins are used: you compile the model on the plugin with configuration and execute inference. -### Device Candidates and Priority -The device candidate list enables you to customize the priority and limit the choice of devices available to AUTO. -- If is not specified, AUTO assumes all the devices present in the system can be used. -- If `AUTO` without any device names is specified, AUTO assumes all the devices present in the system can be used, and will load the network to all devices and run inference based on their default priorities, from high to low. -To specify the priority of devices, enter the device names in the priority order (from high to low) in `AUTO: `, or use the `ov::device::priorities` property. +Device Candidates and Priority +++++++++++++++++++++++++++++++ -See the following code for using AUTO and specifying devices: -@sphinxdirective +The device candidate list enables you to customize the priority and limit the choice of devices available to AUTO. + +* If is not specified, AUTO assumes all the devices present in the system can be used. +* If ``AUTO`` without any device names is specified, AUTO assumes all the devices present in the system can be used, and will load the network to all devices and run inference based on their default priorities, from high to low. + +To specify the priority of devices, enter the device names in the priority order (from high to low) in ``AUTO: ``, or use the ``ov::device::priorities`` property. + +See the following code for using AUTO and specifying devices: + .. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO0.cpp - :language: cpp - :fragment: [part0] + .. doxygensnippet:: docs/snippets/AUTO0.cpp + :language: cpp + :fragment: [part0] .. tab:: Python - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part0] + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part0] -@endsphinxdirective -Note that OpenVINO Runtime lets you use “GPU” as an alias for “GPU.0” in function calls. More details on enumerating devices can be found in [Working with devices](supported_plugins/Device_Plugins.md). +Note that OpenVINO Runtime lets you use "GPU" as an alias for "GPU.0" in function calls. More details on enumerating devices can be found in :doc:`Working with devices `. -#### Checking Available Devices +Checking Available Devices +-------------------------- -To check what devices are present in the system, you can use Device API, as listed below. For information on how to use it, see [Query device properties and configuration](supported_plugins/config_properties.md). +To check what devices are present in the system, you can use Device API, as listed below. For information on how to use it, see :doc:`Query device properties and configuration `. -@sphinxdirective -.. tab:: C++ +.. tab:: C++ .. code-block:: sh - ov::runtime::Core::get_available_devices() + ov::runtime::Core::get_available_devices() See the Hello Query Device C++ Sample for reference. @@ -181,19 +191,18 @@ To check what devices are present in the system, you can use Device API, as list See the Hello Query Device Python Sample for reference. -@endsphinxdirective -#### Excluding Devices from Device Candidate List +Excluding Devices from Device Candidate List +-------------------------------------------- -You can also exclude hardware devices from AUTO, for example, to reserve CPU for other jobs. AUTO will not use the device for inference then. To do that, add a minus sign (-) before CPU in `AUTO: `, as in the following example: +You can also exclude hardware devices from AUTO, for example, to reserve CPU for other jobs. AUTO will not use the device for inference then. To do that, add a minus sign ``(-)`` before CPU in ``AUTO: ``, as in the following example: -@sphinxdirective .. tab:: C++ .. code-block:: sh - ov::CompiledModel compiled_model = core.compile_model(model, "AUTO:-CPU"); + ov::CompiledModel compiled_model = core.compile_model(model, "AUTO:-CPU"); .. tab:: Python @@ -201,144 +210,156 @@ You can also exclude hardware devices from AUTO, for example, to reserve CPU for compiled_model = core.compile_model(model=model, device_name="AUTO:-CPU") -@endsphinxdirective -AUTO will then query all available devices and remove CPU from the candidate list. +AUTO will then query all available devices and remove CPU from the candidate list. -Note that if you choose to exclude CPU from device candidate list, CPU will not be able to support the initial model compilation stage. See more information in [How AUTO Works](#how-auto-works). +Note that if you choose to exclude CPU from device candidate list, CPU will not be able to support the initial model compilation stage. See more information in :ref:`How AUTO Works `. -### Checking Target Runtime Devices -To query the runtime target devices on which the inferences are being executed using AUTO, you can use the `ov::execution_devices` property. It must be used with `get_property`, for example: +Performance Hints for AUTO +++++++++++++++++++++++++++ -@sphinxdirective +The ``ov::hint::performance_mode`` property enables you to specify a performance option for AUTO to be more efficient for particular use cases. The default hint for AUTO is ``LATENCY``. -.. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO7.cpp - :language: cpp - :fragment: [part7] +LATENCY +-------------------- -.. tab:: Python +This option prioritizes low latency, providing short response time for each inference job. It performs best for tasks where inference is required for a single input image, e.g. a medical analysis of an ultrasound scan image. It also fits the tasks of real-time or nearly real-time applications, such as an industrial robot's response to actions in its environment or obstacle avoidance for autonomous vehicles. - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part7] +.. note:: -@endsphinxdirective + If no performance hint is set explicitly, AUTO will set LATENCY for devices that have not set ``ov::device::properties``, for example, ``ov::device::properties(, ov::hint::performance_mode(ov::hint::LATENCY))``. -### Performance Hints for AUTO -The `ov::hint::performance_mode` property enables you to specify a performance option for AUTO to be more efficient for particular use cases. The default hint for AUTO is `LATENCY`. -#### LATENCY -This option prioritizes low latency, providing short response time for each inference job. It performs best for tasks where inference is required for a single input image, e.g. a medical analysis of an ultrasound scan image. It also fits the tasks of real-time or nearly real-time applications, such as an industrial robot's response to actions in its environment or obstacle avoidance for autonomous vehicles. +.. _cumulative throughput: -> **NOTE**: If no performance hint is set explicitly, AUTO will set LATENCY for devices that have not set `ov::device::properties`, for example, `ov::device::properties(, ov::hint::performance_mode(ov::hint::LATENCY))`. -@sphinxdirective +THROUGHPUT +-------------------- -.. _cumulative throughput: +This option prioritizes high throughput, balancing between latency and power. It is best suited for tasks involving multiple jobs, such as inference of video feeds or large numbers of images. -@endsphinxdirective -#### THROUGHPUT -This option prioritizes high throughput, balancing between latency and power. It is best suited for tasks involving multiple jobs, such as inference of video feeds or large numbers of images. +CUMULATIVE_THROUGHPUT +--------------------- -#### CUMULATIVE_THROUGHPUT -While `LATENCY` and `THROUGHPUT` can select one target device with your preferred performance option, the `CUMULATIVE_THROUGHPUT` option enables running inference on multiple devices for higher throughput. With `CUMULATIVE_THROUGHPUT`, AUTO loads the network model to all available devices in the candidate list, and then runs inference on them based on the default or specified priority. +While ``LATENCY`` and ``THROUGHPUT`` can select one target device with your preferred performance option, the ``CUMULATIVE_THROUGHPUT`` option enables running inference on multiple devices for higher throughput. With ``CUMULATIVE_THROUGHPUT``, AUTO loads the network model to all available devices in the candidate list, and then runs inference on them based on the default or specified priority. -CUMULATIVE_THROUGHPUT has similar behavior as [the Multi-Device execution mode (MULTI)](./multi_device.md). The only difference is that CUMULATIVE_THROUGHPUT uses the devices specified by AUTO, which means that it's not mandatory to add devices manually, while with MULTI, you need to specify the devices before inference. +CUMULATIVE_THROUGHPUT has similar behavior as :doc:`the Multi-Device execution mode (MULTI) `. The only difference is that CUMULATIVE_THROUGHPUT uses the devices specified by AUTO, which means that it's not mandatory to add devices manually, while with MULTI, you need to specify the devices before inference. With the CUMULATIVE_THROUGHPUT option: -- If `AUTO` without any device names is specified, and the system has more than two GPU devices, AUTO will remove CPU from the device candidate list to keep GPU running at full capacity. -- If device priority is specified, AUTO will run inference requests on devices based on the priority. In the following example, AUTO will always try to use GPU first, and then use CPU if GPU is busy: - ```sh - ov::CompiledModel compiled_model = core.compile_model(model, "AUTO:GPU,CPU", ov::hint::performance_mode(ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT)); - ``` -#### Code Examples +* If ``AUTO`` without any device names is specified, and the system has more than two GPU devices, AUTO will remove CPU from the device candidate list to keep GPU running at full capacity. +* If device priority is specified, AUTO will run inference requests on devices based on the priority. In the following example, AUTO will always try to use GPU first, and then use CPU if GPU is busy: + + .. code-block: sh + + ov::CompiledModel compiled_model = core.compile_model(model, "AUTO:GPU,CPU", ov::hint::performance_mode(ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT)); + + +Code Examples +-------------------- + +To enable performance hints for your application, use the following code: -To enable performance hints for your application, use the following code: -@sphinxdirective .. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO3.cpp - :language: cpp - :fragment: [part3] - + .. doxygensnippet:: docs/snippets/AUTO3.cpp + :language: cpp + :fragment: [part3] + .. tab:: Python - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part3] + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part3] -@endsphinxdirective -#### Disabling Auto-Batching for THROUGHPUT and CUMULATIVE_THROUGHPUT +Disabling Auto-Batching for THROUGHPUT and CUMULATIVE_THROUGHPUT +---------------------------------------------------------------- -The `ov::hint::PerformanceMode::THROUGHPUT` mode and the `ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT` mode will trigger Auto-Batching (for example, for the GPU device) by default. You can disable it by setting `ov::hint::allow_auto_batching(false)`, or change the default timeout value to a large number, e.g. `ov::auto_batch_timeout(1000)`. See [Automatic Batching](./automatic_batching.md) for more details. +The ``ov::hint::PerformanceMode::THROUGHPUT`` mode and the ``ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT`` mode will trigger Auto-Batching (for example, for the GPU device) by default. You can disable it by setting ``ov::hint::allow_auto_batching(false)``, or change the default timeout value to a large number, e.g. ``ov::auto_batch_timeout(1000)``. See :doc:`Automatic Batching ` for more details. -### Configuring Model Priority -The `ov::hint::model_priority` property enables you to control the priorities of models in the Auto-Device plugin. A high-priority model will be loaded to a supported high-priority device. A lower-priority model will not be loaded to a device that is occupied by a higher-priority model. +Configuring Model Priority +++++++++++++++++++++++++++ + +The ``ov::hint::model_priority`` property enables you to control the priorities of models in the Auto-Device plugin. A high-priority model will be loaded to a supported high-priority device. A lower-priority model will not be loaded to a device that is occupied by a higher-priority model. -@sphinxdirective .. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO4.cpp - :language: cpp - :fragment: [part4] - + .. doxygensnippet:: docs/snippets/AUTO4.cpp + :language: cpp + :fragment: [part4] + .. tab:: Python - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part4] + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part4] -@endsphinxdirective -## Configuring Individual Devices and Creating the Auto-Device plugin on Top +Checking Target Runtime Devices ++++++++++++++++++++++++++++++++ -Although the methods described above are currently the preferred way to execute inference with AUTO, the following steps can be also used as an alternative. It is currently available as a legacy feature and used if AUTO is uncapable of utilizing the Performance Hints option. +To query the runtime target devices on which the inferences are being executed using AUTO, you can use the ``ov::execution_devices`` property. It must be used with ``get_property``, for example: -@sphinxdirective +.. tab:: C++ + + .. doxygensnippet:: docs/snippets/AUTO7.cpp + :language: cpp + :fragment: [part7] + +.. tab:: Python + + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part7] + + +Configuring Individual Devices and Creating the Auto-Device plugin on Top +######################################################################### + +Although the methods described above are currently the preferred way to execute inference with AUTO, the following steps can be also used as an alternative. It is currently available as a legacy feature and used if AUTO is incapable of utilizing the Performance Hints option. .. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO5.cpp - :language: cpp - :fragment: [part5] - + .. doxygensnippet:: docs/snippets/AUTO5.cpp + :language: cpp + :fragment: [part5] + .. tab:: Python - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part5] + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part5] -@endsphinxdirective -## Using AUTO with OpenVINO Samples and Benchmark app +.. _using-auto-with-openvino-samples-and-benchmark-app: + +Using AUTO with OpenVINO Samples and Benchmark app +################################################## To see how the Auto-Device plugin is used in practice and test its performance, take a look at OpenVINO™ samples. All samples supporting the "-d" command-line option (which stands for "device") will accept the plugin out-of-the-box. The Benchmark Application will be a perfect place to start – it presents the optimal performance of the plugin without the need for additional settings, like the number of requests or CPU threads. To evaluate the AUTO performance, you can use the following commands: For unlimited device choice: -```sh -benchmark_app –d AUTO –m -i -niter 1000 -``` +.. code-block:sh + + benchmark_app –d AUTO –m -i -niter 1000 For limited device choice: -```sh -benchmark_app –d AUTO:CPU,GPU,GNA –m -i -niter 1000 -``` +.. code-block:sh -For more information, refer to the [C++](../../samples/cpp/benchmark_app/README.md) or [Python](../../tools/benchmark_tool/README.md) version instructions. + benchmark_app –d AUTO:CPU,GPU,GNA –m -i -niter 1000 + +For more information, refer to the :doc:`C++ ` or :doc:`Python ` version instructions. -@sphinxdirective .. note:: The default CPU stream is 1 if using “-d AUTO”. @@ -346,11 +367,13 @@ For more information, refer to the [C++](../../samples/cpp/benchmark_app/README. You can use the FP16 IR to work with auto-device. No demos are yet fully optimized for AUTO, by means of selecting the most suitable device, using the GPU streams/throttling, and so on. -@endsphinxdirective -## Additional Resources -- [Debugging AUTO](AutoPlugin_Debugging.md) -- [Running on Multiple Devices Simultaneously](./multi_device.md) -- [Supported Devices](supported_plugins/Supported_Devices.md) +Additional Resources +#################### +- :doc:`Debugging AUTO ` +- :doc:`Running on Multiple Devices Simultaneously ` +- :doc:`Supported Devices ` + +@endsphinxdirective diff --git a/docs/OV_Runtime_UG/supported_plugins/GPU.md b/docs/OV_Runtime_UG/supported_plugins/GPU.md index 9ddc26bf15b80e..222d11ebdea98d 100644 --- a/docs/OV_Runtime_UG/supported_plugins/GPU.md +++ b/docs/OV_Runtime_UG/supported_plugins/GPU.md @@ -222,11 +222,11 @@ The GPU plugin has the following additional preprocessing options: @sphinxtabset @sphinxtab{C++} -@snippet docs/snippets/gpu/preprocessing.cpp init_preproc +@snippet docs/snippets/gpu/preprocessing_nv12_two_planes.cpp init_preproc @endsphinxtab @sphinxtab{Python} -@snippet docs/snippets/gpu/preprocessing.py init_preproc +@snippet docs/snippets/gpu/preprocessing_nv12_two_planes.py init_preproc @endsphinxtab @endsphinxtabset diff --git a/docs/OV_Runtime_UG/supported_plugins/GPU_RemoteTensor_API.md b/docs/OV_Runtime_UG/supported_plugins/GPU_RemoteTensor_API.md index 6f22307aa452b7..2046aaf44386d1 100644 --- a/docs/OV_Runtime_UG/supported_plugins/GPU_RemoteTensor_API.md +++ b/docs/OV_Runtime_UG/supported_plugins/GPU_RemoteTensor_API.md @@ -3,8 +3,11 @@ The GPU plugin implementation of the `ov::RemoteContext` and `ov::RemoteTensor` interfaces supports GPU pipeline developers who need video memory sharing and interoperability with existing native APIs, such as OpenCL, Microsoft DirectX, or VAAPI. -Using these interfaces allows you to avoid any memory copy overhead when plugging OpenVINO™ inference -into an existing GPU pipeline. It also enables OpenCL kernels to participate in the pipeline to become + +The `ov::RemoteContext` and `ov::RemoteTensor` interface implementation targets the need for memory sharing and +interoperability with existing native APIs, such as OpenCL, Microsoft DirectX, and VAAPI. +They allow you to avoid any memory copy overhead when plugging OpenVINO™ inference +into an existing GPU pipeline. They also enable OpenCL kernels to participate in the pipeline to become native buffer consumers or producers of the OpenVINO™ inference. There are two interoperability scenarios supported by the Remote Tensor API: @@ -23,7 +26,7 @@ and functions that consume or produce native handles directly. ## Context Sharing Between Application and GPU Plugin GPU plugin classes that implement the `ov::RemoteContext` interface are responsible for context sharing. -Obtaining a context object is the first step of sharing pipeline objects. +Obtaining a context object is the first step in sharing pipeline objects. The context object of the GPU plugin directly wraps OpenCL context, setting a scope for sharing the `ov::CompiledModel` and `ov::RemoteTensor` objects. The `ov::RemoteContext` object can be either created on top of an existing handle from a native API or retrieved from the GPU plugin. @@ -37,60 +40,49 @@ additional parameter. To create the `ov::RemoteContext` object for user context, explicitly provide the context to the plugin using constructor for one of `ov::RemoteContext` derived classes. -@sphinxtabset - -@sphinxtab{Linux} - -@sphinxtabset - -@sphinxtab{Create from cl_context} - -@snippet docs/snippets/gpu/remote_objects_creation.cpp context_from_cl_context - -@endsphinxtab - -@sphinxtab{Create from cl_queue} - -@snippet docs/snippets/gpu/remote_objects_creation.cpp context_from_cl_queue - -@endsphinxtab - -@sphinxtab{Create from VADisplay} - -@snippet docs/snippets/gpu/remote_objects_creation.cpp context_from_va_display - -@endsphinxtab - -@endsphinxtabset - -@endsphinxtab - -@sphinxtab{Windows} +@sphinxdirective -@sphinxtabset +.. tab:: Linux -@sphinxtab{Create from cl_context} + .. tab:: Create from cl_context + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: context_from_cl_context -@snippet docs/snippets/gpu/remote_objects_creation.cpp context_from_cl_context + .. tab:: Create from cl_queue -@endsphinxtab + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: context_from_cl_queue -@sphinxtab{Create from cl_queue} + .. tab:: Create from VADisplay -@snippet docs/snippets/gpu/remote_objects_creation.cpp context_from_cl_queue + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: context_from_va_display -@endsphinxtab +.. tab:: Windows -@sphinxtab{Create from ID3D11Device} + .. tab:: Create from cl_context -@snippet docs/snippets/gpu/remote_objects_creation.cpp context_from_d3d_device + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: context_from_cl_context -@endsphinxtab + .. tab:: Create from cl_queue -@endsphinxtabset + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: context_from_cl_queue -@endsphinxtabset + .. tab:: Create from ID3D11Device + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: context_from_d3d_device +@endsphinxdirective ### Getting RemoteContext from the Plugin If you do not provide any user context, the plugin uses its default internal context. @@ -100,21 +92,21 @@ Once the plugin options have been changed, the internal context is replaced by t To request the current default context of the plugin, use one of the following methods: -@sphinxtabset - -@sphinxtab{Get context from Core} - -@snippet docs/snippets/gpu/remote_objects_creation.cpp default_context_from_core +@sphinxdirective -@endsphinxtab +.. tab:: Get context from Core -@sphinxtab{Batching via throughput hint} + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: default_context_from_core -@snippet docs/snippets/gpu/remote_objects_creation.cpp default_context_from_model +.. tab:: Get context from compiled model -@endsphinxtab + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: default_context_from_model -@endsphinxtabset +@endsphinxdirective ## Memory Sharing Between Application and GPU Plugin @@ -126,107 +118,152 @@ of the `ov::RemoteContext` sub-classes. `ov::intel_gpu::ocl::ClContext` has multiple overloads of `create_tensor` methods which allow to wrap pre-allocated native handles with the `ov::RemoteTensor` object or request plugin to allocate specific device memory. For more details, see the code snippets below: -@sphinxtabset - -@sphinxtab{Wrap native handles} - -@sphinxtabset - -@sphinxtab{USM pointer} - -@snippet docs/snippets/gpu/remote_objects_creation.cpp wrap_usm_pointer - -@endsphinxtab - -@sphinxtab{cl_mem} - -@snippet docs/snippets/gpu/remote_objects_creation.cpp wrap_cl_mem - -@endsphinxtab - -@sphinxtab{cl::Buffer} - -@snippet docs/snippets/gpu/remote_objects_creation.cpp wrap_cl_buffer - -@endsphinxtab - -@sphinxtab{cl::Image2D} - -@snippet docs/snippets/gpu/remote_objects_creation.cpp wrap_cl_image - -@endsphinxtab +@sphinxdirective -@sphinxtab{biplanar NV12 surface} +.. tab:: Wrap native handles + + .. tab:: USM pointer + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: wrap_usm_pointer + + .. tab:: cl_mem + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: wrap_cl_mem + + .. tab:: cl::Buffer + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: wrap_cl_buffer + + .. tab:: cl::Image2D + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: wrap_cl_image + + .. tab:: biplanar NV12 surface + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: wrap_nv12_surface + +.. tab:: Allocate device memory + + .. tab:: USM host memory + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: allocate_usm_host + + .. tab:: USM device memory + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: allocate_usm_device + + .. tab:: cl::Buffer + + .. doxygensnippet:: docs/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: allocate_cl_buffer -@snippet docs/snippets/gpu/remote_objects_creation.cpp wrap_nv12_surface +@endsphinxdirective -@endsphinxtab +The `ov::intel_gpu::ocl::D3DContext` and `ov::intel_gpu::ocl::VAContext` classes are derived from `ov::intel_gpu::ocl::ClContext`. +Therefore, they provide the functionality described above and extend it +to allow creation of `ov::RemoteTensor` objects from `ID3D11Buffer`, `ID3D11Texture2D` pointers or the `VASurfaceID` handle respectively. -@endsphinxtabset -@endsphinxtab -@sphinxtab{Allocate device memory} +## Direct NV12 Video Surface Input -@sphinxtabset +To support the direct consumption of a hardware video decoder output, the GPU plugin accepts: -@sphinxtab{USM host memory} +* Two-plane NV12 video surface input - calling the `create_tensor_nv12()` function creates + a pair of `ov::RemoteTensor` objects, representing the Y and UV planes. +* Single-plane NV12 video surface input - calling the `create_tensor()` function creates one + `ov::RemoteTensor` object, representing the Y and UV planes at once (Y elements before UV elements). +* NV12 to Grey video surface input conversion - calling the `create_tensor()` function creates one + `ov::RemoteTensor` object, representing only the Y plane. -@snippet docs/snippets/gpu/remote_objects_creation.cpp allocate_usm_host +To ensure that the plugin generates a correct execution graph, static preprocessing +should be added before model compilation: -@endsphinxtab +@sphinxdirective -@sphinxtab{USM device memory} +.. tab:: two-plane -@snippet docs/snippets/gpu/remote_objects_creation.cpp allocate_usm_device + .. doxygensnippet:: docs/snippets/gpu/preprocessing_nv12_two_planes.cpp + :language: cpp + :fragment: [init_preproc] -@endsphinxtab +.. tab:: single-plane -@sphinxtab{cl::Buffer} + .. doxygensnippet:: docs/snippets/gpu/preprocessing_nv12_single_plane.cpp + :language: cpp + :fragment: [init_preproc] -@snippet docs/snippets/gpu/remote_objects_creation.cpp allocate_cl_buffer +.. tab:: NV12 to Grey -@endsphinxtab + .. doxygensnippet:: docs/snippets/gpu/preprocessing_nv12_to_gray.cpp + :language: cpp + :fragment: [init_preproc] -@endsphinxtabset +@endsphinxdirective -@endsphinxtab -@endsphinxtabset +Since the `ov::intel_gpu::ocl::ClImage2DTensor` and its derived classes do not support batched surfaces, +if batching and surface sharing are required at the same time, +inputs need to be set via the `ov::InferRequest::set_tensors` method with vector of shared surfaces for each plane: -The `ov::intel_gpu::ocl::D3DContext` and `ov::intel_gpu::ocl::VAContext` classes are derived from `ov::intel_gpu::ocl::ClContext`. -Therefore, they provide the functionality described above and extend it -to allow creation of `ov::RemoteTensor` objects from `ID3D11Buffer`, `ID3D11Texture2D` pointers or the `VASurfaceID` handle respectively. -## Direct NV12 Video Surface Input +@sphinxdirective -To support the direct consumption of a hardware video decoder output, the plugin accepts two-plane video -surfaces as arguments for the `create_tensor_nv12()` function, which creates a pair of `ov::RemoteTensor` -objects which represent the Y and UV planes. +.. tab:: Single Batch -To ensure that the plugin generates the correct execution graph for the NV12 dual-plane input, static preprocessing -should be added before model compilation: + .. tab:: two-plane -@snippet snippets/gpu/preprocessing.cpp init_preproc + .. doxygensnippet:: docs/snippets/gpu/preprocessing_nv12_two_planes.cpp + :language: cpp + :fragment: single_batch -Since the `ov::intel_gpu::ocl::ClImage2DTensor` and its derived classes do not support batched surfaces, if batching and surface sharing are required -at the same time, inputs need to be set via the `ov::InferRequest::set_tensors` method with vector of shared surfaces for each plane: + .. tab:: single-plane + + .. doxygensnippet:: docs/snippets/gpu/preprocessing_nv12_single_plane.cpp + :language: cpp + :fragment: single_batch -@sphinxtabset + .. tab:: NV12 to Grey -@sphinxtab{Single batch} + .. doxygensnippet:: docs/snippets/gpu/preprocessing_nv12_to_gray.cpp + :language: cpp + :fragment: single_batch -@snippet docs/snippets/gpu/preprocessing.cpp single_batch +.. tab:: Multiple Batches -@endsphinxtab + .. tab:: two-plane -@sphinxtab{Multiple batches} + .. doxygensnippet:: docs/snippets/gpu/preprocessing_nv12_two_planes.cpp + :language: cpp + :fragment: batched_case -@snippet docs/snippets/gpu/preprocessing.cpp batched_case + .. tab:: single-plane + + .. doxygensnippet:: docs/snippets/gpu/preprocessing_nv12_single_plane.cpp + :language: cpp + :fragment: batched_case -@endsphinxtab + .. tab:: NV12 to Grey -@endsphinxtabset + .. doxygensnippet:: docs/snippets/gpu/preprocessing_nv12_to_gray.cpp + :language: cpp + :fragment: batched_case +@endsphinxdirective I420 color format can be processed in a similar way @@ -242,18 +279,12 @@ This sharing mechanism allows performing pipeline synchronization on the app sid on waiting for the completion of inference. The pseudo-code may look as follows: @sphinxdirective -.. raw:: html -
+.. dropdown:: Queue and context sharing example -@endsphinxdirective - -@snippet snippets/gpu/queue_sharing.cpp queue_sharing - -@sphinxdirective -.. raw:: html - -
+ .. doxygensnippet:: docs/snippets/gpu/queue_sharing.cpp + :language: cpp + :fragment: queue_sharing @endsphinxdirective @@ -282,60 +313,34 @@ For possible low-level properties and their description, refer to the `openvino/ To see pseudo-code of usage examples, refer to the sections below. -> **NOTE**: For low-level parameter usage examples, see the source code of user-side wrappers from the include files mentioned above. - - -@sphinxdirective -.. raw:: html - -
- -@endsphinxdirective - -This example uses the OpenCL context obtained from a compiled model object. - -@snippet snippets/gpu/context_sharing.cpp context_sharing_get_from_ov - -@sphinxdirective -.. raw:: html - -
- -@endsphinxdirective - - @sphinxdirective -.. raw:: html -
+.. NOTE:: + + For low-level parameter usage examples, see the source code of user-side wrappers from the include files mentioned above. -@endsphinxdirective +.. dropdown:: OpenCL Kernel Execution on a Shared Buffer -@snippet snippets/gpu/context_sharing.cpp context_sharing_user_handle + This example uses the OpenCL context obtained from a compiled model object. -@sphinxdirective -.. raw:: html + .. doxygensnippet:: docs/snippets/gpu/context_sharing.cpp + :language: cpp + :fragment: context_sharing_get_from_ov -
+.. dropdown:: Running GPU Plugin Inference within User-Supplied Shared Context -@endsphinxdirective + .. doxygensnippet:: docs/snippets/gpu/context_sharing.cpp + :language: cpp + :fragment: context_sharing_user_handle +.. dropdown:: Direct Consuming of the NV12 VAAPI Video Decoder Surface on Linux -@sphinxdirective -.. raw:: html - -
+ .. doxygensnippet:: docs/snippets/gpu/context_sharing_va.cpp + :language: cpp + :fragment: context_sharing_va @endsphinxdirective -@snippet snippets/gpu/context_sharing_va.cpp context_sharing_va - -@sphinxdirective -.. raw:: html - -
- -@endsphinxdirective ## See Also diff --git a/docs/_static/download/OV_2023_models_supported.pdf b/docs/_static/download/OV_2023_models_supported.pdf new file mode 100644 index 00000000000000..a226075e08e4c0 Binary files /dev/null and b/docs/_static/download/OV_2023_models_supported.pdf differ diff --git a/docs/img/autoplugin_accelerate.svg b/docs/_static/images/autoplugin_accelerate.svg similarity index 100% rename from docs/img/autoplugin_accelerate.svg rename to docs/_static/images/autoplugin_accelerate.svg diff --git a/docs/_static/images/sample-graph-image.png b/docs/_static/images/sample-graph-image.png deleted file mode 100644 index 97477897623537..00000000000000 --- a/docs/_static/images/sample-graph-image.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64e64059e7416353cfd2ad836a36c12071804addf4fb165f0cf5150aa7658fa4 -size 123996 diff --git a/docs/_static/js/graphs.js b/docs/_static/js/graphs.js index e09dc8072c803d..9680031d99566d 100644 --- a/docs/_static/js/graphs.js +++ b/docs/_static/js/graphs.js @@ -310,7 +310,7 @@ class Graph { $(document).ready(function () { - $('#build-graphs-btn').on('click', showModal); + $('.ov-toolkit-benchmark-results').on('click', showModal); function clickBuildGraphs(graph, networkModels, ietype, platforms, kpis, precisions) { renderData(graph, networkModels, ietype, platforms, kpis, precisions); diff --git a/docs/benchmarks/performance_benchmarks.md b/docs/benchmarks/performance_benchmarks.md index 4fe5b4bf2e84ad..6c00b7df02f1ee 100644 --- a/docs/benchmarks/performance_benchmarks.md +++ b/docs/benchmarks/performance_benchmarks.md @@ -21,7 +21,6 @@ Benchmarks are available for: * [Intel® Distribution of OpenVINO™ toolkit](performance_benchmarks_openvino.md). - You can also test performance for your system yourself, following the guide on [getting performance numbers](../MO_DG/prepare_model/Getting_performance_numbers.md). Performance of a particular application can also be evaluated virtually using [Intel® DevCloud for the Edge](https://devcloud.intel.com/edge/). It is a remote development environment with access to Intel® hardware and the latest versions of the Intel® Distribution of the OpenVINO™ Toolkit. To learn more about it, visit [the website](https://www.intel.com/content/www/us/en/developer/tools/devcloud/edge/overview.html) or [create an account](https://www.intel.com/content/www/us/en/forms/idz/devcloud-registration.html?tgt=https://www.intel.com/content/www/us/en/secure/forms/devcloud-enrollment/account-provisioning.html). diff --git a/docs/benchmarks/performance_benchmarks_openvino.md b/docs/benchmarks/performance_benchmarks_openvino.md index 8fc6f80ffb6890..639f1c38a8dd64 100644 --- a/docs/benchmarks/performance_benchmarks_openvino.md +++ b/docs/benchmarks/performance_benchmarks_openvino.md @@ -9,89 +9,76 @@ openvino_docs_performance_int8_vs_fp32 Performance Data Spreadsheet (download xlsx) -@endsphinxdirective - Click the "Benchmark Graphs" button to see the OpenVINO™ benchmark graphs. Select the models, the hardware platforms (CPU SKUs), precision and performance index from the lists and click the “Build Graphs” button. -@sphinxdirective +.. button-link:: # + :class: ov-toolkit-benchmark-results + :color: primary + :outline: + + :material-regular:`bar_chart;1.4em` Benchmark Graphs -.. raw:: html +Measuring inference performance involves many variables and is extremely use-case and application dependent. +Below are four parameters for measurements, which are key elements to consider for a successful deep learning inference application: -
-
-

Build benchmark graphs to your specifications

-
-
- -
- -
-@endsphinxdirective +.. tab:: :material-regular:`keyboard_double_arrow_right;1.4em` Throughput -Measuring inference performance involves many variables and is extremely use-case and application dependent. -Below are four parameters for measurements, which are key elements to consider for a successful deep learning inference application: + Measures the number of inferences delivered within a latency threshold (for example, number of Frames Per Second - FPS). When deploying a system with deep learning inference, select the throughput that delivers the best trade-off between latency and power for the price and performance that meets your requirements. -@sphinxdirective +.. tab:: :material-regular:`attach_money;1.4em` Value + + While throughput is important, what is more critical in edge AI deployments is the performance efficiency or performance-per-cost. Application performance in throughput per dollar of system cost is the best measure of value. The value KPI is calculated as “Throughput measured as inferences per second / price of inference engine”. This means for a 2 socket system 2x the price of a CPU is used. Prices are as per date of benchmarking and sources can be found as links in the Hardware Platforms (PDF) description below. + +.. tab:: :material-regular:`flash_on;1.4em` Efficiency + + System power is a key consideration from the edge to the data center. When selecting deep learning solutions, power efficiency (throughput/watt) is a critical factor to consider. Intel designs provide excellent power efficiency for running deep learning workloads. The efficiency KPI is calculated as “Throughput measured as inferences per second / TDP of inference engine”. This means for a 2 socket system 2x the power dissipation (TDP) of a CPU is used. TDP-values are as per date of benchmarking and sources can be found as links in the Hardware Platforms (PDF) description below. + +.. tab:: :material-regular:`hourglass_empty;1.4em` Latency + + This measures the synchronous execution of inference requests and is reported in milliseconds. Each inference request (for example: preprocess, infer, postprocess) is allowed to complete before the next is started. This performance metric is relevant in usage scenarios where a single image input needs to be acted upon as soon as possible. An example would be the healthcare sector where medical personnel only request analysis of a single ultra sound scanning image or in real-time or near real-time applications for example an industrial robot's response to actions in its environment or obstacle avoidance for autonomous vehicles. + + + +Platform & Configurations +#################################### + +For a listing of all platforms and configurations used for testing, refer to the following: + +.. button-link:: _static/benchmarks_files/platform_list_22.3.pdf + :color: primary + :outline: + + :material-regular:`download;1.5em` Click for Hardware Platforms [PDF] + +.. button-link:: _static/benchmarks_files/OV-2022.3-system-info-detailed.xlsx + :color: primary + :outline: + + :material-regular:`download;1.5em` Click for Configuration Details [XLSX] -.. raw:: html - -
- - Throughput - - - Value - - - Efficiency - - - Latency - -

- Measures the number of inferences delivered within a latency threshold. (for example, number of Frames Per Second - FPS). When deploying a system with deep learning inference, select the throughput that delivers the best trade-off between latency and power for the price and performance that meets your requirements. -

-

- While throughput is important, what is more critical in edge AI deployments is the performance efficiency or performance-per-cost. Application performance in throughput per dollar of system cost is the best measure of value. The value KPI is calculated as “Throughput measured as inferences per second / price of inference engine”. This means for a 2 socket system 2x the price of a CPU is used. Prices are as per date of benchmarking and sources can be found as links in the Hardware Platforms (PDF) description below. -

- System power is a key consideration from the edge to the data center. When selecting deep learning solutions, power efficiency (throughput/watt) is a critical factor to consider. Intel designs provide excellent power efficiency for running deep learning workloads. The efficiency KPI is calculated as “Throughput measured as inferences per second / TDP of inference engine”. This means for a 2 socket system 2x the power dissipation (TDP) of a CPU is used. TDP-values are as per date of benchmarking and sources can be found as links in the Hardware Platforms (PDF) description below. -

- This measures the synchronous execution of inference requests and is reported in milliseconds. Each inference request (for example: preprocess, infer, postprocess) is allowed to complete before the next is started. This performance metric is relevant in usage scenarios where a single image input needs to be acted upon as soon as possible. An example would be the healthcare sector where medical personnel only request analysis of a single ultra sound scanning image or in real-time or near real-time applications for example an industrial robot's response to actions in its environment or obstacle avoidance for autonomous vehicles. -

-
- -

Platform & Configurations

-

For a listing of all platforms and configurations used for testing, refer to the following:

- - - - - -@endsphinxdirective This benchmark setup includes a single machine on which both the benchmark application and the OpenVINO™ installation reside. The presented performance benchmark numbers are based on the release 2022.3 of the Intel® Distribution of OpenVINO™ toolkit. The benchmark application loads the OpenVINO™ Runtime and executes inferences on the specified hardware (CPU, GPU or GNA). -It measures the time spent on actual inferencing (excluding any pre or post processing) and then reports on the inferences per second (or Frames Per Second). +It measures the time spent on actual inference (excluding any pre or post processing) and then reports on the inferences per second (or Frames Per Second). -## Disclaimers +Disclaimers +#################################### Intel® Distribution of OpenVINO™ toolkit performance benchmark numbers are based on release 2022.3. Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Learn more at intel.com, or from the OEM or retailer. Performance results are based on testing as of December 13, 2022 and may not reflect all publicly available updates. See configuration disclosure for details. No product can be absolutely secure. -Performance varies by use, configuration and other factors. Learn more at [www.intel.com/PerformanceIndex](https://www.intel.com/PerformanceIndex). +Performance varies by use, configuration and other factors. Learn more at :ref:`www.intel.com/PerformanceIndex`. Your costs and results may vary. Intel optimizations, for Intel compilers or other products, may not optimize to the same degree for non-Intel products. -© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. \ No newline at end of file +© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. + +@endsphinxdirective \ No newline at end of file diff --git a/docs/benchmarks/performance_int8_vs_fp32.md b/docs/benchmarks/performance_int8_vs_fp32.md index 6e163daa310093..7faed00e38e8e1 100644 --- a/docs/benchmarks/performance_int8_vs_fp32.md +++ b/docs/benchmarks/performance_int8_vs_fp32.md @@ -1,4 +1,4 @@ -# Model Accuracy and Performance for INT8 and FP32 {#openvino_docs_performance_int8_vs_fp32} +# Model Accuracy {#openvino_docs_performance_int8_vs_fp32} The following table presents the absolute accuracy drop calculated as the accuracy difference between FP32 and INT8 representations of a model on two platforms diff --git a/docs/conf.py b/docs/conf.py index ff96fbc4312c2b..e1cbbd5c3fc28e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -79,7 +79,7 @@ html_theme_path = ['_themes'] html_theme_options = { - "navigation_depth": 6, + "navigation_depth": 8, "show_nav_level": 2, "use_edit_page_button": True, "github_url": "https://github.com/openvinotoolkit/openvino", diff --git a/docs/install_guides/installing-openvino-from-archive-linux.md b/docs/install_guides/installing-openvino-from-archive-linux.md index 3450584e01ff29..abb899a116d227 100644 --- a/docs/install_guides/installing-openvino-from-archive-linux.md +++ b/docs/install_guides/installing-openvino-from-archive-linux.md @@ -10,23 +10,54 @@ See the [Release Notes](https://software.intel.com/en-us/articles/OpenVINO-RelNo @sphinxdirective + .. tab:: System Requirements - | Full requirement listing is available in: - | `System Requirements Page `_ + | Full requirement listing is available in: + | `System Requirements Page `_ .. tab:: Processor Notes Processor graphics are not included in all processors. See `Product Specifications`_ for information about your processor. - + .. _Product Specifications: https://ark.intel.com/ .. tab:: Software * `CMake 3.13 or higher, 64-bit `_ - * GCC 7.5.0 (for Ubuntu 18.04) or GCC 9.3.0 (for Ubuntu 20.04) * `Python 3.7 - 3.10, 64-bit `_ + * GCC: + + .. tab:: Ubuntu 18.04 + + * GCC 7.5.0 + + .. tab:: Ubuntu 20.04 + + * GCC 9.3.0 + + .. tab:: RHEL 8 + + * GCC 8.4.1 + + .. tab:: CENTOS 7 + + * GCC 8.3.1 + Use folloving instructions to install it: + Install GCC 8.3.1 via devtoolset-8 + + .. code-block:: sh + + sudo yum update -y && sudo yum install -y centos-release-scl epel-release + sudo yum install -y devtoolset-8 git patchelf + + Enable devtoolset-8 and check current gcc version + + .. code-block:: sh + + source /opt/rh/devtoolset-8/enable + gcc -v @endsphinxdirective diff --git a/docs/install_guides/uninstalling-openvino.md b/docs/install_guides/uninstalling-openvino.md index 504708d3b5007c..c6a6bceac1775b 100644 --- a/docs/install_guides/uninstalling-openvino.md +++ b/docs/install_guides/uninstalling-openvino.md @@ -1,12 +1,17 @@ # Uninstalling the Intel® Distribution of OpenVINO™ Toolkit {#openvino_docs_install_guides_uninstalling_openvino} -> **NOTE**: Uninstallation procedures remove all Intel® Distribution of OpenVINO™ Toolkit component files but don't affect user files in the installation directory. +@sphinxdirective + +.. note:: -## Uninstall Using the Original Installation Package + Uninstallation procedures remove all Intel® Distribution of OpenVINO™ Toolkit component files but don't affect user files in the installation directory. + +Uninstall Using the Original Installation Package +################################################# If you have installed OpenVINO Runtime from archive files, you can uninstall it by deleting the archive files and the extracted folders. -@sphinxdirective + .. tab:: Windows If you have created the symbolic link, remove the link first. @@ -15,25 +20,27 @@ If you have installed OpenVINO Runtime from archive files, you can uninstall it * Use Windows Explorer to remove the files. * Open a Command Prompt and run: - + .. code-block:: sh - + rmdir /s del - + .. tab:: Linux & macOS - + If you have created the symbolic link, remove the link first: .. code-block:: sh - - rm /home//intel/openvino_2022 + + rm /opt/intel/openvino_2022 To delete the files: .. code-block:: sh - + rm -r && rm + @endsphinxdirective + diff --git a/docs/ops/opset.md b/docs/ops/opset.md index 27a24a0ffef3dc..5e68cfd343bdc4 100644 --- a/docs/ops/opset.md +++ b/docs/ops/opset.md @@ -6,6 +6,7 @@ :maxdepth: 1 :hidden: + openvino_docs_ops_opset11 openvino_docs_ops_opset10 openvino_docs_ops_opset9 openvino_docs_ops_opset8 @@ -25,6 +26,7 @@ This topic provides a complete list of available sets of operations supported in | OpenVINO™ Version | Actual Operations Set | | :---------------- | :------------------------------- | +| 2023.0 | [opset11](opset11.md) | | 2022.3 | [opset10](opset10.md) | | 2022.2 | [opset9](opset9.md) | | 2022.1 | [opset8](opset8.md) | diff --git a/docs/ops/opset11.md b/docs/ops/opset11.md new file mode 100644 index 00000000000000..c8d2b3fae56377 --- /dev/null +++ b/docs/ops/opset11.md @@ -0,0 +1,187 @@ +# opset11 {#openvino_docs_ops_opset11} + +This specification document describes the `opset11` operation set supported in OpenVINO™. +Support for each particular operation from the list below depends on the capabilities of an inference plugin +and may vary among different hardware platforms and devices. Examples of operation instances are provided as IR V10 xml +snippets. Such IR is generated by the Model Optimizer. The semantics match corresponding nGraph operation classes +declared in `namespace opset11`. + + +## Table of Contents + +* [Abs](arithmetic/Abs_1.md) +* [Acos](arithmetic/Acos_1.md) +* [Acosh](arithmetic/Acosh_3.md) +* [AdaptiveAvgPool](pooling/AdaptiveAvgPool_8.md) +* [AdaptiveMaxPool](pooling/AdaptiveMaxPool_8.md) +* [Add](arithmetic/Add_1.md) +* [Asin](arithmetic/Asin_1.md) +* [Asinh](arithmetic/Asinh_3.md) +* [Assign](infrastructure/Assign_3.md) +* [Atan](arithmetic/Atan_1.md) +* [Atanh](arithmetic/Atanh_3.md) +* [AvgPool](pooling/AvgPool_1.md) +* [BatchNormInference](normalization/BatchNormInference_5.md) +* [BatchToSpace](movement/BatchToSpace_2.md) +* [BinaryConvolution](convolution/BinaryConvolution_1.md) +* [Broadcast](movement/Broadcast_3.md) +* [Bucketize](condition/Bucketize_3.md) +* [CTCGreedyDecoder](sequence/CTCGreedyDecoder_1.md) +* [CTCGreedyDecoderSeqLen](sequence/CTCGreedyDecoderSeqLen_6.md) +* [CTCLoss](sequence/CTCLoss_4.md) +* [Ceiling](arithmetic/Ceiling_1.md) +* [Clamp](activation/Clamp_1.md) +* [Concat](movement/Concat_1.md) +* [Constant](infrastructure/Constant_1.md) +* [Convert](type/Convert_1.md) +* [ConvertLike](type/ConvertLike_1.md) +* [Convolution](convolution/Convolution_1.md) +* [ConvolutionBackpropData](convolution/ConvolutionBackpropData_1.md) +* [Cos](arithmetic/Cos_1.md) +* [Cosh](arithmetic/Cosh_1.md) +* [CumSum](arithmetic/CumSum_3.md) +* [DeformableConvolution](convolution/DeformableConvolution_8.md) +* [DeformablePSROIPooling](detection/DeformablePSROIPooling_1.md) +* [DepthToSpace](movement/DepthToSpace_1.md) +* [DetectionOutput](detection/DetectionOutput_8.md) +* [DFT](signals/DFT_7.md) +* [Divide](arithmetic/Divide_1.md) +* [Einsum](matrix/Einsum_7.md) +* [Elu](activation/Elu_1.md) +* [EmbeddingBagOffsetsSum](sparse/EmbeddingBagOffsetsSum_3.md) +* [EmbeddingBagPackedSum](sparse/EmbeddingBagPackedSum_3.md) +* [EmbeddingSegmentsSum](sparse/EmbeddingSegmentsSum_3.md) +* [Equal](comparison/Equal_1.md) +* [Erf](arithmetic/Erf_1.md) +* [Exp](activation/Exp_1.md) +* [ExperimentalDetectronDetectionOutput_6](detection/ExperimentalDetectronDetectionOutput_6.md) +* [ExperimentalDetectronGenerateProposalsSingleImage_6](detection/ExperimentalDetectronGenerateProposalsSingleImage_6.md) +* [ExperimentalDetectronPriorGridGenerator_6](detection/ExperimentalDetectronPriorGridGenerator_6.md) +* [ExperimentalDetectronROIFeatureExtractor_6](detection/ExperimentalDetectronROIFeatureExtractor_6.md) +* [ExperimentalDetectronTopKROIs_6](sort/ExperimentalDetectronTopKROIs_6.md) +* [ExtractImagePatches](movement/ExtractImagePatches_3.md) +* [Eye](generation/Eye_9.md) +* [FakeQuantize](quantization/FakeQuantize_1.md) +* [Floor](arithmetic/Floor_1.md) +* [FloorMod](arithmetic/FloorMod_1.md) +* [Gather](movement/Gather_8.md) +* [GatherElements](movement/GatherElements_6.md) +* [GatherND](movement/GatherND_8.md) +* [GatherTree](movement/GatherTree_1.md) +* [Gelu](activation/GELU_7.md) +* [GenerateProposals](detection/GenerateProposals_9.md) +* [Greater](comparison/Greater_1.md) +* [GreaterEqual](comparison/GreaterEqual_1.md) +* [GridSample](image/GridSample_9.md) +* [GRN](normalization/GRN_1.md) +* [GroupConvolution](convolution/GroupConvolution_1.md) +* [GroupConvolutionBackpropData](convolution/GroupConvolutionBackpropData_1.md) +* [GRUCell](sequence/GRUCell_3.md) +* [GRUSequence](sequence/GRUSequence_5.md) +* [HardSigmoid](activation/HardSigmoid_1.md) +* [HSigmoid](activation/HSigmoid_5.md) +* [HSwish](activation/HSwish_4.md) +* [IDFT](signals/IDFT_7.md) +* [I420toBGR](image/I420toBGR_8.md) +* [I420toRGB](image/I420toRGB_8.md) +* [If](condition/If_8.md) +* [Interpolate](image/Interpolate_4.md) +* [IRDFT](signals/IRDFT_9.md) +* [IsInf](comparison/IsInf_10.md) +* [IsNaN](comparison/IsNaN_10.md) +* [Less](comparison/Less_1.md) +* [LessEqual](comparison/LessEqual_1.md) +* [Log](arithmetic/Log_1.md) +* [LogicalAnd](logical/LogicalAnd_1.md) +* [LogicalNot](logical/LogicalNot_1.md) +* [LogicalOr](logical/LogicalOr_1.md) +* [LogicalXor](logical/LogicalXor_1.md) +* [LogSoftmax](activation/LogSoftmax_5.md) +* [Loop](infrastructure/Loop_5.md) +* [LRN](normalization/LRN_1.md) +* [LSTMCell](sequence/LSTMCell_1.md) +* [LSTMSequence](sequence/LSTMSequence_1.md) +* [MatMul](matrix/MatMul_1.md) +* [MatrixNMS](sort/MatrixNMS_8.md) +* [MaxPool](pooling/MaxPool_8.md) +* [Maximum](arithmetic/Maximum_1.md) +* [Minimum](arithmetic/Minimum_1.md) +* [Mish](activation/Mish_4.md) +* [Mod](arithmetic/Mod_1.md) +* [MVN](normalization/MVN_6.md) +* [MulticlassNMS](sort/MulticlassNonMaxSuppression_9.md) +* [Multiply](arithmetic/Multiply_1.md) +* [Negative](arithmetic/Negative_1.md) +* [NonMaxSuppression](sort/NonMaxSuppression_5.md) +* [NonZero](condition/NonZero_3.md) +* [NormalizeL2](normalization/NormalizeL2_1.md) +* [NotEqual](comparison/NotEqual_1.md) +* [NV12toBGR](image/NV12toBGR_8.md) +* [NV12toRGB](image/NV12toRGB_8.md) +* [OneHot](sequence/OneHot_1.md) +* [Pad](movement/Pad_1.md) +* [Parameter](infrastructure/Parameter_1.md) +* [Power](arithmetic/Power_1.md) +* [PReLU](activation/PReLU_1.md) +* [PriorBoxClustered](detection/PriorBoxClustered_1.md) +* [PriorBox](detection/PriorBox_8.md) +* [Proposal](detection/Proposal_4.md) +* [PSROIPooling](detection/PSROIPooling_1.md) +* [RandomUniform](generation/RandomUniform_8.md) +* [Range](generation/Range_4.md) +* [RDFT](signals/RDFT_9.md) +* [ReLU](activation/ReLU_1.md) +* [ReadValue](infrastructure/ReadValue_3.md) +* [ReduceL1](reduction/ReduceL1_4.md) +* [ReduceL2](reduction/ReduceL2_4.md) +* [ReduceLogicalAnd](reduction/ReduceLogicalAnd_1.md) +* [ReduceLogicalOr](reduction/ReduceLogicalOr_1.md) +* [ReduceMax](reduction/ReduceMax_1.md) +* [ReduceMean](reduction/ReduceMean_1.md) +* [ReduceMin](reduction/ReduceMin_1.md) +* [ReduceProd](reduction/ReduceProd_1.md) +* [ReduceSum](reduction/ReduceSum_1.md) +* [RegionYolo](detection/RegionYolo_1.md) +* [ReorgYolo](detection/ReorgYolo_1.md) +* [Reshape](shape/Reshape_1.md) +* [Result](infrastructure/Result_1.md) +* [ReverseSequence](movement/ReverseSequence_1.md) +* [RNNCell](sequence/RNNCell_3.md) +* [RNNSequence](sequence/RNNSequence_5.md) +* [ROIAlign](detection/ROIAlign_9.md) +* [ROIPooling](detection/ROIPooling_1.md) +* [Roll](movement/Roll_7.md) +* [Round](arithmetic/Round_5.md) +* [ScatterElementsUpdate](movement/ScatterElementsUpdate_3.md) +* [ScatterNDUpdate](movement/ScatterNDUpdate_3.md) +* [ScatterUpdate](movement/ScatterUpdate_3.md) +* [Select](condition/Select_1.md) +* [Selu](activation/Selu_1.md) +* [ShapeOf](shape/ShapeOf_3.md) +* [ShuffleChannels](movement/ShuffleChannels_1.md) +* [Sigmoid](activation/Sigmoid_1.md) +* [Sign](arithmetic/Sign_1.md) +* [Sin](arithmetic/Sin_1.md) +* [Sinh](arithmetic/Sinh_1.md) +* [Slice](movement/Slice_8.md) +* [SoftMax](activation/SoftMax_8.md) +* [SoftPlus](activation/SoftPlus_4.md) +* [SoftSign](activation/SoftSign_9.md) +* [SpaceToBatch](movement/SpaceToBatch_2.md) +* [SpaceToDepth](movement/SpaceToDepth_1.md) +* [Split](movement/Split_1.md) +* [Sqrt](arithmetic/Sqrt_1.md) +* [SquaredDifference](arithmetic/SquaredDifference_1.md) +* [Squeeze](shape/Squeeze_1.md) +* [StridedSlice](movement/StridedSlice_1.md) +* [Subtract](arithmetic/Subtract_1.md) +* [Swish](activation/Swish_4.md) +* [Tan](arithmetic/Tan_1.md) +* [Tanh](arithmetic/Tanh_1.md) +* [TensorIterator](infrastructure/TensorIterator_1.md) +* [Tile](movement/Tile_1.md) +* [TopK](sort/TopK_11.md) +* [Transpose](movement/Transpose_1.md) +* [Unique](movement/Unique_10.md) +* [Unsqueeze](shape/Unsqueeze_1.md) +* [VariadicSplit](movement/VariadicSplit_1.md) diff --git a/docs/ops/sort/TopK_1.md b/docs/ops/sort/TopK_1.md index 824ae65fa2876c..b1ad91b4b791f7 100644 --- a/docs/ops/sort/TopK_1.md +++ b/docs/ops/sort/TopK_1.md @@ -51,7 +51,7 @@ **Detailed Description** -Output tensor is populated by values computes in the following way: +The output tensor is populated by values computed in the following way: output[i1, ..., i(axis-1), j, i(axis+1) ..., iN] = top_k(input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]), k, sort, mode) @@ -59,7 +59,7 @@ So for each slice `input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]` which repr Sorting and minimum/maximum are controlled by `sort` and `mode` attributes: * *mode*=`max`, *sort*=`value` - descending by value - * *mode*=`max`, *sort*=`index` - ascending by index + * *mode*=`max`, *sort*=`index` - descending by index * *mode*=`max`, *sort*=`none` - undefined * *mode*=`min`, *sort*=`value` - ascending by value * *mode*=`min`, *sort*=`index` - ascending by index diff --git a/docs/ops/sort/TopK_11.md b/docs/ops/sort/TopK_11.md new file mode 100644 index 00000000000000..f96007704da53e --- /dev/null +++ b/docs/ops/sort/TopK_11.md @@ -0,0 +1,118 @@ +# TopK {#openvino_docs_ops_sort_TopK_11} + +**Versioned name**: *TopK-11* + +**Category**: *Sorting and maximization* + +**Short description**: *TopK* computes indices and values of the *k* maximum/minimum values for each slice along a specified axis. + +**Attributes** + +* *axis* + + * **Description**: Specifies the axis along which the values are retrieved. + * **Range of values**: An integer. Negative values means counting dimension from the back. + * **Type**: `int` + * **Required**: *yes* + +* *mode* + + * **Description**: Specifies whether *TopK* selects the largest or the smallest elements from each slice. + * **Range of values**: "min", "max" + * **Type**: `string` + * **Required**: *yes* + +* *sort* + + * **Description**: Specifies the order of corresponding elements of the output tensor. + * **Range of values**: `value`, `index`, `none` + * **Type**: `string` + * **Required**: *yes* + +* *stable* + + * **Description**: Specifies whether the equivalent elements should maintain their relative order from the input tensor. Takes effect only if the `sort` attribute is set to `value`. + * **Range of values**: `true` of `false` + * **Type**: `boolean` + * **Default value**: `false` + * **Required**: *no* + +* *index_element_type* + + * **Description**: the type of output tensor with indices + * **Range of values**: "i64" or "i32" + * **Type**: string + * **Default value**: "i32" + * **Required**: *no* + + +**Inputs**: + +* **1**: tensor with arbitrary rank and type *T*. **Required.** + +* **2**: The value of *K* - a scalar of any integer type that specifies how many elements from the input tensor should be selected. The accepted range of values of *K* is `<1;input1.shape[axis]>`. The behavior of this operator is undefined if the value of *K* does not meet those requirements. **Required.** + +**Outputs**: + +* **1**: Output tensor of type *T* with *k* values from the input tensor along a specified *axis*. The shape of the tensor is `[input1.shape[0], ..., input1.shape[axis-1], 1..k, input1.shape[axis+1], ..., input1.shape[input1.rank - 1]]`. + +* **2**: Output tensor containing indices of the corresponding elements(values) from the first output tensor. The indices point to the location of selected values in the original input tensor. The shape of this output tensor is the same as the shape of the first output, that is `[input1.shape[0], ..., input1.shape[axis-1], 1..k, input1.shape[axis+1], ..., input1.shape[input1.rank - 1]]`. The type of this tensor *T_IND* is controlled by the `index_element_type` attribute. + +**Types** + +* *T*: any numeric type. + +* *T_IND*: `int64` or `int32`. + +**Detailed Description** + +The output tensor is populated by values computed in the following way: + + output[i1, ..., i(axis-1), j, i(axis+1) ..., iN] = top_k(input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]), k, sort, mode) + +meaning that for each slice `input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]` the *TopK* values are computed individually. + +Sorting and minimum/maximum are controlled by `sort` and `mode` attributes with additional configurability provided by `stable`: + * *sort*=`value`, *mode*=`max`, *stable*=`false` - descending by value, relative order of equal elements not guaranteed to be maintained + * *sort*=`value`, *mode*=`max`, *stable*=`true` - descending by value, relative order of equal elements guaranteed to be maintained + * *sort*=`value`, *mode*=`min`, *stable*=`false` - ascending by value, relative order of equal elements not guaranteed to be maintained + * *sort*=`value`, *mode*=`min`, *stable*=`true` - ascending by value, relative order of equal elements guaranteed to be maintained + * *sort*=`index`, *mode*=`max` - descending by index + * *sort*=`index`, *mode*=`min` - ascending by index + * *sort*=`none` , *mode*=`max` - undefined + * *sort*=`none` , *mode*=`min` - undefined + +The relative order of equivalent elements is only preserved if the *stable* attribute is set to `true`. This makes the implementation use stable sorting algorithm during the computation of TopK elements. Otherwise the output order is undefined. + +**Example** + +This example assumes that `K` is equal to 10: + +```xml + + + + + 1 + 3 + 224 + 224 + + + + + + 1 + 3 + 224 + 10 + + + 1 + 3 + 224 + 10 + + + +``` diff --git a/docs/ops/sort/TopK_3.md b/docs/ops/sort/TopK_3.md index d5d4d3a4085b36..2ad37b24cfbb7d 100644 --- a/docs/ops/sort/TopK_3.md +++ b/docs/ops/sort/TopK_3.md @@ -58,7 +58,7 @@ **Detailed Description** -Output tensor is populated by values computes in the following way: +The output tensor is populated by values computed in the following way: output[i1, ..., i(axis-1), j, i(axis+1) ..., iN] = top_k(input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]), k, sort, mode) @@ -66,7 +66,7 @@ So for each slice `input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]` which repr Sorting and minimum/maximum are controlled by `sort` and `mode` attributes: * *mode*=`max`, *sort*=`value` - descending by value - * *mode*=`max`, *sort*=`index` - ascending by index + * *mode*=`max`, *sort*=`index` - descending by index * *mode*=`max`, *sort*=`none` - undefined * *mode*=`min`, *sort*=`value` - ascending by value * *mode*=`min`, *sort*=`index` - ascending by index diff --git a/docs/optimization_guide/nncf/filter_pruning.md b/docs/optimization_guide/nncf/filter_pruning.md index 726482a311ee1c..7633d2e2400751 100644 --- a/docs/optimization_guide/nncf/filter_pruning.md +++ b/docs/optimization_guide/nncf/filter_pruning.md @@ -1,183 +1,227 @@ # Filter Pruning of Convolutional Models {#filter_pruning} -## Introduction -Filter pruning is an advanced optimization method which allows reducing computational complexity of the model by removing redundant or unimportant filters from convolutional operations of the model. This removal is done in two steps: +@sphinxdirective + +Introduction +#################### + +Filter pruning is an advanced optimization method which allows reducing computational complexity of the model by removing +redundant or unimportant filters from convolutional operations of the model. This removal is done in two steps: + 1. Unimportant filters are zeroed out by the NNCF optimization with fine-tuning. -2. Zero filters are removed from the model during the export to OpenVINO™ Intermediate Representation (IR). -Filter Pruning method from the NNCF can be used stand-alone but we usually recommend to stack it with 8-bit quantization for two reasons. First, 8-bit quantization is the best method in terms of achieving the highest accuracy-performance trade-offs so stacking it with filter pruning can give even better performance results. Second, applying quantization along with filter pruning does not hurt accuracy a lot since filter pruning removes noisy filters from the model which narrows down values ranges of weights and activations and helps to reduce overall quantization error. +2. Zero filters are removed from the model during the export to OpenVINO Intermediate Representation (IR). + + +Filter Pruning method from the NNCF can be used stand-alone but we usually recommend to stack it with 8-bit quantization for +two reasons. First, 8-bit quantization is the best method in terms of achieving the highest accuracy-performance trade-offs so +stacking it with filter pruning can give even better performance results. Second, applying quantization along with filter +pruning does not hurt accuracy a lot since filter pruning removes noisy filters from the model which narrows down values +ranges of weights and activations and helps to reduce overall quantization error. + +.. note:: + Filter Pruning usually requires a long fine-tuning or retraining of the model which can be comparable to training the + model from scratch. Otherwise, a large accuracy degradation can be caused. Therefore, the training schedule should be + adjusted accordingly when applying this method. + -> **NOTE**: Filter Pruning usually requires a long fine-tuning or retraining of the model which can be comparable to training the model from scratch. Otherwise, a large accuracy degradation can be caused. Therefore, the training schedule should be adjusted accordingly when applying this method. Below, we provide the steps that are required to apply Filter Pruning + QAT to the model: -## Applying Filter Pruning with fine-tuning -Here, we show the basic steps to modify the training script for the model and use it to zero out unimportant filters: -### 1. Import NNCF API -In this step, NNCF-related imports are added in the beginning of the training script: +Applying Filter Pruning with fine-tuning +######################################## -@sphinxtabset +Here, we show the basic steps to modify the training script for the model and use it to zero out unimportant filters: -@sphinxtab{PyTorch} +1. Import NNCF API +++++++++++++++++++ -@snippet docs/optimization_guide/nncf/code/pruning_torch.py imports +In this step, NNCF-related imports are added in the beginning of the training script: -@endsphinxtab +.. tab:: PyTorch -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [imports] -@snippet docs/optimization_guide/nncf/code/pruning_tf.py imports +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [imports] -@endsphinxtabset +2. Create NNCF configuration +++++++++++++++++++++++++++++ -### 2. Create NNCF configuration -Here, you should define NNCF configuration which consists of model-related parameters (`"input_info"` section) and parameters of optimization methods (`"compression"` section). +Here, you should define NNCF configuration which consists of model-related parameters (`"input_info"` section) and parameters +of optimization methods (`"compression"` section). -@sphinxtabset +.. tab:: PyTorch -@sphinxtab{PyTorch} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [nncf_congig] -@snippet docs/optimization_guide/nncf/code/pruning_torch.py nncf_congig +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [nncf_congig] -@sphinxtab{TensorFlow 2} +Here is a brief description of the required parameters of the Filter Pruning method. For full description refer to the +`GitHub `__ page. -@snippet docs/optimization_guide/nncf/code/pruning_tf.py nncf_congig +* ``pruning_init`` - initial pruning rate target. For example, value ``0.1`` means that at the begging of training, convolutions that can be pruned will have 10% of their filters set to zero. -@endsphinxtab +* ``pruning_target`` - pruning rate target at the end of the schedule. For example, the value ``0.5`` means that at the epoch with the number of ``num_init_steps + pruning_steps``, convolutions that can be pruned will have 50% of their filters set to zero. -@endsphinxtabset +* ``pruning_steps` - the number of epochs during which the pruning rate target is increased from ``pruning_init` to ``pruning_target`` value. We recommend to keep the highest learning rate during this period. -Here is a brief description of the required parameters of the Filter Pruning method. For full description refer to the [GitHub](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Pruning.md) page. -- `pruning_init` - initial pruning rate target. For example, value `0.1` means that at the begging of training, convolutions that can be pruned will have 10% of their filters set to zero. -- `pruning_target` - pruning rate target at the end of the schedule. For example, the value `0.5` means that at the epoch with the number of `num_init_steps + pruning_steps`, convolutions that can be pruned will have 50% of their filters set to zero. -- `pruning_steps` - the number of epochs during which the pruning rate target is increased from `pruning_init` to `pruning_target` value. We recommend to keep the highest learning rate during this period. -### 3. Apply optimization methods -In the next step, the original model is wrapped by the NNCF object using the `create_compressed_model()` API using the configuration defined in the previous step. This method returns a so-called compression controller and the wrapped model that can be used the same way as the original model. It is worth noting that optimization methods are applied at this step so that the model undergoes a set of corresponding transformations and can contain additional operations required for the optimization. -@sphinxtabset +3. Apply optimization methods ++++++++++++++++++++++++++++++ -@sphinxtab{PyTorch} +In the next step, the original model is wrapped by the NNCF object using the ``create_compressed_model()`` API using the +configuration defined in the previous step. This method returns a so-called compression controller and the wrapped model +that can be used the same way as the original model. It is worth noting that optimization methods are applied at this step +so that the model undergoes a set of corresponding transformations and can contain additional operations required for the +optimization. -@snippet docs/optimization_guide/nncf/code/pruning_torch.py wrap_model -@endsphinxtab +.. tab:: PyTorch -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [wrap_model] -@snippet docs/optimization_guide/nncf/code/pruning_tf.py wrap_model +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [wrap_model] -@endsphinxtabset -### 4. Fine-tune the model -This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the case of Filter Pruning method we recommend using the training schedule and learning rate similar to what was used for the training of original model. +4. Fine-tune the model +++++++++++++++++++++++ -@sphinxtabset +This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the case +of Filter Pruning method we recommend using the training schedule and learning rate similar to what was used for the training +of original model. -@sphinxtab{PyTorch} -@snippet docs/optimization_guide/nncf/code/pruning_torch.py tune_model +.. tab:: PyTorch -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [tune_model] -@sphinxtab{TensorFlow 2} +.. tab:: TensorFlow 2 -@snippet docs/optimization_guide/nncf/code/pruning_tf.py tune_model + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [tune_model] -@endsphinxtab -@endsphinxtabset +5. Multi-GPU distributed training ++++++++++++++++++++++++++++++++++ -### 5. Multi-GPU distributed training -In the case of distributed multi-GPU training (not DataParallel), you should call `compression_ctrl.distributed()` before the fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode. -@sphinxtabset +In the case of distributed multi-GPU training (not DataParallel), you should call ``compression_ctrl.distributed()`` before the +fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode. -@sphinxtab{PyTorch} -@snippet docs/optimization_guide/nncf/code/qat_torch.py distributed +.. tab:: PyTorch -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [distributed] -@sphinxtab{TensorFlow 2} +.. tab:: TensorFlow 2 -@snippet docs/optimization_guide/nncf/code/qat_tf.py distributed + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [distributed] -@endsphinxtab -@endsphinxtabset +6. Export quantized model ++++++++++++++++++++++++++ -### 6. Export quantized model -When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in the case of PyTorch and frozen graph - for TensorFlow 2. +When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in +the case of PyTorch and frozen graph - for TensorFlow 2. -@sphinxtabset -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py export + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [export] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [export] -@snippet docs/optimization_guide/nncf/code/qat_tf.py export -@endsphinxtab +These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model +checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs. -@endsphinxtabset -These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs. +7. (Optional) Save checkpoint ++++++++++++++++++++++++++++++ -### 7. (Optional) Save checkpoint To save model checkpoint use the following API: -@sphinxtabset -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py save_checkpoint + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [save_checkpoint] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [save_checkpoint] -@snippet docs/optimization_guide/nncf/code/qat_tf.py save_checkpoint -@endsphinxtab +8. (Optional) Restore from checkpoint ++++++++++++++++++++++++++++++++++++++ -@endsphinxtabset - -### 8. (Optional) Restore from checkpoint To restore the model from checkpoint you should use the following API: -@sphinxtabset - -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py load_checkpoint + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [load_checkpoint] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [load_checkpoint] -@snippet docs/optimization_guide/nncf/code/qat_tf.py load_checkpoint -@endsphinxtab +For more details on saving/loading checkpoints in the NNCF, see the following +`documentation `__. -@endsphinxtabset +Deploying pruned model +###################### -For more details on saving/loading checkpoints in the NNCF, see the following [documentation](https://github.com/openvinotoolkit/nncf/blob/develop/docs/Usage.md#saving-and-loading-compressed-models). +The pruned model requres an extra step that should be done to get performance improvement. This step involves removal of the +zero filters from the model. This is done at the model conversion step using :doc:`Model Optimizer ` tool when model is converted from the framework representation (ONNX, TensorFlow, etc.) to OpenVINO Intermediate Representation. -## Deploying pruned model -The pruned model requres an extra step that should be done to get performance improvement. This step involves removal of the zero filters from the model. This is done at the model convertion step using [Model Optimizer](@ref openvino_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide) tool when model is converted from the framework representation (ONNX, TensorFlow, etc.) to OpenVINO Intermediate Representation. -- To remove zero filters from the pruned model add the following parameter to the model convertion command: `--transform=Pruning` +* To remove zero filters from the pruned model add the following parameter to the model convertion command: ``--transform=Pruning`` After that the model can be deployed with OpenVINO in the same way as the baseline model. -For more details about model deployment with OpenVINO, see the corresponding [documentation](../../OV_Runtime_UG/openvino_intro.md). +For more details about model deployment with OpenVINO, see the corresponding :doc:`documentation `. + + +Examples +#################### + +* `PyTorch Image Classiication example `__ + +* `TensorFlow Image Classification example `__ -## Examples -- [PyTorch Image Classiication example](https://github.com/openvinotoolkit/nncf/blob/develop/examples/torch/classification) -- [TensorFlow Image Classification example](https://github.com/openvinotoolkit/nncf/tree/develop/examples/tensorflow/classification) \ No newline at end of file +@endsphinxdirective diff --git a/docs/optimization_guide/nncf/ptq/basic_quantization_flow.md b/docs/optimization_guide/nncf/ptq/basic_quantization_flow.md index 38831daac02f04..2f315c04705fbd 100644 --- a/docs/optimization_guide/nncf/ptq/basic_quantization_flow.md +++ b/docs/optimization_guide/nncf/ptq/basic_quantization_flow.md @@ -1,135 +1,161 @@ # Basic Quantization Flow {#basic_qauntization_flow} -## Introduction +@sphinxdirective + +Introduction +#################### The basic quantization flow is the simplest way to apply 8-bit quantization to the model. It is available for models in the following frameworks: PyTorch, TensorFlow 2.x, ONNX, and OpenVINO. The basic quantization flow is based on the following steps: + * Set up an environment and install dependencies. * Prepare the **calibration dataset** that is used to estimate quantization parameters of the activations within the model. * Call the quantization API to apply 8-bit quantization to the model. -## Set up an Environment +Set up an Environment +##################### It is recommended to set up a separate Python environment for quantization with NNCF. To do this, run the following command: -```bash -python3 -m venv nncf_ptq_env -``` -Install all the packages required to instantiate the model object, for example, DL framework. After that, install NNCF on top of the environment: -```bash -pip install nncf -``` -## Prepare a Calibration Dataset +.. code-block:: sh -At this step, create an instance of the `nncf.Dataset` class that represents the calibration dataset. The `nncf.Dataset` class can be a wrapper over the framework dataset object that is used for model training or validation. The class constructor receives the dataset object and the transformation function. For example, if you use PyTorch, you can pass an instance of the `torch.utils.data.DataLoader` object. + python3 -m venv nncf_ptq_env -The transformation function is a function that takes a sample from the dataset and returns data that can be passed to the model for inference. For example, this function can take a tuple of a data tensor and labels tensor, and return the former while ignoring the latter. The transformation function is used to avoid modifying the dataset code to make it compatible with the quantization API. The function is applied to each sample from the dataset before passing it to the model for inference. The following code snippet shows how to create an instance of the `nncf.Dataset` class: +Install all the packages required to instantiate the model object, for example, DL framework. After that, install NNCF on top of the environment: -@sphinxtabset +.. code-block:: sh -@sphinxtab{PyTorch} + pip install nncf -@snippet docs/optimization_guide/nncf/ptq/code/ptq_torch.py dataset +Prepare a Calibration Dataset +############################# -@endsphinxtab +At this step, create an instance of the ``nncf.Dataset`` class that represents the calibration dataset. The ``nncf.Dataset`` class can be a wrapper over the framework dataset object that is used for model training or validation. The class constructor receives the dataset object and the transformation function. For example, if you use PyTorch, you can pass an instance of the ``torch.utils.data.DataLoader`` object. -@sphinxtab{ONNX} +The transformation function is a function that takes a sample from the dataset and returns data that can be passed to the model for inference. For example, this function can take a tuple of a data tensor and labels tensor, and return the former while ignoring the latter. The transformation function is used to avoid modifying the dataset code to make it compatible with the quantization API. The function is applied to each sample from the dataset before passing it to the model for inference. The following code snippet shows how to create an instance of the ``nncf.Dataset`` class: -@snippet docs/optimization_guide/nncf/ptq/code/ptq_onnx.py dataset +.. tab:: PyTorch -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_torch.py + :language: python + :fragment: [dataset] -@sphinxtab{OpenVINO} +.. tab:: ONNX -@snippet docs/optimization_guide/nncf/ptq/code/ptq_openvino.py dataset + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_onnx.py + :language: python + :fragment: [dataset] -@endsphinxtab +.. tab:: OpenVINO -@sphinxtab{TensorFlow} + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_openvino.py + :language: python + :fragment: [dataset] -@snippet docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py dataset +.. tab:: TensorFlow -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py + :language: python + :fragment: [dataset] -@endsphinxtabset -If there is no framework dataset object, you can create your own entity that implements the `Iterable` interface in Python and returns data samples feasible for inference. In this case, a transformation function is not required. +If there is no framework dataset object, you can create your own entity that implements the ``Iterable`` interface in Python and returns data samples feasible for inference. In this case, a transformation function is not required. -## Run a Quantized Model +Run a Quantized Model +##################### Once the dataset is ready and the model object is instantiated, you can apply 8-bit quantization to it: -@sphinxtabset -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/ptq/code/ptq_torch.py quantization + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_torch.py + :language: python + :fragment: [quantization] -@endsphinxtab +.. tab:: ONNX -@sphinxtab{ONNX} + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_onnx.py + :language: python + :fragment: [quantization] -@snippet docs/optimization_guide/nncf/ptq/code/ptq_torch.py quantization +.. tab:: OpenVINO -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_openvino.py + :language: python + :fragment: [quantization] -@sphinxtab{OpenVINO} +.. tab:: TensorFlow -@snippet docs/optimization_guide/nncf/ptq/code/ptq_torch.py quantization + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py + :language: python + :fragment: [quantization] -@endsphinxtab -@sphinxtab{TensorFlow} +.. note:: The ``model`` is an instance of the ``torch.nn.Module`` class for PyTorch, ``onnx.ModelProto`` for ONNX, and ``openvino.runtime.Model`` for OpenVINO. -@snippet docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py quantization +After that the model can be exported into th OpenVINO Intermediate Representation if needed and run faster with OpenVINO. -@endsphinxtab +Tune quantization parameters +############################ -@endsphinxtabset +``nncf.quantize()`` function has several parameters that allow to tune quantization process to get more accurate model. Below is the list of parameters and their description: -> **NOTE**: The `model` is an instance of the `torch.nn.Module` class for PyTorch, `onnx.ModelProto` for ONNX, and `openvino.runtime.Model` for OpenVINO. +* ``model_type`` - used to specify quantization scheme required for specific type of the model. For example, **Transformer** models (BERT, distillBERT, etc.) require a special quantization scheme to preserve accuracy after quantization. -After that the model can be exported into th OpenVINO Intermediate Representation if needed and run faster with OpenVINO. + .. code-block:: sh + + nncf.quantize(model, dataset, model_type=nncf.ModelType.Transformer) + +* ``preset`` - defines quantization scheme for the model. Two types of presets are available: + + * ``PERFORMANCE`` (default) - defines symmetric quantization of weights and activations + * ``MIXED`` - weights are quantized with symmetric quantization and the activations are quantized with asymmetric quantization. This preset is recommended for models with non-ReLU and asymmetric activation functions, e.g. ELU, PReLU, GELU, etc. + + .. code-block:: sh + + nncf.quantize(model, dataset, preset=nncf.Preset.MIXED) + +* ``fast_bias_correction`` - enables more accurate bias (error) correction algorithm that can be used to improve accuracy of the model. This parameter is available only for OpenVINO representation. ``True`` is used by default. + + .. code-block:: sh + + nncf.quantize(model, dataset, fast_bias_correction=False) + +* ``subset_size`` - defines the number of samples from the calibration dataset that will be used to estimate quantization parameters of activations. The default value is 300. + + .. code-block:: sh + + nncf.quantize(model, dataset, subset_size=1000) + +* ``ignored_scope`` - this parameter can be used to exclude some layers from quantization process. For example, if you want to exclude the last layer of the model from quantization. Below are some examples of how to use this parameter: -## Tune quantization parameters - -`nncf.quantize()` function has several parameters that allow to tune quantization process to get more accurate model. Below is the list of parameters and their description: -* `model_type` - used to specify quantization scheme required for specific type of the model. For example, **Transformer** models (BERT, distillBERT, etc.) require a special quantization scheme to preserve accuracy after quantization. - ```python - nncf.quantize(model, dataset, model_type=nncf.ModelType.Transformer) - ``` -* `preset` - defines quantization scheme for the model. Two types of presets are available: - * `PERFORMANCE` (default) - defines symmetric quantization of weigths and activations - * `MIXED` - weights are quantized with symmetric quantization and the activations are quantized with asymmetric quantization. This preset is recommended for models with non-ReLU and asymmetric activation funstions, e.g. ELU, PReLU, GELU, etc. - ```python - nncf.quantize(model, dataset, preset=nncf.Preset.MIXED) - ``` -* `fast_bias_correction` - enables more accurate bias (error) correction algorithm that can be used to improve accuracy of the model. This parameter is available only for OpenVINO representation. `True` is used by default. - ```python - nncf.quantize(model, dataset, fast_bias_correction=False) - ``` -* `subset_size` - defines the number of samples from the calibration dataset that will be used to estimate quantization parameters of activations. The default value is 300. - ```python - nncf.quantize(model, dataset, subset_size=1000) - ``` -* `ignored_scope` - this parameter can be used to exclude some layers from quantization process. For example, if you want to exclude the last layer of the model from quantization. Below are some examples of how to use this parameter: * Exclude by layer name: - ```python - names = ['layer_1', 'layer_2', 'layer_3'] - nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(names=names)) - ``` + + .. code-block:: sh + + names = ['layer_1', 'layer_2', 'layer_3'] + nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(names=names)) + * Exclude by layer type: - ```python - types = ['Conv2d', 'Linear'] - nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(types=types)) - ``` + + .. code-block:: sh + + types = ['Conv2d', 'Linear'] + nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(types=types)) + * Exclude by regular expression: - ```python - regex = '.*layer_.*' - nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(patterns=regex)) - ``` -If the accuracy of the quantized model is not satisfactory, you can try to use the [Quantization with accuracy control](@ref quantization_w_accuracy_control) flow. + .. code-block:: sh + + regex = '.*layer_.*' + nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(patterns=regex)) + + +If the accuracy of the quantized model is not satisfactory, you can try to use the :doc:`Quantization with accuracy control ` flow. + +See also +#################### -## See also +* `Example of basic quantization flow in PyTorch `__ -* [Example of basic quantization flow in PyTorch](https://github.com/openvinotoolkit/nncf/tree/develop/examples/post_training_quantization/torch/mobilenet_v2) \ No newline at end of file +@endsphinxdirective diff --git a/docs/optimization_guide/nncf/ptq/quantization_w_accuracy_control.md b/docs/optimization_guide/nncf/ptq/quantization_w_accuracy_control.md index 03ddd9f99ac3a5..65d5ede50e4d8e 100644 --- a/docs/optimization_guide/nncf/ptq/quantization_w_accuracy_control.md +++ b/docs/optimization_guide/nncf/ptq/quantization_w_accuracy_control.md @@ -1,66 +1,64 @@ # Quantizing with accuracy control {#quantization_w_accuracy_control} -## Introduction +@sphinxdirective -This is the advanced quantization flow that allows to apply 8-bit quantization to the model with control of accuracy metric. This is achieved by keeping the most impactful operations within the model in the original precision. The flow is based on the [Basic 8-bit quantization](@ref basic_qauntization_flow) and has the following differences: -* Besided the calibration dataset, a **validation dataset** is required to compute accuracy metric. They can refer to the same data in the simplest case. -* **Validation function**, used to compute accuracy metric is required. It can be a function that is already available in the source framework or a custom function. -* Since accuracy validation is run several times during the quantization process, quantization with accuracy control can take more time than the [Basic 8-bit quantization](@ref basic_qauntization_flow) flow. -* The resulted model can provide smaller performance improvement than the [Basic 8-bit quantization](@ref basic_qauntization_flow) flow because some of the operations are kept in the original precision. - -> **NOTE**: Currently, this flow is available only for models in OpenVINO representation. - -The steps for the quantizatation with accuracy control are described below. +Introduction +#################### -## Prepare datasets +This is the advanced quantization flow that allows to apply 8-bit quantization to the model with control of accuracy metric. This is achieved by keeping the most impactful operations within the model in the original precision. The flow is based on the :doc:`Basic 8-bit quantization ` and has the following differences: -This step is similar to the [Basic 8-bit quantization](@ref basic_qauntization_flow) flow. The only difference is that two datasets, calibration and validation, are required. - -@sphinxtabset - -@sphinxtab{OpenVINO} +* Beside the calibration dataset, a **validation dataset** is required to compute accuracy metric. They can refer to the same data in the simplest case. +* **Validation function**, used to compute accuracy metric is required. It can be a function that is already available in the source framework or a custom function. +* Since accuracy validation is run several times during the quantization process, quantization with accuracy control can take more time than the [Basic 8-bit quantization](@ref basic_qauntization_flow) flow. +* The resulted model can provide smaller performance improvement than the :doc:`Basic 8-bit quantization ` flow because some of the operations are kept in the original precision. -@snippet docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py dataset +.. note:: Currently, this flow is available only for models in OpenVINO representation. -@endsphinxtab +The steps for the quantization with accuracy control are described below. -@endsphinxtabset +Prepare datasets +#################### -## Prepare validation function +This step is similar to the :doc:`Basic 8-bit quantization ` flow. The only difference is that two datasets, calibration and validation, are required. -Validation funtion receives `openvino.runtime.CompiledModel` object and -validation dataset and returns accuracy metric value. The following code snippet shows an example of validation function for OpenVINO model: +.. tab:: OpenVINO -@sphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py + :language: python + :fragment: [dataset] -@sphinxtab{OpenVINO} -@snippet docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py validation +Prepare validation function +########################### -@endsphinxtab +Validation funtion receives ``openvino.runtime.CompiledModel`` object and validation dataset and returns accuracy metric value. The following code snippet shows an example of validation function for OpenVINO model: -@endsphinxtabset +.. tab:: OpenVINO -## Run quantization with accuracy control + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py + :language: python + :fragment: [validation] -Now, you can run quantization with accuracy control. The following code snippet shows an example of quantization with accuracy control for OpenVINO model: -@sphinxtabset +Run quantization with accuracy control -@sphinxtab{OpenVINO} +Now, you can run quantization with accuracy control. The following code snippet shows an example of quantization with accuracy control for OpenVINO model: -@snippet docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py quantization +.. tab:: OpenVINO -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py + :language: python + :fragment: [quantization] -@endsphinxtabset -`max_drop` defines the accuracy drop threshold. The quantization process stops when the degradation of accuracy metric on the validation dataset is less than the `max_drop`. +``max_drop`` defines the accuracy drop threshold. The quantization process stops when the degradation of accuracy metric on the validation dataset is less than the ``max_drop``. -`nncf.quantize_with_accuracy_control()` API supports all the parameters of `nncf.quantize()` API. For example, you can use `nncf.quantize_with_accuracy_control()` to quantize a model with a custom configuration. +``nncf.quantize_with_accuracy_control()`` API supports all the parameters of ``nncf.quantize()`` API. For example, you can use ``nncf.quantize_with_accuracy_control()`` to quantize a model with a custom configuration. -## See also +See also +#################### -* [Optimizing Models at Training Time](@ref tmo_introduction) +* :doc:`Optimizing Models at Training Time ` +@endsphinxdirective diff --git a/docs/optimization_guide/nncf/qat.md b/docs/optimization_guide/nncf/qat.md index 88c4cfa57730a2..0ddf086921002c 100644 --- a/docs/optimization_guide/nncf/qat.md +++ b/docs/optimization_guide/nncf/qat.md @@ -1,172 +1,201 @@ # Quantization-aware Training (QAT) {#qat_introduction} -## Introduction -Quantization-aware Training is a popular method that allows quantizing a model and applying fine-tuning to restore accuracy degradation caused by quantization. In fact, this is the most accurate quantization method. This document describes how to apply QAT from the Neural Network Compression Framework (NNCF) to get 8-bit quantized models. This assumes that you are knowledgeable in Python* programming and familiar with the training code for the model in the source DL framework. +@sphinxdirective -## Using NNCF QAT -Here, we provide the steps that are required to integrate QAT from NNCF into the training script written with PyTorch or TensorFlow 2: +Introduction +#################### -> **NOTE**: Currently, NNCF for TensorFlow 2 supports optimization of the models created using Keras [Sequesntial API](https://www.tensorflow.org/guide/keras/sequential_model) or [Functional API](https://www.tensorflow.org/guide/keras/functional). +Quantization-aware Training is a popular method that allows quantizing a model and applying fine-tuning to restore accuracy +degradation caused by quantization. In fact, this is the most accurate quantization method. This document describes how to +apply QAT from the Neural Network Compression Framework (NNCF) to get 8-bit quantized models. This assumes that you are +knowledgeable in Python programming and familiar with the training code for the model in the source DL framework. -### 1. Import NNCF API -In this step, you add NNCF-related imports in the beginning of the training script: - -@sphinxtabset - -@sphinxtab{PyTorch} - -@snippet docs/optimization_guide/nncf/code/qat_torch.py imports - -@endsphinxtab - -@sphinxtab{TensorFlow 2} +Using NNCF QAT +#################### -@snippet docs/optimization_guide/nncf/code/qat_tf.py imports +Here, we provide the steps that are required to integrate QAT from NNCF into the training script written with +PyTorch or TensorFlow 2: -@endsphinxtab +.. note:: + Currently, NNCF for TensorFlow 2 supports optimization of the models created using Keras + `Sequential API `__ or + `Functional API `__. -@endsphinxtabset +1. Import NNCF API +++++++++++++++++++++ -### 2. Create NNCF configuration -Here, you should define NNCF configuration which consists of model-related parameters (`"input_info"` section) and parameters of optimization methods (`"compression"` section). For faster convergence, it is also recommended to register a dataset object specific to the DL framework. It will be used at the model creation step to initialize quantization parameters. - -@sphinxtabset +In this step, you add NNCF-related imports in the beginning of the training script: -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py nncf_congig + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [imports] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [imports] -@snippet docs/optimization_guide/nncf/code/qat_tf.py nncf_congig -@endsphinxtab +2. Create NNCF configuration +++++++++++++++++++++++++++++ -@endsphinxtabset +Here, you should define NNCF configuration which consists of model-related parameters (``"input_info"`` section) and parameters +of optimization methods (``"compression"`` section). For faster convergence, it is also recommended to register a dataset object +specific to the DL framework. It will be used at the model creation step to initialize quantization parameters. -### 3. Apply optimization methods -In the next step, you need to wrap the original model object with the `create_compressed_model()` API using the configuration defined in the previous step. This method returns a so-called compression controller and a wrapped model that can be used the same way as the original model. It is worth noting that optimization methods are applied at this step so that the model undergoes a set of corresponding transformations and can contain additional operations required for the optimization. In the case of QAT, the compression controller object is used for model export and, optionally, in distributed training as it will be shown below. +.. tab:: PyTorch -@sphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [nncf_congig] -@sphinxtab{PyTorch} +.. tab:: TensorFlow 2 -@snippet docs/optimization_guide/nncf/code/qat_torch.py wrap_model + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [nncf_congig] -@endsphinxtab -@sphinxtab{TensorFlow 2} +3. Apply optimization methods ++++++++++++++++++++++++++++++ -@snippet docs/optimization_guide/nncf/code/qat_tf.py wrap_model +In the next step, you need to wrap the original model object with the ``create_compressed_model()`` API using the configuration +defined in the previous step. This method returns a so-called compression controller and a wrapped model that can be used the +same way as the original model. It is worth noting that optimization methods are applied at this step so that the model +undergoes a set of corresponding transformations and can contain additional operations required for the optimization. In +the case of QAT, the compression controller object is used for model export and, optionally, in distributed training as it +will be shown below. -@endsphinxtab +.. tab:: PyTorch -@endsphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [wrap_model] -### 4. Fine-tune the model -This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the case of QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5. In principle, you can skip this step which means that the post-training optimization will be applied to the model. +.. tab:: TensorFlow 2 -@sphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [wrap_model] -@sphinxtab{PyTorch} -@snippet docs/optimization_guide/nncf/code/qat_torch.py tune_model +4. Fine-tune the model +++++++++++++++++++++++ -@endsphinxtab +This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the +case of QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5. In principle, +you can skip this step which means that the post-training optimization will be applied to the model. -@sphinxtab{TensorFlow 2} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_tf.py tune_model + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [tune_model] -@endsphinxtab +.. tab:: TensorFlow 2 -@endsphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [tune_model] -### 5. Multi-GPU distributed training -In the case of distributed multi-GPU training (not DataParallel), you should call `compression_ctrl.distributed()` before the fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode. -@sphinxtabset -@sphinxtab{PyTorch} +5. Multi-GPU distributed training ++++++++++++++++++++++++++++++++++ -@snippet docs/optimization_guide/nncf/code/qat_torch.py distributed +In the case of distributed multi-GPU training (not DataParallel), you should call ``compression_ctrl.distributed()`` before +the fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode. -@endsphinxtab +.. tab:: PyTorch -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [distributed] -@snippet docs/optimization_guide/nncf/code/qat_tf.py distributed +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [distributed] -@endsphinxtabset +6. Export quantized model ++++++++++++++++++++++++++ -### 6. Export quantized model -When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in the case of PyTorch and frozen graph - for TensorFlow 2. +When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in +the case of PyTorch and frozen graph - for TensorFlow 2. -@sphinxtabset +.. tab:: PyTorch -@sphinxtab{PyTorch} + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [export] -@snippet docs/optimization_guide/nncf/code/qat_torch.py export +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [export] -@sphinxtab{TensorFlow 2} -@snippet docs/optimization_guide/nncf/code/qat_tf.py export +.. note:: + The precision of weigths gets INT8 only after the step of model conversion to OpenVINO Intermediate Representation. + You can expect the model footprint reduction only for that format. -@endsphinxtab -@endsphinxtabset +These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model +checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs. -> **NOTE**: The precision of weigths gets INT8 only after the step of model conversion to OpenVINO Intermediate Representation. You can expect the model footprint reduction only for that format. -These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs. +7. (Optional) Save checkpoint ++++++++++++++++++++++++++++++ -### 7. (Optional) Save checkpoint To save model checkpoint use the following API: -@sphinxtabset - -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py save_checkpoint + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [save_checkpoint] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [save_checkpoint] -@snippet docs/optimization_guide/nncf/code/qat_tf.py save_checkpoint -@endsphinxtab +8. (Optional) Restore from checkpoint ++++++++++++++++++++++++++++++++++++++ -@endsphinxtabset - -### 8. (Optional) Restore from checkpoint To restore the model from checkpoint you should use the following API: -@sphinxtabset +.. tab:: PyTorch + + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [load_checkpoint] -@sphinxtab{PyTorch} +.. tab:: TensorFlow 2 -@snippet docs/optimization_guide/nncf/code/qat_torch.py load_checkpoint + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [load_checkpoint] -@endsphinxtab -@sphinxtab{TensorFlow 2} +For more details on saving/loading checkpoints in the NNCF, see the following `documentation `__. -@snippet docs/optimization_guide/nncf/code/qat_tf.py load_checkpoint +Deploying quantized model +######################### -@endsphinxtab +The quantized model can be deployed with OpenVINO in the same way as the baseline model. No extra steps or options are +required in this case. For more details, see the corresponding :doc:`documentation `. -@endsphinxtabset +Examples +#################### -For more details on saving/loading checkpoints in the NNCF, see the following [documentation](https://github.com/openvinotoolkit/nncf/blob/develop/docs/Usage.md#saving-and-loading-compressed-models). +* `Quantizing PyTorch model with NNCF `__ -## Deploying quantized model -The quantized model can be deployed with OpenVINO in the same way as the baseline model. No extra steps or options are required in this case. For more details, see the corresponding [documentation](../../OV_Runtime_UG/openvino_intro.md). +* `Quantizing TensorFlow model with NNCF `__ -## Examples -- [Quantizing PyTorch model with NNCF](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/302-pytorch-quantization-aware-training) -- [Quantizing TensorFlow model with NNCF](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/305-tensorflow-quantization-aware-training) \ No newline at end of file +@endsphinxdirective diff --git a/docs/resources/prerelease_information.md b/docs/resources/prerelease_information.md new file mode 100644 index 00000000000000..7d260e086b7b25 --- /dev/null +++ b/docs/resources/prerelease_information.md @@ -0,0 +1,32 @@ +# Prerelease Information {#prerelease_information} + +@sphinxdirective + +OpenVINO follows a four-month release cycle, which means three major releases a year, +the last one being an LTS version. To ensure you do not have to wait long to test its new features, +OpenVINO developers continue to roll out prerelease versions. In this page you can find +a general changelog and the schedule for all versions for the current year. + +.. note:: + These versions are pre-release software and have not undergone full validation or qualification. OpenVINO™ toolkit pre-release is: + + * NOT to be incorporated into production software/solutions. + * NOT subject to official support. + * Subject to change in the future. + * Introduced to allow early testing and get early feedback from the community. + + +.. dropdown:: OpenVINO Toolkit 2023.0.0.dev20230217 + :open: + :animate: fade-in-slide-down + :color: primary + + OpenVINO™ repository tag: `2023.0.0.dev20230217 `__ + + * Enabled PaddlePaddle Framework 2.4 + * Preview of TensorFlow Lite Front End – Load models directly via “read_model” into OpenVINO Runtime and export OpenVINO IR format using Model Optimizer or “convert_model” + * Introduced new option ov::auto::enable_startup_fallback / ENABLE_STARTUP_FALLBACK to control whether to use CPU to accelerate first inference latency for accelerator HW devices like GPU. + * New FrontEndManager register_front_end(name, lib_path) interface added, to remove “OV_FRONTEND_PATH” env var (a way to load non-default frontends). + + +@endsphinxdirective \ No newline at end of file diff --git a/docs/resources/resources.md b/docs/resources/resources.md index ad5d806c036296..2f6056a9b78908 100644 --- a/docs/resources/resources.md +++ b/docs/resources/resources.md @@ -9,6 +9,7 @@ openvino_docs_performance_benchmarks openvino_ir + prerelease_information .. toctree:: :maxdepth: 1 diff --git a/docs/resources/supported_models.md b/docs/resources/supported_models.md index 907f220e6650d0..455a4d543d56f6 100644 --- a/docs/resources/supported_models.md +++ b/docs/resources/supported_models.md @@ -1,13 +1,12 @@ # Supported Models {#openvino_supported_models} +@sphinxdirective The OpenVINO team continues the effort to support as many models out-of-the-box as possible. Based on our research and user feedback, we prioritize the most common models and test them before every release. These models are considered officially supported. -@sphinxdirective - -.. button-link:: _static/download/OV_2022_models_supported.pdf +.. button-link:: _static/download/OV_2023_models_supported.pdf :color: primary :outline: @@ -18,36 +17,33 @@ before every release. These models are considered officially supported. | If your model is not included but is similar to those that are, it is still very likely to work. If your model fails to execute properly there are a few options available: -@endsphinxdirective - -* If the model originates from a framework like TensorFlow or PyTorch, OpenVINO™ offers a hybrid solution. The original model can be run without explicit conversion into the OpenVINO format. For more information, see [OpenVINO TensorFlow Integration](https://docs.openvino.ai/latest/ovtf_integration.html). +* If the model originates from a framework like TensorFlow or PyTorch, OpenVINO™ offers a hybrid solution. The original model can be run without explicit conversion into the OpenVINO format. For more information, see :ref:`OpenVINO TensorFlow Integration `. * You can create a GitHub request for the operation(s) that are missing. These requests are reviewed regularly. You will be informed if and how the request will be accommodated. Additionally, your request may trigger a reply from someone in the community who can help. -* As OpenVINO™ is open source you can enhance it with your own contribution to the GitHub repository. To learn more, see the articles on [OpenVINO Extensibility](https://docs.openvino.ai/latest/openvino_docs_Extensibility_UG_Intro.html). +* As OpenVINO™ is open source you can enhance it with your own contribution to the GitHub repository. To learn more, see the articles on :ref:`OpenVINO Extensibility`. The following table summarizes the number of models supported by OpenVINO™ in different categories: -@sphinxdirective +--------------------------------------------+-------------------+ | Model Categories: | Number of Models: | +============================================+===================+ -| Object Detection | 149 | +| Object Detection | 149 | +--------------------------------------------+-------------------+ | Instance Segmentation | 3 | +--------------------------------------------+-------------------+ | Semantic Segmentation | 19 | +--------------------------------------------+-------------------+ -| Image Processing, Enhancement | 16 | +| Image Processing, Enhancement | 16 | +--------------------------------------------+-------------------+ -| Monodepth | 2 | +| Monodepth | 2 | +--------------------------------------------+-------------------+ -| Colorization | 2 | +| Colorization | 2 | +--------------------------------------------+-------------------+ -| Behavior / Decision Prediction | 1 | +| Behavior / Decision Prediction | 1 | +--------------------------------------------+-------------------+ -| Action Recognition | 2 | +| Action Recognition | 2 | +--------------------------------------------+-------------------+ -| Time Series Forecasting | 1 | +| Time Series Forecasting | 1 | +--------------------------------------------+-------------------+ | Image Classification | 68 | +--------------------------------------------+-------------------+ @@ -55,14 +51,15 @@ The following table summarizes the number of models supported by OpenVINO™ in +--------------------------------------------+-------------------+ | Image Classification, Emotion | 1 | +--------------------------------------------+-------------------+ -| Image Translation | 1 | +| Image Translation | 1 | +--------------------------------------------+-------------------+ -| Natural language Processing | 35 | +| Natural language Processing | 35 | +--------------------------------------------+-------------------+ -| Text Detection | 18 | +| Text Detection | 18 | +--------------------------------------------+-------------------+ -| Audio Enhancement | 3 | +| Audio Enhancement | 3 | +--------------------------------------------+-------------------+ -| Sound Classification | 2 | +| Sound Classification | 2 | +--------------------------------------------+-------------------+ + @endsphinxdirective \ No newline at end of file diff --git a/docs/snippets/CMakeLists.txt b/docs/snippets/CMakeLists.txt index 006870d71caafe..787b7da08aeec8 100644 --- a/docs/snippets/CMakeLists.txt +++ b/docs/snippets/CMakeLists.txt @@ -24,7 +24,9 @@ file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" if (NOT TARGET OpenCL::OpenCL) list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/gpu/context_sharing_va.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/gpu/context_sharing.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/gpu/preprocessing.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/gpu/preprocessing_nv12_two_planes.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/gpu/preprocessing_nv12_single_plane.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/gpu/preprocessing_nv12_to_gray.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/gpu/queue_sharing.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/gpu/remote_objects_creation.cpp") endif() diff --git a/docs/snippets/gpu/preprocessing_nv12_single_plane.cpp b/docs/snippets/gpu/preprocessing_nv12_single_plane.cpp new file mode 100644 index 00000000000000..2b360ea08ce300 --- /dev/null +++ b/docs/snippets/gpu/preprocessing_nv12_single_plane.cpp @@ -0,0 +1,48 @@ +#include +#define OV_GPU_USE_OPENCL_HPP +#include +#include +#include + +ov::intel_gpu::ocl::ClImage2DTensor get_yuv_tensor(); + +int main() { + ov::Core core; + auto model = core.read_model("model.xml"); + + //! [init_preproc] + using namespace ov::preprocess; + auto p = PrePostProcessor(model); + p.input().tensor().set_element_type(ov::element::u8) + .set_color_format(ColorFormat::NV12_SINGLE_PLANE) + .set_memory_type(ov::intel_gpu::memory_type::surface); + p.input().preprocess().convert_color(ov::preprocess::ColorFormat::BGR); + p.input().model().set_layout("NCHW"); + auto model_with_preproc = p.build(); + //! [init_preproc] + + auto compiled_model = core.compile_model(model_with_preproc, "GPU"); + auto context = compiled_model.get_context().as(); + auto infer_request = compiled_model.create_infer_request(); + +{ + //! [single_batch] + auto input_yuv = model_with_preproc->input(0); + ov::intel_gpu::ocl::ClImage2DTensor yuv_tensor = get_yuv_tensor(); + infer_request.set_tensor(input_yuv.get_any_name(), yuv_tensor); + infer_request.infer(); + //! [single_batch] +} + +{ + auto yuv_tensor_0 = get_yuv_tensor(); + auto yuv_tensor_1 = get_yuv_tensor(); + //! [batched_case] + auto input_yuv = model_with_preproc->input(0); + std::vector yuv_tensors = {yuv_tensor_0, yuv_tensor_1}; + infer_request.set_tensors(input_yuv.get_any_name(), yuv_tensors); + infer_request.infer(); + //! [batched_case] +} + return 0; +} diff --git a/docs/snippets/gpu/preprocessing_nv12_to_gray.cpp b/docs/snippets/gpu/preprocessing_nv12_to_gray.cpp new file mode 100644 index 00000000000000..e61e4cdca64611 --- /dev/null +++ b/docs/snippets/gpu/preprocessing_nv12_to_gray.cpp @@ -0,0 +1,50 @@ +#define OV_GPU_USE_OPENCL_HPP +#include +#include +#include + +ov::intel_gpu::ocl::ClImage2DTensor get_y_tensor(); +ov::intel_gpu::ocl::ClImage2DTensor get_uv_tensor(); + +int main() { + ov::Core core; + auto model = core.read_model("model.xml"); + + //! [init_preproc] + using namespace ov::preprocess; + auto p = PrePostProcessor(model); + p.input().tensor().set_element_type(ov::element::u8) + .set_layout("NHWC") + .set_memory_type(ov::intel_gpu::memory_type::surface); + p.input().model().set_layout("NCHW"); + auto model_with_preproc = p.build(); + //! [init_preproc] + + auto compiled_model = core.compile_model(model_with_preproc, "GPU"); + auto remote_context = compiled_model.get_context().as(); + auto input = model->input(0); + auto infer_request = compiled_model.create_infer_request(); + +{ + //! [single_batch] + cl::Image2D img_y_plane; + auto input_y = model_with_preproc->input(0); + auto remote_y_tensor = remote_context.create_tensor(input_y.get_element_type(), input.get_shape(), img_y_plane); + infer_request.set_tensor(input_y.get_any_name(), remote_y_tensor); + infer_request.infer(); + //! [single_batch] +} + +{ + //! [batched_case] + cl::Image2D img_y_plane_0, img_y_plane_l; + auto input_y = model_with_preproc->input(0); + auto remote_y_tensor_0 = remote_context.create_tensor(input_y.get_element_type(), input.get_shape(), img_y_plane_0); + auto remote_y_tensor_1 = remote_context.create_tensor(input_y.get_element_type(), input.get_shape(), img_y_plane_l); + std::vector y_tensors = {remote_y_tensor_0, remote_y_tensor_1}; + infer_request.set_tensors(input_y.get_any_name(), y_tensors); + infer_request.infer(); + //! [batched_case] +} + return 0; +} diff --git a/docs/snippets/gpu/preprocessing.cpp b/docs/snippets/gpu/preprocessing_nv12_two_planes.cpp similarity index 100% rename from docs/snippets/gpu/preprocessing.cpp rename to docs/snippets/gpu/preprocessing_nv12_two_planes.cpp diff --git a/docs/snippets/gpu/preprocessing.py b/docs/snippets/gpu/preprocessing_nv12_two_planes.py similarity index 100% rename from docs/snippets/gpu/preprocessing.py rename to docs/snippets/gpu/preprocessing_nv12_two_planes.py diff --git a/install_build_dependencies.sh b/install_build_dependencies.sh index 7d76c26259c825..d7db483bc5fe6f 100755 --- a/install_build_dependencies.sh +++ b/install_build_dependencies.sh @@ -125,6 +125,46 @@ elif [ -f /etc/redhat-release ] || grep -q "rhel" /etc/os-release ; then `# samples and tools` \ zlib-devel \ gflags-devel +elif [ -f /etc/os-release ] && grep -q "SUSE" /etc/os-release ; then + zypper refresh + zypper install -y \ + file \ + `# build tools` \ + cmake \ + ccache \ + ninja \ + scons \ + gcc \ + gcc-c++ \ + make \ + `# to determine openvino version via git` \ + git \ + git-lfs \ + `# to build and check pip packages` \ + patchelf \ + fdupes \ + `# to build and check rpm packages` \ + rpm-build \ + rpmlint \ + `# check bash scripts for correctness` \ + ShellCheck \ + `# main openvino dependencies` \ + tbb-devel \ + pugixml-devel \ + `# GPU plugin dependency` \ + libva-devel \ + `# OpenCL for GPU` \ + ocl-icd-devel \ + opencl-cpp-headers \ + opencl-headers \ + `# python API` \ + python39-pip \ + python39-setuptools \ + python39-devel \ + `# samples and tools` \ + zlib-devel \ + gflags-devel-static \ + nlohmann_json-devel elif [ -f /etc/os-release ] && grep -q "raspbian" /etc/os-release; then # Raspbian apt update @@ -176,8 +216,10 @@ if [ ! "$(printf '%s\n' "$required_cmake_ver" "$current_cmake_ver" | sort -V | h if command -v apt-get &> /dev/null; then apt-get install -y --no-install-recommends wget - else + elif command -v yum &> /dev/null; then yum install -y wget + elif command -v zypper &> /dev/null; then + zypper in -y wget fi cmake_install_bin="cmake-${installed_cmake_ver}-linux-${arch}.sh" diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt index 72bfa5cfd34ed0..693c9b2df5e9a8 100644 --- a/samples/cpp/CMakeLists.txt +++ b/samples/cpp/CMakeLists.txt @@ -69,7 +69,7 @@ if(APPLE) set(CMAKE_MACOSX_RPATH ON) endif() -if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*)") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") set(AARCH64 ON) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") set(ARM ON) diff --git a/samples/cpp/benchmark_app/CMakeLists.txt b/samples/cpp/benchmark_app/CMakeLists.txt index 16d2bc8e53991e..6939dd118bd61b 100644 --- a/samples/cpp/benchmark_app/CMakeLists.txt +++ b/samples/cpp/benchmark_app/CMakeLists.txt @@ -26,6 +26,7 @@ if(NOT TARGET nlohmann_json::nlohmann_json) if(TARGET nlohmann_json) # Ubuntu 18.04 case where target 'nlohmann_json' is here, but nlohmann_json_FOUND is OFF if(NOT TARGET nlohmann_json::nlohmann_json) + set_target_properties(nlohmann_json PROPERTIES IMPORTED_GLOBAL ON) add_library(nlohmann_json::nlohmann_json ALIAS nlohmann_json) endif() set(nlohmann_json_FOUND ON) diff --git a/samples/cpp/speech_sample/CMakeLists.txt b/samples/cpp/speech_sample/CMakeLists.txt index caab61495d0495..2b99a9fe1367d2 100644 --- a/samples/cpp/speech_sample/CMakeLists.txt +++ b/samples/cpp/speech_sample/CMakeLists.txt @@ -15,8 +15,8 @@ endif() if(NOT TARGET zlib::zlib) if(PkgConfig_FOUND) pkg_search_module(zlib QUIET - IMPORTED_TARGET GLOBAL - zlib) + IMPORTED_TARGET GLOBAL + zlib) if(zlib_FOUND) add_library(zlib::zlib ALIAS PkgConfig::zlib) endif() diff --git a/scripts/install_dependencies/install_openvino_dependencies.sh b/scripts/install_dependencies/install_openvino_dependencies.sh index d8ee4b92660894..3bd4cfac116b41 100755 --- a/scripts/install_dependencies/install_openvino_dependencies.sh +++ b/scripts/install_dependencies/install_openvino_dependencies.sh @@ -95,6 +95,7 @@ if [ "$os" == "auto" ] ; then case $os in centos7|centos8|rhel8|rhel9.1|\ almalinux8.7|amzn2|\ + opensuse-leap15.3| \ fedora34|fedora35|fedora36|fedora37|fedora38|\ raspbian9|debian9|ubuntu18.04|\ raspbian10|debian10|ubuntu20.04|ubuntu20.10|ubuntu21.04|\ @@ -132,24 +133,20 @@ elif [ "$os" == "ubuntu20.04" ] || [ "$os" == "debian10" ] || [ "$os" == "raspbi [ "$os" == "ubuntu21.10" ] || [ "$os" == "ubuntu22.04" ] || [ "$os" == "debian11" ] || [ "$os" == "raspbian11" ] || [ "$os" == "ubuntu22.10" ] || [ "$os" == "debian12" ] || [ "$os" == "raspbian12" ]; then - pkgs_core=(libpugixml1v5) + pkgs_core=(libpugixml1v5 libtbb2) pkgs_gpu=() pkgs_python=(python3 python3-venv python3-pip) pkgs_dev=(cmake pkg-config g++ gcc libc6-dev libgflags-dev zlib1g-dev nlohmann-json3-dev make curl sudo) if [ "$os" == "debian10" ] || [ "$os" == "raspbian10" ] ; then - pkgs_core=("${pkgs_core[@]}" libtbb2) pkgs_python=("${pkgs_python[@]}" libpython3.7) elif [ "$os" == "ubuntu20.04" ] || [ "$os" == "ubuntu20.10" ] || [ "$os" == "ubuntu21.04" ] ; then - pkgs_core=("${pkgs_core[@]}" libtbb2) pkgs_python=("${pkgs_python[@]}" libpython3.8) elif [ "$os" == "ubuntu21.10" ] || [ "$os" == "debian11" ] || [ "$os" == "raspbian11" ] ; then - pkgs_core=("${pkgs_core[@]}" libtbb2) pkgs_python=("${pkgs_python[@]}" libpython3.9) elif [ "$os" == "ubuntu22.04" ] || [ "$os" == "ubuntu22.10" ] || [ "$os" == "debian12" ] || [ "$os" == "raspbian12" ] ; then - pkgs_core=("${pkgs_core[@]}" libtbb12) pkgs_python=("${pkgs_python[@]}" libpython3.10) fi @@ -216,6 +213,11 @@ elif [ "$os" == "centos7" ] || [ "$os" == "centos8" ] || pkgs_dev+=("https://download-ib01.fedoraproject.org/pub/epel/9/Everything/$arch/Packages/g/gflags-devel-2.2.2-9.el9.$arch.rpm") extra_repos+=("https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm") fi +elif [ "$os" == "opensuse-leap15.3" ] ; then + pkgs_core=(libtbb2 libtbbmalloc2 libpugixml1) + pkgs_gpu=() + pkgs_python=(python39-base python39 python39-venv python39-pip) + pkgs_dev=(cmake pkg-config gcc-c++ gcc gflags-devel-static zlib-devel nlohmann_json-devel make curl sudo) else echo "Internal script error: invalid OS (${os}) after check (package selection)" >&2 exit 3 @@ -280,6 +282,14 @@ elif [ "$os" == "centos7" ] || [ "$os" == "centos8" ] || yum install "$iopt" "${pkgs[@]}" +elif [ "$os" == "opensuse-leap15.3" ] ; then + + [ -z "$interactive" ] && iopt="-y" + [ -n "$dry" ] && iopt="--dry-run" + [ -n "$keepcache" ] && zypper clean --all + + zypper ref && zypper in --auto-agree-with-licenses --no-recommends "$iopt" "${pkgs[@]}" + else echo "Internal script error: invalid OS (${os}) after check (package installation)" >&2 exit 3 diff --git a/src/bindings/python/src/openvino/frontend/pytorch/decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/decoder.py index 2ceee453636dd0..e65f7ba27dff16 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/decoder.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/decoder.py @@ -10,6 +10,7 @@ import warnings import torch +import numpy as np def get_type_from_py_type(value): @@ -36,18 +37,17 @@ def ivalue_to_constant(ivalue): assert ov_type.is_static(), "Can't deduce type for list" return op.Constant(ov_type, Shape([len(ivalue)]), ivalue).outputs() - if isinstance(ivalue, torch.Tensor) and ivalue.type() in pt_to_ov_type_map: - try: - ovshape = PartialShape(ivalue.size()) - ovtype = pt_to_ov_type_map[ivalue.type()] - ov_const = op.Constant(ovtype, ovshape.get_shape(), ivalue.data_ptr()) - except Exception: - # old variant that makes a slow data copying - warnings.warn("[ WARNING ] Constant wasn't able to convert from data_ptr.") - nvalues = ivalue.numpy() - ovtype = np_to_ov_type_map[str(nvalues.dtype)] - ovshape = PartialShape(nvalues.shape) - ov_const = op.Constant(ovtype, ovshape.get_shape(), nvalues.flatten().tolist()) + if isinstance(ivalue, torch.Tensor): + if ivalue.ndim == 0: + assert str(ivalue.dtype()) in pt_to_ov_type_map, f"Type is not known {ivalue.dtype()}" + ov_type = pt_to_ov_type_map[str(ivalue.dtype)] + ov_const = op.Constant(ov_type, Shape([]), [ivalue.item()]) + else: + ivalue = ivalue.to(memory_format=torch.contiguous_format) + narr = ivalue.numpy(force=True) + if not narr.flags['C_CONTIGUOUS']: + narr = np.ascontiguousarray(narr) + ov_const = op.Constant(narr, shared_memory=True) return ov_const.outputs() return None @@ -89,11 +89,6 @@ def get_value_from_getattr(getattr_node, self_module): "torch.BoolTensor": OVType.boolean, } -np_to_ov_type_map = { - "float32": OVType.f32, - "int32": OVType.i32, -} - class TorchScriptPythonDecoder (Decoder): def __init__(self, pt_module, graph_element=None): @@ -265,29 +260,20 @@ def as_string(self): def _as_constant_tensor(pt_value: torch.Value): ivalue = pt_value.toIValue() if pt_value.isCompleteTensor(): - try: - ivalue = ivalue.to(memory_format=torch.contiguous_format).detach().cpu() - except Exception: - warnings.warn("[ WARNING ] Tensor couldn't detach") - if str(pt_value.type().dtype()) in pt_to_ov_type_map: + if ivalue.ndim == 0: + assert str(ivalue.dtype) in pt_to_ov_type_map, f"Type is not known {ivalue.dtype}" + ov_type = pt_to_ov_type_map[str(ivalue.dtype)] + ov_const = op.Constant(ov_type, Shape([]), [ivalue.item()]) + else: + ivalue = ivalue.to(memory_format=torch.contiguous_format) + narr = ivalue.numpy(force=True) + if not narr.flags['C_CONTIGUOUS']: + narr = np.ascontiguousarray(narr) # Constant interpretation doesn't respect new-full type of PT # It recognizes only tensors, and give lists as 1D tensors, and scalars as Tensor scalars # So only tensor-type constants are supported - ovshape = PartialShape(pt_value.type().sizes()) - ovtype = pt_to_ov_type_map[str(pt_value.type().dtype())] - - # TODO: try-except here is a temporary WA for issues with data_ptr that we currently cannot predict; provide better solution - try: - # this is only possible with adding a new ctor for Constant Python binding - # TODO Check strides and pass them somehow - values = ivalue.data_ptr() - ov_const = op.Constant(ovtype, ovshape.get_shape(), values) - except Exception: - # old variant that makes a slow data copying - warnings.warn("[ WARNING ] Constant wasn't able to convert from data_ptr.") - values = ivalue.flatten().tolist() - ov_const = op.Constant(ovtype, ovshape.get_shape(), values) - return ov_const.outputs() + ov_const = op.Constant(narr, shared_memory=True) + return ov_const.outputs() else: return ivalue_to_constant(ivalue) return None diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py index 914f34d2480303..95490cbf98acb4 100644 --- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py +++ b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py @@ -212,7 +212,7 @@ def update_tensor( key: Optional[ValidKeys] = None, ) -> None: if hasattr(inputs, "__array__"): - update_tensor(normalize_arrays(inputs, is_shared=False), request, key=None) + update_tensor(normalize_arrays(inputs, is_shared=False), request, key) return None raise TypeError(f"Incompatible inputs of type: {type(inputs)} under {key} key!") diff --git a/src/bindings/python/src/pyopenvino/core/common.cpp b/src/bindings/python/src/pyopenvino/core/common.cpp index 2c7ec5653571ae..434433cb28dc6c 100644 --- a/src/bindings/python/src/pyopenvino/core/common.cpp +++ b/src/bindings/python/src/pyopenvino/core/common.cpp @@ -6,6 +6,7 @@ #include +#include "Python.h" #include "openvino/util/common_util.hpp" #define C_CONTIGUOUS py::detail::npy_api::constants::NPY_ARRAY_C_CONTIGUOUS_ @@ -51,78 +52,22 @@ const std::map& dtype_to_ov_type() { return dtype_to_ov_type_mapping; } -ov::Tensor tensor_from_pointer(py::array& array, const ov::Shape& shape, const ov::element::Type& type) { - bool is_contiguous = C_CONTIGUOUS == (array.flags() & C_CONTIGUOUS); - auto element_type = (type == ov::element::undefined) ? Common::dtype_to_ov_type().at(py::str(array.dtype())) : type; +namespace array_helpers { - if (is_contiguous) { - return ov::Tensor(element_type, shape, const_cast(array.data(0)), {}); - } else { - throw ov::Exception("Tensor with shared memory must be C contiguous!"); - } +bool is_contiguous(const py::array& array) { + return C_CONTIGUOUS == (array.flags() & C_CONTIGUOUS); } -ov::Tensor tensor_from_numpy(py::array& array, bool shared_memory) { - // Check if passed array has C-style contiguous memory layout. - bool is_contiguous = C_CONTIGUOUS == (array.flags() & C_CONTIGUOUS); - auto type = Common::dtype_to_ov_type().at(py::str(array.dtype())); - std::vector shape(array.shape(), array.shape() + array.ndim()); +ov::element::Type get_ov_type(const py::array& array) { + return Common::dtype_to_ov_type().at(py::str(array.dtype())); +} - // If memory is going to be shared it needs to be contiguous before - // passing to the constructor. This case should be handled by advanced - // users on their side of the code. - if (shared_memory) { - if (is_contiguous) { - std::vector strides(array.strides(), array.strides() + array.ndim()); - return ov::Tensor(type, shape, const_cast(array.data(0)), strides); - } else { - throw ov::Exception("Tensor with shared memory must be C contiguous!"); - } - } - // Convert to contiguous array if not already C-style. - if (!is_contiguous) { - array = Common::as_contiguous(array, type); - } - // Create actual Tensor and copy data. - auto tensor = ov::Tensor(type, shape); - // If ndim of py::array is 0, array is a numpy scalar. That results in size to be equal to 0. - // To gain access to actual raw/low-level data, it is needed to use buffer protocol. - py::buffer_info buf = array.request(); - std::memcpy(tensor.data(), buf.ptr, buf.ndim == 0 ? buf.itemsize : buf.itemsize * buf.size); - return tensor; +std::vector get_shape(const py::array& array) { + return std::vector(array.shape(), array.shape() + array.ndim()); } -ov::PartialShape partial_shape_from_list(const py::list& shape) { - using value_type = ov::Dimension::value_type; - ov::PartialShape pshape; - for (py::handle dim : shape) { - if (py::isinstance(dim)) { - pshape.insert(pshape.end(), ov::Dimension(dim.cast())); - } else if (py::isinstance(dim)) { - pshape.insert(pshape.end(), ov::Dimension(dim.cast())); - } else if (py::isinstance(dim)) { - pshape.insert(pshape.end(), dim.cast()); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - py::list bounded_dim = dim.cast(); - if (bounded_dim.size() != 2) { - throw py::type_error("Two elements are expected in tuple(lower, upper) for dynamic dimension, but " + - std::to_string(bounded_dim.size()) + " elements were given."); - } - if (!(py::isinstance(bounded_dim[0]) && py::isinstance(bounded_dim[1]))) { - throw py::type_error("Incorrect pair of types (" + std::string(bounded_dim[0].get_type().str()) + ", " + - std::string(bounded_dim[1].get_type().str()) + - ") for dynamic dimension, ints are expected."); - } - pshape.insert(pshape.end(), - ov::Dimension(bounded_dim[0].cast(), bounded_dim[1].cast())); - } else { - throw py::type_error("Incorrect type " + std::string(dim.get_type().str()) + - " for dimension. Expected types are: " - "int, str, openvino.runtime.Dimension, list/tuple with lower and upper values for " - "dynamic dimension."); - } - } - return pshape; +std::vector get_strides(const py::array& array) { + return std::vector(array.strides(), array.strides() + array.ndim()); } py::array as_contiguous(py::array& array, ov::element::Type type) { @@ -165,6 +110,120 @@ py::array as_contiguous(py::array& array, ov::element::Type type) { } } +}; // namespace array_helpers + +template <> +ov::op::v0::Constant create_copied(py::array& array) { + // Convert to contiguous array if not already in C-style. + if (!array_helpers::is_contiguous(array)) { + array = array_helpers::as_contiguous(array, array_helpers::get_ov_type(array)); + } + // Create actual Constant and a constructor is copying data. + return ov::op::v0::Constant(array_helpers::get_ov_type(array), + array_helpers::get_shape(array), + const_cast(array.data(0))); +} + +template <> +ov::op::v0::Constant create_copied(ov::Tensor& tensor) { + // Create actual Constant and a constructor is copying data. + return ov::op::v0::Constant(tensor.get_element_type(), tensor.get_shape(), const_cast(tensor.data())); +} + +template <> +ov::op::v0::Constant create_shared(py::array& array) { + // Check if passed array has C-style contiguous memory layout. + // If memory is going to be shared it needs to be contiguous before passing to the constructor. + if (array_helpers::is_contiguous(array)) { + auto memory = + std::make_shared>(static_cast(array.mutable_data(0)), + array.nbytes(), + array); + return ov::op::v0::Constant(array_helpers::get_ov_type(array), array_helpers::get_shape(array), memory); + } + // If passed array is not C-style, throw an error. + throw ov::Exception( + "SHARED MEMORY MODE FOR THIS CONSTANT IS NOT APPLICABLE! Passed numpy array must be C contiguous."); +} + +template <> +ov::op::v0::Constant create_shared(ov::Tensor& tensor) { + return ov::op::v0::Constant(tensor); +} + +template <> +ov::Tensor create_copied(py::array& array) { + // Convert to contiguous array if not already in C-style. + if (!array_helpers::is_contiguous(array)) { + array = array_helpers::as_contiguous(array, array_helpers::get_ov_type(array)); + } + // Create actual Tensor and copy data. + auto tensor = ov::Tensor(array_helpers::get_ov_type(array), array_helpers::get_shape(array)); + // If ndim of py::array is 0, array is a numpy scalar. That results in size to be equal to 0. + // To gain access to actual raw/low-level data, it is needed to use buffer protocol. + py::buffer_info buf = array.request(); + std::memcpy(tensor.data(), buf.ptr, buf.ndim == 0 ? buf.itemsize : buf.itemsize * buf.size); + return tensor; +} + +template <> +ov::Tensor create_shared(py::array& array) { + // Check if passed array has C-style contiguous memory layout. + // If memory is going to be shared it needs to be contiguous before passing to the constructor. + if (array_helpers::is_contiguous(array)) { + return ov::Tensor(array_helpers::get_ov_type(array), + array_helpers::get_shape(array), + const_cast(array.data(0)), + array_helpers::get_strides(array)); + } + // If passed array is not C-style, throw an error. + throw ov::Exception( + "SHARED MEMORY MODE FOR THIS TENSOR IS NOT APPLICABLE! Passed numpy array must be C contiguous."); +} + +ov::Tensor tensor_from_pointer(py::array& array, const ov::Shape& shape, const ov::element::Type& type) { + auto element_type = (type == ov::element::undefined) ? Common::dtype_to_ov_type().at(py::str(array.dtype())) : type; + + if (array_helpers::is_contiguous(array)) { + return ov::Tensor(element_type, shape, const_cast(array.data(0)), {}); + } + throw ov::Exception( + "SHARED MEMORY MODE FOR THIS TENSOR IS NOT APPLICABLE! Passed numpy array must be C contiguous."); +} + +ov::PartialShape partial_shape_from_list(const py::list& shape) { + using value_type = ov::Dimension::value_type; + ov::PartialShape pshape; + for (py::handle dim : shape) { + if (py::isinstance(dim)) { + pshape.insert(pshape.end(), ov::Dimension(dim.cast())); + } else if (py::isinstance(dim)) { + pshape.insert(pshape.end(), ov::Dimension(dim.cast())); + } else if (py::isinstance(dim)) { + pshape.insert(pshape.end(), dim.cast()); + } else if (py::isinstance(dim) || py::isinstance(dim)) { + py::list bounded_dim = dim.cast(); + if (bounded_dim.size() != 2) { + throw py::type_error("Two elements are expected in tuple(lower, upper) for dynamic dimension, but " + + std::to_string(bounded_dim.size()) + " elements were given."); + } + if (!(py::isinstance(bounded_dim[0]) && py::isinstance(bounded_dim[1]))) { + throw py::type_error("Incorrect pair of types (" + std::string(bounded_dim[0].get_type().str()) + ", " + + std::string(bounded_dim[1].get_type().str()) + + ") for dynamic dimension, ints are expected."); + } + pshape.insert(pshape.end(), + ov::Dimension(bounded_dim[0].cast(), bounded_dim[1].cast())); + } else { + throw py::type_error("Incorrect type " + std::string(dim.get_type().str()) + + " for dimension. Expected types are: " + "int, str, openvino.runtime.Dimension, list/tuple with lower and upper values for " + "dynamic dimension."); + } + } + return pshape; +} + const ov::Tensor& cast_to_tensor(const py::handle& tensor) { return tensor.cast(); } diff --git a/src/bindings/python/src/pyopenvino/core/common.hpp b/src/bindings/python/src/pyopenvino/core/common.hpp index 9d363ded0fe494..910d9e55e966ed 100644 --- a/src/bindings/python/src/pyopenvino/core/common.hpp +++ b/src/bindings/python/src/pyopenvino/core/common.hpp @@ -10,8 +10,10 @@ #include #include +#include #include #include +#include #include "Python.h" #include "openvino/runtime/compiled_model.hpp" @@ -20,22 +22,62 @@ #include "openvino/pass/serialize.hpp" #include "pyopenvino/core/containers.hpp" #include "pyopenvino/graph/any.hpp" +#include "pyopenvino/graph/ops/constant.hpp" namespace py = pybind11; namespace Common { + +namespace values { + +// Minimum amount of bits for common numpy types. Used to perform checks against OV types. +constexpr size_t min_bitwidth = sizeof(int8_t) * CHAR_BIT; + +}; // namespace values + const std::map& ov_type_to_dtype(); const std::map& dtype_to_ov_type(); -ov::Tensor tensor_from_pointer(py::array& array, const ov::Shape& shape, const ov::element::Type& ov_type); +// Helpers for numpy arrays +namespace array_helpers { -ov::Tensor tensor_from_numpy(py::array& array, bool shared_memory); +bool is_contiguous(const py::array& array); -ov::PartialShape partial_shape_from_list(const py::list& shape); +ov::element::Type get_ov_type(const py::array& array); + +std::vector get_shape(const py::array& array); + +std::vector get_strides(const py::array& array); py::array as_contiguous(py::array& array, ov::element::Type type); +}; // namespace array_helpers + +template +T create_copied(py::array& array); + +template +T create_copied(ov::Tensor& array); + +template +T create_shared(py::array& array); + +template +T create_shared(ov::Tensor& array); + +template +T object_from_data(D& data, bool shared_memory) { + if (shared_memory) { + return create_shared(data); + } + return create_copied(data); +} + +ov::Tensor tensor_from_pointer(py::array& array, const ov::Shape& shape, const ov::element::Type& ov_type); + +ov::PartialShape partial_shape_from_list(const py::list& shape); + const ov::Tensor& cast_to_tensor(const py::handle& tensor); const Containers::TensorNameMap cast_to_tensor_name_map(const py::dict& inputs); diff --git a/src/bindings/python/src/pyopenvino/core/tensor.cpp b/src/bindings/python/src/pyopenvino/core/tensor.cpp index 397b5d4a73879b..8bdc90c58a9265 100644 --- a/src/bindings/python/src/pyopenvino/core/tensor.cpp +++ b/src/bindings/python/src/pyopenvino/core/tensor.cpp @@ -17,7 +17,7 @@ void regclass_Tensor(py::module m) { cls.doc() = "openvino.runtime.Tensor holding either copy of memory or shared host memory."; cls.def(py::init([](py::array& array, bool shared_memory) { - return Common::tensor_from_numpy(array, shared_memory); + return Common::object_from_data(array, shared_memory); }), py::arg("array"), py::arg("shared_memory") = false, @@ -209,7 +209,7 @@ void regclass_Tensor(py::module m) { [](ov::Tensor& self) { auto ov_type = self.get_element_type(); auto dtype = Common::ov_type_to_dtype().at(ov_type); - if (ov_type.bitwidth() < 8) { + if (ov_type.bitwidth() < Common::values::min_bitwidth) { return py::array(dtype, self.get_byte_size(), self.data(), py::cast(self)); } return py::array(dtype, self.get_shape(), self.get_strides(), self.data(), py::cast(self)); diff --git a/src/bindings/python/src/pyopenvino/frontend/input_model.cpp b/src/bindings/python/src/pyopenvino/frontend/input_model.cpp index 6069ceef2371cb..8e47b02bb7508a 100644 --- a/src/bindings/python/src/pyopenvino/frontend/input_model.cpp +++ b/src/bindings/python/src/pyopenvino/frontend/input_model.cpp @@ -310,7 +310,7 @@ void regclass_frontend_InputModel(py::module m) { "set_tensor_value", [](ov::frontend::InputModel& self, const ov::frontend::Place::Ptr& place, py::array& value) { // Convert to contiguous array if not already C-style. - auto tensor = Common::tensor_from_numpy(value, false); + auto tensor = Common::object_from_data(value, false); self.set_tensor_value(place, (const void*)tensor.data()); }, py::arg("place"), diff --git a/src/bindings/python/src/pyopenvino/graph/ops/constant.cpp b/src/bindings/python/src/pyopenvino/graph/ops/constant.cpp index 2e558242351b23..edb8f3bca816f1 100644 --- a/src/bindings/python/src/pyopenvino/graph/ops/constant.cpp +++ b/src/bindings/python/src/pyopenvino/graph/ops/constant.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "openvino/op/constant.hpp" +#include "pyopenvino/graph/ops/constant.hpp" #include #include @@ -10,10 +10,10 @@ #include #include -#include #include "openvino/core/shape.hpp" -#include "pyopenvino/graph/ops/constant.hpp" +#include "openvino/runtime/tensor.hpp" +#include "pyopenvino/core/common.hpp" namespace py = pybind11; @@ -27,6 +27,38 @@ std::vector _get_byte_strides(const ov::Shape& s) { return byte_strides; } +std::vector _get_strides(const ov::op::v0::Constant& self) { + auto element_type = self.get_element_type(); + auto shape = self.get_shape(); + if (element_type == ov::element::boolean) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::f16) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::f32) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::f64) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::i8) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::i16) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::i32) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::i64) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::u8 || element_type == ov::element::u1) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::u16) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::u32) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::u64) { + return _get_byte_strides(shape); + } else { + throw std::runtime_error("Unsupported data type!"); + } +} + template py::buffer_info _get_buffer_info(const ov::op::v0::Constant& c) { ov::Shape shape = c.get_shape(); @@ -68,6 +100,18 @@ void regclass_graph_op_Constant(py::module m) { "Constant", py::buffer_protocol()); constant.doc() = "openvino.runtime.op.Constant wraps ov::op::v0::Constant"; + // Numpy-based constructor + constant.def(py::init([](py::array& array, bool shared_memory) { + return Common::object_from_data(array, shared_memory); + }), + py::arg("array"), + py::arg("shared_memory") = false); + // Tensor-based constructors + constant.def(py::init([](ov::Tensor& tensor, bool shared_memory) { + return Common::object_from_data(tensor, shared_memory); + }), + py::arg("tensor"), + py::arg("shared_memory") = false); constant.def(py::init&>()); constant.def(py::init&>()); constant.def(py::init&>()); @@ -80,12 +124,6 @@ void regclass_graph_op_Constant(py::module m) { constant.def(py::init&>()); constant.def(py::init&>()); constant.def(py::init&>()); - constant.def(py::init([](const ov::element::Type& et, const ov::Shape& sh, int64_t p) { - // restore pointer from integer - // TODO: Align on bit width - void* pp = reinterpret_cast(p); - return std::make_shared(et, sh, pp); - })); constant.def("get_value_strings", &ov::op::v0::Constant::get_value_strings); @@ -151,4 +189,26 @@ void regclass_graph_op_Constant(py::module m) { throw std::runtime_error("Unsupported data type!"); } }); + + constant.def_property_readonly( + "data", + [](ov::op::v0::Constant& self) { + auto ov_type = self.get_element_type(); + auto dtype = Common::ov_type_to_dtype().at(ov_type); + if (ov_type.bitwidth() < Common::values::min_bitwidth) { + return py::array(dtype, self.get_byte_size(), self.get_data_ptr(), py::cast(self)); + } + return py::array(dtype, self.get_shape(), _get_strides(self), self.get_data_ptr(), py::cast(self)); + }, + R"( + Access to Constant's data. + + Returns numpy array with corresponding shape and dtype. + For Constants with openvino specific element type, such as u1, + it returns linear array, with uint8 / int8 numpy dtype. + + Note: this access method reflects shared memory if it was applied during initialization. + + :rtype: numpy.array + )"); } diff --git a/src/bindings/python/src/pyopenvino/graph/ops/constant.hpp b/src/bindings/python/src/pyopenvino/graph/ops/constant.hpp index 5b175e8c09d682..cb7d457b1296ad 100644 --- a/src/bindings/python/src/pyopenvino/graph/ops/constant.hpp +++ b/src/bindings/python/src/pyopenvino/graph/ops/constant.hpp @@ -4,8 +4,14 @@ #pragma once +#include + #include +#include "openvino/op/constant.hpp" + namespace py = pybind11; +std::vector _get_strides(const ov::op::v0::Constant& self); + void regclass_graph_op_Constant(py::module m); diff --git a/src/bindings/python/src/pyopenvino/graph/preprocess/pre_post_process.cpp b/src/bindings/python/src/pyopenvino/graph/preprocess/pre_post_process.cpp index 335f8e8c530989..31aec4a66a4297 100644 --- a/src/bindings/python/src/pyopenvino/graph/preprocess/pre_post_process.cpp +++ b/src/bindings/python/src/pyopenvino/graph/preprocess/pre_post_process.cpp @@ -322,7 +322,7 @@ static void regclass_graph_InputTensorInfo(py::module m) { "set_from", [](ov::preprocess::InputTensorInfo& self, py::array& numpy_array) { // Convert to contiguous array if not already C-style. - return &self.set_from(Common::tensor_from_numpy(numpy_array, false)); + return &self.set_from(Common::object_from_data(numpy_array, false)); }, py::arg("runtime_tensor"), R"( diff --git a/src/bindings/python/src/pyopenvino/graph/util.cpp b/src/bindings/python/src/pyopenvino/graph/util.cpp index a5f2c473972a3e..35ea9003c70eb1 100644 --- a/src/bindings/python/src/pyopenvino/graph/util.cpp +++ b/src/bindings/python/src/pyopenvino/graph/util.cpp @@ -6,9 +6,12 @@ #include +#include + #include "openvino/core/graph_util.hpp" #include "openvino/core/validation_util.hpp" #include "openvino/pass/manager.hpp" +#include "pyopenvino/graph/ops/constant.hpp" #include "pyopenvino/utils/utils.hpp" namespace py = pybind11; diff --git a/src/bindings/python/tests/test_graph/test_manager.py b/src/bindings/python/tests/test_graph/test_manager.py index a9b76538fa1a38..dad03fecaeaa2c 100644 --- a/src/bindings/python/tests/test_graph/test_manager.py +++ b/src/bindings/python/tests/test_graph/test_manager.py @@ -8,17 +8,26 @@ import numpy as np import pytest -import openvino.runtime.opset8 as ov -from openvino.runtime import Model +import openvino.runtime.opset10 as ops +from openvino.runtime import Core, Model from openvino.runtime.passes import Manager, Serialize, ConstantFolding, Version from tests.test_graph.util import count_ops_of_type -from openvino.runtime import Core from tests.test_utils.test_utils import create_filename_for_test +def create_model(): + shape = [100, 100, 2] + parameter_a = ops.parameter(shape, dtype=np.float32, name="A") + parameter_b = ops.parameter(shape, dtype=np.float32, name="B") + parameter_c = ops.parameter(shape, dtype=np.float32, name="C") + model = ops.floor(ops.minimum(ops.abs(parameter_a), ops.multiply(parameter_b, parameter_c))) + func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") + return func + + def test_constant_folding(): - node_constant = ov.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) - node_ceil = ov.ceiling(node_constant) + node_constant = ops.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) + node_ceil = ops.ceiling(node_constant) model = Model(node_ceil, [], "TestFunction") assert count_ops_of_type(model, node_ceil) == 1 @@ -43,9 +52,9 @@ def test_serialize_seperate_paths_kwargs(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) shape = [2, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") + parameter_a = ops.parameter(shape, dtype=np.float32, name="A") + parameter_b = ops.parameter(shape, dtype=np.float32, name="B") + parameter_c = ops.parameter(shape, dtype=np.float32, name="C") model = (parameter_a + parameter_b) * parameter_c func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") @@ -67,10 +76,10 @@ def test_serialize_seperate_paths_args(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) shape = [2, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") + parameter_a = ops.parameter(shape, dtype=np.float32, name="A") + parameter_b = ops.parameter(shape, dtype=np.float32, name="B") + parameter_c = ops.parameter(shape, dtype=np.float32, name="C") + parameter_d = ops.parameter(shape, dtype=np.float32, name="D") model = ((parameter_a + parameter_b) * parameter_c) / parameter_d func = Model(model, [parameter_a, parameter_b, parameter_c, parameter_d], "Model") @@ -92,8 +101,8 @@ def test_serialize_pass_mixed_args_kwargs(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) shape = [3, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") + parameter_a = ops.parameter(shape, dtype=np.float32, name="A") + parameter_b = ops.parameter(shape, dtype=np.float32, name="B") model = parameter_a - parameter_b func = Model(model, [parameter_a, parameter_b], "Model") @@ -114,20 +123,15 @@ def test_serialize_pass_mixed_args_kwargs(request, tmp_path): def test_serialize_pass_mixed_args_kwargs_v2(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") + model = create_model() pass_manager = Manager() pass_manager.register_pass(Serialize(path_to_xml=xml_path, path_to_bin=bin_path)) - pass_manager.run_passes(func) + pass_manager.run_passes(model) res_model = core.read_model(model=xml_path, weights=bin_path) - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() + assert model.get_parameters() == res_model.get_parameters() + assert model.get_ordered_ops() == res_model.get_ordered_ops() os.remove(xml_path) os.remove(bin_path) @@ -146,8 +150,8 @@ def test_serialize_pass_wrong_num_of_args(request, tmp_path): # request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request def test_serialize_results(request, tmp_path): core = Core() - node_constant = ov.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) - node_ceil = ov.ceiling(node_constant) + node_constant = ops.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) + node_ceil = ops.ceiling(node_constant) func = Model(node_ceil, [], "Model") xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) @@ -165,73 +169,19 @@ def test_serialize_results(request, tmp_path): os.remove(bin_path) -# request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request -def test_serialize_pass_tuple(request, tmp_path): - core = Core() - xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") - pass_manager = Manager() - pass_manager.register_pass("Serialize", output_files=(xml_path, bin_path)) - pass_manager.run_passes(func) - - res_model = core.read_model(model=xml_path, weights=bin_path) - - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() - - os.remove(xml_path) - os.remove(bin_path) - - # request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request def test_default_version(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") + model = create_model() pass_manager = Manager() - pass_manager.register_pass("Serialize", output_files=(xml_path, bin_path)) - pass_manager.run_passes(func) - - res_model = core.read_model(model=xml_path, weights=bin_path) - - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() - - os.remove(xml_path) - os.remove(bin_path) - - -# request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request -def test_default_version_IR_V11_tuple(request, tmp_path): - core = Core() - xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") - pass_manager = Manager() - pass_manager.register_pass("Serialize", output_files=(xml_path, bin_path), version="IR_V11") - pass_manager.run_passes(func) + pass_manager.register_pass(Serialize(xml_path, bin_path)) + pass_manager.run_passes(model) res_model = core.read_model(model=xml_path, weights=bin_path) - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() + assert model.get_parameters() == res_model.get_parameters() + assert model.get_ordered_ops() == res_model.get_ordered_ops() os.remove(xml_path) os.remove(bin_path) @@ -241,21 +191,15 @@ def test_default_version_IR_V11_tuple(request, tmp_path): def test_default_version_IR_V11_seperate_paths(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") + model = create_model() pass_manager = Manager() pass_manager.register_pass(Serialize(path_to_xml=xml_path, path_to_bin=bin_path, version=Version.IR_V11)) - pass_manager.run_passes(func) + pass_manager.run_passes(model) res_model = core.read_model(model=xml_path, weights=bin_path) - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() + assert model.get_parameters() == res_model.get_parameters() + assert model.get_ordered_ops() == res_model.get_ordered_ops() os.remove(xml_path) os.remove(bin_path) diff --git a/src/bindings/python/tests/test_runtime/test_infer_request.py b/src/bindings/python/tests/test_runtime/test_infer_request.py index 5d9db0a461a456..b64623a4ad7c8d 100644 --- a/src/bindings/python/tests/test_runtime/test_infer_request.py +++ b/src/bindings/python/tests/test_runtime/test_infer_request.py @@ -925,6 +925,7 @@ def __array__(self): request, _, input_data = abs_model_with_data(device, Type.f32, np.single) model_input_object = ArrayLikeObject(input_data.tolist()) model_input_list = [ArrayLikeObject(input_data.tolist())] + model_input_dict = {0: ArrayLikeObject(input_data.tolist())} # Test single array-like object in InferRequest().Infer() res_object = request.infer(model_input_object, shared_memory=shared_flag) @@ -934,6 +935,10 @@ def __array__(self): res_list = request.infer(model_input_list) assert np.array_equal(res_list[request.model_outputs[0]], np.abs(input_data)) + # Test dict of array-like objects to use normalize_inputs() + res_dict = request.infer(model_input_dict) + assert np.array_equal(res_dict[request.model_outputs[0]], np.abs(input_data)) + @pytest.mark.parametrize("shared_flag", [True, False]) def test_array_like_input_async(device, shared_flag): diff --git a/src/bindings/python/tests/test_runtime/test_memory_modes.py b/src/bindings/python/tests/test_runtime/test_memory_modes.py new file mode 100644 index 00000000000000..ccbd44efa729bb --- /dev/null +++ b/src/bindings/python/tests/test_runtime/test_memory_modes.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest + +import openvino.runtime as ov +from openvino.runtime import Tensor +from openvino.runtime.op import Constant + +from tests.test_utils.test_utils import generate_image + + +@pytest.mark.parametrize(("cls", "cls_str"), [ + (Tensor, "TENSOR"), + (Constant, "CONSTANT"), +]) +def test_init_with_numpy_fail(cls, cls_str): + arr = np.asfortranarray(generate_image()) # F-style array + + with pytest.raises(RuntimeError) as e: + _ = cls(array=arr, shared_memory=True) + + assert "SHARED MEMORY MODE FOR THIS " + cls_str + " IS NOT APPLICABLE!" in str(e.value) + + +@pytest.mark.parametrize("cls", [Tensor, Constant]) +@pytest.mark.parametrize("shared_flag", [True, False]) +@pytest.mark.parametrize(("ov_type", "numpy_dtype"), [ + (ov.Type.f32, np.float32), + (ov.Type.f64, np.float64), + (ov.Type.f16, np.float16), + (ov.Type.i8, np.int8), + (ov.Type.u8, np.uint8), + (ov.Type.i32, np.int32), + (ov.Type.u32, np.uint32), + (ov.Type.i16, np.int16), + (ov.Type.u16, np.uint16), + (ov.Type.i64, np.int64), + (ov.Type.u64, np.uint64), + (ov.Type.boolean, bool), +]) +def test_with_numpy_memory(cls, shared_flag, ov_type, numpy_dtype): + arr = np.ascontiguousarray(generate_image().astype(numpy_dtype)) + ov_object = cls(array=arr, shared_memory=shared_flag) + + assert ov_object.get_element_type() == ov_type + assert tuple(ov_object.shape) == arr.shape + + assert isinstance(ov_object.data, np.ndarray) + assert ov_object.data.dtype == numpy_dtype + assert ov_object.data.shape == arr.shape + assert np.array_equal(ov_object.data, arr) + + if shared_flag is True: + assert np.shares_memory(arr, ov_object.data) + else: + assert not (np.shares_memory(arr, ov_object.data)) + + +@pytest.mark.parametrize("cls", [Tensor, Constant]) +@pytest.mark.parametrize("shared_flag", [True, False]) +def test_with_external_memory(cls, shared_flag): + class ArrayLikeObject: + # Array-like object to test inputs similar to torch.Tensor and tf.Tensor + def __init__(self, array) -> None: + self.data = array + + @property + def shape(self): + return self.data.shape + + @property + def dtype(self): + return self.data.dtype + + def to_numpy(self): + return self.data + + external_object = ArrayLikeObject(np.ascontiguousarray(generate_image())) + ov_object = cls(array=external_object.to_numpy(), shared_memory=shared_flag) + + assert np.array_equal(ov_object.data.dtype, external_object.dtype) + assert np.array_equal(ov_object.data.shape, external_object.shape) + assert np.array_equal(ov_object.data, external_object.to_numpy()) + + if shared_flag is True: + assert np.shares_memory(external_object.to_numpy(), ov_object.data) + else: + assert not (np.shares_memory(external_object.to_numpy(), ov_object.data)) + + +@pytest.mark.parametrize("cls", [Constant]) +@pytest.mark.parametrize("shared_flag_one", [True, False]) +@pytest.mark.parametrize("shared_flag_two", [True, False]) +@pytest.mark.parametrize(("ov_type", "numpy_dtype"), [ + (ov.Type.f32, np.float32), + (ov.Type.f64, np.float64), + (ov.Type.f16, np.float16), + (ov.Type.i8, np.int8), + (ov.Type.u8, np.uint8), + (ov.Type.i32, np.int32), + (ov.Type.u32, np.uint32), + (ov.Type.i16, np.int16), + (ov.Type.u16, np.uint16), + (ov.Type.i64, np.int64), + (ov.Type.u64, np.uint64), + (ov.Type.boolean, bool), +]) +def test_with_tensor_memory(cls, shared_flag_one, shared_flag_two, ov_type, numpy_dtype): + arr = np.ascontiguousarray(generate_image().astype(numpy_dtype)) + ov_tensor = Tensor(arr, shared_memory=shared_flag_one) + ov_object = cls(tensor=ov_tensor, shared_memory=shared_flag_two) + + # Case 1: all data is shared + if shared_flag_one is True and shared_flag_two is True: + assert np.shares_memory(arr, ov_object.data) + assert np.shares_memory(ov_tensor.data, ov_object.data) + # Case 2: data is shared only between object and Tensor + elif shared_flag_one is False and shared_flag_two is True: + assert not (np.shares_memory(arr, ov_object.data)) + assert np.shares_memory(ov_tensor.data, ov_object.data) + # Case 3: data is not shared, copy occurs in the object's constructor + else: + assert not (np.shares_memory(arr, ov_object.data)) + assert not (np.shares_memory(ov_tensor.data, ov_object.data)) diff --git a/src/bindings/python/tests/test_runtime/test_tensor.py b/src/bindings/python/tests/test_runtime/test_tensor.py index 9e4d0daf0fd4b1..f9eb556a15e482 100644 --- a/src/bindings/python/tests/test_runtime/test_tensor.py +++ b/src/bindings/python/tests/test_runtime/test_tensor.py @@ -148,13 +148,6 @@ def test_init_with_numpy_copy_memory(ov_type, numpy_dtype): assert ov_tensor.byte_size == arr.nbytes -def test_init_with_numpy_fail(): - arr = np.asfortranarray(generate_image()) - with pytest.raises(RuntimeError) as e: - _ = Tensor(array=arr, shared_memory=True) - assert "Tensor with shared memory must be C contiguous" in str(e.value) - - def test_init_with_roi_tensor(): array = np.random.normal(size=[1, 3, 48, 48]) ov_tensor1 = Tensor(array) diff --git a/src/bindings/python/tests/test_transformations/test_manager.py b/src/bindings/python/tests/test_transformations/test_manager.py index 1aa7cbb85d8dbe..d88863c43561a3 100644 --- a/src/bindings/python/tests/test_transformations/test_manager.py +++ b/src/bindings/python/tests/test_transformations/test_manager.py @@ -32,14 +32,10 @@ def test_registration_and_pass_name(): GraphRewrite().set_name("Anchor") BackwardGraphRewrite().set_name("BackAnchor") - # Preserve legacy behaviour when registered pass doesn't exist - # and in this case we shouldn't throw an exception. - manager.register_pass("NotExistingPass") - def test_negative_pass_registration(): manager = Manager() expect_exception(lambda: manager.register_pass(PatternReplacement)) expect_exception(lambda: manager.register_pass("PatternReplacement", PatternReplacement())) expect_exception(lambda: manager.register_pass("Serialize", Serialize("out.xml", "out.bin"))) - expect_exception(lambda: manager.register_pass("Serialize", "out.xml", "out.bin", "out.wrong")) + expect_exception(lambda: manager.register_pass(Serialize("out.xml", "out.bin", "out.wrong"))) diff --git a/src/bindings/python/tests/test_transformations/test_offline_api.py b/src/bindings/python/tests/test_transformations/test_offline_api.py index 1cae5c0af5ab8c..cf3089e30fc00b 100644 --- a/src/bindings/python/tests/test_transformations/test_offline_api.py +++ b/src/bindings/python/tests/test_transformations/test_offline_api.py @@ -6,7 +6,7 @@ import pytest import numpy as np from openvino.runtime import serialize -from openvino.offline_transformations import ( +from openvino._offline_transformations import ( apply_moc_transformations, apply_pot_transformations, apply_low_latency_transformation, diff --git a/src/bindings/python/tests_compatibility/test_ngraph/test_einsum.py b/src/bindings/python/tests_compatibility/test_ngraph/test_einsum.py index 27efce283adba4..2427d60f64d841 100644 --- a/src/bindings/python/tests_compatibility/test_ngraph/test_einsum.py +++ b/src/bindings/python/tests_compatibility/test_ngraph/test_einsum.py @@ -27,7 +27,7 @@ def einsum_op_exec(input_shapes: list, equation: str, data_type: np.dtype, ng_inputs = [] np_inputs = [] for i in range(num_inputs): - input_i = np.random.random_integers(10, size=input_shapes[i]).astype(data_type) + input_i = np.random.randint(1, 10 + 1, size=input_shapes[i]).astype(data_type) np_inputs.append(input_i) ng_inputs.append(ng.parameter(input_i.shape, dtype=data_type)) diff --git a/src/bindings/python/tests_compatibility/test_ngraph/test_ops_fused.py b/src/bindings/python/tests_compatibility/test_ngraph/test_ops_fused.py index f1b095b08ea5e9..4cd2bbcba2fc96 100644 --- a/src/bindings/python/tests_compatibility/test_ngraph/test_ops_fused.py +++ b/src/bindings/python/tests_compatibility/test_ngraph/test_ops_fused.py @@ -33,7 +33,7 @@ def test_elu_operator_with_scalar(): def test_fake_quantize(): - levels = np.float32(4) + levels = np.int32(4) data_shape = [1, 2, 3, 4] bound_shape = [] @@ -60,7 +60,7 @@ def test_fake_quantize(): def test_depth_to_space(): data_shape = [1, 4, 2, 3] mode = "blocks_first" - block_size = np.float32(2) + block_size = np.int32(2) parameter_data = ng.parameter(data_shape, name="Data", dtype=np.float32) diff --git a/src/bindings/python/wheel/requirements-dev.txt b/src/bindings/python/wheel/requirements-dev.txt index 38b09d5d1effb7..2ac9ed6f6dca1b 100644 --- a/src/bindings/python/wheel/requirements-dev.txt +++ b/src/bindings/python/wheel/requirements-dev.txt @@ -1,3 +1,3 @@ -setuptools>=53.0.0 +setuptools>=53.0.0,<=65.7.0 wheel>=0.38.1 -patchelf; sys_platform == 'linux' and platform_machine == 'x86_64' or sys_platform == 'linux' and platform_machine == 'aarch64' +patchelf; sys_platform == 'linux' and platform_machine == 'x86_64' diff --git a/src/cmake/ie_parallel.cmake b/src/cmake/ie_parallel.cmake index 5e92e854640808..a2f15636d8a8eb 100644 --- a/src/cmake/ie_parallel.cmake +++ b/src/cmake/ie_parallel.cmake @@ -141,6 +141,23 @@ macro(ov_find_package_tbb) list(APPEND TBB_IMPORTED_TARGETS ${target}) endif() endforeach() + + if(WIN32 AND TARGET TBB::tbbbind_2_5) + # Add HWLOC::hwloc_2_5 target to check via Apivalidator + get_target_property(TBB_location TBB::tbb IMPORTED_LOCATION_RELEASE) + get_filename_component(TBB_dir "${TBB_location}" DIRECTORY) + set(hwloc_dll_name "${CMAKE_SHARED_LIBRARY_PREFIX}hwloc${CMAKE_SHARED_LIBRARY_SUFFIX}") + find_file(HWLOC_DLL NAMES ${hwloc_dll_name} PATHS "${TBB_dir}" DOC "Path to hwloc.dll") + + if(NOT HWLOC_DLL) + message(FATAL_ERROR "Failed to find ${hwloc_dll_name} in ${TBB_dir}") + endif() + + add_library(HWLOC::hwloc_2_5 SHARED IMPORTED) + set_property(TARGET HWLOC::hwloc_2_5 APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) + set_target_properties(HWLOC::hwloc_2_5 PROPERTIES + IMPORTED_LOCATION_RELEASE "${HWLOC_DLL}") + endif() endif() if(NOT TBB_FOUND) diff --git a/src/cmake/openvino.cmake b/src/cmake/openvino.cmake index a3477788c15887..7870e2963e3c59 100644 --- a/src/cmake/openvino.cmake +++ b/src/cmake/openvino.cmake @@ -29,7 +29,6 @@ add_library(openvino::runtime ALIAS ${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES EXPORT_NAME runtime) ie_add_vs_version_file(NAME ${TARGET_NAME} FILEDESCRIPTION "OpenVINO runtime library") -ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) target_include_directories(${TARGET_NAME} PUBLIC $ @@ -65,6 +64,9 @@ endif() set_ie_threading_interface_for(${TARGET_NAME}) ie_mark_target_as_cc(${TARGET_NAME}) +# must be called after all target_link_libraries +ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME} EXTRA ${TBB_IMPORTED_TARGETS}) + # LTO set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/common/preprocessing/src/CMakeLists.txt b/src/common/preprocessing/src/CMakeLists.txt index 11fa0eadb7ab25..3e8a70b9e61151 100644 --- a/src/common/preprocessing/src/CMakeLists.txt +++ b/src/common/preprocessing/src/CMakeLists.txt @@ -167,6 +167,7 @@ if(ENABLE_GAPI_PREPROCESSING) set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) endif() + # must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) ie_add_vs_version_file(NAME ${TARGET_NAME} diff --git a/src/common/snippets/src/pass/convert_constants.cpp b/src/common/snippets/src/pass/convert_constants.cpp index c3d2318b49e8c2..951f51825c8f5f 100644 --- a/src/common/snippets/src/pass/convert_constants.cpp +++ b/src/common/snippets/src/pass/convert_constants.cpp @@ -32,5 +32,5 @@ ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() { ngraph::replace_node(constant, scalar); return true; }; - register_matcher(std::make_shared(constants), callback); + register_matcher(std::make_shared(constants, matcher_name), callback); } diff --git a/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp b/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp index 7a76f7207f7b8f..ef43e677f6f8cb 100644 --- a/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp +++ b/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp @@ -16,7 +16,7 @@ ngraph::snippets::pass::ConvertPowerToPowerStatic::ConvertPowerToPowerStatic() { is_type(n->get_input_node_shared_ptr(1)); }); ngraph::graph_rewrite_callback callback = [](ngraph::pattern::Matcher &m) { - OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars") + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertPowerToPowerStatic") auto power = ov::as_type_ptr(m.get_match_root()); auto scalar = ov::as_type_ptr(power->get_input_node_shared_ptr(1)); auto value = scalar->cast_vector()[0]; diff --git a/src/common/transformations/include/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.hpp b/src/common/transformations/include/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.hpp index 8aeda0614990d5..780b9ad3eec812 100644 --- a/src/common/transformations/include/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.hpp @@ -4,28 +4,25 @@ #pragma once -#include -#include -#include -#include +#include "openvino/pass/pass.hpp" +#include "transformations_visibility.hpp" namespace ov { namespace pass { -class TRANSFORMATIONS_API RemoveMultiSubGraphOpDanglingParams; +class TRANSFORMATIONS_API RemoveMultiSubGraphOpDanglingParamsResults; } // namespace pass } // namespace ov /* * @ingroup ie_transformation_common_api - * @brief RemoveMultiSubGraphOpDanglingParams transformation - * removed MultiSubGraphOp inputs which are not connected to other nodes - * in the bodies of a MultiSubGraphOp + * @brief RemoveMultiSubGraphOpDanglingParamsResults transformation removes MultiSubGraphOp inputs which are not + * connected to other nodes in the bodies of a MultiSubGraphOp and outputs that are not used in the Model */ -class ov::pass::RemoveMultiSubGraphOpDanglingParams : public ov::pass::MatcherPass { +class ov::pass::RemoveMultiSubGraphOpDanglingParamsResults : public ov::pass::ModelPass { public: - OPENVINO_RTTI("RemoveMultiSubGraphOpDanglingParams", "0"); - RemoveMultiSubGraphOpDanglingParams(); + OPENVINO_RTTI("RemoveMultiSubGraphOpDanglingParamsResults", "0"); + bool run_on_model(const std::shared_ptr& m) override; }; diff --git a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp index c3472ea896c0a1..f394f7d037d26c 100644 --- a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp @@ -104,21 +104,21 @@ bool ov::pass::MOCTransformations::run_on_model(const std::shared_ptr(); } - // RemoveConcatZeroDimInput and RemoveMultiSubGraphOpDanglingParams + // RemoveConcatZeroDimInput and RemoveMultiSubGraphOpDanglingParamsResults // should be performed before first ConstantFolding call. // The passes can deteach graph branches where zero dimesion is calculated. // Zero dimensions in shape causes creation empty tensors, which are incorrect during CF. // In particular, if zero dim tensor is consumed in body of MultiSubGraphOp - // RemoveConcatZeroDimInput and RemoveMultiSubGraphOpDanglingParams should be called together. + // RemoveConcatZeroDimInput and RemoveMultiSubGraphOpDanglingParamsResults should be called together. using namespace ov::pass; REGISTER_PASS(manager, EliminateScatterUpdate) REGISTER_PASS(manager, RemoveConcatZeroDimInput) REGISTER_PASS(manager, Validate) // todo: ticket 96960 - // the order EliminateDuplicateTIInputs and RemoveMultiSubGraphOpDanglingParams is important + // the order EliminateDuplicateTIInputs and RemoveMultiSubGraphOpDanglingParamsResults is important // it looks like we need to combine these transformations into one. REGISTER_PASS(manager, EliminateDuplicateTIInputs); - REGISTER_PASS(manager, RemoveMultiSubGraphOpDanglingParams) + REGISTER_PASS(manager, RemoveMultiSubGraphOpDanglingParamsResults) REGISTER_PASS(manager, FoldSubgraphEmptyInputs) REGISTER_PASS(manager, DisableRandomUniformConstantFolding) REGISTER_PASS(manager, PushConstantToSubgraph) diff --git a/src/common/transformations/src/transformations/common_optimizations/pull_through_reduce.cpp b/src/common/transformations/src/transformations/common_optimizations/pull_through_reduce.cpp index 3c25795b296862..5a88b007341d1d 100644 --- a/src/common/transformations/src/transformations/common_optimizations/pull_through_reduce.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/pull_through_reduce.cpp @@ -183,7 +183,7 @@ ov::pass::PullReshapeThroughReduce::PullReshapeThroughReduce() { matcher_pass_callback callback = [=](pattern::Matcher& m) { auto& pattern_map = m.get_pattern_value_map(); - const auto input_node = pattern_map.at(input).get_node_shared_ptr(); + const auto input_node = pattern_map.at(input); const auto reduce_node = std::dynamic_pointer_cast(pattern_map.at(reduce).get_node_shared_ptr()); if (!reduce_node) { @@ -194,7 +194,7 @@ ov::pass::PullReshapeThroughReduce::PullReshapeThroughReduce() { return false; } const auto unsqueeze_axes = - try_get_unsqueeze_axes_from_reshape(reshape_node->get_shape(), input_node->get_shape()); + try_get_unsqueeze_axes_from_reshape(reshape_node->get_shape(), input_node.get_shape()); if (unsqueeze_axes.empty()) { return false; } diff --git a/src/common/transformations/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp b/src/common/transformations/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp index 701b4b4441dfee..022e1283b14c11 100644 --- a/src/common/transformations/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.cpp @@ -4,35 +4,125 @@ #include "transformations/common_optimizations/remove_multi_subgraph_op_dangling_params.hpp" -#include -#include -#include -#include -#include - #include "itt.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/util/multi_subgraph_base.hpp" +#include "openvino/opsets/opset10.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" -ov::pass::RemoveMultiSubGraphOpDanglingParams::RemoveMultiSubGraphOpDanglingParams() { - MATCHER_SCOPE(RemoveMultiSubGraphOpDanglingParams); - auto multi_subgraph_op_pattern = pattern::wrap_type(); - ov::matcher_pass_callback callback = [=](pattern::Matcher& m) { - auto multi_subgraph_op = std::dynamic_pointer_cast(m.get_match_root()); - if (multi_subgraph_op == nullptr) { - return false; +using namespace ov::op::util; + +bool ov::pass::RemoveMultiSubGraphOpDanglingParamsResults::run_on_model(const std::shared_ptr& m) { + RUN_ON_MODEL_SCOPE(RemoveMultiSubGraphOpDanglingParamsResults); + bool is_changed = false; + auto ops = m->get_ordered_ops(); + // Going in reverse order + for (auto it = ops.rbegin(); it != ops.rend(); ++it) { + auto multi_subgraph_op = std::dynamic_pointer_cast(*it); + if (!multi_subgraph_op) + continue; + auto if_op = std::dynamic_pointer_cast(multi_subgraph_op); + auto loop_op = std::dynamic_pointer_cast(multi_subgraph_op); + auto ti_op = std::dynamic_pointer_cast(multi_subgraph_op); + // Only If, Loop and TensorIterator are supported + if (!if_op && !loop_op && !ti_op) + continue; + + // Shouldn't remove special output + int64_t special_out_port = -1; + if (loop_op) { + special_out_port = loop_op->get_special_body_ports().body_condition_output_idx; + } + + const auto subgraphs_size = multi_subgraph_op->get_internal_subgraphs_size(); + // Starting from outputs + std::set outputs_to_remove; + for (size_t out_idx = 0; out_idx < multi_subgraph_op->get_output_size(); ++out_idx) { + if (multi_subgraph_op->output(out_idx).get_target_inputs().empty()) { + outputs_to_remove.insert(out_idx); + } + } + std::vector new_op_out_desc; + for (size_t body_idx = 0; body_idx < subgraphs_size; ++body_idx) { + auto body = multi_subgraph_op->get_function(static_cast(body_idx)); + // recursive call of this transformation on each body + run_on_model(body); + // need to pay attention to merged inputs, shouldn't remove them + MultiSubGraphOp::MultiSubgraphInputDescriptionVector merged_input_descs; + for (size_t body_idx = 0; body_idx < subgraphs_size; ++body_idx) { + for (const auto& desc : multi_subgraph_op->get_input_descriptions(static_cast(body_idx))) { + if (const auto& merged_input_desc = + ov::as_type_ptr(desc)) { + merged_input_descs.push_back(desc); + } + } + } + const auto& out_desc = multi_subgraph_op->get_output_descriptions(static_cast(body_idx)); + MultiSubGraphOp::MultiSubgraphOutputDescriptionVector new_out_desc; + std::set results_idxs_to_remove; + for (const auto& odesc : out_desc) { + bool to_remove = outputs_to_remove.find(odesc->m_output_index) != outputs_to_remove.end(); + if (!to_remove) { + new_out_desc.push_back(odesc); + } else if (static_cast(odesc->m_body_value_index) == special_out_port) { + // If this is special out port, we will remove output description and output, but do not remove + // Result + to_remove = false; + } + if (to_remove) { + for (const auto& desc : merged_input_descs) { + const auto& mdesc = ov::as_type_ptr(desc); + if (mdesc && mdesc->m_body_value_index == odesc->m_body_value_index) { + // Cannot remove Result which is part of merged input + to_remove = false; + } + } + } + if (to_remove) { + results_idxs_to_remove.insert(odesc->m_body_value_index); + } + } + new_op_out_desc.push_back(new_out_desc); + auto results = body->get_results(); + // go in reverse order to first delete last result + for (auto it = results_idxs_to_remove.rbegin(); it != results_idxs_to_remove.rend(); ++it) { + body->remove_result(results.at(*it)); + is_changed = true; + // We need to go over output descriptors and modify them to reflect deleted result + for (auto& desc : new_out_desc) { + if (desc->m_body_value_index > *it) { + desc->m_body_value_index--; + } + } + for (auto& desc : merged_input_descs) { + const auto& mdesc = ov::as_type_ptr(desc); + if (mdesc && mdesc->m_body_value_index > *it) { + mdesc->m_body_value_index--; + } + } + if (special_out_port != -1) { + if (special_out_port > static_cast(*it)) { + special_out_port--; + } + } + } + if (special_out_port != -1) { + loop_op->set_special_body_ports( + {loop_op->get_special_body_ports().current_iteration_input_idx, special_out_port}); + } } + // Remove inputs bool pass_required = false; std::set> required_inputs; auto op_inputs = multi_subgraph_op->input_values(); std::vector> to_remove_descriptors_indexes; - const auto subgraphs_size = multi_subgraph_op->get_internal_subgraphs_size(); to_remove_descriptors_indexes.resize(subgraphs_size); for (size_t body_idx = 0; body_idx < subgraphs_size; ++body_idx) { auto& body_func = multi_subgraph_op->get_function(static_cast(body_idx)); auto& body_params = body_func->get_parameters(); auto& body_in_descriptors = multi_subgraph_op->get_input_descriptions(static_cast(body_idx)); - // collect all descriptors which should be removed and reqired inputs + // collect all descriptors which should be removed and required inputs for (size_t i = 0; i < body_in_descriptors.size(); ++i) { auto& body_param = body_params[body_in_descriptors[i]->m_body_parameter_index]; if (body_param->get_output_target_inputs(0).size() == 0) { @@ -46,6 +136,7 @@ ov::pass::RemoveMultiSubGraphOpDanglingParams::RemoveMultiSubGraphOpDanglingPara } } if (pass_required) { + is_changed = true; using DescType = op::util::MultiSubGraphOp::MultiSubgraphInputDescriptionVector; auto update_body_param_desc = [](DescType& descriptors, uint64_t removed_body_idx) { for (auto& desc : descriptors) { @@ -97,8 +188,47 @@ ov::pass::RemoveMultiSubGraphOpDanglingParams::RemoveMultiSubGraphOpDanglingPara } multi_subgraph_op->set_arguments(op_inputs); } - return false; - }; - auto m = std::make_shared(multi_subgraph_op_pattern, matcher_name); - this->register_matcher(m, callback); + if (!outputs_to_remove.empty()) { + // we need to reconstruct operation with new number of outputs, we cannot reduce number of outputs of + // existing op + std::shared_ptr new_op; + if (if_op) { + new_op = std::make_shared(); + } else if (loop_op) { + auto new_loop_op = std::make_shared(); + new_loop_op->set_special_body_ports(loop_op->get_special_body_ports()); + new_op = new_loop_op; + } else if (ti_op) { + new_op = std::make_shared(); + } + new_op->set_arguments(multi_subgraph_op->input_values()); + new_op->set_friendly_name(multi_subgraph_op->get_friendly_name()); + copy_runtime_info(multi_subgraph_op, new_op); + for (int body_idx = 0; static_cast(body_idx) < subgraphs_size; ++body_idx) { + new_op->set_function(body_idx, multi_subgraph_op->get_function(body_idx)); + new_op->set_input_descriptions(body_idx, multi_subgraph_op->get_input_descriptions(body_idx)); + new_op->set_output_descriptions(body_idx, new_op_out_desc.at(body_idx)); + } + size_t removed_outs_counter = 0; + new_op->set_output_size(multi_subgraph_op->get_output_size() - outputs_to_remove.size()); + for (size_t out_idx = 0; out_idx < multi_subgraph_op->get_output_size(); ++out_idx) { + if (outputs_to_remove.find(out_idx) != outputs_to_remove.end()) { + // Need to go through all output descriptors to reflect deleted output + for (int body_idx = 0; static_cast(body_idx) < subgraphs_size; ++body_idx) { + for (auto& odesc : new_op->get_output_descriptions(body_idx)) { + if (odesc->m_output_index > out_idx - removed_outs_counter) { + odesc->m_output_index--; + } + } + } + ++removed_outs_counter; + } else { + // replace output with new one + multi_subgraph_op->output(out_idx).replace(new_op->output(out_idx - removed_outs_counter)); + } + } + new_op->validate_and_infer_types(); + } + } + return is_changed; } diff --git a/src/common/transformations/src/transformations/common_optimizations/strides_optimization.cpp b/src/common/transformations/src/transformations/common_optimizations/strides_optimization.cpp index 839ba82cb49f23..25dc8300f674e9 100644 --- a/src/common/transformations/src/transformations/common_optimizations/strides_optimization.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/strides_optimization.cpp @@ -13,7 +13,9 @@ #include "itt.hpp" +using namespace std; using namespace ov; +using namespace ov::opset7; static bool can_propagate_conv_stride(const std::shared_ptr& conv) { const auto& kernel_shape = conv->input_value(1).get_shape(); @@ -39,40 +41,36 @@ static std::tuple check_next_ops(const std::vector& first, - ngraph::Input& second, - const ngraph::Strides& strides) { +static void insert_pooling(const Output& first, Input& second, const Strides& strides) { + pass::NodeRegistry rg; auto first_node = first.get_node_shared_ptr(); - auto rank = first.get_partial_shape().rank(); - bool do_reshape = rank.is_static() && static_cast(rank.get_length()) < strides.size() + 2; + const auto rank = first.get_partial_shape().rank(); + const bool do_reshape = rank.is_static() && static_cast(rank.get_length()) < strides.size() + 2; if (do_reshape) { - size_t diff = strides.size() + 2 - static_cast(rank.get_length()); - auto ones = opset7::Constant::create(ngraph::element::i64, ngraph::Shape{diff}, std::vector(diff, 1)); - auto current_shape = std::make_shared(first); - std::shared_ptr new_shape = - std::make_shared(ngraph::OutputVector{ones, current_shape}, 0); - std::shared_ptr constant_new_shape = get_constant_from_source(new_shape); - if (constant_new_shape) + const size_t diff = strides.size() + 2 - static_cast(rank.get_length()); + const auto ones = rg.make(element::i64, Shape{diff}, vector(diff, 1)); + const auto current_shape = rg.make(first); + shared_ptr new_shape = rg.make(OutputVector{ones, current_shape}, 0); + if (const auto constant_new_shape = get_constant_from_source(new_shape)) { + rg.add(constant_new_shape); new_shape = constant_new_shape; - first_node = std::make_shared(first_node, new_shape, false); + } + first_node = rg.make(first_node, new_shape, false); } - std::shared_ptr new_node = std::make_shared(first_node, - strides, - ngraph::Shape{}, - ngraph::Shape{}, - ngraph::Shape(strides.size(), 1)); + shared_ptr new_node = rg.make(first_node, strides, Shape{}, Shape{}, Shape(strides.size(), 1)); if (do_reshape) { // squeeze dimensions back - size_t diff = strides.size() + 2 - static_cast(rank.get_length()); - std::vector axes(diff); - std::iota(axes.begin(), axes.end(), 0); - new_node = std::make_shared( - new_node, - opset7::Constant::create(ngraph::element::u64, ngraph::Shape{diff}, axes)); + const size_t diff = strides.size() + 2 - static_cast(rank.get_length()); + vector axes(diff); + iota(axes.begin(), axes.end(), 0); + new_node = rg.make(new_node, rg.make(element::u64, Shape{diff}, axes)); } - std::shared_ptr constant_new_node = get_constant_from_source(new_node); - if (constant_new_node) + if (const auto constant_new_node = get_constant_from_source(new_node)) { + rg.add(constant_new_node); new_node = constant_new_node; + } + + copy_runtime_info(as_node_vector({second.get_source_output()}), rg.get()); second.replace_source_output(new_node); } diff --git a/src/common/transformations/tests/common_optimizations/pull_through_reduce_test.cpp b/src/common/transformations/tests/common_optimizations/pull_through_reduce_test.cpp index f9d9db0beae4bd..349f5b39a03232 100644 --- a/src/common/transformations/tests/common_optimizations/pull_through_reduce_test.cpp +++ b/src/common/transformations/tests/common_optimizations/pull_through_reduce_test.cpp @@ -168,6 +168,27 @@ INSTANTIATE_TEST_SUITE_P(PullUnsqueezeThroughReduceLogicalOr, PullUnsqueezeThroughReduceLogicalOr, ValuesIn(reduce_logical_or_params)); +TEST_F(TransformationTestsF, PullUnsqueezeThroughReduceMeanInputHasMoreThanOneOutput) { + const auto input = std::make_shared(element::f32, PartialShape{10, 10, 15}); + const auto split = std::make_shared(input, Constant::create(element::i64, Shape{}, {0}), 2); + const auto unsqueeze_axes = Constant::create(element::i64, Shape{1}, {0}); + { + const auto unsqueeze = std::make_shared(split->output(0), unsqueeze_axes); + const auto reduce_axes = Constant::create(element::i64, Shape{}, {1}); + const auto reduce_mean = std::make_shared(unsqueeze, reduce_axes); + + model = std::make_shared(OutputVector{reduce_mean, split->output(1)}, ParameterVector{input}); + manager.register_pass(); + } + { + const auto reduce_axes = Constant::create(element::i64, Shape{}, {0}); + const auto reduce_mean = std::make_shared(split->output(0), reduce_axes); + const auto unsqueeze = std::make_shared(reduce_mean, unsqueeze_axes); + + model_ref = std::make_shared(OutputVector{unsqueeze, split->output(1)}, ParameterVector{input}); + } +} + TEST_F(TransformationTestsF, PullUnsqueezeThroughReduceSkipIfTheSameAxes) { model = generate_unsqueeze_model(element::f32, {5, 10, 15}, {0, 1}, {1, 2}); manager.register_pass(); @@ -296,6 +317,28 @@ INSTANTIATE_TEST_SUITE_P(PullReshapeThroughReduceLogicalOr, PullReshapeThroughReduceLogicalOr, ValuesIn(reduce_logical_or_reshape_params)); +TEST_F(TransformationTestsF, PullReshapeThroughReduceMeanInputHasMoreThanOneOutput) { + const auto input = std::make_shared(element::f32, PartialShape{10, 10, 15}); + const auto split = std::make_shared(input, Constant::create(element::i64, Shape{}, {0}), 2); + { + const auto target_shape = Constant::create(element::i64, Shape{4}, {1, 5, 10, 15}); + const auto reshape = std::make_shared(split->output(0), target_shape, false); + const auto reduce_axes = Constant::create(element::i64, Shape{}, {1}); + const auto reduce_mean = std::make_shared(reshape, reduce_axes); + + model = std::make_shared(OutputVector{reduce_mean, split->output(1)}, ParameterVector{input}); + manager.register_pass(); + } + { + const auto reduce_axes = Constant::create(element::i64, Shape{}, {0}); + const auto reduce_mean = std::make_shared(split->output(0), reduce_axes); + const auto target_shape = Constant::create(element::i64, Shape{3}, {1, 10, 15}); + const auto reshape = std::make_shared(reduce_mean, target_shape, false); + + model_ref = std::make_shared(OutputVector{reshape, split->output(1)}, ParameterVector{input}); + } +} + TEST_F(TransformationTestsF, PullReshapeThroughReduceMeanSkipIfDynamicInput) { model = generate_reshape_model(element::f32, {5, Dimension::dynamic(), 15}, {1, 5, 10, 15}, {2}); manager.register_pass(); diff --git a/src/common/transformations/tests/common_optimizations/remove_multi_subgraph_op_dangling_params_tests.cpp b/src/common/transformations/tests/common_optimizations/remove_multi_subgraph_op_dangling_params_tests.cpp index eb87973f9bf238..bbfe28a67a777f 100644 --- a/src/common/transformations/tests/common_optimizations/remove_multi_subgraph_op_dangling_params_tests.cpp +++ b/src/common/transformations/tests/common_optimizations/remove_multi_subgraph_op_dangling_params_tests.cpp @@ -42,7 +42,7 @@ TEST_F(TransformationTestsF, RemoveLoopDanglingParameters) { auto loop_res = std::make_shared(loop->get_iter_value(abs)); function = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); - manager.register_pass(); + manager.register_pass(); } { auto body = std::make_shared(OutputVector{condition, abs}, ParameterVector{bi}); @@ -81,7 +81,7 @@ TEST_F(TransformationTestsF, RemoveLoopManyDanglingParameters) { auto loop_res = std::make_shared(loop->get_iter_value(abs)); function = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b, c}); - manager.register_pass(); + manager.register_pass(); } { auto body = std::make_shared(OutputVector{condition, abs}, ParameterVector{bi}); @@ -124,7 +124,7 @@ TEST_F(TransformationTestsF, RemoveLoopManyDanglingParameters2) { auto loop_res = std::make_shared(loop->get_iter_value(abs)); function = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b, c, d}); - manager.register_pass(); + manager.register_pass(); } { auto body = std::make_shared(OutputVector{condition, abs}, ParameterVector{bi, di}); @@ -160,7 +160,7 @@ TEST_F(TransformationTestsF, RemoveLoopDanglingParametersIfConcatEmptyTensor) { function = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); manager.register_pass(); - manager.register_pass(); + manager.register_pass(); } { auto concat = std::make_shared(NodeVector{ai}, 0); @@ -199,7 +199,7 @@ TEST_F(TransformationTestsF, RemoveIfDanglingParametersFromBodiesAndInputs) { auto res = if_op->set_output(then_op_res, else_op_res); function = std::make_shared(OutputVector{res}, ParameterVector{X, Y}); - manager.register_pass(); + manager.register_pass(); } { auto then_body = std::make_shared(OutputVector{then_op_res}, ParameterVector{Xte}); @@ -240,7 +240,7 @@ TEST_F(TransformationTestsF, RemoveIfDanglingParametersOnlyFromBodies) { auto res = if_op->set_output(then_op_res, else_op_res); function = std::make_shared(OutputVector{res}, ParameterVector{X, Y}); - manager.register_pass(); + manager.register_pass(); } { auto then_body = std::make_shared(OutputVector{then_op_res}, ParameterVector{Xt}); @@ -286,7 +286,7 @@ TEST_F(TransformationTestsF, RemoveIfManyDanglingParameters) { auto res = if_op->set_output(then_op_res, else_op_res); function = std::make_shared(OutputVector{res}, ParameterVector{X, Y, Z}); - manager.register_pass(); + manager.register_pass(); } { auto then_body = std::make_shared(OutputVector{then_op_res}, ParameterVector{Xt, Zt}); @@ -332,7 +332,7 @@ TEST_F(TransformationTestsF, RemoveIfDanglingParamFromOneBodyAndUpdateAllDescrip auto res = if_op->set_output(then_op_res, else_op_res); function = std::make_shared(OutputVector{res}, ParameterVector{X, Y, Z}); - manager.register_pass(); + manager.register_pass(); } { auto then_body = std::make_shared(OutputVector{then_op_res}, ParameterVector{Zt}); @@ -368,7 +368,7 @@ TEST_F(TransformationTestsF, RemoveTensorIteratorDanglingParameter) { auto res = std::make_shared(out); function = std::make_shared(OutputVector{res}, ParameterVector{X, Y, M}); - manager.register_pass(); + manager.register_pass(); } { auto body = std::make_shared(OutputVector{Zo}, ParameterVector{Xi, Yi}); @@ -407,7 +407,7 @@ TEST_F(TransformationTestsF, RemoveTensorIteratorManyDanglingParameters) { auto res = std::make_shared(out); function = std::make_shared(OutputVector{res}, ParameterVector{X, Y, Z, M}); - manager.register_pass(); + manager.register_pass(); } { auto body = std::make_shared(OutputVector{Zo}, ParameterVector{Xi, Zi}); @@ -421,3 +421,338 @@ TEST_F(TransformationTestsF, RemoveTensorIteratorManyDanglingParameters) { function_ref = std::make_shared(OutputVector{res}, ParameterVector{X, Y, Z, M}); } } + +TEST_F(TransformationTestsF, RemoveIfDanglingResult) { + auto X = std::make_shared(element::f32, Shape{2, 4, 1}); + auto Y = std::make_shared(element::f32, Shape{2, 4, 1}); + auto cond = std::make_shared(element::boolean, Shape{1}, true); + + auto Xt = std::make_shared(element::f32, PartialShape::dynamic()); + auto Yt = std::make_shared(element::f32, PartialShape::dynamic()); + + auto then_op1 = std::make_shared(Xt, Yt); + auto then_op1_res = std::make_shared(then_op1); + auto then_op2 = std::make_shared(Xt, Yt); + auto then_op2_res = std::make_shared(then_op2); + + auto Xe = std::make_shared(element::f32, PartialShape::dynamic()); + + auto else_op1 = std::make_shared(Xe, Xe); + auto else_op1_res = std::make_shared(else_op1); + auto else_op2 = std::make_shared(Xe, Xe); + auto else_op2_res = std::make_shared(else_op2); + { + auto then_body = std::make_shared(OutputVector{then_op1_res, then_op2_res}, ParameterVector{Xt, Yt}); + auto else_body = std::make_shared(OutputVector{else_op1_res, else_op2_res}, ParameterVector{Xe}); + auto if_op = std::make_shared(cond); + if_op->set_then_body(then_body); + if_op->set_else_body(else_body); + if_op->set_input(X, Xt, Xe); + if_op->set_input(Y, Yt, nullptr); + auto res1 = if_op->set_output(then_op1_res, else_op1_res); + auto res2 = if_op->set_output(then_op2_res, else_op2_res); + // Not using res2 output + model = std::make_shared(OutputVector{res1}, ParameterVector{X, Y}); + + manager.register_pass(); + } + { + auto then_body = std::make_shared(OutputVector{then_op1_res}, ParameterVector{Xt, Yt}); + auto else_body = std::make_shared(OutputVector{else_op1_res}, ParameterVector{Xe}); + auto if_op = std::make_shared(cond); + if_op->set_then_body(then_body); + if_op->set_else_body(else_body); + if_op->set_input(X, Xt, Xe); + if_op->set_input(Y, Yt, nullptr); + auto res1 = if_op->set_output(then_op1_res, else_op1_res); + model_ref = std::make_shared(OutputVector{res1}, ParameterVector{X, Y}); + } +} + +TEST_F(TransformationTestsF, RemoveLoopDanglingResults) { + auto trip_count = std::make_shared(element::i64, Shape{}, 10); + auto condition = std::make_shared(element::boolean, Shape{}, true); + + auto a = std::make_shared(element::f32, Shape{2, 2}); + auto ai = std::make_shared(element::f32, Shape{2, 2}); + auto b = std::make_shared(element::f32, Shape{2, 2}); + auto bi = std::make_shared(element::f32, Shape{2, 2}); + + auto mul = std::make_shared(ai, bi); + auto abs1 = std::make_shared(mul); + auto add = std::make_shared(ai, bi); + auto abs2 = std::make_shared(add); + { + auto body = std::make_shared(OutputVector{condition, abs1, abs2}, ParameterVector{ai, bi}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 0}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + loop->set_invariant_input(bi, b); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + loop->get_iter_value(abs2); + // abs2 result is unused + model = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); + + manager.register_pass(); + } + { + auto body = std::make_shared(OutputVector{condition, abs1}, ParameterVector{ai, bi}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 0}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + loop->set_invariant_input(bi, b); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + model_ref = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); + } +} + +TEST_F(TransformationTestsF, RemoveLoopDanglingParamsAndResults) { + auto trip_count = std::make_shared(element::i64, Shape{}, 10); + auto condition = std::make_shared(element::boolean, Shape{}, true); + + auto a = std::make_shared(element::f32, Shape{2, 2}); + auto ai = std::make_shared(element::f32, Shape{2, 2}); + auto b = std::make_shared(element::f32, Shape{2, 2}); + auto bi = std::make_shared(element::f32, Shape{2, 2}); + + auto mul = std::make_shared(ai, ai); + auto abs1 = std::make_shared(mul); + auto add = std::make_shared(bi, bi); + auto abs2 = std::make_shared(add); + { + auto body = std::make_shared(OutputVector{condition, abs1, abs2}, ParameterVector{ai, bi}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 0}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + loop->set_invariant_input(bi, b); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + loop->get_iter_value(abs2); + // abs2 result is unused + model = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); + + manager.register_pass(); + } + { + auto body = std::make_shared(OutputVector{condition, abs1}, ParameterVector{ai}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 0}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + model_ref = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); + } +} + +TEST_F(TransformationTestsF, RemoveLoopMultipleDanglingResults) { + auto trip_count = std::make_shared(element::i64, Shape{}, 10); + auto condition = std::make_shared(element::boolean, Shape{}, true); + + auto a = std::make_shared(element::f32, Shape{2, 2}); + auto ai = std::make_shared(element::f32, Shape{2, 2}); + auto b = std::make_shared(element::f32, Shape{2, 2}); + auto bi = std::make_shared(element::f32, Shape{2, 2}); + + auto mul = std::make_shared(ai, bi); + auto abs1 = std::make_shared(mul); + auto add = std::make_shared(ai, bi); + auto abs2 = std::make_shared(add); + auto sub = std::make_shared(ai, bi); + auto abs3 = std::make_shared(sub); + auto div = std::make_shared(ai, bi); + auto abs4 = std::make_shared(div); + { + auto body = std::make_shared(OutputVector{condition, abs1, abs2, abs3, abs4}, ParameterVector{ai, bi}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 0}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + loop->set_invariant_input(bi, b); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + loop->get_iter_value(abs2); + auto loop_res2 = std::make_shared(loop->get_iter_value(abs3)); + loop->get_iter_value(abs4); + // abs2 and abs4 result is unused + model = std::make_shared(OutputVector{loop_res, loop_res2}, ParameterVector{a, b}); + + manager.register_pass(); + } + { + auto body = std::make_shared(OutputVector{condition, abs1, abs3}, ParameterVector{ai, bi}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 0}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + loop->set_invariant_input(bi, b); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + auto loop_res2 = std::make_shared(loop->get_iter_value(abs3)); + model_ref = std::make_shared(OutputVector{loop_res, loop_res2}, ParameterVector{a, b}); + } +} + +TEST_F(TransformationTestsF, RemoveLoopDanglingResultsSpecialOutPortMoved) { + auto trip_count = std::make_shared(element::i64, Shape{}, 10); + auto condition = std::make_shared(element::boolean, Shape{}, true); + + auto a = std::make_shared(element::f32, Shape{2, 2}); + auto ai = std::make_shared(element::f32, Shape{2, 2}); + auto b = std::make_shared(element::f32, Shape{2, 2}); + auto bi = std::make_shared(element::f32, Shape{2, 2}); + + auto mul = std::make_shared(ai, bi); + auto abs1 = std::make_shared(mul); + auto add = std::make_shared(ai, bi); + auto abs2 = std::make_shared(add); + { + auto body = std::make_shared(OutputVector{abs1, abs2, condition}, ParameterVector{ai, bi}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 2}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + loop->set_invariant_input(bi, b); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + loop->get_iter_value(abs2); + // abs2 result is unused + model = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); + + manager.register_pass(); + } + { + auto body = std::make_shared(OutputVector{abs1, condition}, ParameterVector{ai, bi}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 1}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + loop->set_invariant_input(bi, b); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + model_ref = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); + } +} + +TEST_F(TransformationTestsF, RemoveTensorIteratorDanglingResult) { + auto X = std::make_shared(element::f32, Shape{32, 40, 10}); + auto Y = std::make_shared(element::f32, Shape{32, 40, 10}); + + auto Xi = std::make_shared(element::f32, Shape{32, 2, 10}); + auto Yi = std::make_shared(element::f32, Shape{32, 2, 10}); + auto Zo = std::make_shared(std::make_shared(Xi, Yi)); + auto Zo2 = std::make_shared(std::make_shared(Xi, Yi)); + { + auto body = std::make_shared(OutputVector{Zo, Zo2}, ParameterVector{Xi, Yi}); + auto tensor_iterator = std::make_shared(); + tensor_iterator->set_body(body); + tensor_iterator->set_sliced_input(Xi, X, 0, 2, 2, 39, 1); + tensor_iterator->set_sliced_input(Yi, Y, 0, 2, 2, -1, 1); + + auto out = tensor_iterator->get_iter_value(Zo, -1); + auto out2 = tensor_iterator->get_iter_value(Zo2, -1); + auto res = std::make_shared(out); + // out2 is not used + model = std::make_shared(OutputVector{res}, ParameterVector{X, Y}); + + manager.register_pass(); + } + { + auto body = std::make_shared(OutputVector{Zo}, ParameterVector{Xi, Yi}); + auto tensor_iterator = std::make_shared(); + tensor_iterator->set_body(body); + tensor_iterator->set_sliced_input(Xi, X, 0, 2, 2, 39, 1); + tensor_iterator->set_sliced_input(Yi, Y, 0, 2, 2, -1, 1); + + auto out = tensor_iterator->get_iter_value(Zo, -1); + auto res = std::make_shared(out); + model_ref = std::make_shared(OutputVector{res}, ParameterVector{X, Y}); + } +} + +TEST_F(TransformationTestsF, RemoveTensorIteratorMultipleDanglingResult) { + auto X = std::make_shared(element::f32, Shape{32, 40, 10}); + auto Y = std::make_shared(element::f32, Shape{32, 40, 10}); + + auto Xi = std::make_shared(element::f32, Shape{32, 2, 10}); + auto Yi = std::make_shared(element::f32, Shape{32, 2, 10}); + auto Zo1 = std::make_shared(std::make_shared(Xi, Yi)); + auto Zo2 = std::make_shared(std::make_shared(Xi, Yi)); + auto Zo3 = std::make_shared(std::make_shared(Xi, Yi)); + auto Zo4 = std::make_shared(std::make_shared(Xi, Yi)); + { + auto body = std::make_shared(OutputVector{Zo1, Zo2, Zo3, Zo4}, ParameterVector{Xi, Yi}); + auto tensor_iterator = std::make_shared(); + tensor_iterator->set_body(body); + tensor_iterator->set_sliced_input(Xi, X, 0, 2, 2, 39, 1); + tensor_iterator->set_sliced_input(Yi, Y, 0, 2, 2, -1, 1); + + auto out1 = tensor_iterator->get_iter_value(Zo1, -1); + auto out2 = tensor_iterator->get_iter_value(Zo2, -1); + auto out3 = tensor_iterator->get_iter_value(Zo3, -1); + auto out4 = tensor_iterator->get_iter_value(Zo4, -1); + // out1 and out3 is not used + model = std::make_shared(OutputVector{out2, out4}, ParameterVector{X, Y}); + + manager.register_pass(); + } + { + auto body = std::make_shared(OutputVector{Zo2, Zo4}, ParameterVector{Xi, Yi}); + auto tensor_iterator = std::make_shared(); + tensor_iterator->set_body(body); + tensor_iterator->set_sliced_input(Xi, X, 0, 2, 2, 39, 1); + tensor_iterator->set_sliced_input(Yi, Y, 0, 2, 2, -1, 1); + + auto out2 = tensor_iterator->get_iter_value(Zo2, -1); + auto out4 = tensor_iterator->get_iter_value(Zo4, -1); + model_ref = std::make_shared(OutputVector{out2, out4}, ParameterVector{X, Y}); + } +} + +TEST_F(TransformationTestsF, RemoveLoopDanglingResultsPreserveMerged) { + auto trip_count = std::make_shared(element::i64, Shape{}, 10); + auto condition = std::make_shared(element::boolean, Shape{}, true); + + auto a = std::make_shared(element::f32, Shape{2, 2}); + auto ai = std::make_shared(element::f32, Shape{2, 2}); + auto b = std::make_shared(element::f32, Shape{2, 2}); + auto bi = std::make_shared(element::f32, Shape{2, 2}); + + auto mul = std::make_shared(ai, bi); + auto abs1 = std::make_shared(mul); + auto add = std::make_shared(ai, bi); + auto abs2 = std::make_shared(add); + auto sub = std::make_shared(ai, bi); + auto abs3 = std::make_shared(sub); + { + auto body = std::make_shared(OutputVector{condition, abs1, abs2, abs3}, ParameterVector{ai, bi}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 0}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + loop->set_merged_input(bi, b, abs3); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + loop->get_iter_value(abs2); + // abs2 result is unused + model = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); + + manager.register_pass(); + } + { + auto body = std::make_shared(OutputVector{condition, abs1, abs3}, ParameterVector{ai, bi}); + auto loop = std::make_shared(trip_count, condition); + loop->set_special_body_ports({-1, 0}); + loop->set_function(body); + loop->set_invariant_input(ai, a); + loop->set_merged_input(bi, b, abs3); + + auto loop_res = std::make_shared(loop->get_iter_value(abs1)); + model_ref = std::make_shared(OutputVector{loop_res}, ParameterVector{a, b}); + } +} diff --git a/src/common/transformations/tests/common_optimizations/strides_optimization.cpp b/src/common/transformations/tests/common_optimizations/strides_optimization.cpp index c6d546101db2a6..0e598fe3112d22 100644 --- a/src/common/transformations/tests/common_optimizations/strides_optimization.cpp +++ b/src/common/transformations/tests/common_optimizations/strides_optimization.cpp @@ -264,9 +264,6 @@ TEST_F(TransformationTestsF, StridesOptimization5) { function_ref = std::make_shared(ngraph::NodeVector{conv_2}, ngraph::ParameterVector{data}); } - - // TODO: update transformation and remove this check XXX-68696 - disable_rt_info_check(); } // Pl->Conv(1x1,1x1)->Conv(1x1,2x2)->Conv(3x3,1x1)->Conv(1x1,2x2) @@ -424,8 +421,6 @@ TEST_F(TransformationTestsF, StridesOptimization7) { function_ref = std::make_shared(ngraph::NodeVector{conv_3, conv_4}, ngraph::ParameterVector{data}); } - // TODO: update transformation and remove this check XXX-68696 - disable_rt_info_check(); } // Pl--->Conv(1x1,1x1)->ReLU--->Eltwise-->Conv(1x1,2x2)-->Eltwise-->Conv(1x1, 2x2) @@ -517,8 +512,6 @@ TEST_F(TransformationTestsF, StridesOptimization8) { function_ref = std::make_shared(ngraph::NodeVector{conv_3}, ngraph::ParameterVector{data, data_2}); } - // TODO: update transformation and remove this check XXX-68696 - disable_rt_info_check(); } // Pl------->Conv(1x1,1x1)------>Eltwise------>Conv(1x1,2x2)---->Eltwise-->Conv(1x1, 2x2) @@ -636,6 +629,4 @@ TEST_F(TransformationTestsF, StridesOptimization9) { function_ref = std::make_shared(ngraph::NodeVector{conv_3}, ngraph::ParameterVector{data, data_2, data_3}); } - // TODO: update transformation and remove this check XXX-68696 - disable_rt_info_check(); } diff --git a/src/common/transformations/tests/offline_transformations/pruning_test.cpp b/src/common/transformations/tests/offline_transformations/pruning_test.cpp index 2d6e1cc2fbce4e..ec47d1c8eda1be 100644 --- a/src/common/transformations/tests/offline_transformations/pruning_test.cpp +++ b/src/common/transformations/tests/offline_transformations/pruning_test.cpp @@ -287,7 +287,6 @@ TEST_F(TransformationTestsF, PropagateMasksBasic) { compare_masks(*getMask(conv2->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -371,7 +370,6 @@ TEST_F(TransformationTestsF, PropagateMasksDynamicConvolution) { compare_masks(*getMask(conv2->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -599,7 +597,6 @@ TEST_F(TransformationTestsF, PropagateMaskPassThrough) { compare_masks(*getMask(max_pool->output(0)), Mask({{}, {1, 2, 3}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -768,7 +765,6 @@ TEST_F(TransformationTestsF, PropagateMasksHardDependencies) { // compare_masks(*getMask(conv2), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -915,7 +911,6 @@ TEST_F(TransformationTestsF, PropagateMasksQuantizedGroupConvolution) { compare_masks(*getMask(conv2->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1084,7 +1079,6 @@ TEST_F(TransformationTestsF, PropagateMasksQuantizedGroupConvolutionWithShapeOf) compare_masks(*getMask(weights_2->output(0)), Mask({{}, {0, 1, 2, 3}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1222,7 +1216,6 @@ TEST_F(TransformationTestsF, PropagateMasksFakeQuantizePerTensor) { compare_masks(*getMask(conv2->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1427,7 +1420,6 @@ TEST_F(TransformationTestsF, PropagateMasksFakeQuantizePerChannel) { compare_masks(*getMask(fq->input(4).get_source_output()), Mask({{}, {0, 1, 2, 3, 4}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1559,7 +1551,6 @@ TEST_F(TransformationTestsF, TestConcatMaskPropagation) { Mask({{}, {0, 1, 2, 3, 15, 16, 17, 18, 28, 29, 30, 31}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1707,7 +1698,6 @@ TEST_F(TransformationTestsF, TestConcatMaskPropagationUp) { Mask({{}, {0, 1, 2, 3, 15, 16, 17, 18, 28, 29, 30, 31}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1878,7 +1868,6 @@ TEST_F(TransformationTestsF, PruneConvIsClosingAndInGroup) { compare_masks(*getMask(end_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2070,7 +2059,6 @@ TEST_F(TransformationTestsF, PruneReducelayerUp) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2174,7 +2162,6 @@ TEST_F(TransformationTestsF, PruneReduceLayerDown) { compare_masks(*getMask(end_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2354,7 +2341,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeUp) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2467,7 +2453,6 @@ TEST_P(TransformationTestsBoolParamF, MaskPropagationReshapeUpWithShapeOf) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2579,7 +2564,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeUpShapeSubGraph) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2678,7 +2662,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeExtend) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2784,7 +2767,6 @@ TEST_F(DISABLED_TransformationTestsF, MaskPropagationReshapeDownMul) { compare_masks(*getMask(last_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2889,7 +2871,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeDownAdd) { compare_masks(*getMask(last_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3054,7 +3035,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeUnsqueezeUp) { compare_masks(*getMask(mul_left->output(0)), Mask({{}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3119,7 +3099,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeUnsqueezeDown) { compare_masks(*getMask(mul_left->output(0)), Mask({{}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3292,7 +3271,6 @@ TEST_F(TransformationTestsF, PruneSEBlock) { compare_masks(*getMask(end_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3395,7 +3373,6 @@ TEST_F(TransformationTestsF, PropagateMasksLinear) { compare_masks(*getMask(last_linear->output(0)), Mask{{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3658,7 +3635,6 @@ TEST_F(TransformationTestsF, MaskPropagationLinearOuterDims) { compare_masks(*getMask(last_mul->output(0)), Mask({{}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3808,7 +3784,6 @@ TEST_F(TransformationTestsF, PruneMasksMatMulColsStopRowsUp) { compare_masks(*getMask(last_linear->output(0)), Mask{{}, {}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3898,7 +3873,6 @@ TEST_F(TransformationTestsF, PruneMasksMatMulRowsStopColsUp) { compare_masks(*getMask(last_linear->output(0)), Mask{{}, {}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4003,7 +3977,6 @@ TEST_F(TransformationTestsF, PropagateFlattenUp) { compare_masks(*getMask(linear->output(0)), Mask{{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4076,7 +4049,6 @@ TEST_F(TransformationTestsF, PropagateFlattenDown) { compare_masks(*getMask(linear->output(0)), {{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4126,7 +4098,6 @@ TEST_F(TransformationTestsF, PropagateMasksTranspose) { compare_masks(*getMask(last_mul->output(0)), Mask{{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4200,7 +4171,6 @@ TEST_F(TransformationTestsF, PropagateMasksTransposeComplex) { compare_masks(*getMask(last_mul->output(0)), Mask{{}, {}, {}, {}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4402,7 +4372,6 @@ TEST_F(DISABLED_TransformationTestsF, PropagateMasksBroadcastedEltwiseWithInputs compare_masks(*getMask(last_mul->output(0)), Mask({{}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4583,7 +4552,6 @@ TEST_F(TransformationTestsF, PropagateMasksBroadcastedEltwise) { compare_masks(*getMask(last_mul->output(0)), Mask({{}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4773,7 +4741,6 @@ TEST_F(TransformationTestsF, MaskPropagationComplexReshape) { std::string(VISUALIZE_TREE_ROOT) + "MaskPropagationComplexReshapeWithMasks.svg", modifier); } - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4966,7 +4933,6 @@ TEST_P(TransformationTestsBoolParamF, MaskPropagationReshapedPassThroughP) { manager.register_pass( std::string(VISUALIZE_TREE_ROOT) + "MaskPropagationReverseFlattenWithMasks" + postfix + ".svg", modifier); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -5032,7 +4998,6 @@ TEST_P(TransformationTestsBoolParamF, MaskPropagationBroadcastedSameRankEltwiseS compare_masks(*getMask(mul_last->output(0)), Mask{{}, {}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -5194,7 +5159,6 @@ TEST_F(TransformationTestsF, MaskPropagationMatMulWithSeveralOutputs) { compare_masks(*getMask(right_matmul), Mask{{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } diff --git a/src/core/include/openvino/core/node_output.hpp b/src/core/include/openvino/core/node_output.hpp index 3edca19a653143..c9746e1649c5c8 100644 --- a/src/core/include/openvino/core/node_output.hpp +++ b/src/core/include/openvino/core/node_output.hpp @@ -103,6 +103,7 @@ class OPENVINO_API Output { bool operator>(const Output& other) const; bool operator<=(const Output& other) const; bool operator>=(const Output& other) const; + operator Output() const; private: std::shared_ptr m_node; diff --git a/src/core/include/openvino/op/depth_to_space.hpp b/src/core/include/openvino/op/depth_to_space.hpp index 4c60e5969b1b86..802eddbd665d4c 100644 --- a/src/core/include/openvino/op/depth_to_space.hpp +++ b/src/core/include/openvino/op/depth_to_space.hpp @@ -42,9 +42,14 @@ class OPENVINO_API DepthToSpace : public Op { DepthToSpace(const Output& data, const std::string& mode, std::size_t block_size = 1); bool visit_attributes(AttributeVisitor& visitor) override; + void set_block_size(size_t block_size); + const std::size_t& get_block_size() const { return m_blocksize; } + + void set_mode(DepthToSpaceMode mode); + DepthToSpaceMode get_mode() const { return m_mode; } diff --git a/src/core/include/openvino/op/scatter_elements_update.hpp b/src/core/include/openvino/op/scatter_elements_update.hpp index d1980f338d1c51..903b1fb9bab0cc 100644 --- a/src/core/include/openvino/op/scatter_elements_update.hpp +++ b/src/core/include/openvino/op/scatter_elements_update.hpp @@ -36,6 +36,9 @@ class OPENVINO_API ScatterElementsUpdate : public Op { bool evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const override; OPENVINO_SUPPRESS_DEPRECATED_END bool has_evaluate() const override; + bool evaluate_lower(TensorVector& output_values) const override; + bool evaluate_upper(TensorVector& output_values) const override; + bool evaluate_label(TensorLabelVector& output_labels) const override; private: bool evaluate_scatter_element_update(const HostTensorVector& outputs, const HostTensorVector& inputs) const; diff --git a/src/core/include/openvino/op/shuffle_channels.hpp b/src/core/include/openvino/op/shuffle_channels.hpp index 0bec03b0f36b16..0c30b85d743f62 100644 --- a/src/core/include/openvino/op/shuffle_channels.hpp +++ b/src/core/include/openvino/op/shuffle_channels.hpp @@ -35,9 +35,14 @@ class OPENVINO_API ShuffleChannels : public Op { std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void set_axis(int64_t axis); + const int64_t& get_axis() const { return m_axis; } + + void set_group(int64_t group); + const int64_t& get_group() const { return m_group; } diff --git a/src/core/include/openvino/op/space_to_depth.hpp b/src/core/include/openvino/op/space_to_depth.hpp index 570b95d9b69d41..3b5515503502b6 100644 --- a/src/core/include/openvino/op/space_to_depth.hpp +++ b/src/core/include/openvino/op/space_to_depth.hpp @@ -40,9 +40,15 @@ class OPENVINO_API SpaceToDepth : public Op { SpaceToDepth(const Output& data, const std::string& mode, std::size_t block_size = 1); bool visit_attributes(AttributeVisitor& visitor) override; + + void set_block_size(size_t block_size); + const std::size_t& get_block_size() const { return m_blocksize; } + + void set_mode(SpaceToDepthMode mode); + SpaceToDepthMode get_mode() const { return m_mode; } diff --git a/src/core/include/openvino/runtime/tensor.hpp b/src/core/include/openvino/runtime/tensor.hpp index dfbf71e22db7fb..a90acfd1b66e07 100644 --- a/src/core/include/openvino/runtime/tensor.hpp +++ b/src/core/include/openvino/runtime/tensor.hpp @@ -116,6 +116,23 @@ class OPENVINO_API Tensor { */ Tensor(const element::Type type, const Shape& shape, void* host_ptr, const Strides& strides = {}); + /** + * @brief Constructs Tensor using port from node. Allocate internal host storage using default allocator + * @param port port from node + * @param allocator allocates memory for internal tensor storage + */ + Tensor(const ov::Output& port, const Allocator& allocator = {}); + + /** + * @brief Constructs Tensor using port from node. Wraps allocated host memory. + * @note Does not perform memory allocation internally + * @param port port from node + * @param host_ptr Pointer to pre-allocated host memory + * @param strides Optional strides parameters in bytes. Strides are supposed to be computed automatically based + * on shape and element size + */ + Tensor(const ov::Output& port, void* host_ptr, const Strides& strides = {}); + /** * @brief Constructs region of interest (ROI) tensor form another tensor. * @note Does not perform memory allocation internally @@ -143,10 +160,17 @@ class OPENVINO_API Tensor { */ Shape get_shape() const; + /** + * @brief Copy tensor, destination tensor should have the same element type and shape + * + * @param dst destination tensor + */ + void copy_to(ov::Tensor& dst) const; + /** * @brief Reports whether the tensor is continuous or not * - * @return true if blob is continuous + * @return true if tensor is continuous */ bool is_continuous() const; diff --git a/src/core/reference/include/ngraph/runtime/reference/convert_color_nv12.hpp b/src/core/reference/include/ngraph/runtime/reference/convert_color_nv12.hpp index f575041ba4a96f..81733924ae7b3d 100644 --- a/src/core/reference/include/ngraph/runtime/reference/convert_color_nv12.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/convert_color_nv12.hpp @@ -42,12 +42,12 @@ void color_convert_nv12(const T* arg_y, size_t stride_y, size_t stride_uv, ov::op::util::ConvertColorNV12Base::ColorConversion color_format) { - for (int batch = 0; batch < batch_size; batch++) { + for (size_t batch = 0; batch < batch_size; batch++) { T* out = out_ptr + batch * image_w * image_h * 3; auto y_ptr = arg_y + batch * stride_y; auto uv_ptr = arg_uv + batch * stride_uv; - for (int h = 0; h < image_h; h++) { - for (int w = 0; w < image_w; w++) { + for (size_t h = 0; h < image_h; h++) { + for (size_t w = 0; w < image_w; w++) { auto y_index = h * image_w + w; auto y_val = static_cast(y_ptr[y_index]); auto uv_index = (h / 2) * image_w + (w / 2) * 2; @@ -80,13 +80,13 @@ void color_convert_i420(const T* arg_y, size_t stride_y, size_t stride_uv, ov::op::util::ConvertColorI420Base::ColorConversion color_format) { - for (int batch = 0; batch < batch_size; batch++) { + for (size_t batch = 0; batch < batch_size; batch++) { T* out = out_ptr + batch * image_w * image_h * 3; auto y_ptr = arg_y + batch * stride_y; auto u_ptr = arg_u + batch * stride_uv; auto v_ptr = arg_v + batch * stride_uv; - for (int h = 0; h < image_h; h++) { - for (int w = 0; w < image_w; w++) { + for (size_t h = 0; h < image_h; h++) { + for (size_t w = 0; w < image_w; w++) { auto y_index = h * image_w + w; auto y_val = static_cast(y_ptr[y_index]); auto uv_index = (h / 2) * (image_w / 2) + (w / 2); diff --git a/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp b/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp index c4484ceab120dd..fabe70d95340b3 100644 --- a/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp @@ -46,15 +46,15 @@ void extend_with_zeros(const Strides& strides, const auto offset_batch = batch * input_size * input_shape[1]; for (size_t channel = 0; channel < input_shape[1]; ++channel) { const auto offset_channel = offset_batch + channel * input_size; - for (int i_z = 0; i_z < input_3d[0]; ++i_z) { + for (size_t i_z = 0; i_z < input_3d[0]; ++i_z) { const auto offset_i_z = i_z * input_3d[2] * input_3d[1]; - for (int i_y = 0; i_y < input_3d[1]; ++i_y) { + for (size_t i_y = 0; i_y < input_3d[1]; ++i_y) { const auto offset_i_y = i_y * input_3d[2]; - for (int i_x = 0; i_x < input_3d[2]; ++i_x) { + for (size_t i_x = 0; i_x < input_3d[2]; ++i_x) { input_zeros.push_back(in[offset_channel + i_x + offset_i_y + offset_i_z]); if (i_x < input_3d[2] - 1) { - for (int k = 0; k < strides_3d[2] - 1; k++) { + for (size_t k = 0; k < strides_3d[2] - 1; k++) { input_zeros.push_back(0); } } diff --git a/src/core/reference/include/ngraph/runtime/reference/detection_output.hpp b/src/core/reference/include/ngraph/runtime/reference/detection_output.hpp index b197c110dd5bf6..76b7a6945f0617 100644 --- a/src/core/reference/include/ngraph/runtime/reference/detection_output.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/detection_output.hpp @@ -38,7 +38,7 @@ class referenceDetectionOutput { size_t offset; size_t numResults; size_t outTotalSize; - size_t numClasses; + int numClasses; void GetLocPredictions(const dataType* locData, std::vector& locations) { locations.resize(numImages); @@ -445,7 +445,7 @@ class referenceDetectionOutput { offset = _attrs.normalized ? 0 : 1; numPriors = priorsShape[2] / priorSize; priorsBatchSize = priorsShape[0]; - numClasses = classPredShape[1] / numPriors; + numClasses = classPredShape[1] / static_cast(numPriors); numLocClasses = _attrs.share_location ? 1 : numClasses; numResults = outShape[2]; outTotalSize = shape_size(outShape); diff --git a/src/core/reference/include/ngraph/runtime/reference/roi_align.hpp b/src/core/reference/include/ngraph/runtime/reference/roi_align.hpp index cac56d76b3b4c4..3ea62e8d03304f 100644 --- a/src/core/reference/include/ngraph/runtime/reference/roi_align.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/roi_align.hpp @@ -109,8 +109,8 @@ void roi_align(const T* feature_maps, T sample_x = x1 + static_cast(x_bin_ind) * bin_width + sample_distance_x * (static_cast(x_sample_ind) + static_cast(0.5f)); - if (sample_x < -1.0 || sample_x > feature_map_width || sample_y < -1.0 || - sample_y > feature_map_height) { + if (sample_x < -1.0 || sample_x > static_cast(feature_map_width) || sample_y < -1.0 || + sample_y > static_cast(feature_map_height)) { // For this sample we save 4x point (0,0) with weight 0 pooling_points.insert(pooling_points.end(), 4, {0, 0}); pooling_weights.insert(pooling_weights.end(), 4, T{0}); diff --git a/src/core/shape_inference/include/batch_to_space_shape_inference.hpp b/src/core/shape_inference/include/batch_to_space_shape_inference.hpp index 5069fc23c04ce9..fb7259280796c4 100644 --- a/src/core/shape_inference/include/batch_to_space_shape_inference.hpp +++ b/src/core/shape_inference/include/batch_to_space_shape_inference.hpp @@ -15,28 +15,28 @@ namespace ov { namespace op { namespace v1 { -template -void shape_infer(const ov::op::v1::BatchToSpace* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map>& constant_data = {}) { - using ValType = typename std::iterator_traits::value_type::value_type; - NODE_VALIDATION_CHECK(op, input_shapes.size() == 4 && output_shapes.size() == 1); +template +std::vector shape_infer(const BatchToSpace* op, + const std::vector& input_shapes, + const std::map& constant_data = {}) { + using ValType = typename TShape::value_type::value_type; + NODE_VALIDATION_CHECK(op, input_shapes.size() == 4); + const auto& data_shape = input_shapes[0]; const auto& block_shape = input_shapes[1]; const auto& crops_begin_shape = input_shapes[2]; const auto& crops_end_shape = input_shapes[3]; - bool got_const_data = false; auto inputs_same_ps = crops_begin_shape; - NODE_VALIDATION_CHECK(op, - T::merge_into(inputs_same_ps, crops_end_shape) && T::merge_into(inputs_same_ps, block_shape), - "block_shape, crops_begin and crops_end inputs must have the same shape. Got: ", - block_shape, - ", ", - crops_begin_shape, - " and ", - crops_end_shape); + NODE_VALIDATION_CHECK( + op, + TShape::merge_into(inputs_same_ps, crops_end_shape) && TShape::merge_into(inputs_same_ps, block_shape), + "block_shape, crops_begin and crops_end inputs must have the same shape. Got: ", + block_shape, + ", ", + crops_begin_shape, + " and ", + crops_end_shape); NODE_VALIDATION_CHECK(op, inputs_same_ps.rank().compatible(1), @@ -45,10 +45,11 @@ void shape_infer(const ov::op::v1::BatchToSpace* op, const ov::Rank data_rank = data_shape.rank(); if (data_rank.is_static()) { + constexpr size_t spatial_dim_offset = 1; NODE_VALIDATION_CHECK(op, - (data_rank.get_length() >= 2), + (data_shape.size() > spatial_dim_offset), "data input must have rank greater or equal than 2. Got: ", - data_rank.get_length()); + data_shape.size()); if (inputs_same_ps.is_static()) { NODE_VALIDATION_CHECK(op, data_rank.get_length() == inputs_same_ps[0].get_length(), @@ -59,60 +60,51 @@ void shape_infer(const ov::op::v1::BatchToSpace* op, data_rank); } - auto& output_shape = output_shapes[0]; - output_shape.resize(data_shape.size()); - + auto out_shape = data_shape; std::vector block_val, crops_begin_val, crops_end_val; - if (get_data_as_int64(1, op, block_val, constant_data) && - get_data_as_int64(2, op, crops_begin_val, constant_data) && - get_data_as_int64(3, op, crops_end_val, constant_data)) { - got_const_data = true; - bool block_vals_valid = std::all_of(begin(block_val), end(block_val), [](int64_t elem) { - return elem >= 1; - }); + if (get_data_as_int64(1, op, block_val, constant_data) && + get_data_as_int64(2, op, crops_begin_val, constant_data) && + get_data_as_int64(3, op, crops_end_val, constant_data)) { NODE_VALIDATION_CHECK(op, - block_vals_valid, + std::none_of(begin(block_val), end(block_val), cmp::Less(1)), "Elements of block_shape input must be greater or equal to one."); - bool crops_begin_vals_valid = std::all_of(begin(crops_begin_val), end(crops_begin_val), [](int64_t elem) { - return elem >= 0; - }); - bool crops_end_vals_valid = std::all_of(begin(crops_end_val), end(crops_end_val), [](int64_t elem) { - return elem >= 0; - }); + constexpr auto is_invalid_crop = cmp::Less(0); NODE_VALIDATION_CHECK(op, - crops_begin_vals_valid && crops_end_vals_valid, + std::none_of(begin(crops_begin_val), end(crops_begin_val), is_invalid_crop) && + std::none_of(begin(crops_end_val), end(crops_end_val), is_invalid_crop), "Elements of crops_begin and crops_end inputs must be greater or equal to zero."); - if (data_shape.is_static()) { - for (size_t idx = 0; idx < data_shape.size(); idx++) { - const bool is_valid_crops_and_shape = - crops_begin_val[idx] + crops_end_val[idx] <= block_val[idx] * data_shape[idx].get_length(); - NODE_VALIDATION_CHECK(op, - is_valid_crops_and_shape, - "crops_begin[i] + crops_end[i] must be less or equal to " - "block_shape[i] * input_shape[i]"); - } - } - int64_t block_prod = - std::accumulate(begin(block_val), end(block_val), int64_t(1), std::multiplies()); - const auto divisor = static_cast(block_prod); + const auto divisor = static_cast( + std::accumulate(begin(block_val), end(block_val), int64_t(1), std::multiplies())); - output_shape[0] = data_shape[0] / divisor; - check_divided_result(op, output_shape[0], data_shape[0], divisor); + out_shape[0] /= divisor; + check_divided_result(op, out_shape[0], data_shape[0], divisor); - for (size_t idx = 1; idx < data_shape.size(); idx++) { - output_shape[idx] = data_shape[idx] * static_cast(block_val[idx]) - - static_cast(crops_begin_val[idx]) - - static_cast(crops_end_val[idx]); + for (auto idx = spatial_dim_offset; idx < out_shape.size(); ++idx) { + out_shape[idx] *= static_cast(block_val[idx]); + auto crop = static_cast(crops_begin_val[idx] + crops_end_val[idx]); + NODE_VALIDATION_CHECK( + op, + out_shape[idx].is_dynamic() || crop <= out_shape[idx].get_length(), + "crops_begin[i] + crops_end[i] must be less or equal to block_shape[i] * input_shape[i]"); + + out_shape[idx] = out_shape[idx] - crop; } } + return {out_shape}; + } else { + return {PartialShape::dynamic()}; } - if (!got_const_data) - // For PartialShape, Set the output to be dynamic; - // For StaticShape, throw error caused by implicitly constructing StaticShape with PartialShape argument; - output_shapes[0] = ov::PartialShape::dynamic(data_rank); +} + +template +void shape_infer(const ov::op::v1::BatchToSpace* op, + const std::vector& input_shapes, + std::vector& output_shapes, + const std::map& constant_data = {}) { + output_shapes = shape_infer(op, input_shapes, constant_data); } } // namespace v1 diff --git a/src/core/shape_inference/include/depth_to_space_shape_inference.hpp b/src/core/shape_inference/include/depth_to_space_shape_inference.hpp index 4dd03ee76e861a..bda94acd5a58ab 100644 --- a/src/core/shape_inference/include/depth_to_space_shape_inference.hpp +++ b/src/core/shape_inference/include/depth_to_space_shape_inference.hpp @@ -14,46 +14,43 @@ namespace ov { namespace op { namespace v0 { -template -void shape_infer(const ov::op::v0::DepthToSpace* op, - const std::vector& input_shapes, - std::vector& output_shapes) { - using ValType = typename std::iterator_traits::value_type::value_type; - - NODE_VALIDATION_CHECK(op, input_shapes.size() == 1 && output_shapes.size() == 1); +template +std::vector shape_infer(const DepthToSpace* op, const std::vector& input_shapes) { + using TDim = typename TShape::value_type; + using TVal = typename TDim::value_type; + NODE_VALIDATION_CHECK(op, input_shapes.size() == 1); const auto& data_shape = input_shapes[0]; - const ov::Rank data_rank = data_shape.rank(); - const auto& block_size = op->get_block_size(); - if (data_rank.is_static()) { + if (data_shape.rank().is_static()) { + static constexpr size_t spatial_dim_offset = 2; NODE_VALIDATION_CHECK(op, - data_shape.size() >= 3, + data_shape.size() > spatial_dim_offset, "The input tensor with rank lower than 3 is not supported (input rank: ", data_shape.size(), ")"); - const size_t divider = static_cast(std::pow(block_size, data_shape.size() - 2)); - NODE_VALIDATION_CHECK(op, (divider), "DepthToSpace: The divider must not be 0"); - - auto& output_shape = output_shapes[0]; - output_shape.resize(data_shape.size()); - - output_shape[0] = data_shape[0]; - const auto divisor = static_cast(divider); - output_shape[1] = data_shape[1] / divisor; - check_divided_result(op, output_shape[1], data_shape[1], divisor); - for (size_t i = 2; i < output_shape.size(); i++) { - output_shape[i] = data_shape[i] * static_cast(block_size); - } - + const auto& block_size = op->get_block_size(); + const auto divisor = static_cast(std::pow(block_size, data_shape.size() - spatial_dim_offset)); + NODE_VALIDATION_CHECK(op, divisor != 0, "DepthToSpace: The divisor must not be 0"); + + auto out_shape = data_shape; + out_shape[1] /= divisor; + check_divided_result(op, out_shape[1], data_shape[1], divisor); + std::for_each(out_shape.begin() + spatial_dim_offset, out_shape.end(), [&block_size](TDim& d) { + d *= static_cast(block_size); + }); + return {out_shape}; } else { - // For PartialShape, Set the output to be dynamic; - // For StaticShape, throw error caused by implicitly constructing StaticShape with PartialShape argument; - output_shapes[0] = ov::PartialShape::dynamic(data_rank); + return {PartialShape::dynamic()}; } } +template +void shape_infer(const DepthToSpace* op, const std::vector& input_shapes, std::vector& output_shapes) { + output_shapes = shape_infer(op, input_shapes); +} + } // namespace v0 } // namespace op } // namespace ov diff --git a/src/core/shape_inference/include/scatter_elements_update_shape_inference.hpp b/src/core/shape_inference/include/scatter_elements_update_shape_inference.hpp index ecd9bc3f6cf13b..8db69b87edcc7a 100644 --- a/src/core/shape_inference/include/scatter_elements_update_shape_inference.hpp +++ b/src/core/shape_inference/include/scatter_elements_update_shape_inference.hpp @@ -12,33 +12,33 @@ namespace ov { namespace op { namespace v3 { -template -void shape_infer(const ScatterElementsUpdate* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map>& constant_data = {}) { - NODE_VALIDATION_CHECK(op, input_shapes.size() == 4 && output_shapes.size() == 1); +template +std::vector shape_infer(const ScatterElementsUpdate* op, + const std::vector& input_shapes, + const std::map& constant_data = {}) { + NODE_VALIDATION_CHECK(op, input_shapes.size() == 4); const auto& data_shape = input_shapes[0]; const auto& indices_shape = input_shapes[1]; const auto& updates_shape = input_shapes[2]; const auto& axis_shape = input_shapes[3]; - auto& output_shape = output_shapes[0]; - output_shape = data_shape; NODE_VALIDATION_CHECK(op, - axis_shape.compatible(T{}) || axis_shape.compatible(T{1}), + is_rank_compatible_any_of(axis_shape.rank(), {0, 1}), "Axis input shape are required to be scalar or 1D tensor. ", "Got: ", axis_shape); + const auto& data_rank = data_shape.rank(); + const auto& indices_rank = indices_shape.rank(); + NODE_VALIDATION_CHECK(op, - indices_shape.rank().compatible(data_shape.rank()), + indices_rank.compatible(data_rank), "Indices rank and data rank are required to be equal. ", "Got: ", - indices_shape.rank(), + indices_rank, " and: ", - data_shape.rank()); + data_rank); NODE_VALIDATION_CHECK(op, indices_shape.compatible(updates_shape), @@ -48,26 +48,20 @@ void shape_infer(const ScatterElementsUpdate* op, " and: ", updates_shape); - if (data_shape.rank().is_dynamic()) - return; - - std::vector axis_input; - if (get_data_as_int64(3, op, axis_input, constant_data)) { - auto axis = axis_input[0]; - - int64_t data_rank_length = data_shape.rank().get_length(); - NODE_VALIDATION_CHECK(op, - (-data_rank_length <= axis) && (axis <= data_rank_length - 1), - "Axis value has to be in range [-r, r-1] where r is rank of data shape. ", - " Data rank: ", - data_rank_length, - ", range:[", - -data_rank_length, - ", ", - data_rank_length - 1, - "]. Got axis value: ", - axis); + if (data_shape.rank().is_static()) { + if (const auto axis_input = get_input_const_data_as(op, 3, constant_data)) { + ov::normalize_axis(op, (*axis_input)[0], data_rank); + } } + return {data_shape}; +} + +template +void shape_infer(const ScatterElementsUpdate* op, + const std::vector& input_shapes, + std::vector& output_shapes, + const std::map& constant_data = {}) { + output_shapes = shape_infer(op, input_shapes, constant_data); } } // namespace v3 diff --git a/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp b/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp index 31d35987a19d22..fd54069ea5c294 100644 --- a/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp +++ b/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp @@ -6,13 +6,15 @@ #include +#include "openvino/core/validation_util.hpp" + namespace ov { namespace op { namespace v0 { -template -void shape_infer(const ShuffleChannels* op, const std::vector& input_shapes, std::vector& output_shapes) { - NODE_VALIDATION_CHECK(op, input_shapes.size() == 1 && output_shapes.size() == 1); +template +std::vector shape_infer(const ShuffleChannels* op, const std::vector& input_shapes) { + NODE_VALIDATION_CHECK(op, input_shapes.size() == 1); const auto& group = op->get_group(); NODE_VALIDATION_CHECK(op, group >= 1, "The 'group' parameter must be greater or equal to 1."); @@ -20,25 +22,31 @@ void shape_infer(const ShuffleChannels* op, const std::vector& input_shapes, const auto& input_shape = input_shapes[0]; const auto input_shape_rank = input_shape.rank(); - if (input_shape_rank.is_static()) { - const int64_t input_rank_value = static_cast(input_shape.size()); - NODE_VALIDATION_CHECK(op, input_rank_value >= 1, "The input tensor's shape is expected to be at least 1D."); + auto output_shapes = std::vector(1, input_shape); - const auto& axis = op->get_axis(); + if (input_shape_rank.is_static()) { + NODE_VALIDATION_CHECK(op, input_shape.size() >= 1, "The input tensor's shape is expected to be at least 1D."); + const auto axis_zb = static_cast(normalize_axis(op, op->get_axis(), input_shape_rank)); + const auto& channel_dim = input_shape[axis_zb]; NODE_VALIDATION_CHECK(op, - axis < input_rank_value && axis >= (0 - input_rank_value), - "The 'axis' parameter for ShuffleChannels has to point to one of the " - "input tensor's shape dimensions."); - size_t axis_zb = static_cast(axis >= 0 ? axis : (axis + input_rank_value)); - - if (input_shape[axis_zb].is_static()) { - const auto channel_dim_size = input_shape[axis_zb].get_length(); - NODE_VALIDATION_CHECK(op, - channel_dim_size % group == 0, - "The channel dimension size has to be a multiple of the groups parameter value."); + channel_dim.is_dynamic() || (channel_dim.get_length() % group) == 0, + "The channel dimension size has to be a multiple of the groups parameter value."); + + if (std::is_same::value) { + // overwrite channel dimension to loose label + using TDim = typename TShape::value_type; + output_shapes.front()[axis_zb] = TDim{channel_dim.get_min_length(), channel_dim.get_max_length()}; } } - output_shapes[0] = input_shape; + + return output_shapes; +} + +template +void shape_infer(const ShuffleChannels* op, + const std::vector& input_shapes, + std::vector& output_shapes) { + output_shapes = shape_infer(op, input_shapes); } } // namespace v0 diff --git a/src/core/shape_inference/include/slice_shape_inference_utils.hpp b/src/core/shape_inference/include/slice_shape_inference_utils.hpp index d78ad85a0ec40b..9b33900692b2e1 100644 --- a/src/core/shape_inference/include/slice_shape_inference_utils.hpp +++ b/src/core/shape_inference/include/slice_shape_inference_utils.hpp @@ -134,7 +134,11 @@ inline int64_t get_sliced_value(const int64_t& dim, const int64_t& start, const constexpr int64_t inf_bound = -1; const auto& norm_dim = dim == inf_bound ? std::numeric_limits::max() : dim; +#ifdef OPENVINO_ARCH_64_BIT const auto is_norm_dim_max = ov::internal::is_max(norm_dim); +#else + const auto is_norm_dim_max = ov::internal::is_max(size_t(norm_dim)); +#endif const int64_t lower_max = is_reverse_step ? norm_dim - 1 : norm_dim; const int64_t upper_min = is_reverse_step ? inf_bound : min_bound; diff --git a/src/core/shape_inference/include/space_to_batch_shape_inference.hpp b/src/core/shape_inference/include/space_to_batch_shape_inference.hpp index 33c9caffa22e03..792c7ddc7761d9 100644 --- a/src/core/shape_inference/include/space_to_batch_shape_inference.hpp +++ b/src/core/shape_inference/include/space_to_batch_shape_inference.hpp @@ -15,75 +15,72 @@ namespace ov { namespace op { namespace v1 { -template -void shape_infer(const ov::op::v1::SpaceToBatch* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map>& constant_data = {}) { - using ValType = typename std::iterator_traits::value_type::value_type; - NODE_VALIDATION_CHECK(op, input_shapes.size() == 4 && output_shapes.size() == 1); +template +std::vector shape_infer(const SpaceToBatch* op, + const std::vector& input_shapes, + const std::map& constant_data = {}) { + using TVal = typename TShape::value_type::value_type; + NODE_VALIDATION_CHECK(op, input_shapes.size() == 4); const auto& data_shape = input_shapes[0]; const auto& block_shape = input_shapes[1]; const auto& pads_begin_shape = input_shapes[2]; const auto& pads_end_shape = input_shapes[3]; - const ov::Rank data_rank = data_shape.rank(); - bool got_const_data = false; auto inputs_same_ps = pads_begin_shape; - NODE_VALIDATION_CHECK(op, - T::merge_into(inputs_same_ps, pads_end_shape) && T::merge_into(inputs_same_ps, block_shape), - "block_shape, pads_begin and pads_end inputs must have the same shape. Got: ", - block_shape, - ", ", - pads_begin_shape, - " and ", - pads_end_shape); + NODE_VALIDATION_CHECK( + op, + TShape::merge_into(inputs_same_ps, pads_end_shape) && TShape::merge_into(inputs_same_ps, block_shape), + "block_shape, pads_begin and pads_end inputs must have the same shape. Got: ", + block_shape, + ", ", + pads_begin_shape, + " and ", + pads_end_shape); NODE_VALIDATION_CHECK(op, inputs_same_ps.rank().compatible(1), "block_shape and pads inputs must have rank 1. Got: ", inputs_same_ps.rank()); - if (data_rank.is_static()) { + if (data_shape.rank().is_static()) { + constexpr size_t spatial_dim_offset = 1; NODE_VALIDATION_CHECK(op, - (data_shape.size() >= 2), + (data_shape.size() > spatial_dim_offset), "The data tensor with rank lower than 2 is not supported (data rank: ", data_shape.size(), ")"); - std::vector block_val, pads_begin_val, pads_end_val; - - auto& output_shape = output_shapes[0]; - output_shape.resize(data_shape.size()); - if (get_data_as_int64(1, op, block_val, constant_data) && - get_data_as_int64(2, op, pads_begin_val, constant_data) && - get_data_as_int64(3, op, pads_end_val, constant_data)) { - got_const_data = true; - int64_t block_prod = - std::accumulate(begin(block_val), end(block_val), int64_t(1), std::multiplies()); + auto out_shape = data_shape; + std::vector block, pads_begin, pads_end; + if (get_data_as_int64(1, op, block, constant_data) && + get_data_as_int64(2, op, pads_begin, constant_data) && + get_data_as_int64(3, op, pads_end, constant_data)) { + TVal block_prod = std::accumulate(begin(block), end(block), 1, std::multiplies()); - output_shape[0] = data_shape[0] * static_cast(block_prod); - - for (size_t idx = 1; idx < output_shape.size(); ++idx) { - NODE_VALIDATION_CHECK(op, block_val[idx] > 0, "block_shape values must be greater than 0"); - if (data_shape[idx].is_dynamic() && data_shape[idx] == ov::Dimension::dynamic()) { - output_shape[idx] = ov::Dimension::dynamic(); - } else { - const auto divided = - data_shape[idx] + static_cast((pads_begin_val[idx] + pads_end_val[idx])); - const auto divisor = static_cast(block_val[idx]); - output_shape[idx] = divided / divisor; - check_divided_result(op, output_shape[idx], divided, divisor); + out_shape[0] *= block_prod; + for (auto idx = spatial_dim_offset; idx < out_shape.size(); ++idx) { + NODE_VALIDATION_CHECK(op, block[idx] > 0, "block_shape values must be greater than 0"); + if (out_shape[idx].is_static() || out_shape[idx] != Dimension::dynamic()) { + const auto padded_dim = out_shape[idx] + static_cast(pads_begin[idx] + pads_end[idx]); + const auto divisor = static_cast(block[idx]); + out_shape[idx] = padded_dim / divisor; + check_divided_result(op, out_shape[idx], padded_dim, divisor); } } } + return {out_shape}; + } else { + return {PartialShape::dynamic()}; } +} - if (!got_const_data) - // For PartialShape, Set the output to be dynamic; - // For StaticShape, throw error caused by implicitly constructing StaticShape with PartialShape argument; - output_shapes[0] = ov::PartialShape::dynamic(data_rank); +template +void shape_infer(const SpaceToBatch* op, + const std::vector& input_shapes, + std::vector& output_shapes, + const std::map& constant_data = {}) { + output_shapes = shape_infer(op, input_shapes, constant_data); } } // namespace v1 diff --git a/src/core/shape_inference/include/space_to_depth_shape_inference.hpp b/src/core/shape_inference/include/space_to_depth_shape_inference.hpp index 0f5b8308c27301..7fff113b4d5422 100644 --- a/src/core/shape_inference/include/space_to_depth_shape_inference.hpp +++ b/src/core/shape_inference/include/space_to_depth_shape_inference.hpp @@ -14,44 +14,44 @@ namespace ov { namespace op { namespace v0 { -template -void shape_infer(const ov::op::v0::SpaceToDepth* op, - const std::vector& input_shapes, - std::vector& output_shapes) { - using ValType = typename std::iterator_traits::value_type::value_type; - - NODE_VALIDATION_CHECK(op, input_shapes.size() == 1 && output_shapes.size() == 1); +template +std::vector shape_infer(const ov::op::v0::SpaceToDepth* op, const std::vector& input_shapes) { + using TVal = typename TShape::value_type::value_type; + NODE_VALIDATION_CHECK(op, input_shapes.size() == 1); const auto& data_shape = input_shapes[0]; - const ov::Rank data_rank = data_shape.rank(); - if (data_rank.is_static()) { + if (data_shape.rank().is_static()) { + static constexpr size_t spatial_dim_offset = 2; NODE_VALIDATION_CHECK(op, - !(data_shape.size() < 3), + data_shape.size() > spatial_dim_offset, "The input tensor with rank lower than 3 is not supported (input rank: ", data_shape.size(), ")"); const auto& block_size = op->get_block_size(); - NODE_VALIDATION_CHECK(op, block_size > 0, "The block size must begreater then 0 ", block_size); - const ValType multiplier = static_cast(std::pow(block_size, data_shape.size() - 2)); + NODE_VALIDATION_CHECK(op, block_size > 0, "The block size must be greater than 0 ", block_size); - auto& out_shape = output_shapes[0]; - out_shape.resize(data_shape.size()); + auto out_shape = data_shape; + out_shape[1] *= static_cast(std::pow(block_size, data_shape.size() - spatial_dim_offset)); + const auto divisor = static_cast(block_size); - out_shape[0] = data_shape[0]; - out_shape[1] = data_shape[1] * multiplier; - const auto divisor = static_cast(block_size); - for (size_t i = 2; i < out_shape.size(); i++) { - out_shape[i] = data_shape[i] / divisor; + for (auto i = spatial_dim_offset; i < out_shape.size(); ++i) { + out_shape[i] /= divisor; check_divided_result(op, out_shape[i], data_shape[i], divisor); } + return {out_shape}; } else { - // For PartialShape, Set the output to be dynamic; - // For StaticShape, will throw error caused by implicitly constructing StaticShape with PartialShape argument; - output_shapes[0] = ov::PartialShape::dynamic(data_rank); + return {PartialShape::dynamic()}; } } +template +void shape_infer(const ov::op::v0::SpaceToDepth* op, + const std::vector& input_shapes, + std::vector& output_shapes) { + output_shapes = shape_infer(op, input_shapes); +} + } // namespace v0 } // namespace op } // namespace ov diff --git a/src/core/shape_inference/include/utils.hpp b/src/core/shape_inference/include/utils.hpp index 3c6b5a47435051..d8b85271c40943 100644 --- a/src/core/shape_inference/include/utils.hpp +++ b/src/core/shape_inference/include/utils.hpp @@ -463,17 +463,29 @@ inline bool get_data_as_shape( } } -template +/** + * @brief Check for valid quotient of dimension division. + * + * If quotient is not valid (quotient * divisor != dividend) throw NodeValidationFailure exception. + * + * @tparam TDim Type of dimension. + * + * @param op Pointer to operator. + * @param quotient Dimension result after division. + * @param dividend Original dimension. + * @param divisor Dimension divide value. + */ +template inline void check_divided_result(const ov::Node* op, - const T& res, - const T& divided, - const typename T::value_type& divisor) { + const TDim& quotient, + const TDim& dividend, + const typename TDim::value_type& divisor) { NODE_VALIDATION_CHECK(op, - res != T{}, + quotient != TDim{}, "Dimension value: [ ", - divided.get_min_length(), + dividend.get_min_length(), ", ", - divided.get_max_length(), + dividend.get_max_length(), "]", " must be a multiple of divisor: ", divisor); @@ -481,15 +493,15 @@ inline void check_divided_result(const ov::Node* op, template <> inline void check_divided_result(const ov::Node* op, - const ov::Dimension& res, - const ov::Dimension& divided, + const ov::Dimension& quotient, + const ov::Dimension& dividend, const typename ov::Dimension::value_type& divisor) { NODE_VALIDATION_CHECK(op, - !res.get_interval().empty(), + !quotient.get_interval().empty(), "Dimension value: [ ", - divided.get_min_length(), + dividend.get_min_length(), ", ", - divided.get_max_length(), + dividend.get_max_length(), "]", " must be a multiple of divisor: ", divisor); diff --git a/src/core/src/model.cpp b/src/core/src/model.cpp index 3e3b6e047a3d0b..0621d6cfb67b8c 100644 --- a/src/core/src/model.cpp +++ b/src/core/src/model.cpp @@ -86,6 +86,12 @@ ngraph::ParameterVector auto_detect_parameters(const std::vector& verify_node(const std::shared_ptr& node) { + OPENVINO_ASSERT(node != nullptr, "Model is incorrect! Some Node equals to nullptr."); + return node; +} + } // namespace ov::Model::Model(const ResultVector& results, const ngraph::ParameterVector& parameters, const std::string& name) @@ -118,7 +124,7 @@ ov::Model::Model(const NodeVector& results, const ngraph::ParameterVector& param ov::Model::Model(const std::shared_ptr& result, const ngraph::ParameterVector& parameters, const std::string& name) - : Model(result->outputs(), parameters, name) {} + : Model(verify_node(result)->outputs(), parameters, name) {} ov::Model::Model(const ngraph::ResultVector& results, const ngraph::SinkVector& sinks, diff --git a/src/core/src/node_output.cpp b/src/core/src/node_output.cpp index d1671464a918b7..545546a3f0ac6f 100644 --- a/src/core/src/node_output.cpp +++ b/src/core/src/node_output.cpp @@ -147,6 +147,10 @@ bool Output::operator>=(const Output& other) const { return !(*this < other); } +Output::operator Output() const { + return Output(get_node(), get_index()); +} + Output::Output(const Node* node, size_t index) : m_index(index) { OPENVINO_ASSERT(node, "Cannot create ov::Output from nullptr!"); m_node = node->shared_from_this(); diff --git a/src/core/src/op/batch_to_space.cpp b/src/core/src/op/batch_to_space.cpp index 2d50c2c254860f..6541a90765b611 100644 --- a/src/core/src/op/batch_to_space.cpp +++ b/src/core/src/op/batch_to_space.cpp @@ -60,13 +60,8 @@ void op::v1::BatchToSpace::validate_and_infer_types() { "block_shape and crops inputs must have integer element type. Got: ", inputs_integer_et); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0), - get_input_partial_shape(1), - get_input_partial_shape(2), - get_input_partial_shape(3)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_et, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, data_et, output_shape); } std::shared_ptr ngraph::op::v1::BatchToSpace::clone_with_new_inputs(const OutputVector& new_args) const { diff --git a/src/core/src/op/depth_to_space.cpp b/src/core/src/op/depth_to_space.cpp index 516601a6b3e001..385824ffc56d12 100644 --- a/src/core/src/op/depth_to_space.cpp +++ b/src/core/src/op/depth_to_space.cpp @@ -14,6 +14,7 @@ #include "itt.hpp" #include "ngraph/runtime/reference/depth_to_space.hpp" #include "ngraph/shape.hpp" +#include "openvino/core/validation_util.hpp" using namespace ngraph; @@ -36,20 +37,15 @@ bool op::DepthToSpace::visit_attributes(AttributeVisitor& visitor) { std::shared_ptr op::DepthToSpace::clone_with_new_inputs(const OutputVector& new_args) const { OV_OP_SCOPE(v0_DepthToSpace_clone_with_new_inputs); - if (new_args.size() != 1) { - throw ngraph_error("Incorrect number of new arguments"); - } + check_new_args_count(this, new_args); return std::make_shared(new_args.at(0), m_mode, m_blocksize); } void op::DepthToSpace::validate_and_infer_types() { OV_OP_SCOPE(v0_DepthToSpace_validate_and_infer_types); - const auto& data_type = get_input_element_type(0); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_type, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, get_input_element_type(0), output_shape); } namespace { @@ -88,6 +84,14 @@ std::ostream& ov::operator<<(std::ostream& s, const ov::op::v0::DepthToSpace::De return s << as_string(type); } +void op::v0::DepthToSpace::set_block_size(size_t block_size) { + m_blocksize = block_size; +} + +void op::v0::DepthToSpace::set_mode(DepthToSpaceMode mode) { + m_mode = mode; +} + namespace ov { template <> NGRAPH_API EnumNames& diff --git a/src/core/src/op/scatter_elements_update.cpp b/src/core/src/op/scatter_elements_update.cpp index ff24d4a6048089..365745255332e2 100644 --- a/src/core/src/op/scatter_elements_update.cpp +++ b/src/core/src/op/scatter_elements_update.cpp @@ -6,6 +6,7 @@ #include +#include "bound_evaluate.hpp" #include "itt.hpp" #include "ngraph/op/constant.hpp" #include "ngraph/op/util/op_types.hpp" @@ -51,17 +52,9 @@ void op::v3::ScatterElementsUpdate::validate_and_infer_types() { " and: ", updates_et); - const auto& data = get_input_partial_shape(0); - const auto& indices = get_input_partial_shape(1); - const auto& updates = get_input_partial_shape(2); - const auto& axis = get_input_partial_shape(3); - - std::vector output_shapes = {ov::PartialShape()}; - std::vector input_shapes = {data, indices, updates, axis}; - - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_et, output_shapes[0]); - if (output_shapes[0].is_dynamic()) + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, data_et, output_shape); + if (output_shape.is_dynamic()) set_input_is_relevant_to_shape(0); } @@ -254,3 +247,19 @@ bool op::v3::ScatterElementsUpdate::has_evaluate() const { } return true; } + +bool op::v3::ScatterElementsUpdate::evaluate_lower(ov::TensorVector& output_values) const { + OV_OP_SCOPE(v3_ScatterNDUpdate_evaluate_lower); + return get_input_tensor(1).has_and_set_bound() && ov::default_lower_bound_evaluator(this, output_values); +} + +bool op::v3::ScatterElementsUpdate::evaluate_upper(ov::TensorVector& output_values) const { + OV_OP_SCOPE(v3_ScatterNDUpdate_evaluate_upper); + return get_input_tensor(1).has_and_set_bound() && ov::default_upper_bound_evaluator(this, output_values); +} + +bool op::v3::ScatterElementsUpdate::evaluate_label(TensorLabelVector& output_labels) const { + OV_OP_SCOPE(v3_ScatterNDUpdate_evaluate_label); + + return ov::default_label_evaluator(this, {0, 2}, output_labels); +} diff --git a/src/core/src/op/shuffle_channels.cpp b/src/core/src/op/shuffle_channels.cpp index 22d89c3364e458..51b057c9f7c3b3 100644 --- a/src/core/src/op/shuffle_channels.cpp +++ b/src/core/src/op/shuffle_channels.cpp @@ -15,6 +15,7 @@ #include "ngraph/runtime/reference/shuffle_channels.hpp" #include "ngraph/type/element_type.hpp" #include "ngraph/type/element_type_traits.hpp" +#include "openvino/core/validation_util.hpp" using namespace std; using namespace ngraph; @@ -34,34 +35,24 @@ bool ngraph::op::v0::ShuffleChannels::visit_attributes(AttributeVisitor& visitor } size_t op::ShuffleChannels::get_zero_based_axis() const { - if (m_axis >= 0) { - return m_axis; + const auto input_rank = get_input_partial_shape(0).rank(); + if (input_rank.is_static()) { + return ov::normalize_axis(this, m_axis, input_rank); } else { - if (!get_input_partial_shape(0).rank().is_dynamic()) { - return m_axis + get_input_partial_shape(0).rank().get_length(); - } else { - throw ngraph_error("Cannot request zero-based axis with a input of unknown rank"); - } + throw ngraph_error("Cannot request zero-based axis with a input of unknown rank"); } } void op::ShuffleChannels::validate_and_infer_types() { OV_OP_SCOPE(v0_ShuffleChannels_validate_and_infer_types); - const auto& data_type = get_input_element_type(0); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_type, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, get_input_element_type(0), output_shape); } shared_ptr op::ShuffleChannels::clone_with_new_inputs(const OutputVector& new_args) const { OV_OP_SCOPE(v0_ShuffleChannels_clone_with_new_inputs); - if (new_args.size() != 1) { - throw ngraph_error("Expected 1 element in new_args for the ShuffleChannels op but got " + - std::to_string(new_args.size())); - } - + check_new_args_count(this, new_args); return make_shared(new_args.at(0), m_axis, m_group); } @@ -88,3 +79,11 @@ bool op::ShuffleChannels::has_evaluate() const { OV_OP_SCOPE(v0_ShuffleChannels_has_evaluate); return true; } + +void op::v0::ShuffleChannels::set_axis(int64_t axis) { + m_axis = axis; +} + +void op::v0::ShuffleChannels::set_group(int64_t group) { + m_group = group; +} diff --git a/src/core/src/op/space_to_batch.cpp b/src/core/src/op/space_to_batch.cpp index aeaa0c9197bf62..a36cf37c752e29 100644 --- a/src/core/src/op/space_to_batch.cpp +++ b/src/core/src/op/space_to_batch.cpp @@ -58,13 +58,8 @@ void op::v1::SpaceToBatch::validate_and_infer_types() { "pads_end must be an integral number but got (", pads_end_type, ")."); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0), - get_input_partial_shape(1), - get_input_partial_shape(2), - get_input_partial_shape(3)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_type, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, data_type, output_shape); } std::shared_ptr ngraph::op::v1::SpaceToBatch::clone_with_new_inputs(const OutputVector& new_args) const { diff --git a/src/core/src/op/space_to_depth.cpp b/src/core/src/op/space_to_depth.cpp index 818447e915e8c6..3faa4074f8ba27 100644 --- a/src/core/src/op/space_to_depth.cpp +++ b/src/core/src/op/space_to_depth.cpp @@ -46,11 +46,8 @@ std::shared_ptr ov::op::v0::SpaceToDepth::clone_with_new_inputs(const Outp void ngraph::op::v0::SpaceToDepth::validate_and_infer_types() { OV_OP_SCOPE(v0_SpaceToDepth_validate_and_infer_types); - const auto& data_type = get_input_element_type(0); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_type, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, get_input_element_type(0), output_shape); } namespace { @@ -87,6 +84,14 @@ bool ngraph::op::v0::SpaceToDepth::has_evaluate() const { return !get_input_partial_shape(0).is_dynamic(); } +void op::v0::SpaceToDepth::set_block_size(size_t block_size) { + m_blocksize = block_size; +} + +void op::v0::SpaceToDepth::set_mode(SpaceToDepthMode mode) { + m_mode = mode; +} + std::ostream& ov::operator<<(std::ostream& s, const op::v0::SpaceToDepth::SpaceToDepthMode& type) { return s << as_string(type); } diff --git a/src/core/src/runtime/ov_tensor.cpp b/src/core/src/runtime/ov_tensor.cpp index 732dfe72ed4bba..e4d46691d19e68 100644 --- a/src/core/src/runtime/ov_tensor.cpp +++ b/src/core/src/runtime/ov_tensor.cpp @@ -7,6 +7,9 @@ #include "blob_factory.hpp" // IE private header #include "ie_ngraph_utils.hpp" // IE private header #include "openvino/core/except.hpp" +#include "openvino/core/shape.hpp" +#include "openvino/core/strides.hpp" +#include "openvino/runtime/remote_tensor.hpp" #include "openvino/runtime/tensor.hpp" #include "runtime/blob_allocator.hpp" #include "shape_util.hpp" @@ -94,6 +97,17 @@ Tensor::Tensor(const Tensor& owner, const Coordinate& begin, const Coordinate& e } } +Tensor::Tensor(const ov::Output& port, const Allocator& allocator) + : Tensor(port.get_element_type(), + port.get_partial_shape().is_dynamic() ? ov::Shape{0} : port.get_shape(), + allocator) {} + +Tensor::Tensor(const ov::Output& port, void* host_ptr, const Strides& byte_strides) + : Tensor(port.get_element_type(), + port.get_partial_shape().is_dynamic() ? ov::Shape{0} : port.get_shape(), + host_ptr, + byte_strides) {} + element::Type Tensor::get_element_type() const { OV_TENSOR_STATEMENT(return ie::details::convertPrecision(_impl->getTensorDesc().getPrecision())); } @@ -113,6 +127,128 @@ Shape Tensor::get_shape() const { OV_TENSOR_STATEMENT({ return _impl->getTensorDesc().getBlockingDesc().getBlockDims(); }); } +void Tensor::copy_to(ov::Tensor& dst) const { + OV_TENSOR_STATEMENT({ + OPENVINO_ASSERT(dst, "Destination tensor was not initialized."); + OPENVINO_ASSERT(!is(), "Default copy to doesn't support copy from remote tensor."); + OPENVINO_ASSERT(!dst.is(), "Default copy to doesn't support copy to remote tensor."); + OPENVINO_ASSERT(dst.get_element_type() == get_element_type(), + "Tensor element types are not equal. (src: ", + get_element_type(), + " != dst: ", + dst.get_element_type(), + ")"); + if (dst.get_shape() == ov::Shape{0}) + dst.set_shape(get_shape()); + OPENVINO_ASSERT(dst.get_shape() == get_shape(), + "Tensor shapes are not equal. (src: ", + get_shape(), + " != dst: ", + dst.get_shape(), + ")"); + const auto& shape = get_shape(); + auto* src_data = static_cast(data()); + auto* dst_data = static_cast(dst.data()); + ov::Strides src_strides{get_byte_size()}; + ov::Strides dst_strides{dst.get_byte_size()}; + ov::Shape cur_pos{0}; + ov::Shape max_pos{1}; + + if (get_element_type().bitwidth() < 8 || (get_strides() == dst.get_strides() && is_continuous())) { + // OpenVINO doesn't support strides for LP types + // or both tensors have default strides + // Strides and positions already initialized + } else { + // Tensors have default strides + const auto& type = get_element_type(); + std::vector strides(shape.size()); + if (!shape.empty()) { + strides[shape.size() - 1] = 1; + } + auto size = shape.size(); + for (size_t i = 1; i < size; i++) { + strides[size - i - 1] = strides[size - i] * shape[size - i]; + } + + ov::Strides default_strides(strides.size()); + for (size_t i = 0; i < strides.size(); ++i) + default_strides[i] = strides[i] * type.size(); + + src_strides = get_strides(); + dst_strides = dst.get_strides(); + + ov::Strides src_str, dst_str; + + // Calculate src and dst shapes + bool found_step = false; + for (size_t i = 0; i < shape.size(); i++) { + size_t inverted_idx = shape.size() - i - 1; + if (!found_step) { + if (default_strides[inverted_idx] == src_strides[inverted_idx] && + src_strides[inverted_idx] == dst_strides[inverted_idx]) { + continue; + } else { + found_step = true; + size_t strides_size = inverted_idx + 1; + // Set right size + src_str.resize(strides_size + 1); + dst_str.resize(strides_size + 1); + max_pos.resize(strides_size + 1); + cur_pos.resize(strides_size + 1); + // In case of default continuous strides we can copy several elements + // In other case only one element + size_t dim = 1; + size_t strides = type.size(); + + if (strides_size < default_strides.size()) { + strides = default_strides[strides_size]; + dim = get_shape()[strides_size]; + } + src_str[strides_size] = strides; + dst_str[strides_size] = strides; + max_pos[strides_size] = dim; + cur_pos[strides_size] = 0; + } + } + src_str[inverted_idx] = src_strides[inverted_idx]; + dst_str[inverted_idx] = dst_strides[inverted_idx]; + max_pos[inverted_idx] = shape[inverted_idx]; + cur_pos[inverted_idx] = 0; + } + src_strides = src_str; + dst_strides = dst_str; + } + + const auto update_index = [](const ov::Shape& pos, const ov::Shape& shape, const ov::Strides& strides) { + size_t offset = 0; + + for (size_t i = 0; i < pos.size(); i++) { + offset += pos[i] * strides[i]; + } + return offset; + }; + + bool finish = false; + for (size_t dst_idx = 0, src_idx = 0; !finish;) { + memcpy(dst_data + dst_idx, src_data + src_idx, src_strides[src_strides.size() - 1]); + // update indexes + for (size_t i = 0; i < cur_pos.size(); i++) { + size_t inverted_idx = cur_pos.size() - i - 1; + cur_pos[inverted_idx]++; + if (cur_pos[inverted_idx] != max_pos[inverted_idx]) { + break; + } + if (inverted_idx) + cur_pos[inverted_idx] = 0; + else + finish = true; + } + src_idx = update_index(cur_pos, max_pos, src_strides); + dst_idx = update_index(cur_pos, max_pos, dst_strides); + } + }); +} + Strides Tensor::get_strides() const { OPENVINO_ASSERT(get_element_type().bitwidth() >= 8, "Could not get strides for types with bitwidths less then 8 bit. Tensor type: ", @@ -174,24 +310,26 @@ Tensor::operator bool() const noexcept { } bool Tensor::is_continuous() const { - if (get_element_type().bitwidth() < 8) - // OpenVINO doesn't support strides for lp types - return true; - const auto& shape = get_shape(); - const auto& type = get_element_type(); - std::vector strides(shape.size()); - if (!shape.empty()) { - strides[shape.size() - 1] = 1; - } - auto size = shape.size(); - for (size_t i = 1; i < size; i++) { - strides[size - i - 1] = strides[size - i] * shape[size - i]; - } + OV_TENSOR_STATEMENT({ + if (get_element_type().bitwidth() < 8) + // OpenVINO doesn't support strides for lp types + return true; + const auto& shape = get_shape(); + const auto& type = get_element_type(); + std::vector strides(shape.size()); + if (!shape.empty()) { + strides[shape.size() - 1] = 1; + } + auto size = shape.size(); + for (size_t i = 1; i < size; i++) { + strides[size - i - 1] = strides[size - i] * shape[size - i]; + } - ov::Strides byte_strides(strides.size()); - for (size_t i = 0; i < strides.size(); ++i) - byte_strides[i] = strides[i] * type.size(); - return byte_strides == get_strides(); + ov::Strides byte_strides(strides.size()); + for (size_t i = 0; i < strides.size(); ++i) + byte_strides[i] = strides[i] * type.size(); + return byte_strides == get_strides(); + }); } } // namespace ov diff --git a/src/core/src/validation_util.cpp b/src/core/src/validation_util.cpp index 7b987123fd4a64..b1f96977ccce74 100644 --- a/src/core/src/validation_util.cpp +++ b/src/core/src/validation_util.cpp @@ -938,8 +938,7 @@ int64_t ov::normalize_axis(const std::string& node_description, OPENVINO_ASSERT((axis_range_min <= axis) && (axis <= axis_range_max), node_description, normalize_axis_error_msg(axis, axis_range_min, axis_range_max)); - normalize_axis_to(tensor_rank)(axis); - return axis; + return normalize(axis, tensor_rank); } void ngraph::opset1::infer_conv_backprop_auto_padding(const Shape& input_data_shape, diff --git a/src/core/tests/model.cpp b/src/core/tests/model.cpp index f634fef40c78ba..5bb38ead94557c 100644 --- a/src/core/tests/model.cpp +++ b/src/core/tests/model.cpp @@ -2044,6 +2044,7 @@ TEST(model, set_complex_meta_information) { TEST(model, create_model) { EXPECT_NO_THROW(ov::Model({}, "")); EXPECT_THROW(ov::Model(ov::ResultVector{nullptr}, {}, ""), ov::Exception); + EXPECT_THROW(ov::Model(nullptr, {}, ""), ov::Exception); EXPECT_NO_THROW(ov::Model(ov::ResultVector{}, ov::ParameterVector{}, "")); EXPECT_THROW(ov::Model({nullptr}, {nullptr}, {nullptr}, {nullptr}, ""), ov::Exception); EXPECT_THROW(ov::Model({nullptr}, {}, {}, {}, ""), ov::Exception); diff --git a/src/core/tests/ov_tensor_test.cpp b/src/core/tests/ov_tensor_test.cpp index 68261e854c3147..98dbf9f2383069 100644 --- a/src/core/tests/ov_tensor_test.cpp +++ b/src/core/tests/ov_tensor_test.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -13,7 +14,11 @@ #include "ngraph/coordinate_transform.hpp" #include "openvino/core/except.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/type/element_type_traits.hpp" +#include "openvino/op/parameter.hpp" #include "openvino/runtime/allocator.hpp" +#include "openvino/runtime/remote_tensor.hpp" #include "openvino/runtime/tensor.hpp" using OVTensorTest = ::testing::Test; @@ -40,6 +45,26 @@ TEST_F(OVTensorTest, canCreateTensor) { ASSERT_THROW(t.data(), ov::Exception); } +TEST_F(OVTensorTest, createTensorFromPort) { + auto parameter1 = std::make_shared(ov::element::f64, ov::Shape{1, 3, 2, 2}); + auto parameter2 = std::make_shared(ov::element::f32, ov::Shape{1, 3}); + auto parameter3 = std::make_shared(ov::element::f32, ov::PartialShape::dynamic()); + float data[] = {5.f, 6.f, 7.f}; + ov::Tensor t1{parameter1->output(0)}; + ov::Tensor t2{parameter2->output(0), data}; + ov::Tensor t3{parameter3->output(0)}; + ov::Tensor t4{parameter3->output(0), data}; + + EXPECT_EQ(t1.get_shape(), parameter1->get_shape()); + EXPECT_EQ(t1.get_element_type(), parameter1->get_element_type()); + EXPECT_EQ(t2.get_shape(), parameter2->get_shape()); + EXPECT_EQ(t2.get_element_type(), parameter2->get_element_type()); + EXPECT_EQ(t3.get_shape(), ov::Shape{0}); + EXPECT_EQ(t3.get_element_type(), parameter3->get_element_type()); + EXPECT_EQ(t4.get_shape(), ov::Shape{0}); + EXPECT_EQ(t4.get_element_type(), parameter3->get_element_type()); +} + TEST_F(OVTensorTest, canAccessF16Tensor) { ov::Shape shape = {4, 3, 2}; ov::Tensor t{ov::element::f16, shape}; @@ -281,3 +306,201 @@ TEST_F(OVTensorTest, readRangeRoiBlob) { } } } + +struct TestParams { + ov::Shape src_shape; + ov::Strides src_strides; + ov::Shape dst_shape; + ov::Strides dst_strides; +}; + +struct OVTensorTestCopy : ::testing::TestWithParam> {}; + +namespace { +template +std::vector fill_data(const ov::Tensor& tensor) { + std::vector actual; + const T* data = tensor.data(); + auto strides = tensor.get_strides(); + for (auto&& c : ngraph::CoordinateTransformBasic{tensor.get_shape()}) { + actual.emplace_back( + *(data + (c[2] * strides[2] + c[1] * strides[1] + c[0] * strides[0]) / tensor.get_element_type().size())); + } + return actual; +}; +template +void compare_data(const ov::Tensor& src, const ov::Tensor& dst) { + auto source_vec = fill_data(src); + auto dest_vec = fill_data(dst); + + ASSERT_EQ(source_vec.size(), dest_vec.size()); + + for (size_t i = 0; i < source_vec.size(); i++) { + EXPECT_EQ(source_vec[i], dest_vec[i]); + } +}; + +template +void init_tensor(const ov::Tensor& tensor, bool input) { + const auto origPtr = tensor.data(); + ASSERT_NE(nullptr, origPtr); + for (size_t i = 0; i < tensor.get_size(); ++i) { + origPtr[i] = static_cast(input ? i : -1); + } +} + +void init_tensor(const ov::Tensor& tensor, bool input) { + switch (tensor.get_element_type()) { + case ov::element::bf16: + init_tensor::value_type>(tensor, input); + break; + case ov::element::f16: + init_tensor::value_type>(tensor, input); + break; + case ov::element::f32: + init_tensor::value_type>(tensor, input); + break; + case ov::element::f64: + init_tensor::value_type>(tensor, input); + break; + case ov::element::i8: + init_tensor::value_type>(tensor, input); + break; + case ov::element::i16: + init_tensor::value_type>(tensor, input); + break; + case ov::element::i32: + init_tensor::value_type>(tensor, input); + break; + case ov::element::i64: + init_tensor::value_type>(tensor, input); + break; + case ov::element::u8: + init_tensor::value_type>(tensor, input); + break; + case ov::element::u16: + init_tensor::value_type>(tensor, input); + break; + case ov::element::u32: + init_tensor::value_type>(tensor, input); + break; + case ov::element::u64: + init_tensor::value_type>(tensor, input); + break; + default: + OPENVINO_UNREACHABLE("Unsupported data type"); + } +} + +void compare_tensors(const ov::Tensor& src, const ov::Tensor& dst) { + ASSERT_EQ(src.get_byte_size(), dst.get_byte_size()); + ASSERT_EQ(src.get_shape(), dst.get_shape()); + ASSERT_EQ(src.get_element_type(), dst.get_element_type()); + switch (src.get_element_type()) { + case ov::element::bf16: + compare_data::value_type>(src, dst); + break; + case ov::element::f16: + compare_data::value_type>(src, dst); + break; + case ov::element::f32: + compare_data::value_type>(src, dst); + break; + case ov::element::f64: + compare_data::value_type>(src, dst); + break; + case ov::element::i8: + compare_data::value_type>(src, dst); + break; + case ov::element::i16: + compare_data::value_type>(src, dst); + break; + case ov::element::i32: + compare_data::value_type>(src, dst); + break; + case ov::element::i64: + compare_data::value_type>(src, dst); + break; + case ov::element::u8: + compare_data::value_type>(src, dst); + break; + case ov::element::u16: + compare_data::value_type>(src, dst); + break; + case ov::element::u32: + compare_data::value_type>(src, dst); + break; + case ov::element::u64: + compare_data::value_type>(src, dst); + break; + default: + OPENVINO_UNREACHABLE("Unsupported data type"); + } +} +} // namespace + +TEST_P(OVTensorTestCopy, copy_to) { + ov::element::Type type; + TestParams p; + std::tie(type, p) = GetParam(); + // Source tensors + ov::Tensor full_src_tensor; + ov::Tensor src_tensor; + if (!p.src_strides.empty()) { + full_src_tensor = ov::Tensor(type, ov::Shape{p.src_shape[0] * p.src_strides[0]}); + src_tensor = ov::Tensor(type, p.src_shape, full_src_tensor.data(), p.src_strides); + } else { + src_tensor = full_src_tensor = ov::Tensor(type, p.src_shape); + } + init_tensor(full_src_tensor, true); + + ov::Tensor full_dst_tensor; + ov::Tensor dst_tensor; + if (!p.dst_strides.empty()) { + full_dst_tensor = ov::Tensor(type, ov::Shape{p.dst_shape[0] * p.dst_strides[0]}); + dst_tensor = ov::Tensor(type, p.dst_shape, full_dst_tensor.data(), p.dst_strides); + } else { + dst_tensor = full_dst_tensor = ov::Tensor(type, p.dst_shape); + } + init_tensor(full_src_tensor, false); + + src_tensor.copy_to(dst_tensor); + compare_tensors(src_tensor, dst_tensor); +} + +// clang-format off +INSTANTIATE_TEST_SUITE_P(copy_tests, + OVTensorTestCopy, + ::testing::Combine(::testing::Values( + ov::element::bf16, + ov::element::f16, + ov::element::f32, + ov::element::f64, + ov::element::i8, + ov::element::i16, + ov::element::i32, + ov::element::i64, + ov::element::u8, + ov::element::u16, + ov::element::u32, + ov::element::u64 + ), + ::testing::Values( + TestParams { + ov::Shape{1, 3, 4, 8}, {}, + {0}, {} + }, + TestParams { + ov::Shape{3, 2, 2}, {}, + ov::Shape{3, 2, 2}, ov::Strides{128, 24, 8} + }, + TestParams { + ov::Shape{3, 2, 2}, ov::Strides{64, 16, 8}, + ov::Shape{3, 2, 2}, ov::Strides{} + }, + TestParams { + ov::Shape{3, 2, 2}, ov::Strides{64, 16, 8}, + ov::Shape{3, 2, 2}, ov::Strides{128, 24, 8} + } + ))); +// clang-format on diff --git a/src/core/tests/type_prop/batch_to_space.cpp b/src/core/tests/type_prop/batch_to_space.cpp index 5870283161ae7c..97a9fd57b883a2 100644 --- a/src/core/tests/type_prop/batch_to_space.cpp +++ b/src/core/tests/type_prop/batch_to_space.cpp @@ -10,7 +10,7 @@ using namespace std; using namespace ngraph; -#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +using namespace testing; namespace { constexpr size_t data_input_idx = 0; @@ -339,7 +339,7 @@ TEST(type_prop, batch_to_space_output_shape_5D) { ASSERT_EQ(batch_to_space->get_shape(), (Shape{960 / (6 * 5 * 16), 6 * 6 - 2 - 2, 13 * 5 - 1, 128, 16 * 16})); } -TEST(type_prop, batch_to_space_output_dynamicshape_5D_when_batch_is_static) { +TEST(type_prop, batch_to_space_output_dynamic_shape_5D_when_batch_is_static) { auto data = make_shared(element::f32, PartialShape{960, {2, 20}, {12, 14}, {100, 150}, {10, 20}}); auto block_shape = make_shared(element::i32, Shape{5}, vector{1, 6, 5, 1, 16}); auto crops_begin = make_shared(element::i32, Shape{5}, vector{0, 2, 0, 0, 0}); @@ -354,20 +354,37 @@ TEST(type_prop, batch_to_space_output_dynamicshape_5D_when_batch_is_static) { {10 * 16, 20 * 16}})); } -TEST(type_prop, batch_to_space_output_dynamicshape_5D_when_batch_is_dynamic) { - auto data = - make_shared(element::f32, PartialShape{{959, 962}, {2, 34}, {9, 21}, {100, 162}, {1, 1999}}); +TEST(type_prop, batch_to_space_output_dynamic_shape_5D_when_batch_is_dynamic) { + auto data_shape = PartialShape{{959, 962}, {2, 34}, {9, 21}, {100, 162}, {1, 1999}}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); auto block_shape = make_shared(element::i32, Shape{5}, vector{1, 6, 5, 1, 16}); auto crops_begin = make_shared(element::i32, Shape{5}, vector{0, 2, 0, 0, 0}); auto crops_end = make_shared(element::i32, Shape{5}, vector{0, 2, 1, 0, 0}); auto batch_to_space = make_shared(data, block_shape, crops_begin, crops_end); - ASSERT_EQ(batch_to_space->get_output_partial_shape(0), - (PartialShape{{DIV_ROUND_UP(959, (6 * 5 * 16)), 962 / (6 * 5 * 16)}, + EXPECT_EQ(batch_to_space->get_output_partial_shape(0), + (PartialShape{{ceil_div(959, (6 * 5 * 16)), 962 / (6 * 5 * 16)}, {2 * 6 - 2 - 2, 34 * 6 - 2 - 2}, {9 * 5 - 1, 21 * 5 - 1}, {100, 162}, {1 * 16, 1999 * 16}})); + EXPECT_THAT(get_shape_labels(batch_to_space->get_output_partial_shape(0)), + ElementsAre(ov::no_label, ov::no_label, ov::no_label, 13, ov::no_label)); +} + +TEST(type_prop, batch_to_space_input_interval_shape_block_one) { + auto data_shape = PartialShape{{959, 962}, {2, 34}, {9, 21}}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); + auto block_shape = make_shared(element::i32, Shape{3}, vector{1, 1, 1}); + auto crops_begin = make_shared(element::i32, Shape{3}, vector{0, 0, 0}); + auto crops_end = make_shared(element::i32, Shape{3}, vector{0, 0, 1}); + auto batch_to_space = make_shared(data, block_shape, crops_begin, crops_end); + + EXPECT_EQ(batch_to_space->get_output_partial_shape(0), + PartialShape({{959, 962}, {2, 34}, {9 * 1 - 1, 21 * 1 - 1}})); + EXPECT_THAT(get_shape_labels(batch_to_space->get_output_partial_shape(0)), ElementsAre(10, 11, ov::no_label)); } TEST(type_prop, batch_to_space_and_space_to_batch) { @@ -407,3 +424,20 @@ TEST(type_prop, batch_to_space_dynamic_shape_dynamic_rank) { ASSERT_EQ(batch_to_space->get_element_type(), element::f32); ASSERT_EQ(batch_to_space->get_output_partial_shape(0), PartialShape::dynamic()); } + +TEST(type_prop, batch_to_space_default_ctor) { + auto data = make_shared(element::i16, Shape{100, 7, 13, 3}); + auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 10, 5, 1}); + auto crops_begin = make_shared(element::i64, Shape{4}, vector{0, 3, 1, 0}); + auto crops_end = make_shared(element::i64, Shape{4}, vector{0, 3, 0, 0}); + + auto batch_to_space = make_shared(); + + batch_to_space->set_arguments(OutputVector{data, block_shape, crops_begin, crops_end}); + batch_to_space->validate_and_infer_types(); + + EXPECT_EQ(batch_to_space->get_input_size(), 4); + EXPECT_EQ(batch_to_space->get_output_size(), 1); + EXPECT_EQ(batch_to_space->get_element_type(), element::i16); + EXPECT_EQ(batch_to_space->get_shape(), (Shape{100 / (10 * 5), 7 * 10 - 3 - 3, 13 * 5 - 1, 3})); +} diff --git a/src/core/tests/type_prop/depth_to_space.cpp b/src/core/tests/type_prop/depth_to_space.cpp index 6101fdd731392e..935730a78b9b10 100644 --- a/src/core/tests/type_prop/depth_to_space.cpp +++ b/src/core/tests/type_prop/depth_to_space.cpp @@ -8,74 +8,86 @@ using namespace std; using namespace ngraph; +using namespace testing; -#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +TEST(type_prop, depth_to_space_input_interval_shape_block_first_5D_when_depth_is_static) { + auto a_shape = PartialShape{{2, 10}, 24, {3, 7}, {423, 3000}, {235, 1345}}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); -TEST(type_prop, depth_to_space_output_dynamicshape_block_first_5D_when_depth_is_static) { - auto A = make_shared(element::f32, PartialShape{{2, 10}, 24, {3, 7}, {423, 3000}, {235, 1345}}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); - - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), + EXPECT_EQ(depth_to_space->get_output_element_type(0), element::f32); + EXPECT_EQ(depth_to_space->get_output_partial_shape(0), (PartialShape{{2, 10}, 3, {3 * 2, 7 * 2}, {423 * 2, 3000 * 2}, {235 * 2, 1345 * 2}})); + EXPECT_THAT(get_shape_labels(depth_to_space->get_output_partial_shape(0)), + ElementsAre(10, ov::no_label, ov::no_label, ov::no_label, ov::no_label)); +} + +TEST(type_prop, depth_to_space_input_interval_shape_default_block_size) { + auto a_shape = PartialShape{{2, 10}, 24, {3, 7}, {423, 3000}, {235, 1345}}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST); + + EXPECT_EQ(depth_to_space->get_output_element_type(0), element::f32); + EXPECT_EQ(depth_to_space->get_output_partial_shape(0), a_shape); + EXPECT_THAT(get_shape_labels(depth_to_space->get_output_partial_shape(0)), ElementsAre(10, 11, 12, 13, 14)); } TEST(type_prop, depth_to_space_output_dynamicshape_block_first_5D_when_depth_is_dynamic) { auto A = make_shared(element::f32, PartialShape{{2, 10}, {81, 82}, {3, 7}, {423, 3000}, {235, 1345}}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 3); - - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), - (PartialShape{{2, 10}, - {DIV_ROUND_UP(81, 27), 82 / 27}, - {3 * 3, 7 * 3}, - {423 * 3, 3000 * 3}, - {235 * 3, 1345 * 3}})); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 3); + + ASSERT_EQ( + depth_to_space->get_output_partial_shape(0), + (PartialShape{{2, 10}, {ceil_div(81, 27), 82 / 27}, {3 * 3, 7 * 3}, {423 * 3, 3000 * 3}, {235 * 3, 1345 * 3}})); } TEST(type_prop, depth_to_space_output_shape_block_first_4D) { auto A = make_shared(element::f32, Shape{1, 128, 8, 8}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 8); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 8); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 2, 64, 64})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 2, 64, 64})); } TEST(type_prop, depth_to_space_output_shape_block_first_4D_2) { auto A = make_shared(element::f32, Shape{1, 12, 1080, 1616}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 3, 2 * 1080, 2 * 1616})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 3, 2 * 1080, 2 * 1616})); } TEST(type_prop, depth_to_space_output_shape_block_first_5D) { auto A = make_shared(element::f32, Shape{1, 16, 3, 1080, 1616}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); } TEST(type_prop, depth_to_space_output_shape_depth_first_4D) { auto A = make_shared(element::f32, Shape{1, 12, 1080, 1616}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 3, 2 * 1080, 2 * 1616})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 3, 2 * 1080, 2 * 1616})); } TEST(type_prop, depth_to_space_output_shape_depth_first_5D) { auto A = make_shared(element::f32, Shape{1, 16, 3, 1080, 1616}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); } TEST(type_prop, depth_to_space_input_rank_not_supported) { auto A = make_shared(element::f32, Shape{1, 8}); try { - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); FAIL() << "Not supported input shape for DepthToSpace exception not thrown"; } catch (const ngraph_error& error) { EXPECT_HAS_SUBSTRING(error.what(), "The input tensor with rank lower than 3 is not supported (input rank: 2)"); @@ -87,7 +99,7 @@ TEST(type_prop, depth_to_space_input_rank_not_supported) { TEST(type_prop, depth_to_space_blocksize_not_matched) { auto A = make_shared(element::f32, Shape{1, 7, 4, 4}); try { - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); FAIL() << "Not matched blocksize for DepthToSpace exception not thrown"; } catch (const ngraph_error& error) { EXPECT_HAS_SUBSTRING(error.what(), "Dimension value: [ 7, 7] must be a multiple of divisor: 4"); @@ -98,16 +110,34 @@ TEST(type_prop, depth_to_space_blocksize_not_matched) { TEST(type_prop, depth_to_space_dynamic_shape_static_rank) { auto A = make_shared(element::f32, PartialShape::dynamic(4)); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), PartialShape::dynamic(4)); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_output_partial_shape(0), PartialShape::dynamic(4)); } TEST(type_prop, depth_to_space_dynamic_shape_dynamic_rank) { auto A = make_shared(element::f32, PartialShape::dynamic()); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, "depth_first", 2); + + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_output_partial_shape(0), PartialShape::dynamic()); +} - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), PartialShape::dynamic()); +TEST(type_prop, depth_to_space_default_ctor) { + const auto a_shape = PartialShape{{2, 10}, 27, {0, 54}, {9, -1}}; + const auto A = make_shared(element::u32, a_shape); + + const auto depth_to_space = make_shared(); + depth_to_space->set_block_size(3); + depth_to_space->set_mode(op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST); + depth_to_space->set_argument(0, A); + depth_to_space->validate_and_infer_types(); + + EXPECT_EQ(depth_to_space->get_block_size(), 3); + EXPECT_EQ(depth_to_space->get_mode(), op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST); + EXPECT_EQ(depth_to_space->get_input_size(), 1); + EXPECT_EQ(depth_to_space->get_output_size(), 1); + EXPECT_EQ(depth_to_space->get_output_element_type(0), element::u32); + EXPECT_EQ(depth_to_space->get_output_partial_shape(0), (PartialShape{{2, 10}, 3, {0 * 3, 54 * 3}, {9 * 3, -1}})); } diff --git a/src/core/tests/type_prop/scatter_elements_update.cpp b/src/core/tests/type_prop/scatter_elements_update.cpp index 3e2d031242cca9..269d06de74ea16 100644 --- a/src/core/tests/type_prop/scatter_elements_update.cpp +++ b/src/core/tests/type_prop/scatter_elements_update.cpp @@ -2,12 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "common_test_utils/test_assertions.hpp" #include "gtest/gtest.h" #include "ngraph/ngraph.hpp" #include "util/type_prop.hpp" using namespace std; using namespace ngraph; +using namespace testing; TEST(type_prop, scatter_elements_update_output_shape) { Shape data_shape{2, 4, 5, 7}; @@ -28,18 +30,37 @@ TEST(type_prop, scatter_elements_update_output_shape) { TEST(type_prop, scatter_elements_update_output_partial_dyn_shape) { PartialShape data_shape{2, Dimension::dynamic(), 5}; + set_shape_labels(data_shape, 10); PartialShape indices_shape{Dimension::dynamic(), 2, 2}; PartialShape updates_shape{2, 2, Dimension::dynamic()}; PartialShape axis_shape = PartialShape::dynamic(); - auto data = make_shared(element::f32, data_shape); + auto data = make_shared(element::f64, data_shape); auto indices = make_shared(element::i16, indices_shape); - auto updates = make_shared(element::f32, updates_shape); + auto updates = make_shared(element::f64, updates_shape); auto axis = make_shared(element::i16, axis_shape); auto scatter = make_shared(data, indices, updates, axis); - EXPECT_TRUE(scatter->get_output_partial_shape(0).same_scheme(data_shape)); + EXPECT_EQ(scatter->get_output_element_type(0), element::f64); + EXPECT_EQ(scatter->get_output_partial_shape(0), data_shape); + EXPECT_THAT(get_shape_labels(scatter->get_output_partial_shape(0)), ElementsAre(10, 11, 12)); +} + +TEST(type_prop, scatter_elements_update_data_has_interval_dimensions) { + PartialShape data_shape{{5, 10}, -1, {-1, 3}, {8, -1}}; + set_shape_labels(data_shape, 10); + + const auto data = make_shared(element::i64, data_shape); + const auto indices = make_shared(element::i16, PartialShape{1, 2, 2, {2, 3}}); + const auto updates = make_shared(element::i64, PartialShape{{0, 2}, -1, 2, -1}); + const auto axis = make_shared(element::i16, PartialShape::dynamic()); + + const auto scatter = make_shared(data, indices, updates, axis); + + EXPECT_EQ(scatter->get_output_element_type(0), element::i64); + EXPECT_EQ(scatter->get_output_partial_shape(0), data_shape); + EXPECT_THAT(get_shape_labels(scatter->get_output_partial_shape(0)), ElementsAre(10, 11, 12, 13)); } TEST(type_prop, scatter_elements_update_output_full_dyn_shape) { @@ -55,7 +76,42 @@ TEST(type_prop, scatter_elements_update_output_full_dyn_shape) { auto scatter = make_shared(data, indices, updates, axis); - EXPECT_TRUE(scatter->get_output_partial_shape(0).same_scheme(data_shape)); + EXPECT_EQ(scatter->get_output_element_type(0), element::f32); + EXPECT_EQ(scatter->get_output_partial_shape(0), data_shape); +} + +TEST(type_prop, scatter_elements_update_default_ctor) { + const auto data = make_shared(element::f32, PartialShape{2, 5, 5, 6}); + const auto indices = make_shared(element::i16, PartialShape{1, 2, 1, 3}); + const auto updates = make_shared(element::f32, PartialShape{1, 2, 1, 3}); + const auto axis = make_shared(element::i16, Shape{}, -4); + + const auto scatter = make_shared(data, indices, updates, axis); + scatter->set_arguments(OutputVector{data, indices, updates, axis}); + scatter->validate_and_infer_types(); + + EXPECT_EQ(scatter->get_input_size(), 4); + EXPECT_EQ(scatter->get_output_size(), 1); + EXPECT_EQ(scatter->get_output_element_type(0), element::f32); + EXPECT_EQ(scatter->get_output_partial_shape(0), PartialShape({2, 5, 5, 6})); + EXPECT_THAT(get_shape_labels(scatter->get_output_partial_shape(0)), Each(ov::no_label)); +} + +TEST(type_prop, scatter_elements_update_preserve_partial_values_and_labels_via_evaluates_bounds) { + const auto data = op::Constant::create(element::i64, Shape{4}, {2, 3, 15, 4}); + const auto indices = op::Constant::create(element::i64, Shape{2}, {3, 0}); + auto updates_shape = PartialShape{{10, 20}, {3, 4}}; + set_shape_labels(updates_shape, 20); + const auto axis = make_shared(element::i16, Shape{}, 0); + + const auto shape_of_u = std::make_shared(std::make_shared(element::i64, updates_shape)); + const auto scatter = make_shared(data, indices, shape_of_u, axis); + + auto param = std::make_shared(element::f32, PartialShape{1}); + auto bc = std::make_shared(param, scatter, op::BroadcastType::BIDIRECTIONAL); + + EXPECT_EQ(bc->get_output_partial_shape(0), PartialShape({{3, 4}, 3, 15, {10, 20}})); + EXPECT_THAT(get_shape_labels(bc->get_output_partial_shape(0)), ElementsAre(21, ov::no_label, ov::no_label, 20)); } TEST(type_prop, scatter_elements_update_axis_validation) { @@ -69,14 +125,9 @@ TEST(type_prop, scatter_elements_update_axis_validation) { auto updates = make_shared(element::f32, updates_shape); auto axis = make_shared(element::i16, axis_shape, std::vector{8}); - try { - auto scatter = make_shared(data, indices, updates, axis); - FAIL() << "Not detected axis with value out of the range"; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), std::string("Axis value has to be in range")); - } catch (...) { - FAIL() << "Deduced type check failed for unexpected reason"; - } + OV_EXPECT_THROW(auto scatter = make_shared(data, indices, updates, axis), + ov::AssertFailure, + HasSubstr("Parameter axis 8 out of the tensor rank range [-4, 3]")); } TEST(type_prop, scatter_elements_updates_indices_shape) { @@ -90,14 +141,9 @@ TEST(type_prop, scatter_elements_updates_indices_shape) { auto updates = make_shared(element::f32, updates_shape); auto axis = make_shared(element::i16, axis_shape, std::vector{1}); - try { - auto scatter = make_shared(data, indices, updates, axis); - FAIL() << "Not detected incompatibile indices and updates shape"; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), std::string("Indices and updates input shapes are required to be equal")); - } catch (...) { - FAIL() << "Deduced type check failed for unexpected reason"; - } + OV_EXPECT_THROW(auto scatter = make_shared(data, indices, updates, axis), + NodeValidationFailure, + HasSubstr("Indices and updates input shapes are required to be equal")); } TEST(type_prop, scatter_elements_updates_indices_rank) { @@ -111,14 +157,9 @@ TEST(type_prop, scatter_elements_updates_indices_rank) { auto updates = make_shared(element::f32, updates_shape); auto axis = make_shared(element::i16, axis_shape, std::vector{1}); - try { - auto scatter = make_shared(data, indices, updates, axis); - FAIL() << "Not detected incompatibile indices and updates shape"; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), std::string("Indices and updates input shapes are required to be equal")); - } catch (...) { - FAIL() << "Deduced type check failed for unexpected reason"; - } + OV_EXPECT_THROW(auto scatter = make_shared(data, indices, updates, axis), + NodeValidationFailure, + HasSubstr("Indices and updates input shapes are required to be equal")); } TEST(type_prop, scatter_elements_data_indices_rank) { @@ -132,12 +173,7 @@ TEST(type_prop, scatter_elements_data_indices_rank) { auto updates = make_shared(element::f32, updates_shape); auto axis = make_shared(element::i16, axis_shape, std::vector{1}); - try { - auto scatter = make_shared(data, indices, updates, axis); - FAIL() << "Not detected incompatibile indices and data rank"; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), std::string("Indices rank and data rank are required to be equal")); - } catch (...) { - FAIL() << "Deduced type check failed for unexpected reason"; - } + OV_EXPECT_THROW(auto scatter = make_shared(data, indices, updates, axis), + NodeValidationFailure, + HasSubstr("Indices rank and data rank are required to be equal")); } diff --git a/src/core/tests/type_prop/shuffle_channels.cpp b/src/core/tests/type_prop/shuffle_channels.cpp index ce54933ad243a1..4bd5a8bf28c380 100644 --- a/src/core/tests/type_prop/shuffle_channels.cpp +++ b/src/core/tests/type_prop/shuffle_channels.cpp @@ -2,12 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "gtest/gtest.h" +#include "common_test_utils/test_assertions.hpp" +#include "gmock/gmock.h" #include "ngraph/ngraph.hpp" #include "util/type_prop.hpp" using namespace std; using namespace ngraph; +using namespace testing; TEST(type_prop, shuffle_channels_default_4D) { const auto data_input_shape = Shape{3, 9, 4, 5}; @@ -30,7 +32,8 @@ TEST(type_prop, shuffle_channels_basic_4D) { } TEST(type_prop, shuffle_channels_dynamic_4D) { - const auto data_input_shape = PartialShape{Dimension::dynamic(), Dimension(3, 9), 4, Dimension(4, 15)}; + auto data_input_shape = PartialShape{Dimension::dynamic(), Dimension(3, 9), 4, Dimension(4, 15)}; + set_shape_labels(data_input_shape, 10); const auto data = make_shared(element::f32, data_input_shape); const auto axis = 1; const auto group = 3; @@ -38,6 +41,7 @@ TEST(type_prop, shuffle_channels_dynamic_4D) { EXPECT_EQ(shuffle_channels->get_element_type(), element::f32); EXPECT_EQ(shuffle_channels->get_output_partial_shape(0), data_input_shape); + EXPECT_THAT(get_shape_labels(shuffle_channels->get_output_partial_shape(0)), ElementsAre(10, ov::no_label, 12, 13)); } TEST(type_prop, shuffle_channels_dynamic_fully) { @@ -108,16 +112,11 @@ TEST(type_prop, shuffle_channels_ND_smaller) { } TEST(type_prop, shuffle_channels_axis_validation) { - try { - const auto data = make_shared(element::f64, Shape{1, 2, 3, 4}); - const auto shuffle_channels = make_shared(data, -5, 5); - FAIL() << "ShuffleChannels validation did not work. Op node was created with incorrect " - "params."; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), - "The 'axis' parameter for ShuffleChannels has to point to one of the " - "input tensor's shape dimensions"); - } + const auto data = make_shared(element::f64, Shape{1, 2, 3, 4}); + + OV_EXPECT_THROW(const auto op = make_shared(data, -5, 5), + ov::AssertFailure, + HasSubstr("ShuffleChannels Parameter axis -5 out of the tensor rank range [-4, 3]")); } TEST(type_prop, shuffle_channels_negative_axis_calculation) { @@ -155,24 +154,36 @@ TEST(type_prop, shuffle_channels_infer_shape_with_negative_axis_calculation) { } TEST(type_prop, shuffle_channels_invalid_input_shape) { - try { - const auto data = make_shared(element::f64, Shape{}); - const auto shuffle_channels = make_shared(data, 0, 1); - FAIL() << "ShuffleChannels validation did not work. Op node was created with incorrect " - "params."; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), "The input tensor's shape is expected to be at least 1D."); - } + const auto data = make_shared(element::f64, Shape{}); + + OV_EXPECT_THROW(const auto op = make_shared(data, 0, 1), + NodeValidationFailure, + HasSubstr("The input tensor's shape is expected to be at least 1D.")); } TEST(type_prop, shuffle_channels_invalid_groups_value) { - try { - const auto data = make_shared(element::f64, Shape{1, 2, 3, 15}); - const auto shuffle_channels = make_shared(data, -1, 2); - FAIL() << "ShuffleChannels validation did not work. Op node was created with incorrect " - "params."; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), - "The channel dimension size has to be a multiple of the groups parameter value."); - } + const auto data = make_shared(element::f64, Shape{1, 2, 3, 15}); + + OV_EXPECT_THROW(const auto op = make_shared(data, -1, 2), + NodeValidationFailure, + HasSubstr("The channel dimension size has to be a multiple of the groups parameter value.")); +} + +TEST(type_prop, shuffle_channels_default_ctor) { + const auto data_shape = PartialShape{{2, 5}, {0, 2}, 3, {2, -1}}; + const auto data = make_shared(element::i32, data_shape); + + const auto shuffle_channels = make_shared(); + shuffle_channels->set_axis(-3); + shuffle_channels->set_group(3); + shuffle_channels->set_argument(0, data); + shuffle_channels->validate_and_infer_types(); + + EXPECT_EQ(shuffle_channels->get_axis(), -3); + EXPECT_EQ(shuffle_channels->get_zero_based_axis(), 1); + EXPECT_EQ(shuffle_channels->get_group(), 3); + EXPECT_EQ(shuffle_channels->get_input_size(), 1); + EXPECT_EQ(shuffle_channels->get_output_size(), 1); + EXPECT_EQ(shuffle_channels->get_element_type(), element::i32); + EXPECT_EQ(shuffle_channels->get_output_partial_shape(0), data_shape); } diff --git a/src/core/tests/type_prop/space_to_batch.cpp b/src/core/tests/type_prop/space_to_batch.cpp index 682d71363cedce..b3d5b9bcbb2a89 100644 --- a/src/core/tests/type_prop/space_to_batch.cpp +++ b/src/core/tests/type_prop/space_to_batch.cpp @@ -2,12 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "gtest/gtest.h" +#include "gmock/gmock.h" #include "ngraph/ngraph.hpp" #include "util/type_prop.hpp" using namespace std; using namespace ngraph; +using namespace testing; #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) @@ -65,31 +66,52 @@ TEST(type_prop, space_to_batch_and_batch_to_space) { } TEST(type_prop, space_to_batch_when_space_is_static) { - auto data = make_shared(element::f32, PartialShape{{2, 5}, 100, 1024, 3}); + auto data_shape = PartialShape{{2, 5}, 100, 1024, 3}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 12, 100, 2}); auto pads_begin = make_shared(element::i64, Shape{4}, vector{0, 3, 38, 1}); auto pads_end = make_shared(element::i64, Shape{4}, vector{0, 5, 38, 0}); auto space_to_batch = make_shared(data, block_shape, pads_begin, pads_end); - ASSERT_EQ( + EXPECT_EQ( space_to_batch->get_output_partial_shape(0), (PartialShape{{2 * 12 * 100 * 2, 5 * 12 * 100 * 2}, (100 + 3 + 5) / 12, (1024 + 38 + 38) / 100, (3 + 1) / 2})); + EXPECT_THAT(get_shape_labels(space_to_batch->get_output_partial_shape(0)), Each(ov::no_label)); +} + +TEST(type_prop, space_to_batch_when_data_dynamic_) { + auto data_shape = PartialShape{{2, 5}, {5, 100}, {100, 1024}, {3, 10}}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); + auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 1, 1, 1}); + auto pads_begin = make_shared(element::i64, Shape{4}, vector{1, 0, 2, 0}); + auto pads_end = make_shared(element::i64, Shape{4}, vector{1, 0, 3, 0}); + + auto space_to_batch = make_shared(data, block_shape, pads_begin, pads_end); + + EXPECT_EQ(space_to_batch->get_output_partial_shape(0), + PartialShape({{2, 5}, {5, 100}, {(100 + 2 + 3) / 1, (1024 + 2 + 3) / 1}, {3, 10}})); + EXPECT_THAT(get_shape_labels(space_to_batch->get_output_partial_shape(0)), ElementsAre(10, 11, ov::no_label, 13)); } TEST(type_prop, space_to_batch_when_space_is_dynamic) { - auto data = make_shared(element::f32, PartialShape{{2, 5}, {5, 100}, {100, 1024}, {3, 10}}); + auto data_shape = PartialShape{{2, 5}, {5, 100}, {100, 1024}, {3, 10}}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 12, 100, 2}); auto pads_begin = make_shared(element::i64, Shape{4}, vector{0, 3, 38, 1}); auto pads_end = make_shared(element::i64, Shape{4}, vector{0, 5, 38, 0}); auto space_to_batch = make_shared(data, block_shape, pads_begin, pads_end); - ASSERT_EQ(space_to_batch->get_output_partial_shape(0), + EXPECT_EQ(space_to_batch->get_output_partial_shape(0), (PartialShape{{2 * 12 * 100 * 2, 5 * 12 * 100 * 2}, {DIV_ROUND_UP((5 + 5 + 3), 12), (100 + 5 + 3) / 12}, {DIV_ROUND_UP((100 + 38 + 38), 100), (1024 + 38 + 38) / 100}, {DIV_ROUND_UP((3 + 1), 2), (10 + 1) / 2}})); + EXPECT_THAT(get_shape_labels(space_to_batch->get_output_partial_shape(0)), Each(ov::no_label)); } TEST(type_prop, space_to_batch_dynamic_shape_static_rank) { @@ -116,6 +138,35 @@ TEST(type_prop, space_to_batch_dynamic_shape_dynamic_rank) { ASSERT_EQ(space_to_batch->get_output_partial_shape(0), PartialShape::dynamic()); } +TEST(type_prop, space_to_batch_dynamic_rank_shape_block_and_pads_not_const) { + auto data = make_shared(element::f32, PartialShape::dynamic()); + auto block_shape = make_shared(element::i64, Shape{4}); + auto pads_begin = make_shared(element::i64, Shape{4}); + auto pads_end = make_shared(element::i64, Shape{4}); + + auto space_to_batch = make_shared(data, block_shape, pads_begin, pads_end); + + ASSERT_EQ(space_to_batch->get_element_type(), element::f32); + ASSERT_EQ(space_to_batch->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TEST(type_prop, space_to_batch_default_ctor) { + auto data = make_shared(element::f32, PartialShape{{2, 5}, 100, {100, 1024}, 3}); + auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 2, 4, 1}); + auto pads_begin = make_shared(element::i64, Shape{4}, vector{1, 1, 2, 0}); + auto pads_end = make_shared(element::i64, Shape{4}, vector{1, 1, 6, 0}); + + auto space_to_batch = make_shared(); + space_to_batch->set_arguments(OutputVector{data, block_shape, pads_begin, pads_end}); + space_to_batch->validate_and_infer_types(); + + EXPECT_EQ(space_to_batch->get_input_size(), 4); + EXPECT_EQ(space_to_batch->get_output_size(), 1); + EXPECT_EQ(space_to_batch->get_output_element_type(0), element::f32); + EXPECT_EQ(space_to_batch->get_output_partial_shape(0), + PartialShape({{2 * 2 * 4, 5 * 2 * 4}, (100 + 2) / 2, {(100 + 2 + 6) / 4, (1024 + 2 + 6) / 4}, 3})); +} + TEST(type_prop, space_to_batch_invalid_element_type_block_shape) { auto data = make_shared(element::f32, Shape{2, 128}); auto block_shape = make_shared(element::f32, Shape{2}, vector{1, 5}); diff --git a/src/core/tests/type_prop/space_to_depth.cpp b/src/core/tests/type_prop/space_to_depth.cpp index e20131500b321f..c190356a9c855f 100644 --- a/src/core/tests/type_prop/space_to_depth.cpp +++ b/src/core/tests/type_prop/space_to_depth.cpp @@ -8,6 +8,7 @@ using namespace std; using namespace ngraph; +using namespace testing; #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) @@ -47,25 +48,45 @@ TEST(type_prop, space_to_depth_output_shape_depth_first_5D) { ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 12 * 8, 4 / 2, 1080 / 2, 1616 / 2})); } +TEST(type_prop, space_to_depth_output_shape_depth_first_5D_1) { + auto a_shape = PartialShape{{1, 4}, {12, 36}, 1080, 1616}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); + const auto mode = ngraph::op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST; + auto space_to_depth = make_shared(A, mode, 1); + + EXPECT_EQ(space_to_depth->get_element_type(), element::f32); + EXPECT_EQ(space_to_depth->get_output_partial_shape(0), a_shape); + EXPECT_THAT(get_shape_labels(space_to_depth->get_output_partial_shape(0)), ElementsAre(10, 11, 12, 13)); +} + TEST(type_prop, space_to_depth_output_shape_when_space_is_static) { - auto A = make_shared(element::f32, PartialShape{{1, 4}, {12, 36}, 1080, 1616}); + auto a_shape = PartialShape{{1, 4}, {12, 36}, 1080, 1616}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); const auto mode = ngraph::op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST; auto space_to_depth = make_shared(A, mode, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), + EXPECT_EQ(space_to_depth->get_element_type(), element::f32); + EXPECT_EQ(space_to_depth->get_output_partial_shape(0), (PartialShape{{1, 4}, {12 * 4, 36 * 4}, 1080 / 2, 1616 / 2})); + EXPECT_THAT(get_shape_labels(space_to_depth->get_output_partial_shape(0)), + ElementsAre(10, ov::no_label, ov::no_label, ov::no_label)); } TEST(type_prop, space_to_depth_output_shape_when_space_is_dynamic) { - auto A = make_shared(element::f32, PartialShape{{1, 4}, {12, 36}, {100, 1081}, {99, 1616}}); + auto a_shape = PartialShape{{1, 4}, {12, 36}, {100, 1081}, {99, 1616}}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); const auto mode = ngraph::op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST; auto space_to_depth = make_shared(A, mode, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ( + EXPECT_EQ(space_to_depth->get_element_type(), element::f32); + EXPECT_EQ( space_to_depth->get_output_partial_shape(0), (PartialShape{{1, 4}, {12 * 4, 36 * 4}, {DIV_ROUND_UP(100, 2), 1081 / 2}, {DIV_ROUND_UP(99, 2), 1616 / 2}})); + EXPECT_THAT(get_shape_labels(space_to_depth->get_output_partial_shape(0)), + ElementsAre(10, ov::no_label, ov::no_label, ov::no_label)); } TEST(type_prop, space_to_depth_dynamic_shape_static_rank) { @@ -86,6 +107,23 @@ TEST(type_prop, space_to_depth_dynamic_shape_dynamic_rank) { ASSERT_EQ(space_to_depth->get_output_partial_shape(0), PartialShape::dynamic()); } +TEST(type_prop, space_to_depth_default_ctor) { + auto A = make_shared(element::f64, PartialShape{{1, 4}, {12, 36}, 900, 3}); + + const auto space_to_depth = make_shared(); + space_to_depth->set_block_size(3); + space_to_depth->set_mode(op::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST); + space_to_depth->set_argument(0, A); + space_to_depth->validate_and_infer_types(); + + EXPECT_EQ(space_to_depth->get_block_size(), 3); + EXPECT_EQ(space_to_depth->get_mode(), op::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST); + EXPECT_EQ(space_to_depth->get_input_size(), 1); + EXPECT_EQ(space_to_depth->get_output_size(), 1); + EXPECT_EQ(space_to_depth->get_element_type(), element::f64); + EXPECT_EQ(space_to_depth->get_output_partial_shape(0), (PartialShape{{1, 4}, {12 * 9, 36 * 9}, 900 / 3, 3 / 3})); +} + TEST(type_prop, space_to_depth_input_rank_not_supported) { auto A = make_shared(element::f32, Shape{1, 8}); try { diff --git a/src/core/tests/type_prop/tile.cpp b/src/core/tests/type_prop/tile.cpp index db73b27a37f226..bb97e30ef52e02 100644 --- a/src/core/tests/type_prop/tile.cpp +++ b/src/core/tests/type_prop/tile.cpp @@ -152,12 +152,11 @@ class TileTest : public TypePropTileTest, public WithParamInterface= 0) { - repeats.insert(repeats.begin(), size_diff, 1); + if (labels.size() > repeats.size()) { + repeats.insert(repeats.begin(), labels.size() - repeats.size(), 1); } else { - labels.insert(labels.begin(), -size_diff, ov::no_label); + labels.insert(labels.begin(), repeats.size() - labels.size(), ov::no_label); } std::transform(labels.begin(), diff --git a/src/frontends/CMakeLists.txt b/src/frontends/CMakeLists.txt index b1ace92f3279b2..41902d02d24b4e 100644 --- a/src/frontends/CMakeLists.txt +++ b/src/frontends/CMakeLists.txt @@ -34,4 +34,4 @@ endif() if (ENABLE_OV_TF_LITE_FRONTEND) add_subdirectory(tensorflow_lite) -endif() \ No newline at end of file +endif() diff --git a/src/frontends/common/include/openvino/frontend/manager.hpp b/src/frontends/common/include/openvino/frontend/manager.hpp index 161d37ced06103..4968ef8bbf62e4 100644 --- a/src/frontends/common/include/openvino/frontend/manager.hpp +++ b/src/frontends/common/include/openvino/frontend/manager.hpp @@ -14,7 +14,7 @@ namespace ov { // Forward declaration -void FRONTEND_API shutdown(); +FRONTEND_API void shutdown(); namespace frontend { // -------------- FrontEndManager ----------------- using FrontEndFactory = std::function; diff --git a/src/frontends/paddle/include/openvino/frontend/paddle/frontend.hpp b/src/frontends/paddle/include/openvino/frontend/paddle/frontend.hpp index 103ad5b73228ec..6a804c2b24b012 100644 --- a/src/frontends/paddle/include/openvino/frontend/paddle/frontend.hpp +++ b/src/frontends/paddle/include/openvino/frontend/paddle/frontend.hpp @@ -74,6 +74,7 @@ class PADDLE_API FrontEnd : public ov::frontend::FrontEnd { protected: void try_remove_internal_ops(const std::vector>& models) const; + void fuse_fakequantize_ops(const std::vector>& models) const; static std::vector> convert_each_node( const std::shared_ptr& frontend_model, diff --git a/src/frontends/paddle/src/frontend.cpp b/src/frontends/paddle/src/frontend.cpp index 2a7f22e36fe7e5..ecfe370f9b233d 100644 --- a/src/frontends/paddle/src/frontend.cpp +++ b/src/frontends/paddle/src/frontend.cpp @@ -15,6 +15,7 @@ #include "default_opset.hpp" #include "framework.pb.h" #include "input_model.hpp" +#include "internal/pass/transform_fakequantize.hpp" #include "internal/pass/transform_if.hpp" #include "internal/pass/transform_tensorarray.hpp" #include "internal/pass/transform_while.hpp" @@ -336,6 +337,18 @@ void FrontEnd::try_remove_internal_ops(const std::vector> } } +void FrontEnd::fuse_fakequantize_ops(const std::vector>& models) const { + for (auto& model : models) { + ov::pass::Manager manager; + manager.register_pass(); + manager.run_passes(model); + } + if (models.size() > 0) { + // revalidate as child models are transformed after parent models. + models[0]->validate_nodes_and_infer_types(); + } +} + bool FrontEnd::supported_impl(const std::vector& variants) const { // FrontEnd can only load model specified by one path, one file or two files. if (variants.empty() || variants.size() > 2) @@ -430,6 +443,7 @@ std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model) const return paddle::make_ng_node(nodes_dict, op_place, m_op_translators); }); + fuse_fakequantize_ops(f); try_remove_internal_ops(f); return f[0]; } @@ -444,6 +458,7 @@ void FrontEnd::convert(const std::shared_ptr& partiallyConverted) con result->validate_and_infer_types(); } + fuse_fakequantize_ops({partiallyConverted}); try_remove_internal_ops({partiallyConverted}); } @@ -475,6 +490,7 @@ std::shared_ptr FrontEnd::convert_partially(const InputModel::Ptr& mo return named_outputs; }); + fuse_fakequantize_ops(f); try_remove_internal_ops(f); return f[0]; diff --git a/src/frontends/paddle/src/input_model.cpp b/src/frontends/paddle/src/input_model.cpp index 2600001f222e37..9cd0d64702112f 100644 --- a/src/frontends/paddle/src/input_model.cpp +++ b/src/frontends/paddle/src/input_model.cpp @@ -156,12 +156,6 @@ void InputModel::InputModelImpl::loadPlaces() { namespace { bool read_tensor(std::istream& is, char* data, size_t len) { - std::vector header(16); - is.read(&header[0], 16); - uint32_t dims_len = 0; - is.read(reinterpret_cast(&dims_len), 4); - std::vector dims_struct(dims_len); - is.read(&dims_struct[0], dims_len); is.read(data, len); return (size_t)is.gcount() == len; } @@ -277,9 +271,34 @@ void InputModel::InputModelImpl::loadConsts(const std::basic_string& folder_w continue; FRONT_END_GENERAL_CHECK(var_desc.type().type() == ::paddle::framework::proto::VarType::LOD_TENSOR); - const auto& tensor = var_desc.type().lod_tensor().tensor(); - Shape shape(tensor.dims().cbegin(), tensor.dims().cend()); - const auto& type = TYPE_MAP[tensor.data_type()]; + /* + reference: + https://github.com/PaddlePaddle/Paddle2ONNX/blob/c14446437041a0aa3572994d085b7a35c5b0985c/paddle2onnx/parser/parser.cc#L261 + When deserialize the proto, the header of each weight + [ 4 byte ] -- version(not need) + [ 8 byte ] -- lod_level(not need) + [ 4 byte ] -- version(not need) + [ 4 byte ] -- TensorDesc size + [ x byte ... ] -- TensorDesc + [ y byte ... ] -- weight + */ + { + const size_t header_size = 16; + std::vector header(header_size); + weight_stream->read(&header[0], header_size); + } + + int32_t size; + weight_stream->read(reinterpret_cast(&size), sizeof(size)); + + std::unique_ptr buf(new char[size]); + weight_stream->read(reinterpret_cast(buf.get()), size); + + std::unique_ptr<::paddle::framework::proto::VarType_TensorDesc> tensor_desc( + new ::paddle::framework::proto::VarType_TensorDesc()); + tensor_desc->ParseFromArray(buf.get(), size); + Shape shape(tensor_desc->dims().cbegin(), tensor_desc->dims().cend()); + const auto& type = TYPE_MAP[tensor_desc->data_type()]; const auto& data_length = shape_size(shape) * type.size(); std::vector tensor_data(data_length); diff --git a/src/frontends/paddle/src/internal/pass/transform_fakequantize.cpp b/src/frontends/paddle/src/internal/pass/transform_fakequantize.cpp new file mode 100644 index 00000000000000..3aa363a06e43f4 --- /dev/null +++ b/src/frontends/paddle/src/internal/pass/transform_fakequantize.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "internal/pass/transform_fakequantize.hpp" + +#include +#include +#include +#include +#include +#include + +#include "default_opset.hpp" +#include "openvino/pass/pattern/op/label.hpp" +#include "transformations/utils/utils.hpp" + +using namespace ov::frontend::paddle::op::default_opset; +using namespace ov; +using namespace ov::pass; +using namespace ov::frontend::paddle::op; + +/* + zero_point + / + input convert scale + \ / / + subtract Multiply + quantize_linear ==>> \ / + Divide + \ + Round + \ + Clamp + \ + _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ => FakeQuantize + \ zero_point + \ / + Convert Convert scale + dequantize_linear ==>> \ / / + Subtract Multiply + \ / + Multiply +*/ +ov::frontend::paddle::pass::TransformFakeQuantize::TransformFakeQuantize() { + const auto input_label = ngraph::pattern::any_input(); + const auto q_zp_label = ngraph::pattern::any_input(); + // quantize phase + const auto q_zp_cvt_label = ngraph::pattern::wrap_type({q_zp_label}); + const auto q_sub_label = ngraph::pattern::wrap_type({input_label, q_zp_cvt_label}); + const auto q_real_scale_label = ngraph::pattern::wrap_type(); + const auto div_label = ngraph::pattern::wrap_type({q_sub_label, q_real_scale_label}); + const auto round_label = ngraph::pattern::wrap_type({div_label}); + const auto q_clamp_label = ngraph::pattern::wrap_type({round_label}); + // dequantize phase + const auto dq_cvt_label = ngraph::pattern::wrap_type({q_clamp_label}); + const auto dq_zp_label = ngraph::pattern::any_input(); + const auto dq_zp_cvt_label = ngraph::pattern::wrap_type({dq_zp_label}); + const auto dq_sub_label = ngraph::pattern::wrap_type({dq_cvt_label, dq_zp_cvt_label}); + const auto dq_real_scale_label = ngraph::pattern::wrap_type(); + const auto output_label = ngraph::pattern::wrap_type({dq_sub_label, dq_real_scale_label}); + + matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) -> bool { + const auto& opsMap = m.get_pattern_value_map(); + if (transformation_callback(m.get_match_root())) { + return false; + } + // get the input + const auto& sub_node = opsMap.at(q_sub_label).get_node_shared_ptr(); + if (!sub_node->get_input_node_shared_ptr(0)) { + return false; + } + const auto& input_item = sub_node->get_input_source_output(0); + + // prepare for replace + const auto& output_node = opsMap.at(output_label).get_node_shared_ptr(); + + // check round mode + // Fallback to the PDPD FE if the round_mode is HALF_AWAY_FROM_ZERO. + const auto& round_node_cast = std::dynamic_pointer_cast(opsMap.at(round_label).get_node_shared_ptr()); + if (!round_node_cast || round_node_cast->get_mode() != Round::RoundMode::HALF_TO_EVEN) { + return false; + } + + // check quantize_linear zero_point + auto zp_node_cast = std::dynamic_pointer_cast(opsMap.at(dq_zp_label).get_node_shared_ptr()); + float zp; + if (!zp_node_cast || !ov::op::util::get_single_value(zp_node_cast, zp)) { + return false; + } + + // prepare levels + const auto& clamp_node_cast = std::dynamic_pointer_cast(opsMap.at(q_clamp_label).get_node_shared_ptr()); + if (!clamp_node_cast) { + return false; + } + const auto high_range = static_cast(clamp_node_cast->get_max()); + const auto low_range = static_cast(clamp_node_cast->get_min()); + const auto levels = high_range - low_range + 1; + + // get the scale + const auto& scale_node_cast = std::dynamic_pointer_cast( + opsMap.at(q_real_scale_label).get_node_shared_ptr()->get_input_node_shared_ptr(0)); + float scale; + if (!scale_node_cast || !ov::op::util::get_single_value(scale_node_cast, scale)) { + return false; + } + // The PaddleSlim scale value is not equal to scale definition in OpenVINO. + // scale_ov = scale_pdpd / half_range. + const auto real_scale = scale / high_range; + + // calculate the input_low/input_high/output_low/output_high + // In order to reduce the imported nodes, try to achieve the value from the Constant. + // The formula: + // i8: which is used in PDPD + // low = (-128 - zero_point) * scale + // high = (127 - zero_point) * scale + // u8: which is not used in PDPD + // low = (0 - zero_point) * scale + // high = (255 - zero_point) * scale + const auto limit_low = std::make_shared(element::f32, Shape{1}, (low_range - zp) * real_scale); + const auto limit_high = std::make_shared(element::f32, Shape{1}, (high_range - zp) * real_scale); + + auto fake_node = + std::make_shared(input_item, limit_low, limit_high, limit_low, limit_high, levels); + fake_node->set_friendly_name(output_node->get_friendly_name()); + replace_node(output_node, fake_node); + return true; + }; + auto m = std::make_shared(output_label, "TransformFakeQuantize"); + this->register_matcher(m, callback); +} diff --git a/src/frontends/paddle/src/internal/pass/transform_fakequantize.hpp b/src/frontends/paddle/src/internal/pass/transform_fakequantize.hpp new file mode 100644 index 00000000000000..6d45edd8ea818a --- /dev/null +++ b/src/frontends/paddle/src/internal/pass/transform_fakequantize.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pass.hpp" + +namespace ov { +namespace frontend { +namespace paddle { +namespace pass { + +class TransformFakeQuantize : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ov::frontend::paddle::pass::TransformFakeQuantize"); + TransformFakeQuantize(); + +private: +}; + +} // namespace pass +} // namespace paddle +} // namespace frontend +} // namespace ov \ No newline at end of file diff --git a/src/frontends/paddle/src/op/dequantize_linear.cpp b/src/frontends/paddle/src/op/dequantize_linear.cpp new file mode 100644 index 00000000000000..271b938c17ab43 --- /dev/null +++ b/src/frontends/paddle/src/op/dequantize_linear.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "default_opset.hpp" +#include "openvino/frontend/paddle/node_context.hpp" + +namespace ov { +namespace frontend { +namespace paddle { +namespace op { +NamedOutputs dequantize_linear(const NodeContext& node) { + // extract the INPUTS + const auto x = node.get_input("X"); + const auto scale = node.get_input("Scale"); + const auto zero_point = node.get_input("ZeroPoint"); + + // assert shape of scale and zero_point + const auto& scale_shape = scale.get_partial_shape(); + PADDLE_OP_CHECK(node, scale.get_partial_shape().rank().is_static(), "dequantize_linear scale rank must be static."); + const auto& scale_shape_length = scale.get_partial_shape().rank().get_length(); + + if (scale_shape_length == 1) { + PADDLE_OP_CHECK(node, + scale.get_partial_shape() == zero_point.get_partial_shape(), + "dequantize_linear shape of scale and zero_point doesn't match."); + } else if (scale_shape_length == 2) { + PADDLE_OP_CHECK(node, + scale.get_partial_shape()[1] == zero_point.get_partial_shape()[0], + "dequantize_linear shape of scale and zero_point doesn't match."); + } else { + PADDLE_OP_CHECK(node, false, "dims of scale should not be greater than 2."); + } + + const auto bit_length = node.get_attribute("bit_length"); + const auto range = (1 << (bit_length - 1)) - 1; + const auto range_node = std::make_shared(element::f32, Shape{1}, (1.0 / range)); + const auto real_scale = std::make_shared(scale, range_node); + + auto q_node = std::make_shared(x, element::f32); + // extract the ATTRIBUTES and explaination for quant_axis: + // / [-1] --- per-tensor, scale is always 1-D + // quant_axis - [0 or 1] --- per-channel, scale may be 1-D or 2-D, needing to reshape for input shape. + // \ [others] --- unsupported + auto quant_axis = node.get_attribute("quant_axis"); + std::vector quant_axis_range{-1, 0, 1}; + PADDLE_OP_CHECK(node, + std::any_of(quant_axis_range.begin(), + quant_axis_range.end(), + [&quant_axis](int32_t value) { + return quant_axis == value; + }), + "dequantize_linear quant_axis is NOT in the range of [-1, 0, 1]."); + if (quant_axis == -1) { + const auto zp_node = std::make_shared(zero_point, element::f32); + const auto out_node = + std::make_shared(std::make_shared(q_node, zp_node), + real_scale); + return node.default_single_output_mapping({out_node}, {"Y"}); + } else { + // But for per-channel scenario, the shape of scale is NOT stable. + // Sometimes scale is 1-D and sometimes scale is 2-D. But the last dim(e.g. s[len-1]) really makes sense. + // Let's prepare a pattern to reshape operation according to the scale shape. + std::vector reshape_pattern(x.get_partial_shape().rank().get_length(), 1); + reshape_pattern.at(quant_axis) = scale_shape[scale_shape_length - 1].get_length(); + const auto reshape_node = + std::make_shared(element::i32, Shape{reshape_pattern.size()}, reshape_pattern); + const auto reshape_scale = std::make_shared(real_scale, reshape_node, true); + const auto zp_node = std::make_shared( + std::make_shared(zero_point, reshape_node, true), + element::f32); + const auto out_node = + std::make_shared(std::make_shared(q_node, zp_node), + reshape_scale); + return node.default_single_output_mapping({out_node}, {"Y"}); + } +} + +} // namespace op +} // namespace paddle +} // namespace frontend +} // namespace ov diff --git a/src/frontends/paddle/src/op/quantize_linear.cpp b/src/frontends/paddle/src/op/quantize_linear.cpp new file mode 100644 index 00000000000000..794325dd35f308 --- /dev/null +++ b/src/frontends/paddle/src/op/quantize_linear.cpp @@ -0,0 +1,76 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "default_opset.hpp" +#include "openvino/core/validation_util.hpp" +#include "openvino/frontend/paddle/node_context.hpp" +#include "openvino/opsets/opset6.hpp" + +namespace ov { +namespace frontend { +namespace paddle { +namespace op { +/* + quantize_linear: + INT ---------------------------> FLOAT + [scale, zero_point] + + zero_point is always equal to zero. + + / [-1] --- per-tensor + quant_axis - [0 or 1] --- per-channel, expand 1-D tensor to match the input + \ [others] --- unsupported + + / [0] --- rounding to nearest ties to even + round_type - [1] --- rounding to nearest ties away from zero + \ [others] --- unsupported! + refer to https://en.wikipedia.org/wiki/IEEE_754 for more info about round_type + +*/ +NamedOutputs quantize_linear(const NodeContext& node) { + // extract the INPUTS + const auto x = node.get_input("X"); + const auto scale = node.get_input("Scale"); + const auto zero_point = node.get_input("ZeroPoint"); + const auto quant_axis = node.get_attribute("quant_axis"); + const std::vector quant_axis_range{-1}; + PADDLE_OP_CHECK(node, + std::any_of(quant_axis_range.begin(), + quant_axis_range.end(), + [&quant_axis](int32_t value) { + return quant_axis == value; + }), + "quantize_linear quant_axis is NOT in the range of [-1]."); + + // extract the ATTRIBUTES + const auto bit_length = node.get_attribute("bit_length"); + const auto range = (1 << (bit_length - 1)) - 1; + const auto high_range = (1 << (bit_length - 1)) - 1; + const auto low_range = -(1 << (bit_length - 1)); + const auto round_mode = [&]() { + if (node.has_attribute("round_type")) { + if (node.get_attribute("round_type")) { + return default_opset::Round::RoundMode::HALF_AWAY_FROM_ZERO; + } else { + return default_opset::Round::RoundMode::HALF_TO_EVEN; + } + } else { + return default_opset::Round::RoundMode::HALF_TO_EVEN; + } + }(); + + const auto zp_node = std::make_shared(zero_point, element::f32); + const auto q_sub_node = std::make_shared(x, zp_node); + const auto range_node = std::make_shared(element::f32, Shape{1}, (1.0 / range)); + const auto real_scale = std::make_shared(scale, range_node); + const auto q_div_node = std::make_shared(q_sub_node, real_scale); + const auto q_round_node = std::make_shared(q_div_node, round_mode); + const auto q_node = std::make_shared(q_round_node, low_range, high_range); + return node.default_single_output_mapping({q_node}, {"Y"}); +} + +} // namespace op +} // namespace paddle +} // namespace frontend +} // namespace ov diff --git a/src/frontends/paddle/src/op_table.cpp b/src/frontends/paddle/src/op_table.cpp index cd58f3fed220a8..0a21af065b98ad 100644 --- a/src/frontends/paddle/src/op_table.cpp +++ b/src/frontends/paddle/src/op_table.cpp @@ -24,6 +24,7 @@ OP_CONVERTER(conv2d); OP_CONVERTER(conv2d_transpose); OP_CONVERTER(cumsum); OP_CONVERTER(deformable_conv); +OP_CONVERTER(dequantize_linear); OP_CONVERTER(dropout); OP_CONVERTER(elementwise_add); OP_CONVERTER(elementwise_div); @@ -73,6 +74,7 @@ OP_CONVERTER(pad3d); OP_CONVERTER(pow); OP_CONVERTER(pool2d); OP_CONVERTER(prior_box); +OP_CONVERTER(quantize_linear); OP_CONVERTER(range); OP_CONVERTER(reduce_max); OP_CONVERTER(reduce_mean); @@ -135,6 +137,7 @@ std::map get_supported_ops() { {"deformable_conv_v1", op::deformable_conv}, {"depthwise_conv2d", op::conv2d}, {"depthwise_conv2d_transpose", op::conv2d_transpose}, + {"dequantize_linear", op::dequantize_linear}, {"dropout", op::dropout}, {"elementwise_add", op::elementwise_add}, {"elementwise_div", op::elementwise_div}, @@ -187,6 +190,7 @@ std::map get_supported_ops() { {"pow", op::pow}, {"pool2d", op::pool2d}, {"prior_box", op::prior_box}, + {"quantize_linear", op::quantize_linear}, {"range", op::range}, {"reduce_max", op::reduce_max}, {"reduce_mean", op::reduce_mean}, diff --git a/src/frontends/paddle/tests/basic_api.cpp b/src/frontends/paddle/tests/basic_api.cpp index c7dff9b9884697..7b8a956a7ec7f3 100644 --- a/src/frontends/paddle/tests/basic_api.cpp +++ b/src/frontends/paddle/tests/basic_api.cpp @@ -12,8 +12,7 @@ using namespace ov::frontend; using PaddleBasicTest = FrontEndBasicTest; static const std::vector models{ - std::string("conv2d"), - std::string("conv2d_s/conv2d.pdmodel"), + std::string("conv2d/conv2d.pdmodel"), std::string("conv2d_relu/conv2d_relu.pdmodel"), std::string("2in_2out/2in_2out.pdmodel"), std::string("multi_tensor_split/multi_tensor_split.pdmodel"), diff --git a/src/frontends/paddle/tests/convert_model.cpp b/src/frontends/paddle/tests/convert_model.cpp index 9f4dd6dc870f32..947d08ca304fb4 100644 --- a/src/frontends/paddle/tests/convert_model.cpp +++ b/src/frontends/paddle/tests/convert_model.cpp @@ -12,8 +12,7 @@ using namespace ov::frontend; using PaddleConvertModelTest = FrontEndConvertModelTest; static const std::vector models{ - std::string("conv2d"), - std::string("conv2d_s/conv2d.pdmodel"), + std::string("conv2d/conv2d.pdmodel"), std::string("conv2d_relu/conv2d_relu.pdmodel"), std::string("2in_2out/2in_2out.pdmodel"), std::string("multi_tensor_split/multi_tensor_split.pdmodel"), diff --git a/src/frontends/paddle/tests/load_from.cpp b/src/frontends/paddle/tests/load_from.cpp index 7d61a42f31e4d8..e1ded9e90eb8f0 100644 --- a/src/frontends/paddle/tests/load_from.cpp +++ b/src/frontends/paddle/tests/load_from.cpp @@ -15,7 +15,7 @@ static LoadFromFEParam getTestData() { LoadFromFEParam res; res.m_frontEndName = PADDLE_FE; res.m_modelsPath = std::string(TEST_PADDLE_MODELS_DIRNAME); - res.m_file = "conv2d"; + res.m_file = "conv2d/conv2d.pdmodel"; res.m_files = {"2in_2out/2in_2out.pdmodel", "2in_2out/2in_2out.pdiparams"}; res.m_stream = "relu/relu.pdmodel"; res.m_streams = {"2in_2out/2in_2out.pdmodel", "2in_2out/2in_2out.pdiparams"}; diff --git a/src/frontends/paddle/tests/op_fuzzy.cpp b/src/frontends/paddle/tests/op_fuzzy.cpp index ef742b2064b2f9..598b49dc41a548 100644 --- a/src/frontends/paddle/tests/op_fuzzy.cpp +++ b/src/frontends/paddle/tests/op_fuzzy.cpp @@ -41,25 +41,25 @@ static const std::vector models{ std::string("avgPool_test7"), std::string("avgPool_test8"), std::string("avgPool_test9"), - std::string("batch_norm_nchw"), - std::string("batch_norm_nhwc"), - std::string("bicubic_downsample_false_0"), - std::string("bicubic_downsample_false_1"), - std::string("bicubic_downsample_true_0"), - std::string("bicubic_upsample_false_0"), - std::string("bicubic_upsample_false_1"), - std::string("bicubic_upsample_scales"), - std::string("bicubic_upsample_scales2"), - std::string("bicubic_upsample_true_0"), - std::string("bilinear_downsample_false_0"), - std::string("bilinear_downsample_false_1"), - std::string("bilinear_downsample_true_0"), - std::string("bilinear_upsample_false_0"), - std::string("bilinear_upsample_false_1"), - std::string("bilinear_upsample_scales"), - std::string("bilinear_upsample_scales2"), - std::string("bilinear_upsample_true_0"), - std::string("bmm"), + std::string("batch_norm_nchw/batch_norm_nchw.pdmodel"), + std::string("batch_norm_nhwc/batch_norm_nhwc.pdmodel"), + std::string("bicubic_downsample_false_0/bicubic_downsample_false_0.pdmodel"), + std::string("bicubic_downsample_false_1/bicubic_downsample_false_1.pdmodel"), + std::string("bicubic_downsample_true_0/bicubic_downsample_true_0.pdmodel"), + std::string("bicubic_upsample_false_0/bicubic_upsample_false_0.pdmodel"), + std::string("bicubic_upsample_false_1/bicubic_upsample_false_1.pdmodel"), + std::string("bicubic_upsample_scales/bicubic_upsample_scales.pdmodel"), + std::string("bicubic_upsample_scales2/bicubic_upsample_scales2.pdmodel"), + std::string("bicubic_upsample_true_0/bicubic_upsample_true_0.pdmodel"), + std::string("bilinear_downsample_false_0/bilinear_downsample_false_0.pdmodel"), + std::string("bilinear_downsample_false_1/bilinear_downsample_false_1.pdmodel"), + std::string("bilinear_downsample_true_0/bilinear_downsample_true_0.pdmodel"), + std::string("bilinear_upsample_false_0/bilinear_upsample_false_0.pdmodel"), + std::string("bilinear_upsample_false_1/bilinear_upsample_false_1.pdmodel"), + std::string("bilinear_upsample_scales/bilinear_upsample_scales.pdmodel"), + std::string("bilinear_upsample_scales2/bilinear_upsample_scales2.pdmodel"), + std::string("bilinear_upsample_true_0/bilinear_upsample_true_0.pdmodel"), + std::string("bmm/bmm.pdmodel"), std::string("box_coder_1"), std::string("box_coder_2"), std::string("box_coder_3"), @@ -101,18 +101,19 @@ static const std::vector models{ // std::string("conditional_block_slice0_2tensorarrays_extra/conditional_block_slice0_2tensorarrays_extra.pdmodel"), // std::string( // "conditional_block_slice0_2tensorarrays_extra_dyn/conditional_block_slice0_2tensorarrays_extra_dyn.pdmodel"), - std::string("conv2d_dilation_assymetric_pads_strides"), - std::string("conv2d_SAME_padding"), - std::string("conv2d_strides_assymetric_padding"), - std::string("conv2d_strides_no_padding"), - std::string("conv2d_strides_padding"), - std::string("conv2d_transpose_dilation_assymetric_pads_strides"), + std::string("conv2d_dilation_assymetric_pads_strides/conv2d_dilation_assymetric_pads_strides.pdmodel"), + std::string("conv2d_SAME_padding/conv2d_SAME_padding.pdmodel"), + std::string("conv2d_strides_assymetric_padding/conv2d_strides_assymetric_padding.pdmodel"), + std::string("conv2d_strides_no_padding/conv2d_strides_no_padding.pdmodel"), + std::string("conv2d_strides_padding/conv2d_strides_padding.pdmodel"), + std::string( + "conv2d_transpose_dilation_assymetric_pads_strides/conv2d_transpose_dilation_assymetric_pads_strides.pdmodel"), // conv2d_transpose_SAME_padding(Paddle outputs wrong results), - std::string("conv2d_transpose_strides_assymetric_padding"), - std::string("conv2d_transpose_strides_no_padding"), - std::string("conv2d_transpose_strides_padding"), - std::string("conv2d_transpose_VALID_padding"), - std::string("conv2d_VALID_padding"), + std::string("conv2d_transpose_strides_assymetric_padding/conv2d_transpose_strides_assymetric_padding.pdmodel"), + std::string("conv2d_transpose_strides_no_padding/conv2d_transpose_strides_no_padding.pdmodel"), + std::string("conv2d_transpose_strides_padding/conv2d_transpose_strides_padding.pdmodel"), + std::string("conv2d_transpose_VALID_padding/conv2d_transpose_VALID_padding.pdmodel"), + std::string("conv2d_VALID_padding/conv2d_VALID_padding.pdmodel"), std::string("cumsum"), std::string("cumsum_i32"), std::string("cumsum_i64"), @@ -134,8 +135,8 @@ static const std::vector models{ std::string("deformable_conv_with_stride"), std::string("deformable_conv_with_stride_list"), std::string("deformable_conv_with_stride_tuple"), - std::string("depthwise_conv2d_convolution"), - std::string("depthwise_conv2d_transpose_convolution"), + std::string("depthwise_conv2d_convolution/depthwise_conv2d_convolution.pdmodel"), + std::string("depthwise_conv2d_transpose_convolution/depthwise_conv2d_transpose_convolution.pdmodel"), std::string("dropout"), std::string("dropout_upscale_in_train"), std::string("elementwise_add1"), @@ -176,13 +177,13 @@ static const std::vector models{ std::string("elementwise_floordiv_int64_1"), std::string("elementwise_floordiv_int64_2"), std::string("elementwise_floordiv_int64_3"), - std::string("embedding_0"), - std::string("embedding_sparse"), - std::string("embedding_none_weight"), - std::string("embedding_paddings"), - std::string("embedding_paddings_neg1"), - std::string("embedding_tensorIds"), - std::string("embedding_tensorIds_paddings"), + std::string("embedding_0/embedding_0.pdmodel"), + std::string("embedding_sparse/embedding_sparse.pdmodel"), + std::string("embedding_none_weight/embedding_none_weight.pdmodel"), + std::string("embedding_paddings/embedding_paddings.pdmodel"), + std::string("embedding_paddings_neg1/embedding_paddings_neg1.pdmodel"), + std::string("embedding_tensorIds/embedding_tensorIds.pdmodel"), + std::string("embedding_tensorIds_paddings/embedding_tensorIds_paddings.pdmodel"), std::string("equal"), std::string("expand_v2"), std::string("expand_v2_tensor"), @@ -236,27 +237,27 @@ static const std::vector models{ std::string("greater_than_float32"), std::string("greater_than_int32"), std::string("greater_than_int64"), - std::string("group_norm_1"), - std::string("group_norm_2"), - std::string("group_norm_3"), + std::string("group_norm_1/group_norm_1.pdmodel"), + std::string("group_norm_2/group_norm_2.pdmodel"), + std::string("group_norm_3/group_norm_3.pdmodel"), std::string("hard_sigmoid"), std::string("hard_swish"), - std::string("layer_norm"), - std::string("layer_norm_noall"), - std::string("layer_norm_noscale"), - std::string("layer_norm_noshift"), + std::string("layer_norm/layer_norm.pdmodel"), + std::string("layer_norm_noall/layer_norm_noall.pdmodel"), + std::string("layer_norm_noscale/layer_norm_noscale.pdmodel"), + std::string("layer_norm_noshift/layer_norm_noshift.pdmodel"), std::string("leaky_relu"), std::string("less_than_float32"), std::string("less_than_int32"), std::string("less_than_int64"), - std::string("linear_downsample_false_0"), - std::string("linear_downsample_false_1"), - std::string("linear_downsample_true_0"), - std::string("linear_upsample_false_0"), - std::string("linear_upsample_false_1"), - std::string("linear_upsample_scales"), - std::string("linear_upsample_scales2"), - std::string("linear_upsample_true_0"), + std::string("linear_downsample_false_0/linear_downsample_false_0.pdmodel"), + std::string("linear_downsample_false_1/linear_downsample_false_1.pdmodel"), + std::string("linear_downsample_true_0/linear_downsample_true_0.pdmodel"), + std::string("linear_upsample_false_0/linear_upsample_false_0.pdmodel"), + std::string("linear_upsample_false_1/linear_upsample_false_1.pdmodel"), + std::string("linear_upsample_scales/linear_upsample_scales.pdmodel"), + std::string("linear_upsample_scales2/linear_upsample_scales2.pdmodel"), + std::string("linear_upsample_true_0/linear_upsample_true_0.pdmodel"), std::string("log"), std::string("logical_and"), std::string("logical_not"), @@ -275,16 +276,16 @@ static const std::vector models{ std::string("loop_t/loop_t.pdmodel"), std::string("loop_tensor_array/loop_tensor_array.pdmodel"), std::string("loop_x/loop_x.pdmodel"), - std::string("matmul_xt"), - std::string("matmul_xt_yt"), - std::string("matmul_yt"), - std::string("matmul_v2_1dx1d"), - std::string("matmul_v2_1dx2d"), - std::string("matmul_v2_2dx1d"), - std::string("matmul_v2_ndxmd"), - std::string("matmul_v2_xt"), - std::string("matmul_v2_xt_yt"), - std::string("matmul_v2_yt"), + std::string("matmul_xt/matmul_xt.pdmodel"), + std::string("matmul_xt_yt/matmul_xt_yt.pdmodel"), + std::string("matmul_yt/matmul_yt.pdmodel"), + std::string("matmul_v2_1dx1d/matmul_v2_1dx1d.pdmodel"), + std::string("matmul_v2_1dx2d/matmul_v2_1dx2d.pdmodel"), + std::string("matmul_v2_2dx1d/matmul_v2_2dx1d.pdmodel"), + std::string("matmul_v2_ndxmd/matmul_v2_ndxmd.pdmodel"), + std::string("matmul_v2_xt/matmul_v2_xt.pdmodel"), + std::string("matmul_v2_xt_yt/matmul_v2_xt_yt.pdmodel"), + std::string("matmul_v2_yt/matmul_v2_yt.pdmodel"), std::string("matrix_nms_by_background"), std::string("matrix_nms_by_keep_top_k"), std::string("matrix_nms_by_nms_top_k"), @@ -354,10 +355,10 @@ static const std::vector models{ std::string("multiclass_nms_two_batches_two_classes_by_class_id"), // std::string("multiclass_nms_normalized_random"), // std::string("multiclass_nms_not_normalized_random"), - std::string("nearest_downsample_false_0"), - std::string("nearest_downsample_false_1"), - std::string("nearest_upsample_false_0"), - std::string("nearest_upsample_false_1"), + std::string("nearest_downsample_false_0/nearest_downsample_false_0.pdmodel"), + std::string("nearest_downsample_false_1/nearest_downsample_false_1.pdmodel"), + std::string("nearest_upsample_false_0/nearest_upsample_false_0.pdmodel"), + std::string("nearest_upsample_false_1/nearest_upsample_false_1.pdmodel"), std::string("not_equal_float32"), std::string("not_equal_int32"), std::string("not_equal_int64"), @@ -429,12 +430,12 @@ static const std::vector models{ std::string("reverse_dynamic_2"), std::string("reverse_dynamic_3"), std::string("reverse_dynamic_4"), - std::string("rnn_lstm_layer_1_bidirectional"), - std::string("rnn_lstm_layer_1_forward"), - std::string("rnn_lstm_layer_2_bidirectional"), - std::string("rnn_lstm_layer_2_forward"), - std::string("rnn_lstm_layer_1_forward_seq_len_4"), - std::string("rnn_lstm_layer_2_bidirectional_seq_len_4"), + std::string("rnn_lstm_layer_1_bidirectional/rnn_lstm_layer_1_bidirectional.pdmodel"), + std::string("rnn_lstm_layer_1_forward/rnn_lstm_layer_1_forward.pdmodel"), + std::string("rnn_lstm_layer_2_bidirectional/rnn_lstm_layer_2_bidirectional.pdmodel"), + std::string("rnn_lstm_layer_2_forward/rnn_lstm_layer_2_forward.pdmodel"), + std::string("rnn_lstm_layer_1_forward_seq_len_4/rnn_lstm_layer_1_forward_seq_len_4.pdmodel"), + std::string("rnn_lstm_layer_2_bidirectional_seq_len_4/rnn_lstm_layer_2_bidirectional_seq_len_4.pdmodel"), std::string("roi_align_test"), std::string("roi_align_test2"), std::string("scale_bias_after_float32"), @@ -506,14 +507,14 @@ static const std::vector models{ std::string("top_k_v2_test_4"), std::string("top_k_v2_test_5"), std::string("top_k_v2_test_6"), - std::string("trilinear_downsample_false_0"), - std::string("trilinear_downsample_false_1"), - std::string("trilinear_downsample_true_0"), - std::string("trilinear_upsample_false_0"), - std::string("trilinear_upsample_false_1"), - std::string("trilinear_upsample_scales"), - std::string("trilinear_upsample_scales2"), - std::string("trilinear_upsample_true_0"), + std::string("trilinear_downsample_false_0/trilinear_downsample_false_0.pdmodel"), + std::string("trilinear_downsample_false_1/trilinear_downsample_false_1.pdmodel"), + std::string("trilinear_downsample_true_0/trilinear_downsample_true_0.pdmodel"), + std::string("trilinear_upsample_false_0/trilinear_upsample_false_0.pdmodel"), + std::string("trilinear_upsample_false_1/trilinear_upsample_false_1.pdmodel"), + std::string("trilinear_upsample_scales/trilinear_upsample_scales.pdmodel"), + std::string("trilinear_upsample_scales2/trilinear_upsample_scales2.pdmodel"), + std::string("trilinear_upsample_true_0/trilinear_upsample_true_0.pdmodel"), std::string("unsqueeze"), std::string("where_1"), std::string("where_2"), diff --git a/src/frontends/paddle/tests/partial_shape.cpp b/src/frontends/paddle/tests/partial_shape.cpp index 5070a8c7f6b709..eef001bc185e6a 100644 --- a/src/frontends/paddle/tests/partial_shape.cpp +++ b/src/frontends/paddle/tests/partial_shape.cpp @@ -31,7 +31,7 @@ static PartShape getTestShape_2in_2out_dynbatch() { static PartShape getTestShape_conv2d() { PartShape res; - res.m_modelName = "conv2d_s/conv2d.pdmodel"; + res.m_modelName = "conv2d/conv2d.pdmodel"; res.m_tensorName = "x"; res.m_oldPartialShape = PartialShape{1, 3, 4, 4}; res.m_newPartialShape = PartialShape{1, 3, 8, 8}; @@ -40,7 +40,7 @@ static PartShape getTestShape_conv2d() { static PartShape getTestShape_conv2d_setDynamicBatch() { PartShape res; - res.m_modelName = "conv2d_s/conv2d.pdmodel"; + res.m_modelName = "conv2d/conv2d.pdmodel"; res.m_tensorName = "x"; res.m_oldPartialShape = PartialShape{1, 3, 4, 4}; res.m_newPartialShape = PartialShape{Dimension::dynamic(), 3, 8, 8}; diff --git a/src/frontends/paddle/tests/requirements.txt b/src/frontends/paddle/tests/requirements.txt index ea1238a351ef08..be9663e2fafee2 100644 --- a/src/frontends/paddle/tests/requirements.txt +++ b/src/frontends/paddle/tests/requirements.txt @@ -1,5 +1,5 @@ # PaddlePaddle - generate test models -paddlepaddle==2.4.1 +paddlepaddle==2.4.2 gast==0.3.3 numpy>=1.16.6,<1.25.0 six~=1.16.0 diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_batch_norm.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_batch_norm.py index 5a590913444b10..cdc22aec132ead 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_batch_norm.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_batch_norm.py @@ -34,9 +34,9 @@ def batch_norm1(name : str, x, scale, bias, mean, var, data_layout): outs = exe.run( feed={'x': x}, - fetch_list=[out]) + fetch_list=[out]) - saveModel(name, exe, feedkeys=['x'], fetchlist=[out], inputs=[x], outputs=[outs[0]], target_dir=sys.argv[1]) + saveModel(name, exe, feedkeys=[node_x], fetchlist=[out], inputs=[x], outputs=[outs[0]], target_dir=sys.argv[1], use_static_api=True) return outs[0] @@ -67,7 +67,7 @@ def batch_norm2(name : str, x, scale, bias, mean, var, data_layout): feed={'x': x}, fetch_list=[out]) - saveModel(name, exe, feedkeys=['x'], fetchlist=[out], inputs=[x], outputs=[outs[0]], target_dir=sys.argv[1]) + saveModel(name, exe, feedkeys=[node_x], fetchlist=[out], inputs=[x], outputs=[outs[0]], target_dir=sys.argv[1], use_static_api=True) return outs[0] @@ -89,4 +89,4 @@ def main(): batch_norm2("batch_norm_nhwc", data, scale, bias, mean, var, "NHWC") if __name__ == "__main__": - main() + main() diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_bmm.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_bmm.py index 9f29c35dddcc2b..348ae5ad8a42ed 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_bmm.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_bmm.py @@ -23,8 +23,8 @@ def paddle_bmm(x1, x2): outs = exe.run( feed={'x1': x1, 'x2': x2}, fetch_list=[result]) - saveModel("bmm", exe, feedkeys=['x1', 'x2'], fetchlist=[result], - inputs=[x1, x2], outputs=[outs[0]], target_dir=sys.argv[1]) + saveModel("bmm", exe, feedkeys=[node_x1, node_x2], fetchlist=[result], + inputs=[x1, x2], outputs=[outs[0]], target_dir=sys.argv[1], use_static_api=True) return outs[0] diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d.py index 6131dd47bdf547..9b8131474614ca 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d.py @@ -1,25 +1,38 @@ # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import paddle -from paddle import fluid import numpy as np import os import sys +from save_model import saveModel +def conv2d(name: str, x, dtype): + import paddle + paddle.enable_static() -paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()): + # inp_blob = np.random.randn(1, 3, 4, 4).astype(np.float32) -inp_blob = np.random.randn(1, 3, 4, 4).astype(np.float32) + node_x = paddle.static.data(name='x', shape=[1, 3, 4, 4], dtype='float32') + conv2d_layer = paddle.nn.Conv2D(in_channels=3, out_channels=5, kernel_size=(1, 1), stride=(1, 1), padding=(1, 1), + dilation=(1, 1), groups=1, bias_attr=False) + out = conv2d_layer(node_x) -x = fluid.data(name='x', shape=[1, 3, 4, 4], dtype='float32') -test_layer = fluid.layers.conv2d(input=x, num_filters=5, filter_size=(1, 1), stride=(1, 1), padding=(1, 1), - dilation=(1, 1), groups=1, bias_attr=False) + cpu = paddle.static.cpu_places(1) + exe = paddle.static.Executor(cpu[0]) + exe.run(paddle.static.default_startup_program()) -exe = fluid.Executor(fluid.CPUPlace()) -exe.run(fluid.default_startup_program()) -inp_dict = {'x': inp_blob} -var = [test_layer] -res_paddle = exe.run(fluid.default_main_program(), fetch_list=var, feed=inp_dict) + inp_dict = {'x': x} + var = [out] + outs = exe.run(feed=inp_dict, fetch_list=var) -fluid.io.save_inference_model(os.path.join(sys.argv[1], "conv2d"), list(inp_dict.keys()), var, exe) + saveModel(name, exe, feedkeys=[node_x], fetchlist=[out], inputs=[x], outputs=[outs[0]], target_dir=sys.argv[1], use_static_api=True) + return outs[0] + +def main(): + dtype = "float32" + data = np.random.randn(1, 3, 4, 4).astype(dtype) + conv2d("conv2d", data, dtype) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d_combinations.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d_combinations.py index 10d6f583a114d7..cb637d6b755981 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d_combinations.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d_combinations.py @@ -18,8 +18,8 @@ def run_and_save_model(input_x, name, feed, fetch_list, main_prog, start_prog): program=main_prog) with paddle.static.program_guard(main_prog, start_prog): - saveModel(name, exe, feedkeys=['x'], fetchlist=fetch_list, inputs=[input_x], - outputs=[outs[0]], target_dir=sys.argv[1]) + saveModel(name, exe, feedkeys=[feed], fetchlist=fetch_list, inputs=[input_x], + outputs=[outs[0]], target_dir=sys.argv[1], use_static_api=True) def paddle_conv2d(input_x, name, input_shape, kernel, dilation, padding, stride, groups=1, use_cudnn=True): diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d_transpose.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d_transpose.py index 22f88e6f83d948..fbb4e72346cf0e 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d_transpose.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_conv2d_transpose.py @@ -17,8 +17,8 @@ def run_and_save_model(input_x, name, feed, fetch_list, main_prog, start_prog): fetch_list=fetch_list, program=main_prog) with paddle.static.program_guard(main_prog, start_prog): - saveModel(name, exe, feedkeys=['x'], fetchlist=fetch_list, inputs=[input_x], - outputs=[outs[0]], target_dir=sys.argv[1]) + saveModel(name, exe, feedkeys=[feed], fetchlist=fetch_list, inputs=[input_x], + outputs=[outs[0]], target_dir=sys.argv[1], use_static_api=True) def paddle_conv2d_transpose(input_x, name, input_shape, kernel, dilation, padding, stride, groups=1, use_cudnn=True): diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_embedding.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_embedding.py index cbd0e09b667302..45e621abea5b1c 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_embedding.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_embedding.py @@ -88,7 +88,7 @@ def embedding(name : str, ids, vocab_size, embedding_dim, padding_idx=None, spar feed=input_dict, fetch_list=output_vars_list ) - saveModel(name, exe, feedkeys=list(input_dict.keys()), fetchlist=output_vars_list, inputs=list(input_dict.values()), outputs=infer_results, target_dir=sys.argv[1]) + saveModel(name, exe, feedkeys=[node_ids], fetchlist=output_vars_list, inputs=list(input_dict.values()), outputs=infer_results, target_dir=sys.argv[1], use_static_api=True) # outputs = dict() diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_interpolate.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_interpolate.py index bb884471357352..d0d34c687b4c15 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_interpolate.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_interpolate.py @@ -19,8 +19,8 @@ def run_and_save_model(input_x, name, feed, fetch_list, main_prog, start_prog): program=main_prog) with paddle.static.program_guard(main_prog, start_prog): - saveModel(name, exe, feedkeys=['x'], fetchlist=fetch_list, inputs=[input_x], - outputs=[outs[0]], target_dir=sys.argv[1]) + saveModel(name, exe, feedkeys=[feed], fetchlist=fetch_list, inputs=[input_x], + outputs=[outs[0]], target_dir=sys.argv[1], use_static_api=True) return outs @@ -435,7 +435,7 @@ def linear_upsample_scales(): resize_downsample_bicubic() resize_upsample_bicubic() bicubic_upsample_tensor_size() - bicubic_upsample_scales() + bicubic_upsample_scales() # linear resize_downsample_linear() resize_upsample_linear() diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_mul.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_mul.py index 7b764031720278..dbc8c818d2f8c4 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_mul.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_mul.py @@ -23,7 +23,7 @@ def paddle_matmul(name, x1, x2, x_transpose=False, y_transpose=False): outs = exe.run( feed={'x1': x1, 'x2': x2}, fetch_list=[result]) - saveModel(name, exe, feedkeys=['x1', 'x2'], fetchlist=[result], inputs=[x1, x2], outputs=[outs[0]], target_dir=sys.argv[1]) + saveModel(name, exe, feedkeys=[node_x1, node_x2], fetchlist=[result], inputs=[x1, x2], outputs=[outs[0]], target_dir=sys.argv[1], use_static_api=True) return outs[0] diff --git a/src/frontends/paddle/tests/test_models/gen_scripts/generate_rnn_lstm.py b/src/frontends/paddle/tests/test_models/gen_scripts/generate_rnn_lstm.py index a12e5b2d68d174..4a6216a9581987 100644 --- a/src/frontends/paddle/tests/test_models/gen_scripts/generate_rnn_lstm.py +++ b/src/frontends/paddle/tests/test_models/gen_scripts/generate_rnn_lstm.py @@ -40,16 +40,16 @@ def paddle_rnn_lstm(input_size, hidden_size, layers, direction, seq_len): np.float32), 'sl': np.array(seq_len).astype(np.int32)}, fetch_list=[y, h, c], program=main_program) - saveModel("rnn_lstm_layer_" + str(layers) + '_' + str(direction) + '_seq_len_' + str(len(seq_len)), exe, feedkeys=['x', 'sl'], - fetchlist=[y, h, c], inputs=[np.ones([4, 3, input_size]).astype(np.float32), np.array(seq_len).astype(np.int32)], outputs=[outs[0], outs[1], outs[2]], target_dir=sys.argv[1]) + saveModel("rnn_lstm_layer_" + str(layers) + '_' + str(direction) + '_seq_len_' + str(len(seq_len)), exe, feedkeys=[data, seq_lengths], + fetchlist=[y, h, c], inputs=[np.ones([4, 3, input_size]).astype(np.float32), np.array(seq_len).astype(np.int32)], outputs=[outs[0], outs[1], outs[2]], target_dir=sys.argv[1], use_static_api=True) else: outs = exe.run( feed={'x': np.ones([4, 3, input_size]).astype( np.float32)}, fetch_list=[y, h, c], program=main_program) - saveModel("rnn_lstm_layer_" + str(layers) + '_' + str(direction), exe, feedkeys=['x'], - fetchlist=[y, h, c], inputs=[np.ones([4, 3, input_size]).astype(np.float32)], outputs=[outs[0], outs[1], outs[2]], target_dir=sys.argv[1]) + saveModel("rnn_lstm_layer_" + str(layers) + '_' + str(direction), exe, feedkeys=[data], + fetchlist=[y, h, c], inputs=[np.ones([4, 3, input_size]).astype(np.float32)], outputs=[outs[0], outs[1], outs[2]], target_dir=sys.argv[1], use_static_api=True) return outs[0] diff --git a/src/frontends/pytorch/src/frontend.cpp b/src/frontends/pytorch/src/frontend.cpp index a53a55525437e8..0e49180c8577c5 100644 --- a/src/frontends/pytorch/src/frontend.cpp +++ b/src/frontends/pytorch/src/frontend.cpp @@ -18,7 +18,9 @@ #include "transforms/append_list_unpack_replacer.hpp" #include "transforms/aten_cat_replacer.hpp" #include "transforms/aten_getitem_replacer.hpp" +#include "transforms/aten_index_replacer.hpp" #include "transforms/aten_stack_list_construct_replacer.hpp" +#include "transforms/einsum_list_construct.hpp" #include "transforms/listconstruct_replacer.hpp" #include "transforms/min_max_prim_list_construct_replacer.hpp" #include "transforms/prim_list_construct_pad.hpp" @@ -96,10 +98,12 @@ void FrontEnd::normalize(const std::shared_ptr& model) const { manager.register_pass(); manager.register_pass(); manager.register_pass(); + manager.register_pass(); manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); - manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.run_passes(model); diff --git a/src/frontends/pytorch/src/node_context.cpp b/src/frontends/pytorch/src/node_context.cpp index d8bb94305d86a5..a3e8c81633a800 100644 --- a/src/frontends/pytorch/src/node_context.cpp +++ b/src/frontends/pytorch/src/node_context.cpp @@ -142,6 +142,11 @@ ngraph::Shape NodeContext::const_input(size_t index) const { return get_constant_at_input(*this, index)->cast_vector(); } +template <> +int32_t NodeContext::const_input(size_t index) const { + return get_constant_at_input(*this, index)->cast_vector()[0]; +} + template <> int64_t NodeContext::const_input(size_t index) const { return get_constant_at_input(*this, index)->cast_vector()[0]; diff --git a/src/frontends/pytorch/src/op/roi_align.cpp b/src/frontends/pytorch/src/op/roi_align.cpp new file mode 100644 index 00000000000000..d3a389c59654b9 --- /dev/null +++ b/src/frontends/pytorch/src/op/roi_align.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/roi_align.hpp" + +#include "openvino/frontend/pytorch/node_context.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/reshape.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace op { + +using namespace ov::op; + +OutputVector translate_roi_align(NodeContext& context) { + num_inputs_check(context, 7, 7); + auto const_1 = context.mark_node(v0::Constant::create(element::i32, Shape{1}, {1})); + auto const_neg_1 = context.mark_node(v0::Constant::create(element::i32, Shape{1}, {-1})); + auto const_0 = context.mark_node(v0::Constant::create(element::i32, Shape{1}, {0})); + auto const_rois_indices = context.mark_node(v0::Constant::create(element::i32, Shape{4}, {1, 2, 3, 4})); + + auto input = context.get_input(0); + auto boxes_input = context.get_input(1); + + auto input_real_type = context.mark_node(std::make_shared(input, element::f32)); + auto boxes = context.mark_node(std::make_shared(boxes_input, input_real_type)); + + auto spatial_scale = context.const_input(2); + int output_size_h = context.const_input(3); + int output_size_w = context.const_input(4); + int sampling_ratio = context.const_input(5); + + auto aligned = context.const_input(6); + + auto rois = context.mark_node(std::make_shared(boxes, const_rois_indices, const_1)); + + auto batch_indices_gather = context.mark_node(std::make_shared(boxes, const_0, const_1)); + auto batch_indices_reshape = + context.mark_node(std::make_shared(batch_indices_gather, const_neg_1, false)); + auto batch_indices = context.mark_node(std::make_shared(batch_indices_reshape, element::i32)); + + v9::ROIAlign::AlignedMode aligned_mode = + aligned ? v9::ROIAlign::AlignedMode::HALF_PIXEL_FOR_NN : v9::ROIAlign::AlignedMode::ASYMMETRIC; + + auto roi_align = context.mark_node(std::make_shared(input_real_type, + rois, + batch_indices, + output_size_h, + output_size_w, + sampling_ratio, + spatial_scale, + v9::ROIAlign::PoolingMode::AVG, + aligned_mode)); + + return {roi_align}; +}; + +} // namespace op +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/op/upsample.cpp b/src/frontends/pytorch/src/op/upsample.cpp index 6c5b33d882c1a9..111a07a28c70e9 100644 --- a/src/frontends/pytorch/src/op/upsample.cpp +++ b/src/frontends/pytorch/src/op/upsample.cpp @@ -16,10 +16,12 @@ namespace op { using namespace ov::op; namespace { -OutputVector base_translate_upsample2d(const NodeContext& context, v4::Interpolate::InterpolateMode interpolate_mode) { - num_inputs_check(context, 3, 4); +OutputVector base_translate_upsample(const NodeContext& context, + v4::Interpolate::InterpolateMode interpolate_mode, + size_t dims) { + num_inputs_check(context, 1, 4); auto data = context.get_input(0); - std::vector pad{0}; + std::vector pad(dims, 0); auto size_mode = v4::Interpolate::ShapeCalcMode::SIZES; bool align_corners = false; int scale_id = 2; @@ -29,11 +31,21 @@ OutputVector base_translate_upsample2d(const NodeContext& context, v4::Interpola align_corners = context.const_input(2); } } - auto target_axes = std::make_shared(element::i32, Shape{2}, std::vector({2, 3})); + std::vector spatial_axes; + if (dims == 1) { + spatial_axes = {2}; + } else if (dims == 2) { + spatial_axes = {2, 3}; + } else if (dims == 3) { + spatial_axes = {2, 3, 4}; + } else { + FRONT_END_OP_CONVERSION_CHECK(false, "Unsupported number of dimensions in upsample"); + } + auto target_axes = std::make_shared(element::i32, Shape{spatial_axes.size()}, spatial_axes); auto scales = - context.mark_node(std::make_shared(element::f32, Shape{2}, std::vector({1, 1}))); + context.mark_node(std::make_shared(element::f32, Shape{dims}, std::vector(dims, 1))); auto output_sizes = - context.mark_node(std::make_shared(element::i32, Shape{2}, std::vector({1, 1}))); + context.mark_node(std::make_shared(element::i32, Shape{dims}, std::vector(dims, 1))); if (context.input_is_none(1)) { FRONT_END_OP_CONVERSION_CHECK(!context.input_is_none(scale_id), "Scale or Output size should be provided"); auto spatial_scales = context.get_input(scale_id); @@ -48,6 +60,7 @@ OutputVector base_translate_upsample2d(const NodeContext& context, v4::Interpola attrs.coordinate_transformation_mode = v4::Interpolate::CoordinateTransformMode::ASYMMETRIC; attrs.nearest_mode = v4::Interpolate::NearestMode::FLOOR; if (attrs.mode != v4::Interpolate::InterpolateMode::NEAREST) { + attrs.coordinate_transformation_mode = v4::Interpolate::CoordinateTransformMode::PYTORCH_HALF_PIXEL; if (align_corners) { attrs.coordinate_transformation_mode = v4::Interpolate::CoordinateTransformMode::ALIGN_CORNERS; } @@ -56,16 +69,33 @@ OutputVector base_translate_upsample2d(const NodeContext& context, v4::Interpola }; } // namespace +OutputVector translate_upsample_linear1d(NodeContext& context) { + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::LINEAR_ONNX, 1); +}; + OutputVector translate_upsample_bilinear2d(NodeContext& context) { - return base_translate_upsample2d(context, v4::Interpolate::InterpolateMode::LINEAR_ONNX); + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::LINEAR_ONNX, 2); +}; + +OutputVector translate_upsample_trilinear3d(NodeContext& context) { + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::LINEAR_ONNX, 3); +}; + +OutputVector translate_upsample_nearest1d(NodeContext& context) { + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::NEAREST, 1); }; OutputVector translate_upsample_nearest2d(NodeContext& context) { - return base_translate_upsample2d(context, v4::Interpolate::InterpolateMode::NEAREST); + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::NEAREST, 2); +}; + +OutputVector translate_upsample_nearest3d(NodeContext& context) { + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::NEAREST, 3); }; +// bicubic is only supported for 2d in pytorch OutputVector translate_upsample_bicubic2d(NodeContext& context) { - return base_translate_upsample2d(context, v4::Interpolate::InterpolateMode::CUBIC); + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::CUBIC, 2); }; } // namespace op diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 098afbfc9a6a8c..bd2e9bf0564e7b 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -89,6 +89,7 @@ OP_CONVERTER(translate_repeat); OP_CONVERTER(translate_repeat_interleave); OP_CONVERTER(translate_reshape); OP_CONVERTER(translate_reshape_as); +OP_CONVERTER(translate_roi_align); OP_CONVERTER(translate_roll); OP_CONVERTER(translate_rsqrt); OP_CONVERTER(translate_rsub); @@ -110,7 +111,11 @@ OP_CONVERTER(translate_triu); OP_CONVERTER(translate_unfold); OP_CONVERTER(translate_upsample_bicubic2d); OP_CONVERTER(translate_upsample_bilinear2d); +OP_CONVERTER(translate_upsample_linear1d); +OP_CONVERTER(translate_upsample_nearest1d); OP_CONVERTER(translate_upsample_nearest2d); +OP_CONVERTER(translate_upsample_nearest3d); +OP_CONVERTER(translate_upsample_trilinear3d); OP_CONVERTER(translate_var); OP_CONVERTER(translate_var_mean); OP_CONVERTER(translate_where); @@ -303,7 +308,11 @@ const std::map get_supported_ops() { {"aten::unsqueeze_", op::inplace_op>}, {"aten::upsample_bicubic2d", op::translate_upsample_bicubic2d}, {"aten::upsample_bilinear2d", op::translate_upsample_bilinear2d}, + {"aten::upsample_linear1d", op::translate_upsample_linear1d}, + {"aten::upsample_nearest1d", op::translate_upsample_nearest1d}, {"aten::upsample_nearest2d", op::translate_upsample_nearest2d}, + {"aten::upsample_nearest3d", op::translate_upsample_nearest3d}, + {"aten::upsample_trilinear3d", op::translate_upsample_trilinear3d}, {"aten::var", op::translate_var}, {"aten::var_mean", op::translate_var_mean}, {"aten::view", op::translate_reshape}, @@ -319,6 +328,7 @@ const std::map get_supported_ops() { {"prim::NumToTensor", op::skip_node}, // In openvino we already store number as tensor with shape [] {"prim::requires_grad", op::return_false_scalar}, {"torchvision::nms", op::translate_nms}, + {"torchvision::roi_align", op::translate_roi_align}, }; }; diff --git a/src/frontends/pytorch/src/transforms/aten_index_replacer.cpp b/src/frontends/pytorch/src/transforms/aten_index_replacer.cpp new file mode 100644 index 00000000000000..7affc4511d028a --- /dev/null +++ b/src/frontends/pytorch/src/transforms/aten_index_replacer.cpp @@ -0,0 +1,271 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "aten_index_replacer.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/frontend/pytorch/visibility.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/gather_elements.hpp" +#include "openvino/op/gather_nd.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/non_zero.hpp" +#include "openvino/op/reduce_prod.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/split.hpp" +#include "openvino/op/squeeze.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/util/framework_node.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace pass { + +using namespace ov::op; +namespace { + +std::shared_ptr flatten(const Output& value, size_t axis) { + // First dimension of output tensor is the product of [d_0, ... d_{axis-1}] dimensions of + // input tensor. The last dimension is the product of the rest of input tensor dimensions: + // [d_{axis}, ..., d_n] + Output output_shape; + if (axis == 0) { + output_shape = v0::Constant::create(element::i64, Shape{2}, {1, -1}); + } else if (axis == 1) { + output_shape = v0::Constant::create(element::i64, Shape{2}, {0, -1}); + } else { + const auto value_shape = std::make_shared(value); + const auto value_rank = std::make_shared(value_shape); + const auto axis_node = v0::Constant::create(element::i64, Shape{}, {axis}); + auto start = v0::Constant::create(element::i64, Shape{}, {0}); + auto step = v0::Constant::create(element::i64, Shape{}, {1}); + const auto first_part_dims = std::make_shared(value_shape, start, axis_node, step); + auto zero = v0::Constant::create(element::i64, {}, {0}); + auto first_part_dims_length = std::make_shared(first_part_dims, zero, true); + + auto remaining_part_length = v0::Constant::create(element::i64, {1}, {-1}); + + output_shape = std::make_shared(OutputVector{first_part_dims_length, remaining_part_length}, 0); + } + return std::make_shared(value, output_shape, true); +} +}; // namespace + +AtenIndexToSelect::AtenIndexToSelect() { + auto index_op = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [](ov::pass::pattern::Matcher& m) { + auto index_op = cast_fw_node(m.get_match_root(), "aten::index"); + if (!index_op) { + return false; + } + auto input_node = index_op->input_value(0).get_node_shared_ptr(); + auto indicies = index_op->input_value(1).get_node_shared_ptr(); + auto list_indicies = cast_fw_node(indicies, "prim::ListConstruct"); + if (list_indicies) { + // Multiple tensors as indices. Each tensor could either be + // 1. prim::Constant() + // representing ":" in python indexing. E.g. tensor[:, :] + // 2. prim::Constant[value=...] or tensor output + // representing advanced indexing. E.g. tensor[[0, 1], [2, 0]]. + // For more info on advanced indexing, + // check https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing + + // Consider a general case of + // t: [x_1, y_1, y_2, ..., x_m, ..., y_n] + // where t is a tensor of rank m+n, {x_i} are axes where tensor index is provided, and {y_i} are axes for + // ":". Same results can be achieved through transposing t into + // t: [x_1, x_2, ..., x_m, y_1, y_2, ..., y_n] + // and use gather + // t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n] + // tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j)) + // After gather, reshape and transpose back. + auto ids = list_indicies->input_values(); + std::vector advanced_ids; + std::vector is_masked_bool; + OutputVector masked_indicies; + // for case when index is bool e.g. x[x>0], replace index with non_zero + for (size_t i = 0; i < ids.size(); i++) { + auto const_input = cast_fw_node(ids[i].get_node_shared_ptr(), "prim::Constant"); + + // skip dimensions where index is None + if (const_input) { + const auto& attrs = const_input->get_attrs(); + if (attrs.find("none_value") != attrs.end()) { + masked_indicies.push_back(ids[i]); + is_masked_bool.push_back(false); + continue; + } + } + auto id_dtype = ids[i].get_node_shared_ptr()->get_element_type(); + if (id_dtype == element::boolean || id_dtype == element::u8) { + auto idx = std::make_shared(ids[i], element::u8); + auto nonzero = std::make_shared(idx); + auto input_order = v0::Constant::create(element::i64, Shape{2}, {1, 0}); + auto masked_id = std::make_shared(nonzero, input_order); + masked_indicies.push_back(masked_id); + is_masked_bool.push_back(true); + } else { + masked_indicies.push_back(ids[i]); + is_masked_bool.push_back(false); + } + advanced_ids.push_back(i); + } + + // all indicies prim::Constant(None), return input as is + if (advanced_ids.size() == 0) { + copy_runtime_info({index_op, input_node}, input_node); + replace_node(index_op, input_node); + return true; + } + // perform gather for single element case + if (advanced_ids.size() == 1) { + auto index = masked_indicies[advanced_ids[0]]; + index = std::make_shared(index, element::i64); + if (is_masked_bool[advanced_ids[0]]) { + auto gather = std::make_shared(input_node, index); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + } + auto dim = v0::Constant::create(element::i64, Shape{}, {advanced_ids[0]}); + auto gather = std::make_shared(input_node, index, dim); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + } + auto adv_idx_count = advanced_ids.size(); + auto rank = input_node->get_input_partial_shape(0).rank(); + if (rank.is_dynamic()) { + FRONT_END_CHECK_IMPLEMENTED(false, "indexing for tensor with dynamic rank is not implemented "); + } + auto input_shape = std::make_shared(input_node); + auto zero = v0::Constant::create(element::i64, Shape{}, {0}); + auto input_dims = std::make_shared(input_shape, zero, rank.get_length()); + std::vector non_used_dims; + for (auto i = 0; i < rank.get_length(); i++) { + if (std::find(advanced_ids.begin(), advanced_ids.end(), i) == advanced_ids.end()) { + non_used_dims.push_back(i); + } + } + std::vector permutation_dims; + permutation_dims.insert(permutation_dims.end(), advanced_ids.begin(), advanced_ids.end()); + permutation_dims.insert(permutation_dims.end(), non_used_dims.begin(), non_used_dims.end()); + auto transpose_dims = v0::Constant::create(element::i64, Shape{permutation_dims.size()}, permutation_dims); + auto transposed_input = std::make_shared(input_node, transpose_dims); + auto flatten_input = flatten(transposed_input, adv_idx_count); + auto cum_adv_index = masked_indicies[advanced_ids[adv_idx_count - 1]]; + auto multiplier = input_dims->output(advanced_ids[adv_idx_count - 1]); + for (int i = static_cast(adv_idx_count) - 2; i > 0; i--) { + auto adv_index = std::make_shared(masked_indicies[i], multiplier); + cum_adv_index = std::make_shared(cum_adv_index, adv_index); + auto input_id = advanced_ids[i]; + multiplier = std::make_shared(multiplier, input_dims->output(input_id)); + } + std::shared_ptr gather = std::make_shared(flatten_input, cum_adv_index, zero); + OutputVector concat_dims; + // check if all advanced indices are consecutive. + std::vector consequence_dims; + auto cum_adv_index_shape_tensor = std::make_shared(cum_adv_index); + for (size_t i = advanced_ids[0]; i <= advanced_ids[advanced_ids.size() - 1]; i++) { + consequence_dims.push_back(i); + } + // unfold regular index axes + if (advanced_ids == consequence_dims) { + OutputVector folded_adv_idx_shape_vector; + auto minus_one = v0::Constant::create(element::i64, Shape{1}, {-1}); + folded_adv_idx_shape_vector.push_back(minus_one); + for (auto i : non_used_dims) { + folded_adv_idx_shape_vector.push_back(input_dims->output(i)); + } + auto folded_adv_idx_shape = std::make_shared(folded_adv_idx_shape_vector, 0); + gather = std::make_shared(gather, folded_adv_idx_shape, false); + std::vector adv_idx_permute; + for (size_t i = 1; i < advanced_ids[0] + 1; i++) { + adv_idx_permute.push_back(i); + } + adv_idx_permute.push_back(0); + for (size_t i = advanced_ids[0] + 1; i < (rank.get_length() - adv_idx_count + 1); i++) { + adv_idx_permute.push_back(i); + } + // Transpose folded advanced indexed axis to its original location. + auto permute_indicies = + v0::Constant::create(element::i64, Shape{adv_idx_permute.size()}, adv_idx_permute); + gather = std::make_shared(gather, permute_indicies); + // unfold advanced index axes + for (size_t i = 0; i <= advanced_ids[0]; i++) { + concat_dims.push_back(input_dims->output(i)); + } + concat_dims.push_back(cum_adv_index_shape_tensor); + for (auto i : non_used_dims) { + if (i < advanced_ids[i]) { + continue; + } + concat_dims.push_back(input_dims->output(i)); + } + + } else { + concat_dims.push_back(cum_adv_index_shape_tensor); + for (auto i : non_used_dims) { + concat_dims.push_back(input_dims->output(i)); + } + } + auto final_shape = std::make_shared(concat_dims, 0); + gather = std::make_shared(gather, final_shape, false); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + + } else { + auto const_input = cast_fw_node(indicies, "prim::Constant"); + + if (const_input) { + // index is None, stay input as is + const auto& attrs = const_input->get_attrs(); + if (attrs.find("none_value") != attrs.end()) { + copy_runtime_info({index_op, input_node, indicies}, input_node); + replace_node(index_op, input_node); + return true; + } + } + auto index_dtype = indicies->get_output_element_type(0); + if (index_dtype == element::boolean || index_dtype == element::u8) { + auto nonzero = std::make_shared(indicies); + auto input_order = v0::Constant::create(element::i64, Shape{2}, {1, 0}); + auto masked_id = std::make_shared(nonzero, input_order); + auto gather = std::make_shared(input_node, masked_id); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + } + if (index_dtype != element::i32 && index_dtype != element::i64) { + indicies = std::make_shared(indicies, element::i64); + } + auto dim = v0::Constant::create(element::i64, Shape{}, {0}); + auto gather = std::make_shared(input_node, indicies, dim); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + } + return false; + }; + + auto m = std::make_shared(index_op, "ov::frontend::pytorch::pass::AtenIndexToSelect"); + this->register_matcher(m, callback); +}; + +} // namespace pass +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp b/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp new file mode 100644 index 00000000000000..84f6133253aea6 --- /dev/null +++ b/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/frontend/pytorch/visibility.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pass.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace pass { + +// This transformation replaces pattern prim::ListConstruct->aten::index +class PYTORCH_API AtenIndexToSelect : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenIndexToSelect"); + AtenIndexToSelect(); +}; + +} // namespace pass +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/transforms/einsum_list_construct.cpp b/src/frontends/pytorch/src/transforms/einsum_list_construct.cpp new file mode 100644 index 00000000000000..96881ebcbb25e0 --- /dev/null +++ b/src/frontends/pytorch/src/transforms/einsum_list_construct.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "einsum_list_construct.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/op/einsum.hpp" +#include "openvino/op/util/framework_node.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "utils.hpp" + +using namespace ov::pass::pattern; + +namespace ov { +namespace frontend { +namespace pytorch { +namespace pass { + +using namespace ov::pass; +using namespace ov::op; + +AtenEinsumListConstructReplacer::AtenEinsumListConstructReplacer() { + auto einsum_op = pattern::wrap_type(); + ov::matcher_pass_callback callback = [](pattern::Matcher& m) { + auto einsum_op = cast_fw_node(m.get_match_root(), "aten::einsum"); + if (!einsum_op) { + return false; + } + auto equation_input = einsum_op->input_value(0).get_node_shared_ptr(); + auto tensor_list = einsum_op->input_value(1).get_node_shared_ptr(); + std::string equation; + // equation should be string constant + if (const auto& fw_node_mode = cast_fw_node(equation_input, "prim::Constant")) { + const auto& attrs = fw_node_mode->get_attrs(); + if (attrs.find("string_value") != attrs.end()) { + equation = attrs.at("string_value"); + } + } else { + return false; + } + // Check if ListConstruct is an input + if (auto list_construct_node = cast_fw_node(tensor_list, "prim::ListConstruct")) { + const auto& list_inputs = list_construct_node->input_values(); + OutputVector node_vector; + // Iterate over values in ListConstruct + for (const auto& list_input : list_inputs) { + node_vector.push_back(list_input); + } + + auto einsum = std::make_shared(node_vector, equation); + copy_runtime_info({einsum_op, equation_input, tensor_list}, einsum); + replace_node(einsum_op, einsum); + return true; + } + return false; + }; + + auto m = + std::make_shared(einsum_op, "ov::frontend::pytorch::pass::AtenEinsumListConstructReplacer"); + this->register_matcher(m, callback); +}; + +} // namespace pass +} // namespace pytorch +} // namespace frontend +} // namespace ov \ No newline at end of file diff --git a/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp b/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp new file mode 100644 index 00000000000000..af2ac9b5301129 --- /dev/null +++ b/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pass.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace pass { + +class AtenEinsumListConstructReplacer : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenEinsumListConstructReplacer"); + AtenEinsumListConstructReplacer(); +}; + +} // namespace pass +} // namespace pytorch +} // namespace frontend +} // namespace ov \ No newline at end of file diff --git a/src/frontends/tensorflow/src/frontend.cpp b/src/frontends/tensorflow/src/frontend.cpp index 0a8fd6abaa20e9..1a7a6c2ae887ac 100644 --- a/src/frontends/tensorflow/src/frontend.cpp +++ b/src/frontends/tensorflow/src/frontend.cpp @@ -58,11 +58,12 @@ bool FrontEnd::supported_impl(const std::vector& variants) const { if (variants.size() != 1) return false; - // Validating first path, it must contain a model if (variants[0].is()) { - std::string suffix = ".pb"; std::string model_path = variants[0].as(); - if (ov::util::ends_with(model_path, suffix.c_str())) { + if (ov::util::ends_with(model_path, ".pb") && GraphIteratorProto::is_supported(model_path)) { + // handle binary protobuf format + // for automatic deduction of the frontend to convert the model + // we have more strict rule that is to have `.pb` extension in the path return true; } } @@ -70,12 +71,16 @@ bool FrontEnd::supported_impl(const std::vector& variants) const { else if (variants[0].is()) { std::wstring suffix = L".pb"; std::wstring model_path = variants[0].as(); - if (ov::util::ends_with(model_path, suffix)) { + if (ov::util::ends_with(model_path, suffix) && GraphIteratorProto::is_supported(model_path)) { + // handle binary protobuf format with a path in Unicode + // for automatic deduction of the frontend to convert the model + // we have more strict rule that is to have `.pb` extension in the path return true; } } #endif else if (variants[0].is()) { + // this is used for OpenVINO with TensorFlow Integration return true; } return false; @@ -83,33 +88,36 @@ bool FrontEnd::supported_impl(const std::vector& variants) const { ov::frontend::InputModel::Ptr FrontEnd::load_impl(const std::vector& variants) const { // TODO: Support other TensorFlow formats: SavedModel, .meta, checkpoint, pbtxt - if (variants.size() == 1) { - // a case when binary protobuf format is provided - if (variants[0].is()) { - std::string suffix = ".pb"; - std::string model_path = variants[0].as(); - if (ov::util::ends_with(model_path, suffix.c_str())) { - return std::make_shared( - std::make_shared<::ov::frontend::tensorflow::GraphIteratorProto>(model_path), - m_telemetry); - } + FRONT_END_GENERAL_CHECK(variants.size() == 1, + "[TensorFlow Frontend] Internal error or inconsistent input model: the frontend supports " + "only frozen binary protobuf format."); + + if (variants[0].is()) { + auto model_path = variants[0].as(); + if (GraphIteratorProto::is_supported(model_path)) { + // handle binary protobuf format + return std::make_shared(std::make_shared(model_path), m_telemetry); } + } #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) - else if (variants[0].is()) { - std::wstring suffix = L".pb"; - std::wstring model_path = variants[0].as(); - if (ov::util::ends_with(model_path, suffix)) { - return std::make_shared( - std::make_shared<::ov::frontend::tensorflow::GraphIteratorProto>(model_path), - m_telemetry); - } + else if (variants[0].is()) { + std::wstring model_path = variants[0].as(); + if (GraphIteratorProto::is_supported(model_path)) { + // handle binary protobuf format with a path in Unicode + return std::make_shared(std::make_shared(model_path), m_telemetry); } + } #endif - else if (variants[0].is()) { - auto graph_iterator = variants[0].as(); - return std::make_shared(graph_iterator, m_telemetry); - } + else if (variants[0].is()) { + // this is used for OpenVINO with TensorFlow Integration + auto graph_iterator = variants[0].as(); + return std::make_shared(graph_iterator, m_telemetry); } + + FRONT_END_GENERAL_CHECK(false, + "[TensorFlow Frontend] Internal error or inconsistent input model: the frontend supports " + "only frozen binary protobuf format."); + return nullptr; } diff --git a/src/frontends/tensorflow/src/graph_iterator_proto.hpp b/src/frontends/tensorflow/src/graph_iterator_proto.hpp index c2b08ebe9de923..1fa836e3b036e1 100644 --- a/src/frontends/tensorflow/src/graph_iterator_proto.hpp +++ b/src/frontends/tensorflow/src/graph_iterator_proto.hpp @@ -88,29 +88,40 @@ class GraphIteratorProto : public GraphIterator { } } - /// Set iterator to the start position + /// \brief Check if the input file is supported + template + static bool is_supported(const std::basic_string& path) { + std::ifstream pb_stream(path, std::ios::in | std::ifstream::binary); + auto graph_def = std::make_shared<::tensorflow::GraphDef>(); + return pb_stream && pb_stream.is_open() && graph_def->ParsePartialFromIstream(&pb_stream); + } + + /// \brief Set iterator to the start position void reset() override { node_index = 0; } + /// \brief Return a number of nodes in the graph size_t size() const override { return m_decoders.size(); } - /// Moves to the next node in the graph + /// \brief Move to the next node in the graph void next() override { node_index++; } + /// \brief Check if the graph is fully traversed bool is_end() const override { return node_index >= m_decoders.size(); } - /// Return NodeContext for the current node that iterator points to + /// \brief Return NodeContext for the current node that iterator points to std::shared_ptr get_decoder() const override { return m_decoders[node_index]; } + /// \brief Get GraphIterator for library funnction by name std::shared_ptr get_body_graph_iterator(const std::string& func_name) const override { if (m_library_map.count(func_name)) { auto func_ind = m_library_map.at(func_name); @@ -127,10 +138,12 @@ class GraphIteratorProto : public GraphIterator { return nullptr; } + /// \brief Get input names in the original order. Used for the library functions std::vector get_input_names() const override { return m_input_names; } + /// \brief Get output names in the original order. Used for the library functions std::vector get_output_names() const override { return m_output_names; } diff --git a/src/frontends/tensorflow_common/src/op/fake_quant_min_max_vars.cpp b/src/frontends/tensorflow_common/src/op/fake_quant_min_max_vars.cpp index aed41eea03e94c..fa5c819b88b386 100644 --- a/src/frontends/tensorflow_common/src/op/fake_quant_min_max_vars.cpp +++ b/src/frontends/tensorflow_common/src/op/fake_quant_min_max_vars.cpp @@ -18,41 +18,44 @@ OutputVector translate_fake_quant_op(const NodeContext& node) { auto min = node.get_input(1); auto max = node.get_input(2); - auto narrow_range = node.get_attribute("narrow_range"); - auto num_bits = node.get_attribute("num_bits"); + // retrieve attributes + auto narrow_range = node.get_attribute("narrow_range", false); + auto num_bits = node.get_attribute("num_bits", 8); - size_t levels = static_cast(std::pow(2, num_bits) - int(narrow_range)); - auto min_less_max = make_shared(min, max); - auto minimum = make_shared(min_less_max, max, min); + size_t levels = static_cast(pow(2, num_bits)); + levels = narrow_range ? levels - 1 : levels; - auto zero = make_shared(min.get_element_type(), Shape{}, std::vector({0})); + // compute real min and max values + Output minimum = make_shared(min, max); + Output maximum = make_shared(min, max); + // adjust min and max so that min <= 0 + auto zero = make_shared(min.get_element_type(), Shape{}, 0); auto min_greater_zero = make_shared(minimum, zero); - auto max_minus_min = make_shared(maximum, minimum); + Output max_minus_min = make_shared(maximum, minimum); minimum = make_shared(min_greater_zero, max_minus_min, maximum); + // adjust min and max so that 0 <= max auto max_less_zero = make_shared(maximum, zero); auto min_minus_max = make_shared(minimum, maximum); minimum = make_shared(max_less_zero, zero, maximum); - auto float_range = make_shared(maximum, minimum); - auto quant_min_value = int(narrow_range); - auto quant_max_value = std::pow(2, num_bits) - 1; - auto value = static_cast(quant_max_value - quant_min_value); - auto int_range = make_shared(element::f32, Shape{}, std::vector({value})); - auto scale = make_shared(float_range, int_range); + // adjust min and max so that scale = (max - min) / (2^num_bits - 1), + // min_adj = scale * round(min / scale) and max_adj = max + min_adj - min + max_minus_min = make_shared(maximum, minimum); + auto const_levels = make_shared(element::f32, Shape{}, static_cast(levels - 1)); + auto scale = make_shared(max_minus_min, const_levels); auto descaled_min = make_shared(minimum, scale); auto rounded_descaled_min = make_shared(descaled_min, Round::RoundMode::HALF_TO_EVEN); auto min_adj = make_shared(scale, rounded_descaled_min); auto adjustment = make_shared(min_adj, minimum); auto max_adj = make_shared(maximum, adjustment); - auto res = make_shared(inputs, min_adj, max_adj, min_adj, max_adj, levels); - set_node_name(node.get_name(), res); - return {res}; + auto fake_quantize = make_shared(inputs, min_adj, max_adj, min_adj, max_adj, levels); + set_node_name(node.get_name(), fake_quantize); + return {fake_quantize}; } } // namespace op } // namespace tensorflow diff --git a/src/frontends/tensorflow_common/src/op/identity.cpp b/src/frontends/tensorflow_common/src/op/identity.cpp index 4dbc9e285a892b..7bd6d7735e37fb 100644 --- a/src/frontends/tensorflow_common/src/op/identity.cpp +++ b/src/frontends/tensorflow_common/src/op/identity.cpp @@ -14,13 +14,12 @@ namespace tensorflow { namespace op { OutputVector translate_identity_op(const NodeContext& node) { + vector supported_ops = {"Identity", "PreventGradient", "Snapshot", "StopGradient"}; + default_op_checks(node, 1, supported_ops); auto input = node.get_input(0); - // since the input node can have several outputs, and identity have only one input, - // we cannot use set_node_name(..) helper, we have to set names for output connected - // to this identity only. - // Node_1 -> Node_2 - // -(identity name) -> Identity + // set only tensor names + // no need to change node name since Identity node is skipped set_out_name(node.get_name(), input); set_out_name(node.get_name() + ":" + "0", input); return {input}; diff --git a/src/frontends/tensorflow_lite/src/op/op_translation_utils.cpp b/src/frontends/tensorflow_lite/src/op/op_translation_utils.cpp index ad94a8f03cda13..b849c06f2594c6 100644 --- a/src/frontends/tensorflow_lite/src/op/op_translation_utils.cpp +++ b/src/frontends/tensorflow_lite/src/op/op_translation_utils.cpp @@ -78,13 +78,21 @@ void get_activation(ov::OutputVector& output, output = ov::frontend::tensorflow::op::translate_relu_6_op(context); } else if (activation == "TANH") { output = ov::frontend::tensorflow::op::translate_unary_op(context); + } else if (activation == "RELU_N1_TO_1") { + auto clamp = std::make_shared(output[0], -1.0f, 1.0f); + clamp->set_friendly_name(context.get_name()); + output = clamp->outputs(); + } else if (activation == "SIGN_BIT") { + auto zero = std::make_shared(opset10::Constant::create(element::i32, {}, {0}), output[0]); + auto less = std::make_shared(output[0], zero); + less->set_friendly_name(context.get_name()); + output = less->outputs(); } else { - // TODO: Fused activation to support: - // RELU_N1_TO_1 = 2, - // SIGN_BIT = 5, - if (activation != "NONE") { - FRONT_END_THROW("Unknown Activation fused to " + node.get_decoder()->get_op_type() + ": " + activation); - } + FRONT_END_GENERAL_CHECK(activation == "NONE", + "Unknown Activation fused to ", + node.get_decoder()->get_op_type(), + ": ", + activation); } del_output_names(output); } diff --git a/src/inference/CMakeLists.txt b/src/inference/CMakeLists.txt index 03ded6608790f8..f3f436e57d50e1 100644 --- a/src/inference/CMakeLists.txt +++ b/src/inference/CMakeLists.txt @@ -14,6 +14,7 @@ file (GLOB LIBRARY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/dev/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/dev/preprocessing/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/dev/threading/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/threading/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp_interfaces/interface/*.cpp diff --git a/src/inference/dev_api/ie_system_conf.h b/src/inference/dev_api/ie_system_conf.h index 17f1781c13ffab..408c626accf436 100644 --- a/src/inference/dev_api/ie_system_conf.h +++ b/src/inference/dev_api/ie_system_conf.h @@ -12,7 +12,7 @@ #include #include -#include "ie_api.h" +#include "openvino/runtime/system_conf.hpp" namespace InferenceEngine { @@ -23,7 +23,9 @@ namespace InferenceEngine { * @param[in] includeOMPNumThreads Indicates if the omp number threads is included * @return `True` if any OpenMP environment variable is defined, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) checkOpenMpEnvVars(bool includeOMPNumThreads = true); +inline bool checkOpenMpEnvVars(bool includeOMPNumThreads = true) { + return ov::check_open_mp_env_vars(includeOMPNumThreads); +} /** * @brief Returns available CPU NUMA nodes (on Linux, and Windows [only with TBB], single node is assumed on all @@ -31,7 +33,9 @@ INFERENCE_ENGINE_API_CPP(bool) checkOpenMpEnvVars(bool includeOMPNumThreads = tr * @ingroup ie_dev_api_system_conf * @return NUMA nodes */ -INFERENCE_ENGINE_API_CPP(std::vector) getAvailableNUMANodes(); +inline std::vector getAvailableNUMANodes() { + return ov::get_available_numa_nodes(); +} /** * @brief Returns available CPU cores types (on Linux, and Windows) and ONLY with TBB, single core type is assumed @@ -39,7 +43,9 @@ INFERENCE_ENGINE_API_CPP(std::vector) getAvailableNUMANodes(); * @ingroup ie_dev_api_system_conf * @return Vector of core types */ -INFERENCE_ENGINE_API_CPP(std::vector) getAvailableCoresTypes(); +inline std::vector getAvailableCoresTypes() { + return ov::get_available_cores_types(); +} /** * @brief Returns number of CPU physical cores on Linux/Windows (which is considered to be more performance @@ -50,7 +56,9 @@ INFERENCE_ENGINE_API_CPP(std::vector) getAvailableCoresTypes(); * @param[in] bigCoresOnly Additionally limits the number of reported cores to the 'Big' cores only. * @return Number of physical CPU cores. */ -INFERENCE_ENGINE_API_CPP(int) getNumberOfCPUCores(bool bigCoresOnly = false); +inline int getNumberOfCPUCores(bool bigCoresOnly = false) { + return ov::get_number_of_cpu_cores(bigCoresOnly); +} /** * @brief Returns number of CPU logical cores on Linux/Windows (on other OSes it simply relies on the original @@ -60,80 +68,81 @@ INFERENCE_ENGINE_API_CPP(int) getNumberOfCPUCores(bool bigCoresOnly = false); * @param[in] bigCoresOnly Additionally limits the number of reported cores to the 'Big' cores only. * @return Number of logical CPU cores. */ -INFERENCE_ENGINE_API_CPP(int) getNumberOfLogicalCPUCores(bool bigCoresOnly = false); +inline int getNumberOfLogicalCPUCores(bool bigCoresOnly = false) { + return ov::get_number_of_logical_cpu_cores(bigCoresOnly); +} /** * @brief Checks whether CPU supports SSE 4.2 capability * @ingroup ie_dev_api_system_conf * @return `True` is SSE 4.2 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_sse42(); +using ov::with_cpu_x86_sse42; /** * @brief Checks whether CPU supports AVX capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx(); +using ov::with_cpu_x86_avx; /** * @brief Checks whether CPU supports AVX2 capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX2 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx2(); +using ov::with_cpu_x86_avx2; /** * @brief Checks whether CPU supports AVX 512 capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX512F (foundation) instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512f(); +using ov::with_cpu_x86_avx512f; /** * @brief Checks whether CPU supports AVX 512 capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX512F, AVX512BW, AVX512DQ instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core(); +using ov::with_cpu_x86_avx512_core; /** * @brief Checks whether CPU supports AVX 512 VNNI capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX512F, AVX512BW, AVX512DQ, AVX512_VNNI instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_vnni(); +using ov::with_cpu_x86_avx512_core_vnni; /** * @brief Checks whether CPU supports BFloat16 capability * @ingroup ie_dev_api_system_conf * @return `True` is tAVX512_BF16 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_bfloat16(); +using ov::with_cpu_x86_bfloat16; /** * @brief Checks whether CPU supports AMX int8 capability * @ingroup ie_dev_api_system_conf * @return `True` is tAMX_INT8 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_amx_int8(); +using ov::with_cpu_x86_avx512_core_amx_int8; /** * @brief Checks whether CPU supports AMX bf16 capability * @ingroup ie_dev_api_system_conf * @return `True` is tAMX_BF16 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_amx_bf16(); +using ov::with_cpu_x86_avx512_core_amx_bf16; /** * @brief Checks whether CPU supports AMX capability * @ingroup ie_dev_api_system_conf * @return `True` is tAMX_INT8 or tAMX_BF16 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_amx(); +using ov::with_cpu_x86_avx512_core_amx; /** - * @enum column_of_processor_type_table * @brief This enum contains defination of each columns in processor type table which bases on cpu core types. Will * extend to support other CPU core type like ARM. * @@ -150,16 +159,9 @@ INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_amx(); * ALL_PROC | MAIN_CORE_PROC | EFFICIENT_CORE_PROC | HYPER_THREADING_PROC * 32 8 16 8 // Total number of one socket */ -typedef enum { - ALL_PROC = 0, //!< All processors, regardless of backend cpu - MAIN_CORE_PROC = 1, //!< Processor based on physical core of Intel Performance-cores - EFFICIENT_CORE_PROC = 2, //!< Processor based on Intel Efficient-cores - HYPER_THREADING_PROC = 3, //!< Processor based on logical core of Intel Performance-cores - PROC_TYPE_TABLE_SIZE = 4 //!< Size of processor type table -} column_of_processor_type_table; +using ov::ColumnOfProcessorTypeTable; /** - * @enum column_of_cpu_mapping_table * @brief This enum contains defination of each columns in CPU mapping table which use processor id as index. * * GROUP_ID is generated according to the following rules. @@ -181,14 +183,6 @@ typedef enum { * 6 0 4 2 2 0 * 7 0 5 2 2 0 */ -typedef enum { - CPU_MAP_PROCESSOR_ID = 0, //!< column for processor id of the processor - CPU_MAP_SOCKET_ID = 1, //!< column for socket id of the processor - CPU_MAP_CORE_ID = 2, //!< column for hardware core id of the processor - CPU_MAP_CORE_TYPE = 3, //!< column for CPU core type corresponding to the processor - CPU_MAP_GROUP_ID = 4, //!< column for group id to the processor. Processors in one group have dependency. - CPU_MAP_USED_FLAG = 5, //!< column for resource management of the processor - CPU_MAP_TABLE_SIZE = 6 //!< Size of CPU mapping table -} column_of_cpu_mapping_table; +using ov::ColumnOfCPUMappingTable; } // namespace InferenceEngine diff --git a/src/inference/dev_api/openvino/runtime/iasync_infer_request.hpp b/src/inference/dev_api/openvino/runtime/iasync_infer_request.hpp index 687b05030cd566..628c2c651f85a9 100644 --- a/src/inference/dev_api/openvino/runtime/iasync_infer_request.hpp +++ b/src/inference/dev_api/openvino/runtime/iasync_infer_request.hpp @@ -17,7 +17,7 @@ #include "openvino/runtime/iinfer_request.hpp" #include "openvino/runtime/profiling_info.hpp" #include "openvino/runtime/tensor.hpp" -#include "threading/ie_itask_executor.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" namespace ov { @@ -37,8 +37,8 @@ namespace ov { class OPENVINO_RUNTIME_API IAsyncInferRequest : public IInferRequest { public: IAsyncInferRequest(const std::shared_ptr& request, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor); + const std::shared_ptr& task_executor, + const std::shared_ptr& callback_executor); ~IAsyncInferRequest(); /** @@ -153,7 +153,7 @@ class OPENVINO_RUNTIME_API IAsyncInferRequest : public IInferRequest { const std::vector>& get_outputs() const override; protected: - using Stage = std::pair; + using Stage = std::pair, ov::threading::Task>; /** * @brief Pipeline is vector of stages */ @@ -212,11 +212,11 @@ class OPENVINO_RUNTIME_API IAsyncInferRequest : public IInferRequest { void run_first_stage(const Pipeline::iterator itBeginStage, const Pipeline::iterator itEndStage, - const InferenceEngine::ITaskExecutor::Ptr callbackExecutor = {}); + const std::shared_ptr callbackExecutor = {}); - InferenceEngine::Task make_next_stage_task(const Pipeline::iterator itStage, - const Pipeline::iterator itEndStage, - const InferenceEngine::ITaskExecutor::Ptr callbackExecutor); + ov::threading::Task make_next_stage_task(const Pipeline::iterator itStage, + const Pipeline::iterator itEndStage, + const std::shared_ptr callbackExecutor); template void infer_impl(const F& f) { @@ -264,10 +264,10 @@ class OPENVINO_RUNTIME_API IAsyncInferRequest : public IInferRequest { std::shared_ptr m_sync_request; - InferenceEngine::ITaskExecutor::Ptr m_request_executor; //!< Used to run inference CPU tasks. - InferenceEngine::ITaskExecutor::Ptr + std::shared_ptr m_request_executor; //!< Used to run inference CPU tasks. + std::shared_ptr m_callback_executor; //!< Used to run post inference callback in asynchronous pipline - InferenceEngine::ITaskExecutor::Ptr + std::shared_ptr m_sync_callback_executor; //!< Used to run post inference callback in synchronous pipline mutable std::mutex m_mutex; std::function m_callback; diff --git a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp index 44c701c1d42be8..c95feba6cc1042 100644 --- a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp +++ b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp @@ -17,8 +17,8 @@ #include "openvino/runtime/common.hpp" #include "openvino/runtime/isync_infer_request.hpp" #include "openvino/runtime/remote_context.hpp" -#include "threading/ie_cpu_streams_executor.hpp" -#include "threading/ie_itask_executor.hpp" +#include "openvino/runtime/threading/cpu_streams_executor.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" namespace InferenceEngine { class ICompiledModelWrapper; @@ -47,14 +47,13 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< * * @param callback_executor Callback executor (CPUStreamsExecutor by default) */ - ICompiledModel(const std::shared_ptr& model, - const std::shared_ptr& plugin, - const InferenceEngine::ITaskExecutor::Ptr& task_executor = - std::make_shared(InferenceEngine::IStreamsExecutor::Config{ - "Default"}), - const InferenceEngine::ITaskExecutor::Ptr& callback_executor = - std::make_shared(InferenceEngine::IStreamsExecutor::Config{ - "Callback"})); + ICompiledModel( + const std::shared_ptr& model, + const std::shared_ptr& plugin, + const std::shared_ptr& task_executor = + std::make_shared(ov::threading::IStreamsExecutor::Config{"Default"}), + const std::shared_ptr& callback_executor = + std::make_shared(ov::threading::IStreamsExecutor::Config{"Callback"})); /** * @brief Gets all outputs from compiled model @@ -119,8 +118,8 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< std::vector> m_inputs; std::vector> m_outputs; - InferenceEngine::ITaskExecutor::Ptr m_task_executor = nullptr; //!< Holds a task executor - InferenceEngine::ITaskExecutor::Ptr m_callback_executor = nullptr; //!< Holds a callback executor + std::shared_ptr m_task_executor = nullptr; //!< Holds a task executor + std::shared_ptr m_callback_executor = nullptr; //!< Holds a callback executor friend ov::CoreImpl; friend ov::IExecutableNetworkWrapper; @@ -146,7 +145,7 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< /** * @brief Default implementation of create async inter request method * - * @tparam AsyncInferRequestType Async infer request type. InferenceEngine::AsyncInferRequestThreadSafeDefault by + * @tparam AsyncInferRequestType Async infer request type. ov::IAsyncInferRequest by * default * * @return Asynchronous infer request @@ -163,8 +162,8 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< * @return OpenVINO Plugin interface */ const std::shared_ptr& get_plugin() const; - const InferenceEngine::ITaskExecutor::Ptr get_task_executor() const; - const InferenceEngine::ITaskExecutor::Ptr get_callback_executor() const; + const std::shared_ptr get_task_executor() const; + const std::shared_ptr get_callback_executor() const; }; } // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/icore.hpp b/src/inference/dev_api/openvino/runtime/icore.hpp index b8ea63086169ea..5e7e9401312cf2 100644 --- a/src/inference/dev_api/openvino/runtime/icore.hpp +++ b/src/inference/dev_api/openvino/runtime/icore.hpp @@ -21,7 +21,7 @@ namespace ov { /** * @interface ICore * @brief Minimal ICore interface to allow plugin to get information from Core Inference Engine class. - * @ingroup ie_dev_api_plugin_api + * @ingroup ov_dev_api_plugin_api */ class ICore { public: diff --git a/src/inference/dev_api/openvino/runtime/iplugin.hpp b/src/inference/dev_api/openvino/runtime/iplugin.hpp index 47f576b46bfd52..653d44c0fc2198 100644 --- a/src/inference/dev_api/openvino/runtime/iplugin.hpp +++ b/src/inference/dev_api/openvino/runtime/iplugin.hpp @@ -19,7 +19,7 @@ #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/icore.hpp" #include "openvino/runtime/remote_context.hpp" -#include "threading/ie_executor_manager.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" namespace InferenceEngine { @@ -188,7 +188,7 @@ class OPENVINO_RUNTIME_API IPlugin : public std::enable_shared_from_this& get_executor_manager() const; + const std::shared_ptr& get_executor_manager() const; ~IPlugin() = default; @@ -198,18 +198,18 @@ class OPENVINO_RUNTIME_API IPlugin : public std::enable_shared_from_this m_core; //!< A pointer to ICore interface - std::shared_ptr m_executor_manager; //!< A tasks execution manager - ov::Version m_version; //!< Member contains plugin version - bool m_is_new_api; //!< A flag which shows used API + std::string m_plugin_name; //!< A device name that plugins enables + std::weak_ptr m_core; //!< A pointer to ICore interface + std::shared_ptr m_executor_manager; //!< A tasks execution manager + ov::Version m_version; //!< Member contains plugin version + bool m_is_new_api; //!< A flag which shows used API }; } // namespace ov /** * @def OV_CREATE_PLUGIN * @brief Defines a name of a function creating plugin instance - * @ingroup ie_dev_api_plugin_api + * @ingroup ov_dev_api_plugin_api */ #ifndef OV_CREATE_PLUGIN # define OV_CREATE_PLUGIN CreatePluginEngine diff --git a/src/inference/dev_api/openvino/runtime/system_conf.hpp b/src/inference/dev_api/openvino/runtime/system_conf.hpp new file mode 100644 index 00000000000000..216d059ed357d2 --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/system_conf.hpp @@ -0,0 +1,193 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief Abstraction over platform specific implementations + * @file openvino/runtime/system_conf.hpp + */ + +#pragma once + +#include + +#include "openvino/runtime/common.hpp" + +namespace ov { + +/** + * @brief Checks whether OpenMP environment variables are defined + * @ingroup ov_dev_api_system_conf + * + * @param[in] include_omp_num_threads Indicates if the omp number threads is included + * @return `True` if any OpenMP environment variable is defined, `false` otherwise + */ +OPENVINO_RUNTIME_API bool check_open_mp_env_vars(bool include_omp_num_threads = true); + +/** + * @brief Returns available CPU NUMA nodes (on Linux, and Windows [only with TBB], single node is assumed on all + * other OSes) + * @ingroup ov_dev_api_system_conf + * @return NUMA nodes + */ +OPENVINO_RUNTIME_API std::vector get_available_numa_nodes(); + +/** + * @brief Returns available CPU cores types (on Linux, and Windows) and ONLY with TBB, single core type is assumed + * otherwise + * @ingroup ov_dev_api_system_conf + * @return Vector of core types + */ +OPENVINO_RUNTIME_API std::vector get_available_cores_types(); + +/** + * @brief Returns number of CPU physical cores on Linux/Windows (which is considered to be more performance + * friendly for servers) (on other OSes it simply relies on the original parallel API of choice, which usually uses the + * logical cores). call function with 'false' to get #phys cores of all types call function with 'true' to get #phys + * 'Big' cores number of 'Little' = 'all' - 'Big' + * @ingroup ov_dev_api_system_conf + * @param[in] big_cores_only Additionally limits the number of reported cores to the 'Big' cores only. + * @return Number of physical CPU cores. + */ +OPENVINO_RUNTIME_API int get_number_of_cpu_cores(bool big_cores_only = false); + +/** + * @brief Returns number of CPU logical cores on Linux/Windows (on other OSes it simply relies on the original + * parallel API of choice, which uses the 'all' logical cores). call function with 'false' to get #logical cores of + * all types call function with 'true' to get #logical 'Big' cores number of 'Little' = 'all' - 'Big' + * @ingroup ov_dev_api_system_conf + * @param[in] big_cores_only Additionally limits the number of reported cores to the 'Big' cores only. + * @return Number of logical CPU cores. + */ +OPENVINO_RUNTIME_API int get_number_of_logical_cpu_cores(bool big_cores_only = false); + +/** + * @brief Checks whether CPU supports SSE 4.2 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is SSE 4.2 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_sse42(); + +/** + * @brief Checks whether CPU supports AVX capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx(); + +/** + * @brief Checks whether CPU supports AVX2 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX2 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx2(); + +/** + * @brief Checks whether CPU supports AVX 512 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX512F (foundation) instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512f(); + +/** + * @brief Checks whether CPU supports AVX 512 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX512F, AVX512BW, AVX512DQ instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core(); + +/** + * @brief Checks whether CPU supports AVX 512 VNNI capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX512F, AVX512BW, AVX512DQ, AVX512_VNNI instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_vnni(); + +/** + * @brief Checks whether CPU supports BFloat16 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is tAVX512_BF16 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_bfloat16(); + +/** + * @brief Checks whether CPU supports AMX int8 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is tAMX_INT8 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_amx_int8(); + +/** + * @brief Checks whether CPU supports AMX bf16 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is tAMX_BF16 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_amx_bf16(); + +/** + * @brief Checks whether CPU supports AMX capability + * @ingroup ov_dev_api_system_conf + * @return `True` is tAMX_INT8 or tAMX_BF16 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_amx(); + +/** + * @enum ColumnOfProcessorTypeTable + * @brief This enum contains defination of each columns in processor type table which bases on cpu core types. Will + * extend to support other CPU core type like ARM. + * + * The following are two example of processor type table. + * 1. Processor table of two socket CPUs XEON server + * + * ALL_PROC | MAIN_CORE_PROC | EFFICIENT_CORE_PROC | HYPER_THREADING_PROC + * 96 48 0 48 // Total number of two sockets + * 48 24 0 24 // Number of socket one + * 48 24 0 24 // Number of socket two + * + * 2. Processor table of one socket CPU desktop + * + * ALL_PROC | MAIN_CORE_PROC | EFFICIENT_CORE_PROC | HYPER_THREADING_PROC + * 32 8 16 8 // Total number of one socket + */ +enum ColumnOfProcessorTypeTable { + ALL_PROC = 0, //!< All processors, regardless of backend cpu + MAIN_CORE_PROC = 1, //!< Processor based on physical core of Intel Performance-cores + EFFICIENT_CORE_PROC = 2, //!< Processor based on Intel Efficient-cores + HYPER_THREADING_PROC = 3, //!< Processor based on logical core of Intel Performance-cores + PROC_TYPE_TABLE_SIZE = 4 //!< Size of processor type table +}; + +/** + * @enum ColumnOfCPUMappingTable + * @brief This enum contains defination of each columns in CPU mapping table which use processor id as index. + * + * GROUP_ID is generated according to the following rules. + * 1. If one MAIN_CORE_PROC and one HYPER_THREADING_PROC are based on same Performance-cores, they are in one group. + * 2. If some EFFICIENT_CORE_PROC share one L2 cachle, they are in one group. + * 3. There are no duplicate group IDs in the system + * + * The following is the example of CPU mapping table. + * 1. Four processors of two Pcore + * 2. Four processors of four Ecores shared L2 cache + * + * PROCESSOR_ID | SOCKET_ID | CORE_ID | CORE_TYPE | GROUP_ID | Used + * 0 0 0 3 0 0 + * 1 0 0 1 0 0 + * 2 0 1 3 1 0 + * 3 0 1 1 1 0 + * 4 0 2 2 2 0 + * 5 0 3 2 2 0 + * 6 0 4 2 2 0 + * 7 0 5 2 2 0 + */ +enum ColumnOfCPUMappingTable { + CPU_MAP_PROCESSOR_ID = 0, //!< column for processor id of the processor + CPU_MAP_SOCKET_ID = 1, //!< column for socket id of the processor + CPU_MAP_CORE_ID = 2, //!< column for hardware core id of the processor + CPU_MAP_CORE_TYPE = 3, //!< column for CPU core type corresponding to the processor + CPU_MAP_GROUP_ID = 4, //!< column for group id to the processor. Processors in one group have dependency. + CPU_MAP_USED_FLAG = 5, //!< column for resource management of the processor + CPU_MAP_TABLE_SIZE = 6 //!< Size of CPU mapping table +}; + +} // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp new file mode 100644 index 00000000000000..a59986665a6524 --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @file openvino/runtime/threading/cpu_streams_executor.hpp + * @brief A header file for OpenVINO CPU-Streams-based Executor implementation. + */ + +#pragma once + +#include +#include + +#include "openvino/runtime/common.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" + +namespace ov { +namespace threading { + +/** + * @class CPUStreamsExecutor + * @ingroup ov_dev_api_threading + * @brief CPU Streams executor implementation. The executor splits the CPU into groups of threads, + * that can be pinned to cores or NUMA nodes. + * It uses custom threads to pull tasks from single queue. + */ +class OPENVINO_RUNTIME_API CPUStreamsExecutor : public IStreamsExecutor { +public: + /** + * @brief Constructor + * @param config Stream executor parameters + */ + explicit CPUStreamsExecutor(const Config& config); + + /** + * @brief A class destructor + */ + ~CPUStreamsExecutor() override; + + void run(Task task) override; + + void execute(Task task) override; + + int get_stream_id() override; + + int get_numa_node_id() override; + +private: + struct Impl; + std::unique_ptr _impl; +}; + +} // namespace threading +} // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/threading/executor_manager.hpp b/src/inference/dev_api/openvino/runtime/threading/executor_manager.hpp new file mode 100644 index 00000000000000..6e7735a6906056 --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/threading/executor_manager.hpp @@ -0,0 +1,80 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief OpenVINO Runtime Executor Manager + * @file openvino/runtime/threading/executor_manager.hpp + */ + +#pragma once + +#include "openvino/runtime/common.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" +#include "threading/ie_istreams_executor.hpp" + +namespace ov { + +namespace threading { + +/** + * @interface ExecutorManager + * @brief Interface for tasks execution manager. + * This is global point for getting task executor objects by string id. + * It's necessary in multiple asynchronous requests for having unique executors to avoid oversubscription. + * E.g. There 2 task executors for CPU device: one - in FPGA, another - in OneDNN. Parallel execution both of them leads + * to not optimal CPU usage. More efficient to run the corresponding tasks one by one via single executor. + * @ingroup ov_dev_api_threading + */ +class OPENVINO_RUNTIME_API ExecutorManager { +public: + /** + * @brief Returns executor by unique identificator + * @param id An unique identificator of device (Usually string representation of TargetDevice) + * @return A shared pointer to existing or newly ITaskExecutor + */ + virtual std::shared_ptr get_executor(const std::string& id) = 0; + + /** + * @brief Returns idle cpu streams executor + * + * @param config Streams executor config + * + * @return pointer to streams executor config + */ + virtual std::shared_ptr get_idle_cpu_streams_executor( + const ov::threading::IStreamsExecutor::Config& config) = 0; + + /** + * @brief Allows to configure executor manager + * + * @param properties map with configuration + */ + virtual void set_property(const ov::AnyMap& properties) = 0; + /** + * @brief Returns configuration + * + * @param name property name + * + * @return Property value + */ + virtual ov::Any get_property(const std::string& name) const = 0; + + /** + * @cond + */ + virtual size_t get_executors_number() const = 0; + + virtual size_t get_idle_cpu_streams_executors_number() const = 0; + + virtual void clear(const std::string& id = {}) = 0; + /** + * @endcond + */ + virtual ~ExecutorManager() = default; +}; + +OPENVINO_API std::shared_ptr executor_manager(); +} // namespace threading +} // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp new file mode 100644 index 00000000000000..aead0f07cc1418 --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp @@ -0,0 +1,168 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @file ie_istreams_executor.hpp + * @brief A header file for Inference Engine Streams-based Executor Interface + */ + +#pragma once + +#include +#include +#include + +#include "openvino/runtime/common.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" + +namespace ov { +namespace threading { + +/** + * @interface IStreamsExecutor + * @ingroup ov_dev_api_threading + * @brief Interface for Streams Task Executor. This executor groups worker threads into so-called `streams`. + * @par CPU + * The executor executes all parallel tasks using threads from one stream. + * With proper pinning settings it should reduce cache misses for memory bound workloads. + * @par NUMA + * On NUMA hosts GetNumaNodeId() method can be used to define the NUMA node of current stream + */ +class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor { +public: + /** + * @brief Defines inference thread binding type + */ + enum ThreadBindingType : std::uint8_t { + NONE, //!< Don't bind the inference threads + CORES, //!< Bind inference threads to the CPU cores (round-robin) + // the following modes are implemented only for the TBB code-path: + NUMA, //!< Bind to the NUMA nodes (default mode for the non-hybrid CPUs on the Win/MacOS, where the 'CORES' is + //!< not implemeneted) + HYBRID_AWARE //!< Let the runtime bind the inference threads depending on the cores type (default mode for the + //!< hybrid CPUs) + }; + + /** + * @brief Defines IStreamsExecutor configuration + */ + struct OPENVINO_RUNTIME_API Config { + /** + * @brief Sets configuration + * @param properties map of properties + */ + void set_property(const ov::AnyMap& properties); + + /** + * @brief Sets configuration + * @param key property name + * @param value property value + */ + void set_property(const std::string& key, const ov::Any& value); + + /** + * @brief Return configuration value + * @param key configuration key + * @return configuration value wrapped into ov::Any + */ + ov::Any get_property(const std::string& key) const; + + /** + * @brief Create appropriate multithreaded configuration + * filing unconfigured values from initial configuration using hardware properties + * @param initial Inital configuration + * @param fp_intesive additional hint for the the (Hybrid) core-types selection logic + * whether the executor should be configured for floating point intensive work (as opposite to int8 + * intensive) + * @return configured values + */ + static Config make_default_multi_threaded(const Config& initial, const bool fp_intesive = true); + static int get_default_num_streams( + const bool enable_hyper_thread = true); // no network specifics considered (only CPU's caps); + static int get_hybrid_num_streams(std::map& config, const int stream_mode); + static void update_hybrid_custom_threads(Config& config); + + std::string _name; //!< Used by `ITT` to name executor threads + int _streams = 1; //!< Number of streams. + int _threadsPerStream = 0; //!< Number of threads per stream that executes `ie_parallel` calls + ThreadBindingType _threadBindingType = ThreadBindingType::NONE; //!< Thread binding to hardware resource type. + //!< No binding by default + int _threadBindingStep = 1; //!< In case of @ref CORES binding offset type + //!< thread binded to cores with defined step + int _threadBindingOffset = 0; //!< In case of @ref CORES binding offset type thread binded to cores + //!< starting from offset + int _threads = 0; //!< Number of threads distributed between streams. + //!< Reserved. Should not be used. + int _big_core_streams = 0; //!< Number of streams in Performance-core(big core) + int _small_core_streams = 0; //!< Number of streams in Efficient-core(small core) + int _threads_per_stream_big = 0; //!< Threads per stream in big cores + int _threads_per_stream_small = 0; //!< Threads per stream in small cores + int _small_core_offset = 0; //!< Calculate small core start offset when binding cpu cores + bool _enable_hyper_thread = true; //!< enable hyper thread + enum StreamMode { DEFAULT, AGGRESSIVE, LESSAGGRESSIVE }; + enum PreferredCoreType { + ANY, + LITTLE, + BIG, + ROUND_ROBIN // used w/multiple streams to populate the Big cores first, then the Little, then wrap around + // (for large #streams) + } _threadPreferredCoreType = + PreferredCoreType::ANY; //!< In case of @ref HYBRID_AWARE hints the TBB to affinitize + + /** + * @brief A constructor with arguments + * + * @param[in] name The executor name + * @param[in] streams @copybrief Config::_streams + * @param[in] threadsPerStream @copybrief Config::_threadsPerStream + * @param[in] threadBindingType @copybrief Config::_threadBindingType + * @param[in] threadBindingStep @copybrief Config::_threadBindingStep + * @param[in] threadBindingOffset @copybrief Config::_threadBindingOffset + * @param[in] threads @copybrief Config::_threads + * @param[in] threadPreferBigCores @copybrief Config::_threadPreferBigCores + */ + Config(std::string name = "StreamsExecutor", + int streams = 1, + int threadsPerStream = 0, + ThreadBindingType threadBindingType = ThreadBindingType::NONE, + int threadBindingStep = 1, + int threadBindingOffset = 0, + int threads = 0, + PreferredCoreType threadPreferredCoreType = PreferredCoreType::ANY) + : _name{name}, + _streams{streams}, + _threadsPerStream{threadsPerStream}, + _threadBindingType{threadBindingType}, + _threadBindingStep{threadBindingStep}, + _threadBindingOffset{threadBindingOffset}, + _threads{threads}, + _threadPreferredCoreType(threadPreferredCoreType) {} + }; + + /** + * @brief A virtual destructor + */ + ~IStreamsExecutor() override; + + /** + * @brief Return the index of current stream + * @return An index of current stream. Or throw exceptions if called not from stream thread + */ + virtual int get_stream_id() = 0; + + /** + * @brief Return the id of current NUMA Node + * @return `ID` of current NUMA Node, or throws exceptions if called not from stream thread + */ + virtual int get_numa_node_id() = 0; + + /** + * @brief Execute the task in the current thread using streams executor configuration and constraints + * @param task A task to start + */ + virtual void execute(Task task) = 0; +}; + +} // namespace threading +} // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/threading/itask_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/itask_executor.hpp new file mode 100644 index 00000000000000..3cb42e3200bb0c --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/threading/itask_executor.hpp @@ -0,0 +1,76 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @file openvino/runtime/threading/task_executor.hpp + * @brief A header file for OpenVINO Task Executor Interface + */ + +#pragma once + +#include +#include +#include + +#include "openvino/runtime/common.hpp" + +namespace ov { +namespace threading { + +/** + * @brief OpenVINO Task Executor can use any copyable callable without parameters and output as a task. + * It would be wrapped into std::function object + * @ingroup ov_dev_api_threading + */ +using Task = std::function; + +/** +* @interface ITaskExecutor +* @ingroup ov_dev_api_threading +* @brief Interface for Task Executor. +* OpenVINO uses `ov::ITaskExecutor` interface to run all asynchronous internal tasks. +* Different implementations of task executors can be used for different purposes: +* - To improve cache locality of memory bound CPU tasks some executors can limit task's affinity and maximum +concurrency. +* - The executor with one worker thread can be used to serialize access to acceleration device. +* - Immediate task executor can be used to satisfy `ov::ITaskExecutor` interface restrictions but +run tasks in current thread. +* @note Implementation should guaranty thread safety of all methods +* @section Synchronization +* It is `ov::ITaskExecutor` user responsibility to wait for task execution completion. +* The `c++11` standard way to wait task completion is to use `std::packaged_task` or `std::promise` with +`std::future`. +* Here is an example of how to use `std::promise` to wait task completion and process task's exceptions: + * @snippet example_itask_executor.cpp itask_executor:define_pipeline + */ +class OPENVINO_RUNTIME_API ITaskExecutor { +public: + /** + * @brief Destroys the object. + */ + virtual ~ITaskExecutor() = default; + + /** + * @brief Execute ov::Task inside task executor context + * @param task A task to start + */ + virtual void run(Task task) = 0; + + /** + * @brief Execute all of the tasks and waits for its completion. + * Default run_and_wait() method implementation uses run() pure virtual method + * and higher level synchronization primitives from STL. + * The task is wrapped into std::packaged_task which returns std::future. + * std::packaged_task will call the task and signal to std::future that the task is finished + * or the exception is thrown from task + * Than std::future is used to wait for task execution completion and + * task exception extraction + * @note run_and_wait() does not copy or capture tasks! + * @param tasks A vector of tasks to execute + */ + virtual void run_and_wait(const std::vector& tasks); +}; + +} // namespace threading +} // namespace ov diff --git a/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp b/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp index f4b31d95fc8cb0..12c2232a572e5d 100644 --- a/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp +++ b/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp @@ -33,7 +33,7 @@ class INFERENCE_ENGINE_API_CLASS(CPUStreamsExecutor) : public IStreamsExecutor { * @brief Constructor * @param config Stream executor parameters */ - explicit CPUStreamsExecutor(const Config& config = {}); + explicit CPUStreamsExecutor(const InferenceEngine::IStreamsExecutor::Config& config = {}); /** * @brief A class destructor diff --git a/src/inference/dev_api/threading/ie_executor_manager.hpp b/src/inference/dev_api/threading/ie_executor_manager.hpp index 2504884d071d95..ef789c82c48234 100644 --- a/src/inference/dev_api/threading/ie_executor_manager.hpp +++ b/src/inference/dev_api/threading/ie_executor_manager.hpp @@ -18,8 +18,18 @@ #include "threading/ie_istreams_executor.hpp" #include "threading/ie_itask_executor.hpp" +namespace ov { +namespace threading { + +class ExecutorManager; + +} +} // namespace ov + namespace InferenceEngine { +class IPluginWrapper; + /** * @interface ExecutorManager * @brief Interface for tasks execution manager. @@ -76,8 +86,15 @@ class INFERENCE_ENGINE_API_CLASS(ExecutorManager) { */ virtual void setTbbFlag(bool flag) = 0; virtual bool getTbbFlag() = 0; + +private: + virtual std::shared_ptr get_ov_manager() const = 0; + friend class IPluginWrapper; }; INFERENCE_ENGINE_API_CPP(ExecutorManager::Ptr) executorManager(); +std::shared_ptr create_old_manager( + const std::shared_ptr& manager); + } // namespace InferenceEngine diff --git a/src/inference/dev_api/threading/ie_istreams_executor.hpp b/src/inference/dev_api/threading/ie_istreams_executor.hpp index efecaf606faa32..bb2bbeca0b70d2 100644 --- a/src/inference/dev_api/threading/ie_istreams_executor.hpp +++ b/src/inference/dev_api/threading/ie_istreams_executor.hpp @@ -14,6 +14,7 @@ #include #include "ie_parameter.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" #include "threading/ie_itask_executor.hpp" namespace InferenceEngine { @@ -28,30 +29,17 @@ namespace InferenceEngine { * @par NUMA * On NUMA hosts GetNumaNodeId() method can be used to define the NUMA node of current stream */ -class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor { +class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor, public ov::threading::IStreamsExecutor { public: /** * A shared pointer to IStreamsExecutor interface */ using Ptr = std::shared_ptr; - /** - * @brief Defines inference thread binding type - */ - enum ThreadBindingType : std::uint8_t { - NONE, //!< Don't bind the inference threads - CORES, //!< Bind inference threads to the CPU cores (round-robin) - // the following modes are implemented only for the TBB code-path: - NUMA, //!< Bind to the NUMA nodes (default mode for the non-hybrid CPUs on the Win/MacOS, where the 'CORES' is - //!< not implemeneted) - HYBRID_AWARE //!< Let the runtime bind the inference threads depending on the cores type (default mode for the - //!< hybrid CPUs) - }; - /** * @brief Defines IStreamsExecutor configuration */ - struct INFERENCE_ENGINE_API_CLASS(Config) { + struct INFERENCE_ENGINE_API_CLASS(Config) : public ov::threading::IStreamsExecutor::Config { /** * @brief Supported Configuration keys * @return vector of supported configuration keys @@ -87,33 +75,6 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor { static int GetHybridNumStreams(std::map& config, const int stream_mode); static void UpdateHybridCustomThreads(Config& config); - std::string _name; //!< Used by `ITT` to name executor threads - int _streams = 1; //!< Number of streams. - int _threadsPerStream = 0; //!< Number of threads per stream that executes `ie_parallel` calls - ThreadBindingType _threadBindingType = ThreadBindingType::NONE; //!< Thread binding to hardware resource type. - //!< No binding by default - int _threadBindingStep = 1; //!< In case of @ref CORES binding offset type - //!< thread binded to cores with defined step - int _threadBindingOffset = 0; //!< In case of @ref CORES binding offset type thread binded to cores - //!< starting from offset - int _threads = 0; //!< Number of threads distributed between streams. - //!< Reserved. Should not be used. - int _big_core_streams = 0; //!< Number of streams in Performance-core(big core) - int _small_core_streams = 0; //!< Number of streams in Efficient-core(small core) - int _threads_per_stream_big = 0; //!< Threads per stream in big cores - int _threads_per_stream_small = 0; //!< Threads per stream in small cores - int _small_core_offset = 0; //!< Calculate small core start offset when binding cpu cores - bool _enable_hyper_thread = true; //!< enable hyper thread - enum StreamMode { DEFAULT, AGGRESSIVE, LESSAGGRESSIVE }; - enum PreferredCoreType { - ANY, - LITTLE, - BIG, - ROUND_ROBIN // used w/multiple streams to populate the Big cores first, then the Little, then wrap around - // (for large #streams) - } _threadPreferredCoreType = - PreferredCoreType::ANY; //!< In case of @ref HYBRID_AWARE hints the TBB to affinitize - /** * @brief A constructor with arguments * @@ -134,14 +95,17 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor { int threadBindingOffset = 0, int threads = 0, PreferredCoreType threadPreferredCoreType = PreferredCoreType::ANY) - : _name{name}, - _streams{streams}, - _threadsPerStream{threadsPerStream}, - _threadBindingType{threadBindingType}, - _threadBindingStep{threadBindingStep}, - _threadBindingOffset{threadBindingOffset}, - _threads{threads}, - _threadPreferredCoreType(threadPreferredCoreType) {} + : ov::threading::IStreamsExecutor::Config(name, + streams, + threadsPerStream, + threadBindingType, + threadBindingStep, + threadBindingOffset, + threads, + threadPreferredCoreType) {} + + Config(const ov::threading::IStreamsExecutor::Config& config) + : ov::threading::IStreamsExecutor::Config(config) {} }; /** @@ -166,6 +130,18 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor { * @param task A task to start */ virtual void Execute(Task task) = 0; + + int get_stream_id() override { + return GetStreamId(); + } + + int get_numa_node_id() override { + return GetNumaNodeId(); + } + + void execute(Task task) override { + Execute(task); + } }; } // namespace InferenceEngine diff --git a/src/inference/dev_api/threading/ie_itask_executor.hpp b/src/inference/dev_api/threading/ie_itask_executor.hpp index 90557d08f9f92c..1fc2923fca92e7 100644 --- a/src/inference/dev_api/threading/ie_itask_executor.hpp +++ b/src/inference/dev_api/threading/ie_itask_executor.hpp @@ -14,6 +14,7 @@ #include #include "ie_api.h" +#include "openvino/runtime/threading/itask_executor.hpp" namespace InferenceEngine { @@ -22,7 +23,7 @@ namespace InferenceEngine { * It would be wrapped into std::function object * @ingroup ie_dev_api_threading */ -using Task = std::function; +using Task = ov::threading::Task; /** * @interface ITaskExecutor @@ -36,14 +37,13 @@ concurrency. * - Immediate task executor can be used to satisfy `InferenceEngine::ITaskExecutor` interface restrictions but run tasks in current thread. * @note Implementation should guaranty thread safety of all methods -* @section Synchronization * It is `InferenceEngine::ITaskExecutor` user responsibility to wait for task execution completion. * The `c++11` standard way to wait task completion is to use `std::packaged_task` or `std::promise` with `std::future`. * Here is an example of how to use `std::promise` to wait task completion and process task's exceptions: * @snippet example_itask_executor.cpp itask_executor:define_pipeline */ -class INFERENCE_ENGINE_API_CLASS(ITaskExecutor) { +class INFERENCE_ENGINE_API_CLASS(ITaskExecutor) : virtual public ov::threading::ITaskExecutor { public: /** * A shared pointer to ITaskExecutor interface @@ -55,12 +55,6 @@ class INFERENCE_ENGINE_API_CLASS(ITaskExecutor) { */ virtual ~ITaskExecutor() = default; - /** - * @brief Execute InferenceEngine::Task inside task executor context - * @param task A task to start - */ - virtual void run(Task task) = 0; - /** * @brief Execute all of the tasks and waits for its completion. * Default runAndWait() method implementation uses run() pure virtual method diff --git a/src/inference/include/openvino/runtime/core.hpp b/src/inference/include/openvino/runtime/core.hpp index a349378e8e32b1..55250ec3c5484f 100644 --- a/src/inference/include/openvino/runtime/core.hpp +++ b/src/inference/include/openvino/runtime/core.hpp @@ -733,6 +733,6 @@ class OPENVINO_RUNTIME_API Core { * You might want to use this function if you are developing a dynamically-loaded library which should clean up all * resources after itself when the library is unloaded. */ -void OPENVINO_RUNTIME_API shutdown(); +OPENVINO_RUNTIME_API void shutdown(); } // namespace ov diff --git a/src/inference/include/openvino/runtime/remote_tensor.hpp b/src/inference/include/openvino/runtime/remote_tensor.hpp index 321a2bcab51fcd..938398a07beecb 100644 --- a/src/inference/include/openvino/runtime/remote_tensor.hpp +++ b/src/inference/include/openvino/runtime/remote_tensor.hpp @@ -44,6 +44,8 @@ class OPENVINO_RUNTIME_API RemoteTensor : public Tensor { template T* data() = delete; + void copy_to(ov::Tensor& dst) const = delete; + /** * @brief Returns a map of device-specific parameters required for low-level * operations with underlying object. diff --git a/src/inference/src/cnn_network_ngraph_impl.cpp b/src/inference/src/cnn_network_ngraph_impl.cpp index d6a4ae9bf544cd..bc917d8ea1f598 100644 --- a/src/inference/src/cnn_network_ngraph_impl.cpp +++ b/src/inference/src/cnn_network_ngraph_impl.cpp @@ -152,7 +152,7 @@ CNNNetworkNGraphImpl::CNNNetworkNGraphImpl(const std::shared_ptr& nGra REGISTER_PASS(m, FixRtInfo) REGISTER_PASS(m, EliminateScatterUpdate) REGISTER_PASS(m, RemoveConcatZeroDimInput) - REGISTER_PASS(m, RemoveMultiSubGraphOpDanglingParams) + REGISTER_PASS(m, RemoveMultiSubGraphOpDanglingParamsResults) REGISTER_PASS(m, FoldSubgraphEmptyInputs) m.run_passes(_ngraph_function); } diff --git a/src/inference/src/dev/converter_utils.cpp b/src/inference/src/dev/converter_utils.cpp index 88bded8388127a..8e56463094a921 100644 --- a/src/inference/src/dev/converter_utils.cpp +++ b/src/inference/src/dev/converter_utils.cpp @@ -34,8 +34,10 @@ #include "openvino/runtime/profiling_info.hpp" #include "openvino/runtime/remote_context.hpp" #include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" #include "openvino/runtime/variable_state.hpp" #include "so_ptr.hpp" +#include "threading/ie_executor_manager.hpp" #include "transformations/utils/utils.hpp" namespace { @@ -221,7 +223,7 @@ class IInferencePluginWrapper : public InferenceEngine::IInferencePlugin { version.description = ver.description; SetVersion(version); _isNewAPI = plugin->is_new_api(); - _executorManager = plugin->get_executor_manager(); + _executorManager = InferenceEngine::create_old_manager(plugin->get_executor_manager()); } std::string GetName() const noexcept override { return m_plugin->get_device_name(); diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index f31b3df76ffda5..7c87a7c3d9e5aa 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -28,6 +28,7 @@ #include "openvino/pass/manager.hpp" #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/remote_context.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" #include "openvino/util/common_util.hpp" #include "openvino/util/shared_object.hpp" #include "preprocessing/preprocessing.hpp" @@ -57,7 +58,7 @@ void stripDeviceName(std::string& device, const std::string& substr) { ov::CoreImpl::CoreImpl(bool _newAPI) : m_new_api(_newAPI) { add_mutex(""); // Register global mutex - executorManagerPtr = InferenceEngine::executorManager(); + m_executor_manager = ov::threading::executor_manager(); for (const auto& it : ov::get_available_opsets()) { opsetNames.insert(it.first); } @@ -632,7 +633,7 @@ void ov::CoreImpl::set_property(const std::string& device_name, const AnyMap& pr ov::Any ov::CoreImpl::get_property_for_core(const std::string& name) const { if (name == ov::force_tbb_terminate.name()) { - const auto flag = InferenceEngine::executorManager()->getTbbFlag(); + const auto flag = ov::threading::executor_manager()->get_property(name).as(); return decltype(ov::force_tbb_terminate)::value_type(flag); } else if (name == ov::cache_dir.name()) { return ov::Any(coreConfig.get_cache_dir()); @@ -993,7 +994,7 @@ void ov::CoreImpl::CoreConfig::set_and_update(ov::AnyMap& config) { it = config.find(ov::force_tbb_terminate.name()); if (it != config.end()) { auto flag = it->second.as() == CONFIG_VALUE(YES) ? true : false; - InferenceEngine::executorManager()->setTbbFlag(flag); + ov::threading::executor_manager()->set_property({{it->first, flag}}); config.erase(it); } diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 0d74145f2aea6e..7e223202f038fd 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -21,7 +21,7 @@ #include "openvino/core/version.hpp" #include "openvino/runtime/common.hpp" #include "openvino/runtime/icompiled_model.hpp" -#include "threading/ie_executor_manager.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" #ifdef OPENVINO_STATIC_LIBRARY # include "ie_plugins.hpp" @@ -162,7 +162,7 @@ class CoreImpl : public InferenceEngine::ICore, public std::enable_shared_from_t } }; - InferenceEngine::ExecutorManager::Ptr executorManagerPtr; + std::shared_ptr m_executor_manager; mutable std::unordered_set opsetNames; // TODO: make extensions to be optional with conditional compilation mutable std::vector extensions; diff --git a/src/inference/src/dev/iasync_infer_request.cpp b/src/inference/src/dev/iasync_infer_request.cpp index 385baba838c9ed..45633fa76166e9 100644 --- a/src/inference/src/dev/iasync_infer_request.cpp +++ b/src/inference/src/dev/iasync_infer_request.cpp @@ -14,13 +14,13 @@ namespace { -struct ImmediateStreamsExecutor : public InferenceEngine::ITaskExecutor { - explicit ImmediateStreamsExecutor(const InferenceEngine::IStreamsExecutor::Ptr& streamsExecutor) +struct ImmediateStreamsExecutor : public ov::threading::ITaskExecutor { + explicit ImmediateStreamsExecutor(const std::shared_ptr& streamsExecutor) : _streamsExecutor{streamsExecutor} {} void run(InferenceEngine::Task task) override { - _streamsExecutor->Execute(std::move(task)); + _streamsExecutor->execute(std::move(task)); } - InferenceEngine::IStreamsExecutor::Ptr _streamsExecutor; + std::shared_ptr _streamsExecutor; }; } // namespace @@ -30,8 +30,8 @@ ov::IAsyncInferRequest::~IAsyncInferRequest() { } ov::IAsyncInferRequest::IAsyncInferRequest(const std::shared_ptr& request, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor) + const std::shared_ptr& task_executor, + const std::shared_ptr& callback_executor) : m_sync_request(request), m_request_executor(task_executor), m_callback_executor(callback_executor) { @@ -117,7 +117,7 @@ void ov::IAsyncInferRequest::start_async_thread_unsafe() { void ov::IAsyncInferRequest::run_first_stage(const Pipeline::iterator itBeginStage, const Pipeline::iterator itEndStage, - const InferenceEngine::ITaskExecutor::Ptr callbackExecutor) { + const std::shared_ptr callbackExecutor) { auto& firstStageExecutor = std::get(*itBeginStage); OPENVINO_ASSERT(nullptr != firstStageExecutor); firstStageExecutor->run(make_next_stage_task(itBeginStage, itEndStage, std::move(callbackExecutor))); @@ -126,9 +126,9 @@ void ov::IAsyncInferRequest::run_first_stage(const Pipeline::iterator itBeginSta InferenceEngine::Task ov::IAsyncInferRequest::make_next_stage_task( const Pipeline::iterator itStage, const Pipeline::iterator itEndStage, - const InferenceEngine::ITaskExecutor::Ptr callbackExecutor) { + const std::shared_ptr callbackExecutor) { return std::bind( - [this, itStage, itEndStage](InferenceEngine::ITaskExecutor::Ptr& callbackExecutor) mutable { + [this, itStage, itEndStage](std::shared_ptr& callbackExecutor) mutable { std::exception_ptr currentException = nullptr; auto& thisStage = *itStage; auto itNextStage = itStage + 1; diff --git a/src/inference/src/dev/icompiled_model.cpp b/src/inference/src/dev/icompiled_model.cpp index c3e0796ab754bd..82b94d511d2a83 100644 --- a/src/inference/src/dev/icompiled_model.cpp +++ b/src/inference/src/dev/icompiled_model.cpp @@ -11,8 +11,8 @@ ov::ICompiledModel::ICompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor) + const std::shared_ptr& task_executor, + const std::shared_ptr& callback_executor) : m_plugin(plugin), m_task_executor(task_executor), m_callback_executor(callback_executor) { @@ -86,10 +86,10 @@ std::shared_ptr ov::ICompiledModel::create_infer_request const std::shared_ptr& ov::ICompiledModel::get_plugin() const { return m_plugin; } -const InferenceEngine::ITaskExecutor::Ptr ov::ICompiledModel::get_task_executor() const { +const std::shared_ptr ov::ICompiledModel::get_task_executor() const { return m_task_executor; } -const InferenceEngine::ITaskExecutor::Ptr ov::ICompiledModel::get_callback_executor() const { +const std::shared_ptr ov::ICompiledModel::get_callback_executor() const { return m_callback_executor; } diff --git a/src/inference/src/dev/icompiled_model_wrapper.cpp b/src/inference/src/dev/icompiled_model_wrapper.cpp index b0144b2a5fc416..189ab993217f9a 100644 --- a/src/inference/src/dev/icompiled_model_wrapper.cpp +++ b/src/inference/src/dev/icompiled_model_wrapper.cpp @@ -4,9 +4,8 @@ #include "icompiled_model_wrapper.hpp" -#include - #include "dev/converter_utils.hpp" +#include "ie_plugin_config.hpp" InferenceEngine::ICompiledModelWrapper::ICompiledModelWrapper( const std::shared_ptr& model) diff --git a/src/inference/src/dev/iplugin.cpp b/src/inference/src/dev/iplugin.cpp index 73476d21386942..5bed9efb18f92f 100644 --- a/src/inference/src/dev/iplugin.cpp +++ b/src/inference/src/dev/iplugin.cpp @@ -4,7 +4,7 @@ #include "openvino/runtime/iplugin.hpp" -ov::IPlugin::IPlugin() : m_executor_manager(InferenceEngine::executorManager()), m_is_new_api(true) {} +ov::IPlugin::IPlugin() : m_executor_manager(ov::threading::executor_manager()), m_is_new_api(true) {} void ov::IPlugin::set_version(const ov::Version& version) { m_version = version; @@ -42,7 +42,7 @@ bool ov::IPlugin::is_new_api() const { return m_is_new_api; } -const std::shared_ptr& ov::IPlugin::get_executor_manager() const { +const std::shared_ptr& ov::IPlugin::get_executor_manager() const { return m_executor_manager; } diff --git a/src/inference/src/dev/iplugin_wrapper.cpp b/src/inference/src/dev/iplugin_wrapper.cpp index 36207adf48539f..972d4d62bb4a46 100644 --- a/src/inference/src/dev/iplugin_wrapper.cpp +++ b/src/inference/src/dev/iplugin_wrapper.cpp @@ -9,6 +9,7 @@ #include "any_copy.hpp" #include "dev/converter_utils.hpp" #include "ie_icore.hpp" +#include "threading/ie_executor_manager.hpp" namespace InferenceEngine { @@ -20,7 +21,7 @@ IPluginWrapper::IPluginWrapper(const std::shared_ptrGetName(); m_is_new_api = m_old_plugin->IsNewAPI(); m_core = m_old_plugin->GetCore(); - m_executor_manager = m_old_plugin->executorManager(); + m_executor_manager = m_old_plugin->executorManager()->get_ov_manager(); } const std::shared_ptr& IPluginWrapper::update_exec_network( diff --git a/src/inference/src/dev/isync_infer_request.cpp b/src/inference/src/dev/isync_infer_request.cpp index c8aa79a84b971b..26ba98f1180c00 100644 --- a/src/inference/src/dev/isync_infer_request.cpp +++ b/src/inference/src/dev/isync_infer_request.cpp @@ -7,6 +7,7 @@ #include #include "cpp_interfaces/plugin_itt.hpp" +#include "ie_blob.h" #include "openvino/core/except.hpp" #include "openvino/core/layout.hpp" #include "openvino/core/parallel.hpp" diff --git a/src/inference/src/dev/threading/cpu_streams_executor.cpp b/src/inference/src/dev/threading/cpu_streams_executor.cpp new file mode 100644 index 00000000000000..ceb72eec87db6f --- /dev/null +++ b/src/inference/src/dev/threading/cpu_streams_executor.cpp @@ -0,0 +1,397 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/threading/cpu_streams_executor.hpp" + +#include +#include +#include +#include +#include +#include + +#include "openvino/itt.hpp" +#include "openvino/runtime/system_conf.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" +#include "threading/ie_parallel_custom_arena.hpp" +#include "threading/ie_thread_affinity.hpp" +#include "threading/ie_thread_local.hpp" + +namespace ov { +namespace threading { +struct CPUStreamsExecutor::Impl { + struct Stream { +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + struct Observer : public custom::task_scheduler_observer { + InferenceEngine::CpuSet _mask; + int _ncpus = 0; + int _threadBindingStep = 0; + int _offset = 0; + int _cpuIdxOffset = 0; + Observer(custom::task_arena& arena, + InferenceEngine::CpuSet mask, + int ncpus, + const int streamId, + const int threadsPerStream, + const int threadBindingStep, + const int threadBindingOffset, + const int cpuIdxOffset = 0) + : custom::task_scheduler_observer(arena), + _mask{std::move(mask)}, + _ncpus(ncpus), + _threadBindingStep(threadBindingStep), + _offset{streamId * threadsPerStream + threadBindingOffset}, + _cpuIdxOffset(cpuIdxOffset) {} + void on_scheduler_entry(bool) override { + InferenceEngine::PinThreadToVacantCore(_offset + tbb::this_task_arena::current_thread_index(), + _threadBindingStep, + _ncpus, + _mask, + _cpuIdxOffset); + } + void on_scheduler_exit(bool) override { + PinCurrentThreadByMask(_ncpus, _mask); + } + ~Observer() override = default; + }; +#endif + explicit Stream(Impl* impl) : _impl(impl) { + { + std::lock_guard lock{_impl->_streamIdMutex}; + if (_impl->_streamIdQueue.empty()) { + _streamId = _impl->_streamId++; + } else { + _streamId = _impl->_streamIdQueue.front(); + _impl->_streamIdQueue.pop(); + } + } + _numaNodeId = _impl->_config._streams + ? _impl->_usedNumaNodes.at((_streamId % _impl->_config._streams) / + ((_impl->_config._streams + _impl->_usedNumaNodes.size() - 1) / + _impl->_usedNumaNodes.size())) + : _impl->_usedNumaNodes.at(_streamId % _impl->_usedNumaNodes.size()); +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + const auto concurrency = (0 == _impl->_config._threadsPerStream) ? custom::task_arena::automatic + : _impl->_config._threadsPerStream; + if (ThreadBindingType::HYBRID_AWARE == _impl->_config._threadBindingType) { + if (Config::PreferredCoreType::ROUND_ROBIN != _impl->_config._threadPreferredCoreType) { + if (Config::PreferredCoreType::ANY == _impl->_config._threadPreferredCoreType) { + _taskArena.reset(new custom::task_arena{concurrency}); + } else { + const auto selected_core_type = + Config::PreferredCoreType::BIG == _impl->_config._threadPreferredCoreType + ? custom::info::core_types().back() // running on Big cores only + : custom::info::core_types().front(); // running on Little cores only + _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{} + .set_core_type(selected_core_type) + .set_max_concurrency(concurrency)}); + } + } else { + // assigning the stream to the core type in the round-robin fashion + // wrapping around total_streams (i.e. how many streams all different core types can handle + // together). Binding priority: Big core, Logical big core, Small core + const auto total_streams = _impl->total_streams_on_core_types.back().second; + const auto big_core_streams = _impl->total_streams_on_core_types.front().second; + const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1; + const auto phy_core_streams = + _impl->_config._big_core_streams == 0 + ? 0 + : _impl->num_big_core_phys / _impl->_config._threads_per_stream_big; + const auto streamId_wrapped = _streamId % total_streams; + const auto& selected_core_type = + std::find_if( + _impl->total_streams_on_core_types.cbegin(), + _impl->total_streams_on_core_types.cend(), + [streamId_wrapped](const decltype(_impl->total_streams_on_core_types)::value_type& p) { + return p.second > streamId_wrapped; + }) + ->first; + const auto small_core = hybrid_core && selected_core_type == 0; + const auto logic_core = !small_core && streamId_wrapped >= phy_core_streams; + const auto small_core_skip = small_core && _impl->_config._threads_per_stream_small == 3 && + _impl->_config._small_core_streams > 1; + const auto max_concurrency = + small_core ? _impl->_config._threads_per_stream_small : _impl->_config._threads_per_stream_big; + // Special handling of _threads_per_stream_small == 3 + const auto small_core_id = small_core_skip ? 0 : streamId_wrapped - big_core_streams; + const auto stream_id = + hybrid_core + ? (small_core ? small_core_id + : (logic_core ? streamId_wrapped - phy_core_streams : streamId_wrapped)) + : streamId_wrapped; + const auto thread_binding_step = hybrid_core ? (small_core ? _impl->_config._threadBindingStep : 2) + : _impl->_config._threadBindingStep; + // Special handling of _threads_per_stream_small == 3, need to skip 4 (Four cores share one L2 cache + // on the small core), stream_id = 0, cpu_idx_offset cumulative plus 4 + const auto small_core_offset = + small_core_skip ? _impl->_config._small_core_offset + (streamId_wrapped - big_core_streams) * 4 + : _impl->_config._small_core_offset; + const auto cpu_idx_offset = + hybrid_core + // Prevent conflicts with system scheduling, so default cpu id on big core starts from 1 + ? (small_core ? small_core_offset : (logic_core ? 0 : 1)) + : 0; + + _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{} + .set_core_type(selected_core_type) + .set_max_concurrency(max_concurrency)}); + InferenceEngine::CpuSet processMask; + int ncpus = 0; + std::tie(processMask, ncpus) = InferenceEngine::GetProcessMask(); + if (nullptr != processMask) { + _observer.reset(new Observer{*_taskArena, + std::move(processMask), + ncpus, + stream_id, + max_concurrency, + thread_binding_step, + _impl->_config._threadBindingOffset, + cpu_idx_offset}); + _observer->observe(true); + } + } + } else if (ThreadBindingType::NUMA == _impl->_config._threadBindingType) { + _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{_numaNodeId, concurrency}}); + } else if ((0 != _impl->_config._threadsPerStream) || + (ThreadBindingType::CORES == _impl->_config._threadBindingType)) { + _taskArena.reset(new custom::task_arena{concurrency}); + if (ThreadBindingType::CORES == _impl->_config._threadBindingType) { + InferenceEngine::CpuSet processMask; + int ncpus = 0; + std::tie(processMask, ncpus) = InferenceEngine::GetProcessMask(); + if (nullptr != processMask) { + _observer.reset(new Observer{*_taskArena, + std::move(processMask), + ncpus, + _streamId, + _impl->_config._threadsPerStream, + _impl->_config._threadBindingStep, + _impl->_config._threadBindingOffset}); + _observer->observe(true); + } + } + } +#elif OV_THREAD == OV_THREAD_OMP + omp_set_num_threads(_impl->_config._threadsPerStream); + if (!checkOpenMpEnvVars(false) && (ThreadBindingType::NONE != _impl->_config._threadBindingType)) { + InferenceEngine::CpuSet processMask; + int ncpus = 0; + std::tie(processMask, ncpus) = InferenceEngine::GetProcessMask(); + if (nullptr != processMask) { + parallel_nt(_impl->_config._threadsPerStream, [&](int threadIndex, int threadsPerStream) { + int thrIdx = _streamId * _impl->_config._threadsPerStream + threadIndex + + _impl->_config._threadBindingOffset; + InferenceEngine::PinThreadToVacantCore(thrIdx, + _impl->_config._threadBindingStep, + ncpus, + processMask); + }); + } + } +#elif OV_THREAD == OV_THREAD_SEQ + if (ThreadBindingType::NUMA == _impl->_config._threadBindingType) { + InferenceEngine::PinCurrentThreadToSocket(_numaNodeId); + } else if (ThreadBindingType::CORES == _impl->_config._threadBindingType) { + InferenceEngine::CpuSet processMask; + int ncpus = 0; + std::tie(processMask, ncpus) = InferenceEngine::GetProcessMask(); + if (nullptr != processMask) { + InferenceEngine::PinThreadToVacantCore(_streamId + _impl->_config._threadBindingOffset, + _impl->_config._threadBindingStep, + ncpus, + processMask); + } + } +#endif + } + ~Stream() { + { + std::lock_guard lock{_impl->_streamIdMutex}; + _impl->_streamIdQueue.push(_streamId); + } +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + if (nullptr != _observer) { + _observer->observe(false); + } +#endif + } + + Impl* _impl = nullptr; + int _streamId = 0; + int _numaNodeId = 0; + bool _execute = false; + std::queue _taskQueue; +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + std::unique_ptr _taskArena; + std::unique_ptr _observer; +#endif + }; + + explicit Impl(const Config& config) + : _config{config}, + _streams([this] { + return std::make_shared(this); + }) { + _exectorMgr = executor_manager(); + auto numaNodes = get_available_numa_nodes(); + if (_config._streams != 0) { + std::copy_n(std::begin(numaNodes), + std::min(static_cast(_config._streams), numaNodes.size()), + std::back_inserter(_usedNumaNodes)); + } else { + _usedNumaNodes = numaNodes; + } +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) { + const auto core_types = custom::info::core_types(); + const auto num_core_phys = get_number_of_cpu_cores(); + num_big_core_phys = get_number_of_cpu_cores(true); + const auto num_small_core_phys = num_core_phys - num_big_core_phys; + int sum = 0; + // reversed order, so BIG cores are first + for (auto iter = core_types.rbegin(); iter < core_types.rend(); iter++) { + const auto& type = *iter; + // calculating the #streams per core type + const int num_streams_for_core_type = + type == 0 ? std::max(1, + std::min(config._small_core_streams, + config._threads_per_stream_small == 0 + ? 0 + : num_small_core_phys / config._threads_per_stream_small)) + : std::max(1, + std::min(config._big_core_streams, + config._threads_per_stream_big == 0 + ? 0 + : num_big_core_phys / config._threads_per_stream_big * 2)); + sum += num_streams_for_core_type; + // prefix sum, so the core type for a given stream id will be deduced just as a upper_bound + // (notice that the map keeps the elements in the descending order, so the big cores are populated + // first) + total_streams_on_core_types.push_back({type, sum}); + } + } +#endif + for (auto streamId = 0; streamId < _config._streams; ++streamId) { + _threads.emplace_back([this, streamId] { + openvino::itt::threadName(_config._name + "_" + std::to_string(streamId)); + for (bool stopped = false; !stopped;) { + Task task; + { + std::unique_lock lock(_mutex); + _queueCondVar.wait(lock, [&] { + return !_taskQueue.empty() || (stopped = _isStopped); + }); + if (!_taskQueue.empty()) { + task = std::move(_taskQueue.front()); + _taskQueue.pop(); + } + } + if (task) { + Execute(task, *(_streams.local())); + } + } + }); + } + } + + void Enqueue(Task task) { + { + std::lock_guard lock(_mutex); + _taskQueue.emplace(std::move(task)); + } + _queueCondVar.notify_one(); + } + + void Execute(const Task& task, Stream& stream) { +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + auto& arena = stream._taskArena; + if (nullptr != arena) { + arena->execute(std::move(task)); + } else { + task(); + } +#else + task(); +#endif + } + + void Defer(Task task) { + auto& stream = *(_streams.local()); + stream._taskQueue.push(std::move(task)); + if (!stream._execute) { + stream._execute = true; + try { + while (!stream._taskQueue.empty()) { + Execute(stream._taskQueue.front(), stream); + stream._taskQueue.pop(); + } + } catch (...) { + } + stream._execute = false; + } + } + + Config _config; + std::mutex _streamIdMutex; + int _streamId = 0; + std::queue _streamIdQueue; + std::vector _threads; + std::mutex _mutex; + std::condition_variable _queueCondVar; + std::queue _taskQueue; + bool _isStopped = false; + std::vector _usedNumaNodes; + InferenceEngine::ThreadLocal> _streams; +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + // stream id mapping to the core type + // stored in the reversed order (so the big cores, with the highest core_type_id value, are populated first) + // every entry is the core type and #streams that this AND ALL EARLIER entries can handle (prefix sum) + // (so mapping is actually just an upper_bound: core type is deduced from the entry for which the id < #streams) + using StreamIdToCoreTypes = std::vector>; + StreamIdToCoreTypes total_streams_on_core_types; + int num_big_core_phys; +#endif + std::shared_ptr _exectorMgr; +}; + +int CPUStreamsExecutor::get_stream_id() { + auto stream = _impl->_streams.local(); + return stream->_streamId; +} + +int CPUStreamsExecutor::get_numa_node_id() { + auto stream = _impl->_streams.local(); + return stream->_numaNodeId; +} + +CPUStreamsExecutor::CPUStreamsExecutor(const ov::threading::IStreamsExecutor::Config& config) + : _impl{new Impl{config}} {} + +CPUStreamsExecutor::~CPUStreamsExecutor() { + { + std::lock_guard lock(_impl->_mutex); + _impl->_isStopped = true; + } + _impl->_queueCondVar.notify_all(); + for (auto& thread : _impl->_threads) { + if (thread.joinable()) { + thread.join(); + } + } +} + +void CPUStreamsExecutor::execute(Task task) { + _impl->Defer(std::move(task)); +} + +void CPUStreamsExecutor::run(Task task) { + if (0 == _impl->_config._streams) { + _impl->Defer(std::move(task)); + } else { + _impl->Enqueue(std::move(task)); + } +} + +} // namespace threading +} // namespace ov diff --git a/src/inference/src/dev/threading/executor_manager.cpp b/src/inference/src/dev/threading/executor_manager.cpp new file mode 100644 index 00000000000000..250217b9104267 --- /dev/null +++ b/src/inference/src/dev/threading/executor_manager.cpp @@ -0,0 +1,210 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/threading/executor_manager.hpp" + +#include "openvino/core/parallel.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/threading/cpu_streams_executor.hpp" +#include "threading/ie_cpu_streams_executor.hpp" +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +# if (TBB_INTERFACE_VERSION < 12000) +# include +# else +# include +# endif +#endif + +#include +#include +#include +#include + +namespace ov { +namespace threading { +namespace { +class ExecutorManagerImpl : public ExecutorManager { +public: + ~ExecutorManagerImpl(); + std::shared_ptr get_executor(const std::string& id) override; + std::shared_ptr get_idle_cpu_streams_executor( + const ov::threading::IStreamsExecutor::Config& config) override; + size_t get_executors_number() const override; + size_t get_idle_cpu_streams_executors_number() const override; + void clear(const std::string& id = {}) override; + void set_property(const ov::AnyMap& properties) override; + ov::Any get_property(const std::string& name) const override; + +private: + void reset_tbb(); + + std::unordered_map> executors; + std::vector>> + cpuStreamsExecutors; + mutable std::mutex streamExecutorMutex; + mutable std::mutex taskExecutorMutex; + bool tbbTerminateFlag = false; + mutable std::mutex global_mutex; + bool tbbThreadsCreated = false; +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +# if (TBB_INTERFACE_VERSION < 12000) + std::shared_ptr tbbTaskScheduler = nullptr; +# else + std::shared_ptr tbbTaskScheduler = nullptr; +# endif +#endif +}; + +} // namespace + +ExecutorManagerImpl::~ExecutorManagerImpl() { + reset_tbb(); +} + +void ExecutorManagerImpl::set_property(const ov::AnyMap& properties) { + std::lock_guard guard(global_mutex); + for (const auto& it : properties) { + if (it.first == ov::force_tbb_terminate.name()) { + tbbTerminateFlag = it.second.as(); +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + if (tbbTerminateFlag) { + if (!tbbTaskScheduler) { +# if (TBB_INTERFACE_VERSION < 12000) + tbbTaskScheduler = std::make_shared(); +# elif (TBB_INTERFACE_VERSION < 12060) + tbbTaskScheduler = + std::make_shared(oneapi::tbb::task_scheduler_handle::get()); +# else + tbbTaskScheduler = std::make_shared(tbb::attach{}); +# endif + } + } else { + tbbTaskScheduler = nullptr; + } +#endif + } + } +} +ov::Any ExecutorManagerImpl::get_property(const std::string& name) const { + std::lock_guard guard(global_mutex); + if (name == ov::force_tbb_terminate.name()) { + return tbbTerminateFlag; + } + OPENVINO_UNREACHABLE("Property ", name, " is not supported."); +} + +void ExecutorManagerImpl::reset_tbb() { + std::lock_guard guard(global_mutex); + if (tbbTerminateFlag) { +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + if (tbbTaskScheduler && tbbThreadsCreated) { +# if (TBB_INTERFACE_VERSION < 12000) + tbbTaskScheduler->terminate(); +# else + tbb::finalize(*tbbTaskScheduler, std::nothrow); +# endif + } + tbbThreadsCreated = false; + tbbTaskScheduler = nullptr; +#endif + tbbTerminateFlag = false; + } +} + +std::shared_ptr ExecutorManagerImpl::get_executor(const std::string& id) { + std::lock_guard guard(taskExecutorMutex); + auto foundEntry = executors.find(id); + if (foundEntry == executors.end()) { + auto newExec = std::make_shared(ov::threading::IStreamsExecutor::Config{id}); + tbbThreadsCreated = true; + executors[id] = newExec; + return newExec; + } + return foundEntry->second; +} + +std::shared_ptr ExecutorManagerImpl::get_idle_cpu_streams_executor( + const ov::threading::IStreamsExecutor::Config& config) { + std::lock_guard guard(streamExecutorMutex); + for (const auto& it : cpuStreamsExecutors) { + const auto& executor = it.second; + if (executor.use_count() != 1) + continue; + + const auto& executorConfig = it.first; + if (executorConfig._name == config._name && executorConfig._streams == config._streams && + executorConfig._threadsPerStream == config._threadsPerStream && + executorConfig._threadBindingType == config._threadBindingType && + executorConfig._threadBindingStep == config._threadBindingStep && + executorConfig._threadBindingOffset == config._threadBindingOffset) + if (executorConfig._threadBindingType != ov::threading::IStreamsExecutor::ThreadBindingType::HYBRID_AWARE || + executorConfig._threadPreferredCoreType == config._threadPreferredCoreType) + return executor; + } + auto newExec = std::make_shared(config); + tbbThreadsCreated = true; + cpuStreamsExecutors.emplace_back(std::make_pair(config, newExec)); + return newExec; +} + +size_t ExecutorManagerImpl::get_executors_number() const { + std::lock_guard guard(taskExecutorMutex); + return executors.size(); +} + +size_t ExecutorManagerImpl::get_idle_cpu_streams_executors_number() const { + std::lock_guard guard(streamExecutorMutex); + return cpuStreamsExecutors.size(); +} + +void ExecutorManagerImpl::clear(const std::string& id) { + std::lock_guard stream_guard(streamExecutorMutex); + std::lock_guard task_guard(taskExecutorMutex); + if (id.empty()) { + executors.clear(); + cpuStreamsExecutors.clear(); + } else { + executors.erase(id); + cpuStreamsExecutors.erase( + std::remove_if(cpuStreamsExecutors.begin(), + cpuStreamsExecutors.end(), + [&](const std::pair>& it) { + return it.first._name == id; + }), + cpuStreamsExecutors.end()); + } +} + +namespace { + +class ExecutorManagerHolder { + std::mutex _mutex; + std::weak_ptr _manager; + +public: + ExecutorManagerHolder(const ExecutorManagerHolder&) = delete; + ExecutorManagerHolder& operator=(const ExecutorManagerHolder&) = delete; + + ExecutorManagerHolder() = default; + + std::shared_ptr get() { + std::lock_guard lock(_mutex); + auto manager = _manager.lock(); + if (!manager) { + _manager = manager = std::make_shared(); + } + return manager; + } +}; + +} // namespace + +std::shared_ptr executor_manager() { + static ExecutorManagerHolder executorManagerHolder; + return executorManagerHolder.get(); +} + +} // namespace threading +} // namespace ov diff --git a/src/inference/src/dev/threading/istreams_executor.cpp b/src/inference/src/dev/threading/istreams_executor.cpp new file mode 100644 index 00000000000000..d96163a2739675 --- /dev/null +++ b/src/inference/src/dev/threading/istreams_executor.cpp @@ -0,0 +1,496 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/threading/istreams_executor.hpp" + +#include +#include +#include +#include + +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" +#include "ie_plugin_config.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/system_conf.hpp" +#include "openvino/util/log.hpp" +#include "threading/ie_parallel_custom_arena.hpp" + +namespace ov { +namespace threading { + +IStreamsExecutor::~IStreamsExecutor() {} + +void IStreamsExecutor::Config::set_property(const std::string& key, const ov::Any& value) { + set_property({{key, value}}); +} + +void IStreamsExecutor::Config::set_property(const ov::AnyMap& property) { + for (const auto& it : property) { + const auto& key = it.first; + const auto value = it.second; + if (key == CONFIG_KEY(CPU_BIND_THREAD)) { + if (value.as() == CONFIG_VALUE(YES) || value.as() == CONFIG_VALUE(NUMA)) { +#if (defined(__APPLE__) || defined(_WIN32)) + _threadBindingType = IStreamsExecutor::ThreadBindingType::NUMA; +#else + _threadBindingType = (value.as() == CONFIG_VALUE(YES)) + ? IStreamsExecutor::ThreadBindingType::CORES + : IStreamsExecutor::ThreadBindingType::NUMA; +#endif + } else if (value.as() == CONFIG_VALUE(HYBRID_AWARE)) { + _threadBindingType = IStreamsExecutor::ThreadBindingType::HYBRID_AWARE; + } else if (value.as() == CONFIG_VALUE(NO)) { + _threadBindingType = IStreamsExecutor::ThreadBindingType::NONE; + } else { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_BIND_THREAD) + << ". Expected only YES(binds to cores) / NO(no binding) / NUMA(binds to NUMA nodes) / " + "HYBRID_AWARE (let the runtime recognize and use the hybrid cores)"; + } + } else if (key == ov::affinity) { + ov::Affinity affinity; + std::stringstream{value.as()} >> affinity; + switch (affinity) { + case ov::Affinity::NONE: + _threadBindingType = ThreadBindingType::NONE; + break; + case ov::Affinity::CORE: { +#if (defined(__APPLE__) || defined(_WIN32)) + _threadBindingType = ThreadBindingType::NUMA; +#else + _threadBindingType = ThreadBindingType::CORES; +#endif + } break; + case ov::Affinity::NUMA: + _threadBindingType = ThreadBindingType::NUMA; + break; + case ov::Affinity::HYBRID_AWARE: + _threadBindingType = ThreadBindingType::HYBRID_AWARE; + break; + default: + OPENVINO_UNREACHABLE("Unsupported affinity type"); + } + } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { + if (value.as() == CONFIG_VALUE(CPU_THROUGHPUT_NUMA)) { + _streams = static_cast(get_available_numa_nodes().size()); + } else if (value.as() == CONFIG_VALUE(CPU_THROUGHPUT_AUTO)) { + // bare minimum of streams (that evenly divides available number of cores) + _streams = get_default_num_streams(); + } else { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THROUGHPUT_STREAMS) + << ". Expected only positive numbers (#streams) or " + << "PluginConfigParams::CPU_THROUGHPUT_NUMA/CPU_THROUGHPUT_AUTO"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THROUGHPUT_STREAMS) + << ". Expected only positive numbers (#streams)"; + } + _streams = val_i; + } + } else if (key == ov::num_streams) { + auto streams = value.as(); + if (streams == ov::streams::NUMA) { + _streams = static_cast(get_available_numa_nodes().size()); + } else if (streams == ov::streams::AUTO) { + // bare minimum of streams (that evenly divides available number of cores) + _streams = get_default_num_streams(); + } else if (streams.num >= 0) { + _streams = streams.num; + } else { + OPENVINO_UNREACHABLE("Wrong value for property key ", + ov::num_streams.name(), + ". Expected non negative numbers (#streams) or ", + "ov::streams::NUMA|ov::streams::AUTO, Got: ", + streams); + } + } else if (key == CONFIG_KEY(CPU_THREADS_NUM) || key == ov::inference_num_threads) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THREADS_NUM) + << ". Expected only positive numbers (#threads)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THREADS_NUM) + << ". Expected only positive numbers (#threads)"; + } + _threads = val_i; + } else if (key == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM) + << ". Expected only non negative numbers (#threads)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM) + << ". Expected only non negative numbers (#threads)"; + } + _threadsPerStream = val_i; + } else if (key == CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS) + << ". Expected only non negative numbers (#streams)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS) + << ". Expected only non negative numbers (#streams)"; + } + _big_core_streams = val_i; + } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS) + << ". Expected only non negative numbers (#streams)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS) + << ". Expected only non negative numbers (#streams)"; + } + _small_core_streams = val_i; + } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG) + << ". Expected only non negative numbers (#threads)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG) + << ". Expected only non negative numbers (#threads)"; + } + _threads_per_stream_big = val_i; + } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL) + << ". Expected only non negative numbers (#threads)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL) + << ". Expected only non negative numbers (#threads)"; + } + _threads_per_stream_small = val_i; + } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET) + << ". Expected only non negative numbers"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET) + << ". Expected only non negative numbers"; + } + _small_core_offset = val_i; + } else if (key == CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD)) { + if (value.as() == CONFIG_VALUE(YES)) { + _enable_hyper_thread = true; + } else if (value.as() == CONFIG_VALUE(NO)) { + _enable_hyper_thread = false; + } else { + OPENVINO_UNREACHABLE("Unsupported enable hyper thread type"); + } + } else { + IE_THROW() << "Wrong value for property key " << key; + } + } +} + +ov::Any IStreamsExecutor::Config::get_property(const std::string& key) const { + if (key == ov::supported_properties) { + std::vector properties{ + CONFIG_KEY(CPU_THROUGHPUT_STREAMS), + CONFIG_KEY(CPU_BIND_THREAD), + CONFIG_KEY(CPU_THREADS_NUM), + CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM), + CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS), + CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS), + CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG), + CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL), + CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET), + CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD), + ov::num_streams.name(), + ov::inference_num_threads.name(), + ov::affinity.name(), + }; + return properties; + } else if (key == ov::affinity) { + switch (_threadBindingType) { + case IStreamsExecutor::ThreadBindingType::NONE: + return ov::Affinity::NONE; + case IStreamsExecutor::ThreadBindingType::CORES: + return ov::Affinity::CORE; + case IStreamsExecutor::ThreadBindingType::NUMA: + return ov::Affinity::NUMA; + case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE: + return ov::Affinity::HYBRID_AWARE; + } + } else if (key == CONFIG_KEY(CPU_BIND_THREAD)) { + switch (_threadBindingType) { + case IStreamsExecutor::ThreadBindingType::NONE: + return {CONFIG_VALUE(NO)}; + case IStreamsExecutor::ThreadBindingType::CORES: + return {CONFIG_VALUE(YES)}; + case IStreamsExecutor::ThreadBindingType::NUMA: + return {CONFIG_VALUE(NUMA)}; + case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE: + return {CONFIG_VALUE(HYBRID_AWARE)}; + } + } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { + return {std::to_string(_streams)}; + } else if (key == ov::num_streams) { + return decltype(ov::num_streams)::value_type{_streams}; + } else if (key == CONFIG_KEY(CPU_THREADS_NUM)) { + return {std::to_string(_threads)}; + } else if (key == ov::inference_num_threads) { + return decltype(ov::inference_num_threads)::value_type{_threads}; + } else if (key == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { + return {std::to_string(_threadsPerStream)}; + } else if (key == CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)) { + return {std::to_string(_big_core_streams)}; + } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)) { + return {std::to_string(_small_core_streams)}; + } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)) { + return {std::to_string(_threads_per_stream_big)}; + } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)) { + return {std::to_string(_threads_per_stream_small)}; + } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)) { + return {std::to_string(_small_core_offset)}; + } else if (key == CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD)) { + return {_enable_hyper_thread ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO)}; + } else { + OPENVINO_UNREACHABLE("Wrong value for property key ", key); + } + return {}; +} + +int IStreamsExecutor::Config::get_default_num_streams(const bool enable_hyper_thread) { + const int sockets = static_cast(get_available_numa_nodes().size()); + // bare minimum of streams (that evenly divides available number of core) + const int num_cores = sockets == 1 ? (enable_hyper_thread ? parallel_get_max_threads() : get_number_of_cpu_cores()) + : get_number_of_cpu_cores(); + if (0 == num_cores % 4) + return std::max(4, num_cores / 4); + else if (0 == num_cores % 5) + return std::max(5, num_cores / 5); + else if (0 == num_cores % 3) + return std::max(3, num_cores / 3); + else // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide + return 1; +} + +int IStreamsExecutor::Config::get_hybrid_num_streams(std::map& config, + const int stream_mode) { + const int num_cores = parallel_get_max_threads(); + const int num_cores_phy = get_number_of_cpu_cores(); + const int num_big_cores_phy = get_number_of_cpu_cores(true); + const int num_small_cores = num_cores_phy - num_big_cores_phy; + const int num_big_cores = num_cores > num_cores_phy ? num_big_cores_phy * 2 : num_big_cores_phy; + int big_core_streams = 0; + int small_core_streams = 0; + int threads_per_stream_big = 0; + int threads_per_stream_small = 0; + + if (stream_mode == DEFAULT) { + // bare minimum of streams (that evenly divides available number of core) + if (0 == num_big_cores_phy % 4) { + threads_per_stream_big = 4; + } else if (0 == num_big_cores_phy % 5) { + threads_per_stream_big = 5; + } else if (0 == num_big_cores_phy % 3) { + threads_per_stream_big = 3; + } else { // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide + threads_per_stream_big = num_big_cores_phy; + } + + big_core_streams = num_big_cores / threads_per_stream_big; + threads_per_stream_small = threads_per_stream_big; + if (num_small_cores == 0) { + threads_per_stream_small = 0; + } else if (num_small_cores < threads_per_stream_small) { + small_core_streams = 1; + threads_per_stream_small = num_small_cores; + threads_per_stream_big = threads_per_stream_small; + // Balance the computation of physical core and logical core, the number of threads on the physical core and + // logical core should be equal + big_core_streams = num_big_cores_phy / threads_per_stream_big * 2; + } else { + small_core_streams = num_small_cores / threads_per_stream_small; + } + } else if (stream_mode == AGGRESSIVE) { + big_core_streams = num_big_cores; + small_core_streams = num_small_cores; + threads_per_stream_big = num_big_cores / big_core_streams; + threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / small_core_streams; + } else if (stream_mode == LESSAGGRESSIVE) { + big_core_streams = num_big_cores / 2; + small_core_streams = num_small_cores / 2; + threads_per_stream_big = num_big_cores / big_core_streams; + threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / small_core_streams; + } else { + IE_THROW() << "Wrong stream mode to get num of streams: " << stream_mode; + } + config[CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)] = std::to_string(big_core_streams); + config[CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)] = std::to_string(small_core_streams); + config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)] = std::to_string(threads_per_stream_big); + config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)] = std::to_string(threads_per_stream_small); + // This is default setting for specific CPU which Pcore is in front and Ecore is in the back. + config[CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)] = std::to_string(num_small_cores == 0 ? 0 : num_big_cores); + return big_core_streams + small_core_streams; +} + +void IStreamsExecutor::Config::update_hybrid_custom_threads(Config& config) { + const auto num_cores = parallel_get_max_threads(); + const auto num_cores_phys = get_number_of_cpu_cores(); + const auto num_big_cores_phys = get_number_of_cpu_cores(true); + const auto num_big_cores = num_cores > num_cores_phys ? num_big_cores_phys * 2 : num_big_cores_phys; + const auto num_small_cores_phys = num_cores_phys - num_big_cores_phys; + const auto threads = config._threads ? config._threads : num_cores; + const auto streams = config._streams > 0 ? config._streams : 1; + + config._small_core_offset = num_big_cores; + int threads_per_stream = std::max(1, threads / streams); + + if ((num_big_cores_phys / threads_per_stream >= streams) && (1 < threads_per_stream)) { + config._big_core_streams = streams; + config._threads_per_stream_big = threads_per_stream; + config._small_core_streams = 0; + config._threads_per_stream_small = 0; + } else if ((num_small_cores_phys / threads_per_stream >= streams) && (num_big_cores_phys < threads_per_stream)) { + config._big_core_streams = 0; + config._threads_per_stream_big = 0; + config._small_core_streams = streams; + config._threads_per_stream_small = threads_per_stream; + } else { + const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream); + const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream); + + threads_per_stream = std::min(threads_per_stream_big, threads_per_stream_small); + while (threads_per_stream > 1) { + const int base_big_streams = num_big_cores_phys / threads_per_stream; + const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream : 0; + if (base_big_streams + base_small_streams >= streams) { + config._big_core_streams = base_big_streams; + config._small_core_streams = streams - base_big_streams; + break; + } else if (base_big_streams * 2 + base_small_streams >= streams) { + config._big_core_streams = streams - base_small_streams; + config._small_core_streams = base_small_streams; + break; + } else { + threads_per_stream = threads_per_stream > 1 ? threads_per_stream - 1 : 1; + } + } + + if (threads_per_stream == 1) { + const int stream_loops = streams / num_cores; + const int remain_streams = streams - stream_loops * num_cores; + if (num_big_cores_phys >= remain_streams) { + config._big_core_streams = remain_streams + num_big_cores * stream_loops; + config._small_core_streams = num_small_cores_phys * stream_loops; + } else if (num_big_cores_phys + num_small_cores_phys >= remain_streams) { + config._big_core_streams = num_big_cores_phys + num_big_cores * stream_loops; + config._small_core_streams = remain_streams - num_big_cores_phys + num_small_cores_phys * stream_loops; + } else { + config._big_core_streams = remain_streams - num_small_cores_phys + num_big_cores * stream_loops; + config._small_core_streams = num_small_cores_phys * (stream_loops + 1); + } + } + + config._threads_per_stream_big = threads_per_stream; + config._threads_per_stream_small = threads_per_stream; + } +} + +IStreamsExecutor::Config IStreamsExecutor::Config::make_default_multi_threaded(const IStreamsExecutor::Config& initial, + const bool fp_intesive) { + const auto envThreads = parallel_get_env_threads(); + const auto& numaNodes = get_available_numa_nodes(); + const int numaNodesNum = static_cast(numaNodes.size()); + auto streamExecutorConfig = initial; + const bool bLatencyCase = streamExecutorConfig._streams <= numaNodesNum; + + // by default, do not use the hyper-threading (to minimize threads synch overheads) + int num_cores_default = get_number_of_cpu_cores(); +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + // additional latency-case logic for hybrid processors: + if (ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) { + const auto core_types = custom::info::core_types(); + const auto num_little_cores = + custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.front())); + const auto num_big_cores_phys = get_number_of_cpu_cores(true); + const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores; + const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores; + // by default the latency case uses (faster) Big cores only, depending on the compute ratio + const bool bLatencyCaseBigOnly = + num_big_cores_phys > (num_little_cores / (fp_intesive ? fp32_threshold : int8_threshold)); + // selecting the preferred core type + streamExecutorConfig._threadPreferredCoreType = + bLatencyCase ? (bLatencyCaseBigOnly ? IStreamsExecutor::Config::PreferredCoreType::BIG + : IStreamsExecutor::Config::PreferredCoreType::ANY) + : IStreamsExecutor::Config::PreferredCoreType::ROUND_ROBIN; + // additionally selecting the #cores to use in the "Big-only" case + if (bLatencyCaseBigOnly) { + const int hyper_threading_threshold = + 2; // min #cores, for which the hyper-threading becomes useful for the latency case + const auto num_big_cores = + custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.back())); + num_cores_default = (num_big_cores_phys <= hyper_threading_threshold) ? num_big_cores : num_big_cores_phys; + } + // if nstreams or nthreads are set, need to calculate the Hybrid aware parameters here + if (!bLatencyCase && (streamExecutorConfig._big_core_streams == 0 || streamExecutorConfig._threads)) { + update_hybrid_custom_threads(streamExecutorConfig); + } + OPENVINO_DEBUG << "[ p_e_core_info ] streams (threads): " << streamExecutorConfig._streams << "(" + << streamExecutorConfig._threads_per_stream_big * streamExecutorConfig._big_core_streams + + streamExecutorConfig._threads_per_stream_small * streamExecutorConfig._small_core_streams + << ") -- PCore: " << streamExecutorConfig._big_core_streams << "(" + << streamExecutorConfig._threads_per_stream_big + << ") ECore: " << streamExecutorConfig._small_core_streams << "(" + << streamExecutorConfig._threads_per_stream_small << ")"; + } +#endif + const auto hwCores = + !bLatencyCase && numaNodesNum == 1 + // throughput case on a single-NUMA node machine uses all available cores + ? (streamExecutorConfig._enable_hyper_thread ? parallel_get_max_threads() : num_cores_default) + // in the rest of cases: + // multi-node machine + // or + // latency case, single-node yet hybrid case that uses + // all core types + // or + // big-cores only, but the #cores is "enough" (pls see the logic above) + // it is usually beneficial not to use the hyper-threading (which is default) + : num_cores_default; + const auto threads = + streamExecutorConfig._threads ? streamExecutorConfig._threads : (envThreads ? envThreads : hwCores); + streamExecutorConfig._threadsPerStream = + streamExecutorConfig._streams ? std::max(1, threads / streamExecutorConfig._streams) : threads; + streamExecutorConfig._threads = + (!bLatencyCase && ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) + ? streamExecutorConfig._big_core_streams * streamExecutorConfig._threads_per_stream_big + + streamExecutorConfig._small_core_streams * streamExecutorConfig._threads_per_stream_small + : streamExecutorConfig._threadsPerStream * streamExecutorConfig._streams; + return streamExecutorConfig; +} + +} // namespace threading +} // namespace ov diff --git a/src/inference/src/dev/threading/itask_executor.cpp b/src/inference/src/dev/threading/itask_executor.cpp new file mode 100644 index 00000000000000..7701df3d2b4113 --- /dev/null +++ b/src/inference/src/dev/threading/itask_executor.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/threading/itask_executor.hpp" + +#include +#include +#include +#include + +namespace ov { +namespace threading { + +void ITaskExecutor::run_and_wait(const std::vector& tasks) { + std::vector> packagedTasks; + std::vector> futures; + for (std::size_t i = 0; i < tasks.size(); ++i) { + packagedTasks.emplace_back([&tasks, i] { + tasks[i](); + }); + futures.emplace_back(packagedTasks.back().get_future()); + } + for (std::size_t i = 0; i < tasks.size(); ++i) { + run([&packagedTasks, i] { + packagedTasks[i](); + }); + } + // std::future::get will rethrow exception from task. + // We should wait all tasks before any exception is thrown. + // So wait() and get() for each future moved to separate loops + for (auto&& future : futures) { + future.wait(); + } + for (auto&& future : futures) { + future.get(); + } +} + +} // namespace threading +} // namespace ov diff --git a/src/inference/src/os/lin/lin_system_conf.cpp b/src/inference/src/os/lin/lin_system_conf.cpp index d822b631e9c92e..ec56b4897d5fcb 100644 --- a/src/inference/src/os/lin/lin_system_conf.cpp +++ b/src/inference/src/os/lin/lin_system_conf.cpp @@ -18,7 +18,7 @@ #include "streams_executor.hpp" #include "threading/ie_parallel_custom_arena.hpp" -namespace InferenceEngine { +namespace ov { struct CPU { int _processors = 0; @@ -243,13 +243,13 @@ void parse_processor_info_linux(const int _processors, }; #if !((IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)) -std::vector getAvailableNUMANodes() { +std::vector get_available_numa_nodes() { std::vector nodes((0 == cpu._sockets) ? 1 : cpu._sockets); std::iota(std::begin(nodes), std::end(nodes), 0); return nodes; } #endif -int getNumberOfCPUCores(bool bigCoresOnly) { +int get_number_of_cpu_cores(bool bigCoresOnly) { unsigned numberOfProcessors = cpu._processors; unsigned totalNumberOfCpuCores = cpu._cores; IE_ASSERT(totalNumberOfCpuCores != 0); @@ -280,4 +280,4 @@ int getNumberOfCPUCores(bool bigCoresOnly) { return phys_cores; } -} // namespace InferenceEngine +} // namespace ov diff --git a/src/inference/src/os/win/win_system_conf.cpp b/src/inference/src/os/win/win_system_conf.cpp index e89666edf7ac54..e4d7df0166730a 100644 --- a/src/inference/src/os/win/win_system_conf.cpp +++ b/src/inference/src/os/win/win_system_conf.cpp @@ -3,7 +3,7 @@ // #ifndef NOMINMAX -# define NOMINMAX +# define NOMINMAX #endif #include @@ -11,11 +11,11 @@ #include #include -#include "ie_system_conf.h" +#include "openvino/runtime/system_conf.hpp" #include "streams_executor.hpp" #include "threading/ie_parallel_custom_arena.hpp" -namespace InferenceEngine { +namespace ov { struct CPU { int _processors = 0; @@ -168,7 +168,7 @@ void parse_processor_info_win(const char* base_ptr, } } -int getNumberOfCPUCores(bool bigCoresOnly) { +int get_number_of_cpu_cores(bool bigCoresOnly) { const int fallback_val = parallel_get_max_threads(); DWORD sz = 0; // querying the size of the resulting structure, passing the nullptr for the buffer @@ -178,7 +178,8 @@ int getNumberOfCPUCores(bool bigCoresOnly) { std::unique_ptr ptr(new uint8_t[sz]); if (!GetLogicalProcessorInformationEx(RelationProcessorCore, - reinterpret_cast(ptr.get()), &sz)) + reinterpret_cast(ptr.get()), + &sz)) return fallback_val; int phys_cores = 0; @@ -188,20 +189,21 @@ int getNumberOfCPUCores(bool bigCoresOnly) { phys_cores++; } while (offset < sz); - #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) +#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) auto core_types = custom::info::core_types(); if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ { - phys_cores = custom::info::default_concurrency(custom::task_arena::constraints{} - .set_core_type(core_types.back()) - .set_max_threads_per_core(1)); + phys_cores = custom::info::default_concurrency( + custom::task_arena::constraints{}.set_core_type(core_types.back()).set_max_threads_per_core(1)); } - #endif +#endif return phys_cores; } #if !(IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) // OMP/SEQ threading on the Windows doesn't support NUMA -std::vector getAvailableNUMANodes() { return {-1}; } +std::vector get_available_numa_nodes() { + return {-1}; +} #endif -} // namespace InferenceEngine +} // namespace ov diff --git a/src/inference/src/streams_executor.hpp b/src/inference/src/streams_executor.hpp index 769c4ec73cd034..4bea102dbceb63 100644 --- a/src/inference/src/streams_executor.hpp +++ b/src/inference/src/streams_executor.hpp @@ -11,7 +11,7 @@ #include #include -namespace InferenceEngine { +namespace ov { #ifdef __linux__ /** @@ -55,4 +55,4 @@ void parse_processor_info_win(const char* base_ptr, std::vector>& _cpu_mapping_table); #endif -} // namespace InferenceEngine \ No newline at end of file +} // namespace ov diff --git a/src/inference/src/ie_system_conf.cpp b/src/inference/src/system_conf.cpp similarity index 90% rename from src/inference/src/ie_system_conf.cpp rename to src/inference/src/system_conf.cpp index 761fdda4dd54e3..da212d4a62950c 100644 --- a/src/inference/src/ie_system_conf.cpp +++ b/src/inference/src/system_conf.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ie_system_conf.h" +#include "openvino/runtime/system_conf.hpp" #include #include @@ -15,7 +15,7 @@ #define XBYAK_UNDEF_JNL #include -namespace InferenceEngine { +namespace ov { #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) @@ -102,7 +102,7 @@ bool with_cpu_x86_avx512_core_amx() { #endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 -bool checkOpenMpEnvVars(bool includeOMPNumThreads) { +bool check_open_mp_env_vars(bool include_omp_num_threads) { for (auto&& var : {"GOMP_CPU_AFFINITY", "GOMP_DEBUG" "GOMP_RTEMS_THREAD_POOLS", @@ -134,7 +134,7 @@ bool checkOpenMpEnvVars(bool includeOMPNumThreads) { "PHI_KMP_PLACE_THREADS" "PHI_OMP_NUM_THREADS"}) { if (getenv(var)) { - if (0 != strcmp(var, "OMP_NUM_THREADS") || includeOMPNumThreads) + if (0 != strcmp(var, "OMP_NUM_THREADS") || include_omp_num_threads) return true; } } @@ -144,19 +144,19 @@ bool checkOpenMpEnvVars(bool includeOMPNumThreads) { #if defined(__APPLE__) || defined(__EMSCRIPTEN__) // for Linux and Windows the getNumberOfCPUCores (that accounts only for physical cores) implementation is OS-specific // (see cpp files in corresponding folders), for __APPLE__ it is default : -int getNumberOfCPUCores(bool) { +int get_number_of_cpu_cores(bool) { return parallel_get_max_threads(); } # if !((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO)) -std::vector getAvailableNUMANodes() { +std::vector get_available_numa_nodes() { return {-1}; } # endif -int getNumberOfLogicalCPUCores(bool) { +int get_number_of_logical_cpu_cores(bool) { return parallel_get_max_threads(); } #else -int getNumberOfLogicalCPUCores(bool bigCoresOnly) { +int get_number_of_logical_cpu_cores(bool bigCoresOnly) { int logical_cores = parallel_get_max_threads(); # if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) auto core_types = custom::info::core_types(); @@ -170,18 +170,18 @@ int getNumberOfLogicalCPUCores(bool bigCoresOnly) { #endif #if ((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO)) -std::vector getAvailableNUMANodes() { +std::vector get_available_numa_nodes() { return custom::info::numa_nodes(); } // this is impl only with the TBB -std::vector getAvailableCoresTypes() { +std::vector get_available_cores_types() { return custom::info::core_types(); } #else // as the core types support exists only with the TBB, the fallback is same for any other threading API -std::vector getAvailableCoresTypes() { +std::vector get_available_cores_types() { return {-1}; } #endif -} // namespace InferenceEngine +} // namespace ov diff --git a/src/inference/src/threading/ie_cpu_streams_executor.cpp b/src/inference/src/threading/ie_cpu_streams_executor.cpp index 2e786599a74bf2..37f690ec473c63 100644 --- a/src/inference/src/threading/ie_cpu_streams_executor.cpp +++ b/src/inference/src/threading/ie_cpu_streams_executor.cpp @@ -194,7 +194,7 @@ struct CPUStreamsExecutor::Impl { } #elif IE_THREAD == IE_THREAD_SEQ if (ThreadBindingType::NUMA == _impl->_config._threadBindingType) { - PinCurrentThreadToSocket(_numaNodeId); + InferenceEngine::PinCurrentThreadToSocket(_numaNodeId); } else if (ThreadBindingType::CORES == _impl->_config._threadBindingType) { CpuSet processMask; int ncpus = 0; @@ -368,7 +368,7 @@ int CPUStreamsExecutor::GetNumaNodeId() { return stream->_numaNodeId; } -CPUStreamsExecutor::CPUStreamsExecutor(const IStreamsExecutor::Config& config) : _impl{new Impl{config}} {} +CPUStreamsExecutor::CPUStreamsExecutor(const Config& config) : _impl{new Impl{config}} {} CPUStreamsExecutor::~CPUStreamsExecutor() { { diff --git a/src/inference/src/threading/ie_executor_manager.cpp b/src/inference/src/threading/ie_executor_manager.cpp index 6e52117976d88d..82a1e126ae5dae 100644 --- a/src/inference/src/threading/ie_executor_manager.cpp +++ b/src/inference/src/threading/ie_executor_manager.cpp @@ -5,7 +5,12 @@ #include "threading/ie_executor_manager.hpp" #include "ie_parallel.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" #include "threading/ie_cpu_streams_executor.hpp" +#include "threading/ie_itask_executor.hpp" #if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO # if (TBB_INTERFACE_VERSION < 12000) # include @@ -23,7 +28,7 @@ namespace InferenceEngine { namespace { class ExecutorManagerImpl : public ExecutorManager { public: - ~ExecutorManagerImpl(); + ExecutorManagerImpl(const std::shared_ptr& manager); ITaskExecutor::Ptr getExecutor(const std::string& id) override; IStreamsExecutor::Ptr getIdleCPUStreamsExecutor(const IStreamsExecutor::Config& config) override; size_t getExecutorsNumber() const override; @@ -33,134 +38,87 @@ class ExecutorManagerImpl : public ExecutorManager { bool getTbbFlag() override; private: - void resetTbb(); - std::unordered_map executors; - std::vector> cpuStreamsExecutors; - mutable std::mutex streamExecutorMutex; - mutable std::mutex taskExecutorMutex; - bool tbbTerminateFlag = false; - mutable std::mutex tbbMutex; - bool tbbThreadsCreated = false; -#if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO -# if (TBB_INTERFACE_VERSION < 12000) - std::shared_ptr tbbTaskScheduler = nullptr; -# else - std::shared_ptr tbbTaskScheduler = nullptr; -# endif -#endif + std::shared_ptr m_manager; + std::shared_ptr get_ov_manager() const override { + return m_manager; + } +}; + +class TaskExecutorWrapper : public ITaskExecutor { + std::shared_ptr m_executor; + +public: + TaskExecutorWrapper(const std::shared_ptr& executor) : m_executor(executor) {} + void run(Task task) override { + m_executor->run(task); + } + + void runAndWait(const std::vector& tasks) override { + m_executor->run_and_wait(tasks); + } +}; + +class StreamsExecutorWrapper : public IStreamsExecutor { + std::shared_ptr m_executor; + +public: + StreamsExecutorWrapper(const std::shared_ptr& executor) : m_executor(executor) {} + void run(Task task) override { + m_executor->run(task); + } + + void runAndWait(const std::vector& tasks) override { + m_executor->run_and_wait(tasks); + } + int GetStreamId() override { + return m_executor->get_stream_id(); + } + + int GetNumaNodeId() override { + return m_executor->get_numa_node_id(); + } + + void Execute(Task task) override { + m_executor->execute(task); + } }; } // namespace -ExecutorManagerImpl::~ExecutorManagerImpl() { - resetTbb(); -} +ExecutorManagerImpl::ExecutorManagerImpl(const std::shared_ptr& manager) + : m_manager(manager) {} void ExecutorManagerImpl::setTbbFlag(bool flag) { - std::lock_guard guard(tbbMutex); - tbbTerminateFlag = flag; -#if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO - if (tbbTerminateFlag) { - if (!tbbTaskScheduler) { -# if (TBB_INTERFACE_VERSION < 12000) - tbbTaskScheduler = std::make_shared(); -# elif (TBB_INTERFACE_VERSION < 12060) - tbbTaskScheduler = - std::make_shared(oneapi::tbb::task_scheduler_handle::get()); -# else - tbbTaskScheduler = std::make_shared(tbb::attach{}); -# endif - } - } else { - tbbTaskScheduler = nullptr; - } -#endif + m_manager->set_property({{ov::force_tbb_terminate.name(), flag}}); } bool ExecutorManagerImpl::getTbbFlag() { - std::lock_guard guard(tbbMutex); - return tbbTerminateFlag; -} - -void ExecutorManagerImpl::resetTbb() { - std::lock_guard guard(tbbMutex); - if (tbbTerminateFlag) { -#if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO - if (tbbTaskScheduler && tbbThreadsCreated) { -# if (TBB_INTERFACE_VERSION < 12000) - tbbTaskScheduler->terminate(); -# else - tbb::finalize(*tbbTaskScheduler, std::nothrow); -# endif - } - tbbThreadsCreated = false; - tbbTaskScheduler = nullptr; -#endif - tbbTerminateFlag = false; - } + return m_manager->get_property(ov::force_tbb_terminate.name()).as(); } ITaskExecutor::Ptr ExecutorManagerImpl::getExecutor(const std::string& id) { - std::lock_guard guard(taskExecutorMutex); - auto foundEntry = executors.find(id); - if (foundEntry == executors.end()) { - auto newExec = std::make_shared(IStreamsExecutor::Config{id}); - tbbThreadsCreated = true; - executors[id] = newExec; - return newExec; - } - return foundEntry->second; + return std::make_shared(m_manager->get_executor(id)); } IStreamsExecutor::Ptr ExecutorManagerImpl::getIdleCPUStreamsExecutor(const IStreamsExecutor::Config& config) { - std::lock_guard guard(streamExecutorMutex); - for (const auto& it : cpuStreamsExecutors) { - const auto& executor = it.second; - if (executor.use_count() != 1) - continue; - - const auto& executorConfig = it.first; - if (executorConfig._name == config._name && executorConfig._streams == config._streams && - executorConfig._threadsPerStream == config._threadsPerStream && - executorConfig._threadBindingType == config._threadBindingType && - executorConfig._threadBindingStep == config._threadBindingStep && - executorConfig._threadBindingOffset == config._threadBindingOffset) - if (executorConfig._threadBindingType != IStreamsExecutor::ThreadBindingType::HYBRID_AWARE || - executorConfig._threadPreferredCoreType == config._threadPreferredCoreType) - return executor; - } - auto newExec = std::make_shared(config); - tbbThreadsCreated = true; - cpuStreamsExecutors.emplace_back(std::make_pair(config, newExec)); - return newExec; + return std::make_shared(m_manager->get_idle_cpu_streams_executor(config)); } size_t ExecutorManagerImpl::getExecutorsNumber() const { - std::lock_guard guard(taskExecutorMutex); - return executors.size(); + return m_manager->get_executors_number(); } size_t ExecutorManagerImpl::getIdleCPUStreamsExecutorsNumber() const { - std::lock_guard guard(streamExecutorMutex); - return cpuStreamsExecutors.size(); + return m_manager->get_idle_cpu_streams_executors_number(); } void ExecutorManagerImpl::clear(const std::string& id) { - std::lock_guard stream_guard(streamExecutorMutex); - std::lock_guard task_guard(taskExecutorMutex); - if (id.empty()) { - executors.clear(); - cpuStreamsExecutors.clear(); - } else { - executors.erase(id); - cpuStreamsExecutors.erase( - std::remove_if(cpuStreamsExecutors.begin(), - cpuStreamsExecutors.end(), - [&](const std::pair& it) { - return it.first._name == id; - }), - cpuStreamsExecutors.end()); - } + return m_manager->clear(id); +} + +std::shared_ptr create_old_manager( + const std::shared_ptr& manager) { + return std::make_shared(manager); } namespace { @@ -179,7 +137,7 @@ class ExecutorManagerHolder { std::lock_guard lock(_mutex); auto manager = _manager.lock(); if (!manager) { - _manager = manager = std::make_shared(); + _manager = manager = create_old_manager(ov::threading::executor_manager()); } return manager; } diff --git a/src/inference/src/threading/ie_istreams_executor.cpp b/src/inference/src/threading/ie_istreams_executor.cpp index 87529594c45ad6..e78cc8cb0fae4e 100644 --- a/src/inference/src/threading/ie_istreams_executor.cpp +++ b/src/inference/src/threading/ie_istreams_executor.cpp @@ -23,463 +23,31 @@ namespace InferenceEngine { IStreamsExecutor::~IStreamsExecutor() {} std::vector IStreamsExecutor::Config::SupportedKeys() const { - return { - CONFIG_KEY(CPU_THROUGHPUT_STREAMS), - CONFIG_KEY(CPU_BIND_THREAD), - CONFIG_KEY(CPU_THREADS_NUM), - CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM), - CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS), - CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS), - CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG), - CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL), - CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET), - CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD), - ov::num_streams.name(), - ov::inference_num_threads.name(), - ov::affinity.name(), - }; + return get_property(ov::supported_properties.name()).as>(); } int IStreamsExecutor::Config::GetDefaultNumStreams(const bool enable_hyper_thread) { - const int sockets = static_cast(getAvailableNUMANodes().size()); - // bare minimum of streams (that evenly divides available number of core) - const int num_cores = sockets == 1 ? (enable_hyper_thread ? parallel_get_max_threads() : getNumberOfCPUCores()) - : getNumberOfCPUCores(); - if (0 == num_cores % 4) - return std::max(4, num_cores / 4); - else if (0 == num_cores % 5) - return std::max(5, num_cores / 5); - else if (0 == num_cores % 3) - return std::max(3, num_cores / 3); - else // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide - return 1; + return get_default_num_streams(enable_hyper_thread); } int IStreamsExecutor::Config::GetHybridNumStreams(std::map& config, const int stream_mode) { - const int num_cores = parallel_get_max_threads(); - const int num_cores_phy = getNumberOfCPUCores(); - const int num_big_cores_phy = getNumberOfCPUCores(true); - const int num_small_cores = num_cores_phy - num_big_cores_phy; - const int num_big_cores = num_cores > num_cores_phy ? num_big_cores_phy * 2 : num_big_cores_phy; - int big_core_streams = 0; - int small_core_streams = 0; - int threads_per_stream_big = 0; - int threads_per_stream_small = 0; - - if (stream_mode == DEFAULT) { - // bare minimum of streams (that evenly divides available number of core) - if (0 == num_big_cores_phy % 4) { - threads_per_stream_big = 4; - } else if (0 == num_big_cores_phy % 5) { - threads_per_stream_big = 5; - } else if (0 == num_big_cores_phy % 3) { - threads_per_stream_big = 3; - } else { // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide - threads_per_stream_big = num_big_cores_phy; - } - - big_core_streams = num_big_cores / threads_per_stream_big; - threads_per_stream_small = threads_per_stream_big; - if (num_small_cores == 0) { - threads_per_stream_small = 0; - } else if (num_small_cores < threads_per_stream_small) { - small_core_streams = 1; - threads_per_stream_small = num_small_cores; - threads_per_stream_big = threads_per_stream_small; - // Balance the computation of physical core and logical core, the number of threads on the physical core and - // logical core should be equal - big_core_streams = num_big_cores_phy / threads_per_stream_big * 2; - } else { - small_core_streams = num_small_cores / threads_per_stream_small; - } - } else if (stream_mode == AGGRESSIVE) { - big_core_streams = num_big_cores; - small_core_streams = num_small_cores; - threads_per_stream_big = num_big_cores / big_core_streams; - threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / small_core_streams; - } else if (stream_mode == LESSAGGRESSIVE) { - big_core_streams = num_big_cores / 2; - small_core_streams = num_small_cores / 2; - threads_per_stream_big = num_big_cores / big_core_streams; - threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / small_core_streams; - } else { - IE_THROW() << "Wrong stream mode to get num of streams: " << stream_mode; - } - config[CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)] = std::to_string(big_core_streams); - config[CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)] = std::to_string(small_core_streams); - config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)] = std::to_string(threads_per_stream_big); - config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)] = std::to_string(threads_per_stream_small); - // This is default setting for specific CPU which Pcore is in front and Ecore is in the back. - config[CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)] = std::to_string(num_small_cores == 0 ? 0 : num_big_cores); - return big_core_streams + small_core_streams; + return get_hybrid_num_streams(config, stream_mode); } void IStreamsExecutor::Config::SetConfig(const std::string& key, const std::string& value) { - if (key == CONFIG_KEY(CPU_BIND_THREAD)) { - if (value == CONFIG_VALUE(YES) || value == CONFIG_VALUE(NUMA)) { -#if (defined(__APPLE__) || defined(_WIN32)) - _threadBindingType = IStreamsExecutor::ThreadBindingType::NUMA; -#else - _threadBindingType = (value == CONFIG_VALUE(YES)) ? IStreamsExecutor::ThreadBindingType::CORES - : IStreamsExecutor::ThreadBindingType::NUMA; -#endif - } else if (value == CONFIG_VALUE(HYBRID_AWARE)) { - _threadBindingType = IStreamsExecutor::ThreadBindingType::HYBRID_AWARE; - } else if (value == CONFIG_VALUE(NO)) { - _threadBindingType = IStreamsExecutor::ThreadBindingType::NONE; - } else { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_BIND_THREAD) - << ". Expected only YES(binds to cores) / NO(no binding) / NUMA(binds to NUMA nodes) / " - "HYBRID_AWARE (let the runtime recognize and use the hybrid cores)"; - } - } else if (key == ov::affinity) { - ov::Affinity affinity; - std::stringstream{value} >> affinity; - switch (affinity) { - case ov::Affinity::NONE: - _threadBindingType = ThreadBindingType::NONE; - break; - case ov::Affinity::CORE: { -#if (defined(__APPLE__) || defined(_WIN32)) - _threadBindingType = ThreadBindingType::NUMA; -#else - _threadBindingType = ThreadBindingType::CORES; -#endif - } break; - case ov::Affinity::NUMA: - _threadBindingType = ThreadBindingType::NUMA; - break; - case ov::Affinity::HYBRID_AWARE: - _threadBindingType = ThreadBindingType::HYBRID_AWARE; - break; - default: - OPENVINO_UNREACHABLE("Unsupported affinity type"); - } - } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { - if (value == CONFIG_VALUE(CPU_THROUGHPUT_NUMA)) { - _streams = static_cast(getAvailableNUMANodes().size()); - } else if (value == CONFIG_VALUE(CPU_THROUGHPUT_AUTO)) { - // bare minimum of streams (that evenly divides available number of cores) - _streams = GetDefaultNumStreams(); - } else { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THROUGHPUT_STREAMS) - << ". Expected only positive numbers (#streams) or " - << "PluginConfigParams::CPU_THROUGHPUT_NUMA/CPU_THROUGHPUT_AUTO"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THROUGHPUT_STREAMS) - << ". Expected only positive numbers (#streams)"; - } - _streams = val_i; - } - } else if (key == ov::num_streams) { - auto streams = ov::util::from_string(value, ov::streams::num); - if (streams == ov::streams::NUMA) { - _streams = static_cast(getAvailableNUMANodes().size()); - } else if (streams == ov::streams::AUTO) { - // bare minimum of streams (that evenly divides available number of cores) - _streams = GetDefaultNumStreams(); - } else if (streams.num >= 0) { - _streams = streams.num; - } else { - OPENVINO_UNREACHABLE("Wrong value for property key ", - ov::num_streams.name(), - ". Expected non negative numbers (#streams) or ", - "ov::streams::NUMA|ov::streams::AUTO, Got: ", - streams); - } - } else if (key == CONFIG_KEY(CPU_THREADS_NUM) || key == ov::inference_num_threads) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THREADS_NUM) - << ". Expected only positive numbers (#threads)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THREADS_NUM) - << ". Expected only positive numbers (#threads)"; - } - _threads = val_i; - } else if (key == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM) - << ". Expected only non negative numbers (#threads)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM) - << ". Expected only non negative numbers (#threads)"; - } - _threadsPerStream = val_i; - } else if (key == CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS) - << ". Expected only non negative numbers (#streams)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS) - << ". Expected only non negative numbers (#streams)"; - } - _big_core_streams = val_i; - } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS) - << ". Expected only non negative numbers (#streams)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS) - << ". Expected only non negative numbers (#streams)"; - } - _small_core_streams = val_i; - } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG) - << ". Expected only non negative numbers (#threads)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG) - << ". Expected only non negative numbers (#threads)"; - } - _threads_per_stream_big = val_i; - } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL) - << ". Expected only non negative numbers (#threads)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL) - << ". Expected only non negative numbers (#threads)"; - } - _threads_per_stream_small = val_i; - } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET) - << ". Expected only non negative numbers"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET) - << ". Expected only non negative numbers"; - } - _small_core_offset = val_i; - } else if (key == CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD)) { - if (value == CONFIG_VALUE(YES)) { - _enable_hyper_thread = true; - } else if (value == CONFIG_VALUE(NO)) { - _enable_hyper_thread = false; - } else { - OPENVINO_UNREACHABLE("Unsupported enable hyper thread type"); - } - } else { - IE_THROW() << "Wrong value for property key " << key; - } + set_property(key, value); } Parameter IStreamsExecutor::Config::GetConfig(const std::string& key) const { - if (key == ov::affinity) { - switch (_threadBindingType) { - case IStreamsExecutor::ThreadBindingType::NONE: - return ov::Affinity::NONE; - case IStreamsExecutor::ThreadBindingType::CORES: - return ov::Affinity::CORE; - case IStreamsExecutor::ThreadBindingType::NUMA: - return ov::Affinity::NUMA; - case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE: - return ov::Affinity::HYBRID_AWARE; - } - } else if (key == CONFIG_KEY(CPU_BIND_THREAD)) { - switch (_threadBindingType) { - case IStreamsExecutor::ThreadBindingType::NONE: - return {CONFIG_VALUE(NO)}; - case IStreamsExecutor::ThreadBindingType::CORES: - return {CONFIG_VALUE(YES)}; - case IStreamsExecutor::ThreadBindingType::NUMA: - return {CONFIG_VALUE(NUMA)}; - case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE: - return {CONFIG_VALUE(HYBRID_AWARE)}; - } - } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { - return {std::to_string(_streams)}; - } else if (key == ov::num_streams) { - return decltype(ov::num_streams)::value_type{_streams}; - } else if (key == CONFIG_KEY(CPU_THREADS_NUM)) { - return {std::to_string(_threads)}; - } else if (key == ov::inference_num_threads) { - return decltype(ov::inference_num_threads)::value_type{_threads}; - } else if (key == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { - return {std::to_string(_threadsPerStream)}; - } else if (key == CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)) { - return {std::to_string(_big_core_streams)}; - } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)) { - return {std::to_string(_small_core_streams)}; - } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)) { - return {std::to_string(_threads_per_stream_big)}; - } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)) { - return {std::to_string(_threads_per_stream_small)}; - } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)) { - return {std::to_string(_small_core_offset)}; - } else if (key == CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD)) { - return {_enable_hyper_thread ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO)}; - } else { - IE_THROW() << "Wrong value for property key " << key; - } - return {}; + return get_property(key); } void IStreamsExecutor::Config::UpdateHybridCustomThreads(Config& config) { - const auto num_cores = parallel_get_max_threads(); - const auto num_cores_phys = getNumberOfCPUCores(); - const auto num_big_cores_phys = getNumberOfCPUCores(true); - const auto num_big_cores = num_cores > num_cores_phys ? num_big_cores_phys * 2 : num_big_cores_phys; - const auto num_small_cores_phys = num_cores_phys - num_big_cores_phys; - const auto threads = config._threads ? config._threads : num_cores; - const auto streams = config._streams > 0 ? config._streams : 1; - - config._small_core_offset = num_big_cores; - int threads_per_stream = std::max(1, threads / streams); - - if ((num_big_cores_phys / threads_per_stream >= streams) && (1 < threads_per_stream)) { - config._big_core_streams = streams; - config._threads_per_stream_big = threads_per_stream; - config._small_core_streams = 0; - config._threads_per_stream_small = 0; - } else if ((num_small_cores_phys / threads_per_stream >= streams) && (num_big_cores_phys < threads_per_stream)) { - config._big_core_streams = 0; - config._threads_per_stream_big = 0; - config._small_core_streams = streams; - config._threads_per_stream_small = threads_per_stream; - } else { - const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream); - const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream); - - threads_per_stream = std::min(threads_per_stream_big, threads_per_stream_small); - while (threads_per_stream > 1) { - const int base_big_streams = num_big_cores_phys / threads_per_stream; - const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream : 0; - if (base_big_streams + base_small_streams >= streams) { - config._big_core_streams = base_big_streams; - config._small_core_streams = streams - base_big_streams; - break; - } else if (base_big_streams * 2 + base_small_streams >= streams) { - config._big_core_streams = streams - base_small_streams; - config._small_core_streams = base_small_streams; - break; - } else { - threads_per_stream = threads_per_stream > 1 ? threads_per_stream - 1 : 1; - } - } - - if (threads_per_stream == 1) { - const int stream_loops = streams / num_cores; - const int remain_streams = streams - stream_loops * num_cores; - if (num_big_cores_phys >= remain_streams) { - config._big_core_streams = remain_streams + num_big_cores * stream_loops; - config._small_core_streams = num_small_cores_phys * stream_loops; - } else if (num_big_cores_phys + num_small_cores_phys >= remain_streams) { - config._big_core_streams = num_big_cores_phys + num_big_cores * stream_loops; - config._small_core_streams = remain_streams - num_big_cores_phys + num_small_cores_phys * stream_loops; - } else { - config._big_core_streams = remain_streams - num_small_cores_phys + num_big_cores * stream_loops; - config._small_core_streams = num_small_cores_phys * (stream_loops + 1); - } - } - - config._threads_per_stream_big = threads_per_stream; - config._threads_per_stream_small = threads_per_stream; - } + return update_hybrid_custom_threads(config); } IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial, const bool fp_intesive) { - const auto envThreads = parallel_get_env_threads(); - const auto& numaNodes = getAvailableNUMANodes(); - const int numaNodesNum = static_cast(numaNodes.size()); - auto streamExecutorConfig = initial; - const bool bLatencyCase = streamExecutorConfig._streams <= numaNodesNum; - - // by default, do not use the hyper-threading (to minimize threads synch overheads) - int num_cores_default = getNumberOfCPUCores(); -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - // additional latency-case logic for hybrid processors: - if (ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) { - const auto core_types = custom::info::core_types(); - const auto num_little_cores = - custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.front())); - const auto num_big_cores_phys = getNumberOfCPUCores(true); - const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores; - const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores; - // by default the latency case uses (faster) Big cores only, depending on the compute ratio - const bool bLatencyCaseBigOnly = - num_big_cores_phys > (num_little_cores / (fp_intesive ? fp32_threshold : int8_threshold)); - // selecting the preferred core type - streamExecutorConfig._threadPreferredCoreType = - bLatencyCase ? (bLatencyCaseBigOnly ? IStreamsExecutor::Config::PreferredCoreType::BIG - : IStreamsExecutor::Config::PreferredCoreType::ANY) - : IStreamsExecutor::Config::PreferredCoreType::ROUND_ROBIN; - // additionally selecting the #cores to use in the "Big-only" case - if (bLatencyCaseBigOnly) { - const int hyper_threading_threshold = - 2; // min #cores, for which the hyper-threading becomes useful for the latency case - const auto num_big_cores = - custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.back())); - num_cores_default = (num_big_cores_phys <= hyper_threading_threshold) ? num_big_cores : num_big_cores_phys; - } - // if nstreams or nthreads are set, need to calculate the Hybrid aware parameters here - if (!bLatencyCase && (streamExecutorConfig._big_core_streams == 0 || streamExecutorConfig._threads)) { - UpdateHybridCustomThreads(streamExecutorConfig); - } - OPENVINO_DEBUG << "[ p_e_core_info ] streams (threads): " << streamExecutorConfig._streams << "(" - << streamExecutorConfig._threads_per_stream_big * streamExecutorConfig._big_core_streams + - streamExecutorConfig._threads_per_stream_small * streamExecutorConfig._small_core_streams - << ") -- PCore: " << streamExecutorConfig._big_core_streams << "(" - << streamExecutorConfig._threads_per_stream_big - << ") ECore: " << streamExecutorConfig._small_core_streams << "(" - << streamExecutorConfig._threads_per_stream_small << ")"; - } -#endif - const auto hwCores = - !bLatencyCase && numaNodesNum == 1 - // throughput case on a single-NUMA node machine uses all available cores - ? (streamExecutorConfig._enable_hyper_thread ? parallel_get_max_threads() : num_cores_default) - // in the rest of cases: - // multi-node machine - // or - // latency case, single-node yet hybrid case that uses - // all core types - // or - // big-cores only, but the #cores is "enough" (pls see the logic above) - // it is usually beneficial not to use the hyper-threading (which is default) - : num_cores_default; - const auto threads = - streamExecutorConfig._threads ? streamExecutorConfig._threads : (envThreads ? envThreads : hwCores); - streamExecutorConfig._threadsPerStream = - streamExecutorConfig._streams ? std::max(1, threads / streamExecutorConfig._streams) : threads; - streamExecutorConfig._threads = - (!bLatencyCase && ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) - ? streamExecutorConfig._big_core_streams * streamExecutorConfig._threads_per_stream_big + - streamExecutorConfig._small_core_streams * streamExecutorConfig._threads_per_stream_small - : streamExecutorConfig._threadsPerStream * streamExecutorConfig._streams; - return streamExecutorConfig; + return make_default_multi_threaded(initial); } } // namespace InferenceEngine diff --git a/src/inference/src/threading/ie_itask_executor.cpp b/src/inference/src/threading/ie_itask_executor.cpp index f75279dfa449ab..8e6bf89f389981 100644 --- a/src/inference/src/threading/ie_itask_executor.cpp +++ b/src/inference/src/threading/ie_itask_executor.cpp @@ -12,27 +12,7 @@ namespace InferenceEngine { void ITaskExecutor::runAndWait(const std::vector& tasks) { - std::vector> packagedTasks; - std::vector> futures; - for (std::size_t i = 0; i < tasks.size(); ++i) { - packagedTasks.emplace_back([&tasks, i] { - tasks[i](); - }); - futures.emplace_back(packagedTasks.back().get_future()); - } - for (std::size_t i = 0; i < tasks.size(); ++i) { - run([&packagedTasks, i] { - packagedTasks[i](); - }); - } - // std::future::get will rethrow exception from task. - // We should wait all tasks before any exception is thrown. - // So wait() and get() for each future moved to separate loops - for (auto&& future : futures) { - future.wait(); - } - for (auto&& future : futures) { - future.get(); - } + run_and_wait(tasks); } + } // namespace InferenceEngine diff --git a/src/inference/tests/unit/cpu_map_parser.cpp b/src/inference/tests/unit/cpu_map_parser.cpp index d2693c87ff9983..20f8ace1862eb7 100644 --- a/src/inference/tests/unit/cpu_map_parser.cpp +++ b/src/inference/tests/unit/cpu_map_parser.cpp @@ -10,7 +10,7 @@ #include "streams_executor.hpp" using namespace testing; -using namespace InferenceEngine; +using namespace ov; namespace { @@ -36,12 +36,12 @@ class LinuxCpuMapParserTests : public CommonTestUtils::TestsCommon, std::vector> test_proc_type_table; std::vector> test_cpu_mapping_table; - InferenceEngine::parse_processor_info_linux(test_data._processors, - test_data.system_info_table, - test_sockets, - test_cores, - test_proc_type_table, - test_cpu_mapping_table); + ov::parse_processor_info_linux(test_data._processors, + test_data.system_info_table, + test_sockets, + test_cores, + test_proc_type_table, + test_cpu_mapping_table); ASSERT_EQ(test_data._sockets, test_sockets); ASSERT_EQ(test_data._cores, test_cores); @@ -629,13 +629,13 @@ class WinCpuMapParserTests : public CommonTestUtils::TestsCommon, std::vector> test_proc_type_table; std::vector> test_cpu_mapping_table; - parse_processor_info_win(test_info_ptr, - len, - test_data._processors, - test_sockets, - test_cores, - test_proc_type_table, - test_cpu_mapping_table); + ov::parse_processor_info_win(test_info_ptr, + len, + test_data._processors, + test_sockets, + test_cores, + test_proc_type_table, + test_cpu_mapping_table); ASSERT_EQ(test_data._sockets, test_sockets); ASSERT_EQ(test_data._cores, test_cores); diff --git a/src/inference/tests/unit/ie_executor_manager_tests.cpp b/src/inference/tests/unit/ie_executor_manager_tests.cpp index 42035ac2a5f389..a419777c4c1d60 100644 --- a/src/inference/tests/unit/ie_executor_manager_tests.cpp +++ b/src/inference/tests/unit/ie_executor_manager_tests.cpp @@ -4,36 +4,34 @@ #include -#include +#include "openvino/runtime/threading/executor_manager.hpp" using namespace ::testing; -using namespace std; -using namespace InferenceEngine; TEST(ExecutorManagerTests, canCreateSingleExecutorManager) { - auto executorManager1 = executorManager(); + auto executorManager1 = ov::threading::executor_manager(); - auto executorManager2 = executorManager(); + auto executorManager2 = ov::threading::executor_manager(); ASSERT_EQ(executorManager1, executorManager2); } TEST(ExecutorManagerTests, createDifferentExecutorsForDifferentDevices) { - auto executorMgr = executorManager(); - auto executor1 = executorMgr->getExecutor("CPU"); - auto executor2 = executorMgr->getExecutor("GPU"); + auto executorMgr = ov::threading::executor_manager(); + auto executor1 = executorMgr->get_executor("CPU"); + auto executor2 = executorMgr->get_executor("GPU"); ASSERT_NE(executor1, executor2); - ASSERT_EQ(2, executorMgr->getExecutorsNumber()); + ASSERT_EQ(2, executorMgr->get_executors_number()); } TEST(ExecutorManagerTests, returnTheSameExecutorForTheSameDevice) { - auto executorMgr = executorManager(); - auto executor1 = executorMgr->getExecutor("CPU"); - auto executor2 = executorMgr->getExecutor("GPU"); + auto executorMgr = ov::threading::executor_manager(); + auto executor1 = executorMgr->get_executor("CPU"); + auto executor2 = executorMgr->get_executor("GPU"); - auto executor = executorMgr->getExecutor("GPU"); + auto executor = executorMgr->get_executor("GPU"); ASSERT_EQ(executor, executor2); - ASSERT_EQ(2, executorMgr->getExecutorsNumber()); + ASSERT_EQ(2, executorMgr->get_executors_number()); } diff --git a/src/plugins/auto/CMakeLists.txt b/src/plugins/auto/CMakeLists.txt index fbca1d5c43f19b..ed24e998a5f421 100644 --- a/src/plugins/auto/CMakeLists.txt +++ b/src/plugins/auto/CMakeLists.txt @@ -38,6 +38,7 @@ endif() set_ie_threading_interface_for(${TARGET_NAME}) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/auto/infer_request.cpp b/src/plugins/auto/infer_request.cpp index 3a0fbe1052b5ae..2453ce60c52881 100644 --- a/src/plugins/auto/infer_request.cpp +++ b/src/plugins/auto/infer_request.cpp @@ -91,12 +91,14 @@ void MultiDeviceInferRequest::SetBlob(const std::string& name, const InferenceEn IInferRequestInternal::SetBlob(name, blob); } +IE_SUPPRESS_DEPRECATED_START void MultiDeviceInferRequest::SetBlob(const std::string& name, const Blob::Ptr& blob, const PreProcessInfo& info) { if (_sharedRequest) _sharedRequest->SetBlob(name, blob, info); else IInferRequestInternal::SetBlob(name, blob, info); } +IE_SUPPRESS_DEPRECATED_END InferenceEngine::Blob::Ptr MultiDeviceInferRequest::GetBlob(const std::string& name) { if (_sharedRequest) @@ -124,8 +126,4 @@ std::vector> MultiDevic IE_THROW(NotImplemented); } -void MultiDeviceInferRequest::InferImpl() { - IE_THROW(NotImplemented); -} - } // namespace MultiDevicePlugin diff --git a/src/plugins/auto/infer_request.hpp b/src/plugins/auto/infer_request.hpp index 1502b42e6c3fa6..d540fd686ba8c6 100644 --- a/src/plugins/auto/infer_request.hpp +++ b/src/plugins/auto/infer_request.hpp @@ -38,8 +38,11 @@ class MultiDeviceInferRequest : public InferenceEngine::IInferRequestInternal { const InferenceEngine::SoIInferRequestInternal & request_to_share_blobs_with, InferenceEngine::RemoteContext::Ptr ctx = nullptr); std::map GetPerformanceCounts() const override; - void InferImpl() override; void SetBlob(const std::string& name, const InferenceEngine::Blob::Ptr& blob) override; + /** + * @deprecated This method will be removed in 2024.1 release + * @brief Sets blob with a pre-process information + */ void SetBlob(const std::string& name, const InferenceEngine::Blob::Ptr& blob, const InferenceEngine::PreProcessInfo& info) override; diff --git a/src/plugins/auto/plugin.cpp b/src/plugins/auto/plugin.cpp index ffcef3822a9ce8..4d643363aa8edd 100644 --- a/src/plugins/auto/plugin.cpp +++ b/src/plugins/auto/plugin.cpp @@ -961,7 +961,9 @@ std::vector MultiDeviceInferencePlugin::FilterDeviceByNetwork }); // If CPU is in candidate list, load dynamic network to CPU first - if ((model->is_dynamic() || isStateful()) && cpuiter != metaDevices.end()) { + // For MULTI do not only load stateful network to CPU + // For AUTO CTPUT only load stateful network to CPU + if ((model->is_dynamic() || (isStateful() && _LogTag != "MULTI")) && cpuiter != metaDevices.end()) { filterDevice.push_back(*cpuiter); return filterDevice; } diff --git a/src/plugins/auto/plugin.hpp b/src/plugins/auto/plugin.hpp index 41bf2957b92411..2f91b536bd56f8 100644 --- a/src/plugins/auto/plugin.hpp +++ b/src/plugins/auto/plugin.hpp @@ -70,7 +70,7 @@ class MultiDeviceInferencePlugin : public InferenceEngine::IInferencePlugin { std::vector FilterDevice(const std::vector& metaDevices, const std::map& config); std::vector FilterDeviceByNetwork(const std::vector& metaDevices, - InferenceEngine::CNNNetwork network); + InferenceEngine::CNNNetwork network); std::string GetLogTag() const noexcept; static std::mutex _mtx; static std::map> _priorityMap; diff --git a/src/plugins/auto_batch/CMakeLists.txt b/src/plugins/auto_batch/CMakeLists.txt index 9b34bdcc2a405e..edd4e619b59e0b 100644 --- a/src/plugins/auto_batch/CMakeLists.txt +++ b/src/plugins/auto_batch/CMakeLists.txt @@ -20,6 +20,7 @@ ie_add_plugin(NAME ${TARGET_NAME} target_link_libraries(${TARGET_NAME} PRIVATE Threads::Threads) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/hetero/CMakeLists.txt b/src/plugins/hetero/CMakeLists.txt index 17035a9a3e4052..da48fcc6e88bdd 100644 --- a/src/plugins/hetero/CMakeLists.txt +++ b/src/plugins/hetero/CMakeLists.txt @@ -24,6 +24,7 @@ ie_faster_build(${TARGET_NAME} target_link_libraries(${TARGET_NAME} PRIVATE openvino::pugixml) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index a54bd48b6332ca..380dfa1ff4daac 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -12,7 +12,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) ie_add_compiler_flags(-Wno-sign-compare) ie_add_compiler_flags(-Wno-sequence-point) ie_add_compiler_flags(-Wno-strict-aliasing) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-class-memaccess") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") # C4267, 4244 issues from mkl-dnn headers conversion from 'XXX' to 'YYY', possible loss of data ie_add_compiler_flags(/wd4267) @@ -78,6 +77,7 @@ cross_compiled_file(${TARGET_NAME} NAMESPACE InferenceEngine::Extensions::Cpu::XARCH ) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) # add test object library diff --git a/src/plugins/intel_cpu/src/utils/bfloat16.hpp b/src/plugins/intel_cpu/src/utils/bfloat16.hpp index 6fbd2875a22ac8..7e190e756c9ace 100644 --- a/src/plugins/intel_cpu/src/utils/bfloat16.hpp +++ b/src/plugins/intel_cpu/src/utils/bfloat16.hpp @@ -20,10 +20,7 @@ namespace intel_cpu { class bfloat16_t { public: - constexpr bfloat16_t() - : m_value{0} - { - } + bfloat16_t() = default; bfloat16_t(float value) noexcept : m_value{ #if defined BFLOAT16_ROUND_MODE_TO_NEAREST diff --git a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp index 607a37478a4d5b..1eb82f84becef8 100644 --- a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp @@ -78,14 +78,6 @@ namespace ov { namespace intel_cpu { -void shape_inference(ov::Node* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map& constant_data) { - auto shapeInfer = make_shape_inference(op->shared_from_this()); - output_shapes = shapeInfer->infer(input_shapes, constant_data); -} - class entryBase : public IShapeInferCommon { public: using iface_type = IShapeInferCommon; diff --git a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.hpp b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.hpp index 56f00c6460b256..9e307e6fc871ff 100644 --- a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.hpp +++ b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.hpp @@ -13,11 +13,6 @@ namespace ov { namespace intel_cpu { -void shape_inference(ov::Node* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map& constant_data = {}); - class IShapeInferCommon { public: virtual std::vector infer(const std::vector& input_shapes, diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/infer_request/memory_states.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/infer_request/memory_states.cpp index 7a160f0cd04bdb..20b71e49eb627c 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/infer_request/memory_states.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/infer_request/memory_states.cpp @@ -10,44 +10,37 @@ using namespace BehaviorTestsDefinitions; namespace { -InferenceEngine::CNNNetwork getNetwork() { - ngraph::Shape shape = {1, 200}; - ngraph::element::Type type = ngraph::element::f32; - - auto input = std::make_shared(type, shape); - auto mem_i1 = std::make_shared(type, shape, 0); - auto mem_r1 = std::make_shared(mem_i1, "r_1-3"); - auto mul1 = std::make_shared(mem_r1, input); - - auto mem_i2 = std::make_shared(type, shape, 0); - auto mem_r2 = std::make_shared(mem_i2, "c_1-3"); - auto mul2 = std::make_shared(mem_r2, mul1); - auto mem_w2 = std::make_shared(mul2, "c_1-3"); - - auto mem_w1 = std::make_shared(mul2, "r_1-3"); - auto sigm = std::make_shared(mul2); - sigm->set_friendly_name("sigmod_state"); - mem_r1->set_friendly_name("Memory_1"); - mem_w1->add_control_dependency(mem_r1); - sigm->add_control_dependency(mem_w1); - - mem_r2->set_friendly_name("Memory_2"); - mem_w2->add_control_dependency(mem_r2); - sigm->add_control_dependency(mem_w2); - - auto function = std::make_shared(ngraph::NodeVector{sigm}, ngraph::ParameterVector{input}, "addOutput"); - return InferenceEngine::CNNNetwork{function}; -} - std::vector memoryStateTestCases = { - memoryStateParams(getNetwork(), {"c_1-3", "r_1-3"}, CommonTestUtils::DEVICE_CPU, {}), - memoryStateParams(getNetwork(), {"c_1-3", "r_1-3"}, CommonTestUtils::DEVICE_AUTO, - {{MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_CPU}}), - memoryStateParams(getNetwork(), {"c_1-3", "r_1-3"}, CommonTestUtils::DEVICE_HETERO, - {{MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_CPU}}) -}; - -INSTANTIATE_TEST_SUITE_P(smoke_VariableStateBasic, InferRequestVariableStateTest, + memoryStateParams(InferRequestVariableStateTest::getNetwork(), {"c_1-3", "r_1-3"}, CommonTestUtils::DEVICE_CPU, {}), + memoryStateParams(InferRequestVariableStateTest::getNetwork(), + {"c_1-3", "r_1-3"}, + CommonTestUtils::DEVICE_HETERO, + {{MULTI_CONFIG_KEY(DEVICE_PRIORITIES), CommonTestUtils::DEVICE_CPU}})}; + +std::vector memoryStateAutoTestCases = { + memoryStateParams(InferRequestVariableStateTest::getNetwork(), + {"c_1-3", "r_1-3"}, + CommonTestUtils::DEVICE_AUTO, + {{MULTI_CONFIG_KEY(DEVICE_PRIORITIES), CommonTestUtils::DEVICE_CPU}})}; + +std::vector memoryStateMultiTestCases = { + memoryStateParams(InferRequestVariableStateTest::getNetwork(), + {"c_1-3", "r_1-3"}, + CommonTestUtils::DEVICE_MULTI, + {{MULTI_CONFIG_KEY(DEVICE_PRIORITIES), CommonTestUtils::DEVICE_CPU}})}; + +INSTANTIATE_TEST_SUITE_P(smoke_VariableStateBasic, + InferRequestVariableStateTest, ::testing::ValuesIn(memoryStateTestCases), InferRequestVariableStateTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, + InferRequestVariableStateTest, + ::testing::ValuesIn(memoryStateAutoTestCases), + InferRequestVariableStateTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, + InferRequestVariableStateTest, + ::testing::ValuesIn(memoryStateMultiTestCases), + InferRequestVariableStateTest::getTestCaseName); } // namespace diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/assign_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/assign_shape_inference.cpp index 441a029d21a0fd..9500ca8138f5cd 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/assign_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/assign_shape_inference.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_cell_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_cell_test.cpp index 1dfffe43b38484..311e43dc634bbf 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_cell_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_cell_test.cpp @@ -2,14 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ov_ops/augru_cell.hpp" - #include -#include -#include -#include -#include +#include "ov_ops/augru_cell.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_sequence_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_sequence_test.cpp index 962bd6402c20fd..55cb4958110d27 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_sequence_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_sequence_test.cpp @@ -2,14 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ov_ops/augru_sequence.hpp" - #include -#include -#include -#include -#include +#include "ov_ops/augru_sequence.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference.cpp deleted file mode 100644 index bb168118f82861..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include -#include -#include - -using namespace ov; -using namespace ov::intel_cpu; - -static std::shared_ptr make_batch_to_space( - PartialShape data_shape = PartialShape::dynamic(ov::Rank(2)), - PartialShape block_shape = PartialShape::dynamic(), - PartialShape crops_begin_shape = PartialShape::dynamic(), - PartialShape crops_end_shape = PartialShape::dynamic()) { - auto data = std::make_shared(element::f32, data_shape); - auto block = std::make_shared(element::i32, block_shape); - auto crops_begin = std::make_shared(element::i32, crops_begin_shape); - auto crops_end = std::make_shared(element::i32, crops_end_shape); - - const auto batch_to_space = std::make_shared(data, block, crops_begin, crops_end); - return batch_to_space; -} - -TEST(StaticShapeInferenceTest, BatchToSpaceWithHostTensorData) { - auto space_to_batch = make_batch_to_space(); - int32_t block_val[] = {1, 6, 5, 1, 16}; - int32_t pads_begin_val[] = {0, 2, 0, 0, 0}; - int32_t pads_end_val[] = {0, 2, 1, 0, 0}; - auto block = std::make_shared(ngraph::element::Type_t::i32, ov::Shape{5}, block_val); - auto crops_begin = std::make_shared(element::i32, ov::Shape{5}, pads_begin_val); - auto crops_end = std::make_shared(element::i32, ov::Shape{5}, pads_end_val); - - const std::vector input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - std::map> constant_data; - constant_data[1] = block; - constant_data[2] = crops_begin; - constant_data[3] = crops_end; - - shape_inference(space_to_batch.get(), input_shapes, output_shapes, constant_data); - ASSERT_EQ(output_shapes[0], (StaticShape{960 / (6 * 5 * 16), 6 * 6 - 2 - 2, 13 * 5 - 1, 128, 16 * 16})); -} - -TEST(StaticShapeInferenceTest, BatchToSpaceWithMissingTensorData) { - auto batch_to_space = make_batch_to_space(); - int32_t block_val[] = {1, 6, 5, 1, 16}; - int32_t pads_end_val[] = {0, 2, 1, 0, 0}; - auto block = std::make_shared(ngraph::element::Type_t::i32, ov::Shape{5}, block_val); - auto crops_end = std::make_shared(element::i32, ov::Shape{5}, pads_end_val); - - const std::vector input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - std::map> constant_data; - constant_data[1] = block; - constant_data[3] = crops_end; - - EXPECT_THROW(shape_inference(batch_to_space.get(), input_shapes, output_shapes, constant_data), NodeValidationFailure); -} - -TEST(StaticShapeInferenceTest, batch_to_space_output_with_const_inputs) { - auto data = std::make_shared(element::f32, ov::PartialShape{-1, -1, -1, -1}); - auto block_shape = std::make_shared(element::i64, ov::Shape{4}, std::vector{1, 10, 5, 1}); - auto crops_begin = std::make_shared(element::i64, ov::Shape{4}, std::vector{0, 3, 1, 0}); - auto crops_end = std::make_shared(element::i64, ov::Shape{4}, std::vector{0, 3, 0, 0}); - const auto batch_to_space = std::make_shared(data, block_shape, crops_begin, crops_end); - std::vector input_shapes = {{100, 7, 13, 3}, {4}, {4}, {4}}; - std::vector output_shapes = {{}}; - shape_inference(batch_to_space.get(), input_shapes, output_shapes); - - ASSERT_EQ(output_shapes[0], (StaticShape{100 / (10 * 5), 7 * 10 - 3 - 3, 13 * 5 - 1, 3})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference_test.cpp new file mode 100644 index 00000000000000..a79f3fd98a41d6 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference_test.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class BatchToSpaceV1StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } + + std::shared_ptr make_batch_to_space_dynamic() { + const auto data = std::make_shared(element::f32, PartialShape::dynamic()); + const auto block = std::make_shared(element::i32, PartialShape::dynamic()); + const auto crops_begin = std::make_shared(element::i32, PartialShape::dynamic()); + const auto crops_end = std::make_shared(element::i32, PartialShape::dynamic()); + + return make_op(data, block, crops_begin, crops_end); + } +}; + +TEST_F(BatchToSpaceV1StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t crops_begin_val[] = {0, 2, 0, 0, 0}; + int32_t crops_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {2, std::make_shared(element::i32, Shape{5}, crops_begin_val)}, + {3, std::make_shared(element::i32, Shape{5}, crops_end_val)}}; + + input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; + shape_inference(op.get(), input_shapes, output_shapes, constant_data); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{960 / (6 * 5 * 16), 6 * 6 - 2 - 2, 13 * 5 - 1, 128, 16 * 16})); +} + +TEST_F(BatchToSpaceV1StaticShapeInferenceTest, blocks_crops_in_constant_map) { + op = make_batch_to_space_dynamic(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t crops_begin_val[] = {0, 2, 0, 0, 0}; + int32_t crops_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {2, std::make_shared(element::i32, Shape{5}, crops_begin_val)}, + {3, std::make_shared(element::i32, Shape{5}, crops_end_val)}}; + + input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; + + shape_inference(op.get(), input_shapes, output_shapes, constant_data); + EXPECT_EQ(output_shapes[0], (StaticShape{960 / (6 * 5 * 16), 6 * 6 - 2 - 2, 13 * 5 - 1, 128, 16 * 16})); +} + +TEST_F(BatchToSpaceV1StaticShapeInferenceTest, blocs_crops_as_constants) { + auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1, -1}); + auto block_shape = std::make_shared(element::i64, Shape{4}, std::vector{1, 10, 5, 1}); + auto crops_begin = std::make_shared(element::i64, Shape{4}, std::vector{0, 3, 1, 0}); + auto crops_end = std::make_shared(element::i64, Shape{4}, std::vector{0, 3, 0, 0}); + + op = make_op(data, block_shape, crops_begin, crops_end); + input_shapes = {{100, 7, 13, 3}, {4}, {4}, {4}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes[0], (StaticShape{100 / (10 * 5), 7 * 10 - 3 - 3, 13 * 5 - 1, 3})); +} + +TEST_F(BatchToSpaceV1StaticShapeInferenceTest, missing_tensor_data) { + auto op = make_batch_to_space_dynamic(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t crops_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {3, std::make_shared(element::i32, Shape{5}, crops_end_val)}}; + + input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; + + EXPECT_THROW(shape_inference(op.get(), input_shapes, output_shapes, constant_data), NodeValidationFailure); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/broadcast_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/broadcast_shape_inference.cpp index 02091859c317b2..263062e4eced41 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/broadcast_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/broadcast_shape_inference.cpp @@ -4,12 +4,7 @@ #include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/convolution_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/convolution_shape_inference.cpp index 516bc25e1575c6..e1800a1999aa6c 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/convolution_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/convolution_shape_inference.cpp @@ -4,14 +4,7 @@ #include -#include -#include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference.cpp deleted file mode 100644 index 04b17fbb0bfd66..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include - -using namespace ov::intel_cpu; - -TEST(StaticShapeInferenceTest, DepthToSpaceTest) { - auto A = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(ov::Rank(4))); - auto depth_to_space = - std::make_shared(A, ov::op::v0::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); - const std::vector input_shapes = {StaticShape{1, 16, 3, 1080, 1616}}; - std::vector output_shapes = {StaticShape{}}; - shape_inference(depth_to_space.get(), input_shapes, output_shapes); - ASSERT_EQ(output_shapes[0], (StaticShape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference_test.cpp new file mode 100644 index 00000000000000..d7fb9d9f4e676a --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference_test.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class DepthToSpaceV0StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + input_shapes = {StaticShape{1, 16, 3, 1080, 1616}}; + output_shapes.resize(1); + } +}; + +TEST_F(DepthToSpaceV0StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + op->set_block_size(2); + + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); +} + +TEST_F(DepthToSpaceV0StaticShapeInferenceTest, block_first) { + const auto data = std::make_shared(element::f32, PartialShape::dynamic(4)); + const auto op = make_op(data, op_type::DepthToSpaceMode::BLOCKS_FIRST, 2); + + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/detection_output_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/detection_output_shape_inference_test.cpp index 8e1ef2a216eb76..c91c8879d83472 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/detection_output_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/detection_output_shape_inference_test.cpp @@ -4,13 +4,8 @@ #include -#include -#include -#include - -#include "utils/shape_inference/static_shape.hpp" #include "detection_output_shape_inference.hpp" - +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/elementwises.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/elementwises.cpp index a7eb81a5db9cb4..b77b1330ea4d31 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/elementwises.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/elementwises.cpp @@ -4,13 +4,7 @@ #include -#include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_generate_proposal.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_generate_proposal.cpp index a8782e7fd5da47..8d4d069ec6f306 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_generate_proposal.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_generate_proposal.cpp @@ -5,9 +5,8 @@ #include #include -#include -#include -#include + +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_prior_grid_generator_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_prior_grid_generator_shape_inference.cpp index 49b22ba7ad3000..447dd142e2df9e 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_prior_grid_generator_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_prior_grid_generator_shape_inference.cpp @@ -4,11 +4,7 @@ #include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_roi_feature_extractor.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_roi_feature_extractor.cpp index 7d46c113ca1139..3d82a65dd453de 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_roi_feature_extractor.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_roi_feature_extractor.cpp @@ -4,13 +4,7 @@ #include -#include -#include -#include -#include - -#include "utils/shape_inference/shape_inference.hpp" -#include "utils/shape_inference/static_shape.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/extract_image_patches_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/extract_image_patches_shape_inference.cpp index 7bf7862559b46d..7ab1b7ad681034 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/extract_image_patches_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/extract_image_patches_shape_inference.cpp @@ -4,11 +4,7 @@ #include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/fft_base_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/fft_base_shape_inference.cpp index d1c252b6b7f59f..e0fea3663e23dd 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/fft_base_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/fft_base_shape_inference.cpp @@ -4,13 +4,7 @@ #include -#include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_cell_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_cell_test.cpp index 32c2f02c60a49e..5fdaf6680ec600 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_cell_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_cell_test.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_sequence_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_sequence_test.cpp index 6ec856afec6b87..e55da4d19e937b 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_sequence_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_sequence_test.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/interpolate_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/interpolate_shape_inference.cpp index c8d95fb9537aca..e5016f585118a8 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/interpolate_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/interpolate_shape_inference.cpp @@ -4,12 +4,7 @@ #include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/lstm_cell_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/lstm_cell_shape_inference.cpp index 91273fc85f2577..330ed81d67ebe4 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/lstm_cell_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/lstm_cell_shape_inference.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/matmul_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/matmul_shape_inference.cpp index 75b6c8a10cb1ca..de44f9eb384fd0 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/matmul_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/matmul_shape_inference.cpp @@ -3,13 +3,7 @@ // #include -#include -#include -#include -#include -#include -#include - +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; using namespace testing; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/one_hot_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/one_hot_shape_inference_test.cpp index 7fbf5d273a9d26..7e7ecaf10f54b5 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/one_hot_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/one_hot_shape_inference_test.cpp @@ -6,11 +6,7 @@ #include "common_test_utils/test_assertions.hpp" #include "one_hot_shape_inference.hpp" - -#include "openvino/op/ops.hpp" -#include "openvino/op/parameter.hpp" -#include "utils/shape_inference/shape_inference.hpp" -#include "utils/shape_inference/static_shape.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/proposal.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/proposal.cpp index 4a21796c0b53f4..6eee193c8fafeb 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/proposal.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/proposal.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/read_value_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/read_value_shape_inference.cpp index bcd0a9b3c59cc7..43426ca1f2b6fd 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/read_value_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/read_value_shape_inference.cpp @@ -3,10 +3,7 @@ // #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/reduce_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/reduce_test.cpp index 11eaf813e2966f..9dd98765257977 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/reduce_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/reduce_test.cpp @@ -4,12 +4,7 @@ #include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/roi_align_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/roi_align_shape_inference.cpp index 16a7aba6d3c148..f9e4475a374913 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/roi_align_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/roi_align_shape_inference.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference.cpp deleted file mode 100644 index 33a2f0c38a9170..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include -#include - -using namespace ov; -using namespace ov::intel_cpu; - -TEST(StaticShapeInferenceTest, ScatterElementsUpdateTest) { - auto data_shape = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); - auto indices_shape = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); - auto updates_shape = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); - auto axis_shape = std::make_shared(element::i32, PartialShape::dynamic()); - - auto scatter_elements = - std::make_shared(data_shape, indices_shape, updates_shape, axis_shape); - - int32_t axis_shape_val[] = {2}; - std::map> constant_data; - constant_data[3] = - std::make_shared(ngraph::element::Type_t::i32, Shape{1}, axis_shape_val); - std::vector input_shapes = {StaticShape{1000, 256, 7, 7}, - StaticShape{125, 20, 7, 6}, - StaticShape{125, 20, 7, 6}, - StaticShape{1}}, - output_shapes = {StaticShape{}}; - shape_inference(scatter_elements.get(), input_shapes, output_shapes, constant_data); - - ASSERT_EQ(output_shapes[0], StaticShape({1000, 256, 7, 7})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference_test.cpp new file mode 100644 index 00000000000000..4ea2cf3fef8eb8 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference_test.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "common_test_utils/test_assertions.hpp" +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::opset10; +using namespace ov::intel_cpu; +using namespace testing; + +class ScatterElementsUpdateV3StaticShapeInferenceTest + : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } +}; + +TEST_F(ScatterElementsUpdateV3StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + + int32_t axis = 1; + const auto const_data = + std::map{{3, std::make_shared(element::i32, Shape{1}, &axis)}}; + + input_shapes = ShapeVector{{1000, 256, 10, 13}, {25, 125, 3, 1}, {25, 125, 3, 1}, {1}}; + shape_inference(op.get(), input_shapes, output_shapes, const_data); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], StaticShape({1000, 256, 10, 13})); +} + +TEST_F(ScatterElementsUpdateV3StaticShapeInferenceTest, correct_inputs_axis_as_constant) { + const auto d = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); + const auto i = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); + const auto u = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); + const auto a = std::make_shared(element::i64, Shape{}, -2); + + const auto op = make_op(d, i, u, a); + + input_shapes = ShapeVector{{2, 5, 10, 15}, {2, 1, 10, 15}, {2, 1, 10, 15}, {}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], StaticShape({2, 5, 10, 15})); +} + +TEST_F(ScatterElementsUpdateV3StaticShapeInferenceTest, params_are_dynamic_rank_axis_in_const_map) { + const auto d = std::make_shared(element::i32, PartialShape::dynamic()); + const auto i = std::make_shared(element::i32, PartialShape::dynamic()); + const auto u = std::make_shared(element::i32, PartialShape::dynamic()); + const auto a = std::make_shared(element::u32, PartialShape::dynamic()); + + const auto op = make_op(d, i, u, a); + + uint32_t axis = 2; + const auto const_data = + std::map{{3, std::make_shared(element::u32, Shape{}, &axis)}}; + + input_shapes = ShapeVector{{5000, 256, 10, 15}, {30, 25, 3, 3}, {30, 25, 3, 3}, {}}; + shape_inference(op.get(), input_shapes, output_shapes, const_data); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], StaticShape({5000, 256, 10, 15})); +} + +TEST_F(ScatterElementsUpdateV3StaticShapeInferenceTest, incorrect_axis_value) { + const auto d = std::make_shared(element::i32, PartialShape::dynamic()); + const auto i = std::make_shared(element::i32, PartialShape::dynamic()); + const auto u = std::make_shared(element::i32, PartialShape::dynamic()); + const auto a = std::make_shared(element::u32, PartialShape::dynamic()); + + const auto op = make_op(d, i, u, a); + + uint32_t axis = 4; + const auto const_data = + std::map{{3, std::make_shared(element::u32, Shape{}, &axis)}}; + + input_shapes = ShapeVector{{5000, 256, 10, 15}, {30, 25, 3, 3}, {30, 25, 3, 3}, {}}; + OV_EXPECT_THROW(shape_inference(op.get(), input_shapes, output_shapes, const_data), + AssertFailure, + HasSubstr("Parameter axis 4 out of the tensor rank range [-4, 3]")); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_update_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_update_shape_inference_test.cpp index f37a64de77381e..d88c8a20f46bb9 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_update_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_update_shape_inference_test.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/select_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/select_shape_inference_test.cpp index e6f41d87ecd937..8242f81777d091 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/select_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/select_shape_inference_test.cpp @@ -4,11 +4,7 @@ #include -#include -#include -#include - -#include "utils/shape_inference/static_shape.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shape_node_tests.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shape_node_tests.cpp index 3bd3887fb99385..2662faae88f490 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shape_node_tests.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shape_node_tests.cpp @@ -4,15 +4,7 @@ #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference.cpp deleted file mode 100644 index c47c72db8ba6d5..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include - -#include "utils/shape_inference/static_shape.hpp" - -using namespace ov; -using namespace ov::intel_cpu; - -TEST(StaticShapeInferenceTest, ShuffleChannelsTest) { - const auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1}); - const auto axis = -1; - const auto group = 3; - const auto shuffle_channels = std::make_shared(data, axis, group); - - std::vector static_input_shapes = {StaticShape{5, 4, 9}}; - std::vector static_output_shapes = {StaticShape{}}; - shape_inference(shuffle_channels.get(), static_input_shapes, static_output_shapes); - - ASSERT_EQ(static_output_shapes[0], static_input_shapes[0]); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference_test.cpp new file mode 100644 index 00000000000000..f9b9b2fdd151bd --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference_test.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class ShuffleChannelsV0StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } +}; + +TEST_F(ShuffleChannelsV0StaticShapeInferenceTest, default_ctor) { + op = make_op(); + op->set_axis(-2); + op->set_group(2); + + input_shapes = {StaticShape{5, 4, 9}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], input_shapes[0]); +} + +TEST_F(ShuffleChannelsV0StaticShapeInferenceTest, correct_shape_infer) { + const auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1}); + op = make_op(data, -1, 3); + + input_shapes = {StaticShape{5, 4, 9}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes[0], input_shapes[0]); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference.cpp deleted file mode 100644 index 36d0017af5cc18..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference.cpp +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include -#include -#include - -using namespace ov; -using namespace ov::intel_cpu; - -static std::shared_ptr build_space_to_batch( - PartialShape data_shape = PartialShape::dynamic(ov::Rank(2)), - PartialShape block_shape = PartialShape::dynamic(), - PartialShape pads_begin_shape = PartialShape::dynamic(), - PartialShape pad_end_shape = PartialShape::dynamic()) { - auto data = std::make_shared(element::f32, data_shape); - auto block = std::make_shared(element::i32, block_shape); - auto pads_begin = std::make_shared(element::i32, pads_begin_shape); - auto pads_end = std::make_shared(element::i32, pad_end_shape); - - auto space_to_batch = std::make_shared(data, block, pads_begin, pads_end); - return space_to_batch; -} - -TEST(StaticShapeInferenceTest, SpaceToBatchTest) { - auto space_to_batch = build_space_to_batch(); - int32_t block_val[] = {1, 6, 5, 1, 16}; - int32_t pads_begin_val[] = {0, 2, 0, 0, 0}; - int32_t pads_end_val[] = {0, 2, 1, 0, 0}; - auto block = std::make_shared(ngraph::element::Type_t::i32, ov::Shape{5}, block_val); - auto pads_begin = std::make_shared(element::i32, ov::Shape{5}, pads_begin_val); - auto pads_end = std::make_shared(element::i32, ov::Shape{5}, pads_end_val); - - const std::vector input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - std::map> constant_data; - constant_data[1] = block; - constant_data[2] = pads_begin; - constant_data[3] = pads_end; - - shape_inference(space_to_batch.get(), input_shapes, output_shapes, constant_data); - ASSERT_EQ(output_shapes[0], (StaticShape{2 * 6 * 5 * 16, (32 + 2 + 2) / 6, (64 + 1) / 5, 128, 256 / 16})); -} - -TEST(StaticShapeInferenceTest, SpaceToBatchThrowExceptionWithoutHostTensorData) { - auto space_to_batch = build_space_to_batch(); - - std::map> constant_data; - const std::vector input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - EXPECT_THROW(shape_inference(space_to_batch.get(), input_shapes, output_shapes), NodeValidationFailure); -} - -TEST(StaticShapeInferenceTest, SpaceToBatchThrowExceptionWithMissingPadsHostTensorData) { - auto space_to_batch = build_space_to_batch(); - - int32_t block_val[] = {1, 6, 5, 1, 16}; - auto block = std::make_shared(ngraph::element::Type_t::i32, ov::Shape{5}, block_val); - - std::map> constant_data; - constant_data[1] = block; - - const std::vector input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - EXPECT_THROW(shape_inference(space_to_batch.get(), input_shapes, output_shapes), NodeValidationFailure); -} - -TEST(StaticShapeInferenceTest, space_to_batch_output_with_const_inputs) { - auto data = std::make_shared(element::f32, ov::PartialShape{-1, -1, -1, -1}); - auto block_shape = std::make_shared(element::i64, ov::Shape{4}, std::vector{1, 12, 100, 2}); - auto pads_begin = std::make_shared(element::i64, ov::Shape{4}, std::vector{0, 3, 38, 1}); - auto pads_end = std::make_shared(element::i64, ov::Shape{4}, std::vector{0, 5, 38, 0}); - const auto space_to_batch = std::make_shared(data, block_shape, pads_begin, pads_end); - std::vector input_shapes = {{2, 100, 1024, 3}, {4}, {4}, {4}}; - std::vector output_shapes = {{}}; - shape_inference(space_to_batch.get(), input_shapes, output_shapes); - - ASSERT_EQ(output_shapes[0], (StaticShape{2 * 12 * 100 * 2, (100 + 3 + 5) / 12, (1024 + 38 + 38) / 100, (3 + 1) / 2})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference_test.cpp new file mode 100644 index 00000000000000..fd6969e0622983 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference_test.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class SpaceToBatchV1StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } + + std::shared_ptr make_space_to_batch_dynamic() { + const auto data = std::make_shared(element::f32, PartialShape::dynamic()); + const auto block = std::make_shared(element::i32, PartialShape::dynamic()); + const auto pads_begin = std::make_shared(element::i32, PartialShape::dynamic()); + const auto pads_end = std::make_shared(element::i32, PartialShape::dynamic()); + + return make_op(data, block, pads_begin, pads_end); + } +}; + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t pads_begin_val[] = {0, 2, 0, 0, 0}; + int32_t pads_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {2, std::make_shared(element::i32, Shape{5}, pads_begin_val)}, + {3, std::make_shared(element::i32, Shape{5}, pads_end_val)}}; + + input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; + shape_inference(op.get(), input_shapes, output_shapes, constant_data); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{2 * 6 * 5 * 16, (32 + 2 + 2) / 6, (64 + 1) / 5, 128, 256 / 16})); +} + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, blocks_pads_as_constants) { + const auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1, -1}); + const auto block_shape = std::make_shared(element::i64, Shape{4}, std::vector{1, 12, 100, 2}); + const auto pads_begin = std::make_shared(element::i64, Shape{4}, std::vector{0, 3, 38, 1}); + const auto pads_end = std::make_shared(element::i64, Shape{4}, std::vector{0, 5, 38, 0}); + + const auto op = make_op(data, block_shape, pads_begin, pads_end); + + input_shapes = {{2, 100, 1024, 3}, {4}, {4}, {4}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes[0], + (StaticShape{2 * 12 * 100 * 2, (100 + 3 + 5) / 12, (1024 + 38 + 38) / 100, (3 + 1) / 2})); +} + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, blocks_pads_in_constant_map) { + const auto op = make_space_to_batch_dynamic(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t pads_begin_val[] = {0, 2, 0, 0, 0}; + int32_t pads_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {2, std::make_shared(element::i32, Shape{5}, pads_begin_val)}, + {3, std::make_shared(element::i32, Shape{5}, pads_end_val)}}; + + input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; + shape_inference(op.get(), input_shapes, output_shapes, constant_data); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{2 * 6 * 5 * 16, (32 + 2 + 2) / 6, (64 + 1) / 5, 128, 256 / 16})); +} + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, throw_no_data_const_map) { + const auto op = make_space_to_batch_dynamic(); + + input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; + EXPECT_THROW(shape_inference(op.get(), input_shapes, output_shapes), NodeValidationFailure); +} + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, exception_missing_pads_data_in_const_map) { + const auto op = make_space_to_batch_dynamic(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}}; + + input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; + + EXPECT_THROW(shape_inference(op.get(), input_shapes, output_shapes), NodeValidationFailure); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference.cpp deleted file mode 100644 index 1466e73b34f9cc..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include - -using namespace ov::intel_cpu; - -TEST(StaticShapeInferenceTest, SpaceToDepthTest) { - auto A = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(ov::Rank(4))); - auto space_to_depth = - std::make_shared(A, ov::op::v0::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST, 2); - const std::vector input_shapes = {StaticShape{1, 12, 4, 1080, 1616}}; - std::vector output_shapes = {StaticShape{}}; - shape_inference(space_to_depth.get(), input_shapes, output_shapes); - ASSERT_EQ(output_shapes[0], (StaticShape{1, 12 * 8, 4 / 2, 1080 / 2, 1616 / 2})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference_test.cpp new file mode 100644 index 00000000000000..da8851751ee92c --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference_test.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class SpaceToDepthV0StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } +}; + +TEST_F(SpaceToDepthV0StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + op->set_block_size(2); + + input_shapes = {StaticShape{1, 12, 4, 1080, 1616}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{1, 12 * 8, 4 / 2, 1080 / 2, 1616 / 2})); +} + +TEST_F(SpaceToDepthV0StaticShapeInferenceTest, depth_first_block_2) { + const auto data = std::make_shared(element::f32, PartialShape::dynamic(4)); + const auto op = make_op(data, op_type::SpaceToDepthMode::DEPTH_FIRST, 2); + + input_shapes = {StaticShape{1, 12, 4, 1080, 1616}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{1, 12 * 8, 4 / 2, 1080 / 2, 1616 / 2})); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/utils.hpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/utils.hpp index 85ead85909447d..546ffd7a9c1302 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/utils.hpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/utils.hpp @@ -13,6 +13,19 @@ #pragma once +namespace ov { +namespace intel_cpu { +template +void shape_inference(ov::Node* op, + const std::vector& input_shapes, + std::vector& output_shapes, + const std::map& constant_data = {}) { + const auto shape_infer = make_shape_inference(op->shared_from_this()); + output_shapes = shape_infer->infer(input_shapes, constant_data); +} +} // namespace intel_cpu +} // namespace ov + struct TestTensor { std::shared_ptr tensor; ov::intel_cpu::StaticShape static_shape; @@ -90,6 +103,8 @@ using ShapeVector = std::vector; template class OpStaticShapeInferenceTest : public testing::Test { protected: + using op_type = TOp; + ShapeVector input_shapes, output_shapes; ov::intel_cpu::StaticShape exp_shape; std::shared_ptr op; diff --git a/src/plugins/intel_cpu/thirdparty/CMakeLists.txt b/src/plugins/intel_cpu/thirdparty/CMakeLists.txt index b9875eb09b8f2c..b2494db8986396 100644 --- a/src/plugins/intel_cpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_cpu/thirdparty/CMakeLists.txt @@ -21,7 +21,7 @@ function(ie_add_onednn) set(DNNL_ENABLE_ITT_TASKS OFF CACHE BOOL "" FORCE) endif() set(DNNL_ENABLE_CONCURRENT_EXEC ON CACHE BOOL "" FORCE) - set(DNNL_ENABLE_PRIMITIVE_CACHE OFF CACHE BOOL "" FORCE) ## TODO: try it later + set(DNNL_ENABLE_PRIMITIVE_CACHE ON CACHE BOOL "" FORCE) # Enable primitive cache for global sharing set(DNNL_ENABLE_MAX_CPU_ISA ON CACHE BOOL "" FORCE) set(DNNL_LIBRARY_TYPE "STATIC" CACHE STRING "" FORCE) set(DNNL_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) diff --git a/src/plugins/intel_gna/CMakeLists.txt b/src/plugins/intel_gna/CMakeLists.txt index f6b358a2f13574..08d32a4771cca5 100644 --- a/src/plugins/intel_gna/CMakeLists.txt +++ b/src/plugins/intel_gna/CMakeLists.txt @@ -71,6 +71,7 @@ target_compile_definitions(${TARGET_NAME} _NO_MKL_ ) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) # @@ -139,5 +140,3 @@ if(NOT BUILD_SHARED_LIBS) endif() add_subdirectory(tests) - - diff --git a/src/plugins/intel_gna/cmake/libGNAConfig.cmake b/src/plugins/intel_gna/cmake/libGNAConfig.cmake index 2daac435defcae..0b53d5be514eeb 100644 --- a/src/plugins/intel_gna/cmake/libGNAConfig.cmake +++ b/src/plugins/intel_gna/cmake/libGNAConfig.cmake @@ -2,8 +2,25 @@ # SPDX-License-Identifier: Apache-2.0 # -# module to locate GNA libraries - +# +# The module defines several imported targets: +# +# - (Optional) libGNA::API +# - (Optional) libGNA::KERNEL +# +# And high-level imported interface target: +# +# - libGNA +# +# And the following variables: +# +# - libGNA_API_FOUND +# - libGNA_KERNEL_FOUND +# +# The example usage: +# +# find_package(libGNA COMPONENTS API KERNEL) +# set(libGNA_FOUND TRUE) @@ -27,7 +44,17 @@ if(libGNA_FIND_REQUIRED_KERNEL) if(GNA_KERNEL_LIBRARY) add_library(libGNA::KERNEL SHARED IMPORTED) - set_target_properties(libGNA::KERNEL PROPERTIES IMPORTED_LOCATION ${GNA_KERNEL_LIBRARY}) + set_property(TARGET libGNA::KERNEL APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) + if(WIN32) + set(gna_dll "${CMAKE_SHARED_LIBRARY_PREFIX}${GNA_KERNEL_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}") + set_target_properties(libGNA::KERNEL PROPERTIES + IMPORTED_LOCATION_RELEASE "${libGNA_LIBRARIES_BASE_PATH}/${gna_dll}" + IMPORTED_IMPLIB_RELEASE "${GNA_KERNEL_LIBRARY}") + else() + set_target_properties(libGNA::KERNEL PROPERTIES + IMPORTED_LOCATION_RELEASE "${GNA_KERNEL_LIBRARY}" + INTERFACE_LINK_OPTIONS "-Wl,-rpath-link,${libGNA_LIBRARIES_BASE_PATH}") + endif() set(libGNA_KERNEL_FOUND TRUE) else() message(SEND_ERROR "GNA KERNEL library (${GNA_KERNEL_LIB_NAME}) was not found in ${libGNA_LIBRARIES_BASE_PATH}") @@ -40,7 +67,7 @@ if(libGNA_FIND_REQUIRED_API) NO_CMAKE_FIND_ROOT_PATH) if(libGNA_INCLUDE_DIRS) add_library(libGNA::API INTERFACE IMPORTED) - set_target_properties(libGNA::API PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${libGNA_INCLUDE_DIRS}) + set_target_properties(libGNA::API PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${libGNA_INCLUDE_DIRS}") set(libGNA_API_FOUND TRUE) else() message(SEND_ERROR "GNA API headers (gna2-api.h) was not found in ${GNA_EXT_DIR}/include") @@ -48,15 +75,6 @@ if(libGNA_FIND_REQUIRED_API) endif() add_library(libGNA INTERFACE IMPORTED) -foreach(_lib_name ${libGNA_FIND_COMPONENTS}) +foreach(_lib_name IN LISTS libGNA_FIND_COMPONENTS) set_property(TARGET libGNA APPEND PROPERTY INTERFACE_LINK_LIBRARIES libGNA::${_lib_name}) endforeach(_lib_name) - -if (WIN32) - if(libGNA_FIND_REQUIRED_KERNEL) - set_target_properties(libGNA::KERNEL PROPERTIES - IMPORTED_IMPLIB ${GNA_KERNEL_LIBRARY}) - endif() -else() - set_target_properties(libGNA PROPERTIES INTERFACE_LINK_OPTIONS "-Wl,-rpath-link,${libGNA_LIBRARIES_BASE_PATH}") -endif () diff --git a/src/plugins/intel_gna/src/transformations/utils/gather_sinking_utils.cpp b/src/plugins/intel_gna/src/transformations/utils/gather_sinking_utils.cpp index a8a1fbad30bfbb..4a9495b7b274a2 100644 --- a/src/plugins/intel_gna/src/transformations/utils/gather_sinking_utils.cpp +++ b/src/plugins/intel_gna/src/transformations/utils/gather_sinking_utils.cpp @@ -57,7 +57,7 @@ bool IfNodeHasGatherInputs(const Output& output) { namespace { bool HasDynamicRankInput(NodePtr node) { - for (auto& input_node : node->input_values()) { + for (const auto& input_node : node->input_values()) { const Rank output_rank = input_node.get_partial_shape().rank(); if (output_rank.is_dynamic()) return true; @@ -148,7 +148,7 @@ bool CanPropagateGatherForwardThrough(Node* node) { #undef CHECK_GATHER_SINKING_SUPPORTED bool CanGatherPropagateForward(NodePtr node) { - for (auto output : node->outputs()) { + for (const auto& output : node->outputs()) { for (auto& consumer_input : output.get_target_inputs()) { if (!CanPropagateGatherForwardThrough(consumer_input.get_node())) return false; @@ -209,7 +209,7 @@ GatherInfo GetGatherInfo(Node* node) { } Node* FindFirstConsumer(NodePtr node) { - for (auto output : node->outputs()) { + for (const auto& output : node->outputs()) { auto inputs = output.get_target_inputs(); if (inputs.empty()) continue; diff --git a/src/plugins/intel_gna/tests/deprecated/readers/ir_reader_v7/CMakeLists.txt b/src/plugins/intel_gna/tests/deprecated/readers/ir_reader_v7/CMakeLists.txt index 2f13e3123c0cb1..303537eb48e8e7 100644 --- a/src/plugins/intel_gna/tests/deprecated/readers/ir_reader_v7/CMakeLists.txt +++ b/src/plugins/intel_gna/tests/deprecated/readers/ir_reader_v7/CMakeLists.txt @@ -37,8 +37,6 @@ target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/" target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime::dev inference_engine_legacy openvino::pugixml openvino::itt) -ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) - if(WIN32) set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) endif() @@ -47,6 +45,9 @@ if(BUILD_SHARED_LIBS) target_link_libraries(${TARGET_NAME} PRIVATE inference_engine) endif() +# must be called after all target_link_libraries +ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) + # code style add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME}) diff --git a/src/plugins/intel_gna/tests/functional/shared_tests_instances/behavior/infer_request/memory_states.cpp b/src/plugins/intel_gna/tests/functional/shared_tests_instances/behavior/infer_request/memory_states.cpp index 157415305ad1c6..88dc8b4eddd196 100644 --- a/src/plugins/intel_gna/tests/functional/shared_tests_instances/behavior/infer_request/memory_states.cpp +++ b/src/plugins/intel_gna/tests/functional/shared_tests_instances/behavior/infer_request/memory_states.cpp @@ -12,39 +12,10 @@ using namespace BehaviorTestsDefinitions; namespace { -InferenceEngine::CNNNetwork getNetwork() { - ngraph::Shape shape = {1, 200}; - ngraph::element::Type type = ngraph::element::f32; - - auto input = std::make_shared(type, shape); - auto mem_i1 = std::make_shared(type, shape, 0); - auto mem_r1 = std::make_shared(mem_i1, "r_1-3"); - auto mul1 = std::make_shared(mem_r1, input); - - auto mem_i2 = std::make_shared(type, shape, 0); - auto mem_r2 = std::make_shared(mem_i2, "c_1-3"); - auto mul2 = std::make_shared(mem_r2, mul1); - auto mem_w2 = std::make_shared(mul2, "c_1-3"); - - auto mem_w1 = std::make_shared(mul2, "r_1-3"); - auto sigm = std::make_shared(mul2); - - sigm->set_friendly_name("sigmod_state"); - mem_r1->set_friendly_name("Memory_1"); - mem_w1->add_control_dependency(mem_r1); - sigm->add_control_dependency(mem_w1); - - mem_r2->set_friendly_name("Memory_2"); - mem_w2->add_control_dependency(mem_r2); - sigm->add_control_dependency(mem_w2); - - auto function = - std::make_shared(ngraph::NodeVector{sigm}, ngraph::ParameterVector{input}, "addOutput"); - return InferenceEngine::CNNNetwork{function}; -} - -std::vector memoryStateTestCases = { - memoryStateParams(getNetwork(), {"c_1-3", "r_1-3"}, CommonTestUtils::DEVICE_GNA, {})}; +std::vector memoryStateTestCases = {memoryStateParams(InferRequestVariableStateTest::getNetwork(), + {"c_1-3", "r_1-3"}, + CommonTestUtils::DEVICE_GNA, + {})}; INSTANTIATE_TEST_SUITE_P(smoke_VariableStateBasic, InferRequestVariableStateTest, diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt index ab4d7618c30a25..306b56987d70c8 100644 --- a/src/plugins/intel_gpu/CMakeLists.txt +++ b/src/plugins/intel_gpu/CMakeLists.txt @@ -70,4 +70,5 @@ if(ENABLE_TESTS) endif() # Failed because of OpenCL +# must be called after all target_link_libraries # ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index 0fef9af07ec39b..008b174b644f94 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -231,16 +231,8 @@ struct network { /// Returns memory state @p variable_id of stateful network VariableState& get_variable_memory(const std::string &variable_id); - /// Return kernels_cache - kernels_cache& get_kernels_cache() const { return *_kernels_cache; } - - /// Return implentations_cache - ImplementationsCache& get_implementations_cache() const { return *_impls_cache; } - /// Return in_mem_kernels_cache KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; } - - ICompilationContext& get_compilation_context() const { return *_compilation_context; } std::mutex& get_impl_cache_mutex() const { return _in_mem_cache_mutex; } const ExecutionConfig& get_config() const { return _config; } @@ -267,12 +259,13 @@ struct network { std::list> _data_outputs; variables_states_map _variables_states; std::vector> _variable_state_primitives; + program::primitives_info _prims_info; + std::map _ext_id_mapping; std::unordered_map _events; output_chains_map _output_chains; mutable std::mutex _in_mem_cache_mutex; - std::unique_ptr _compilation_context; void build_exec_order(); void allocate_primitive_instance(program_node const& node); @@ -284,11 +277,8 @@ struct network { void add_default_output_chains(); output_chains_map::iterator add_output_chain(std::shared_ptr& p_inst); - std::unique_ptr _kernels_cache; // Move from cldnn::program to cldnn::network for multi-threads issue. - std::unique_ptr _impls_cache; std::unique_ptr _in_mem_kernels_cache; - const size_t _impls_cache_capacity = 10000; const size_t _in_mem_kernels_cache_capacity = 10000; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index d764f1de5e7adf..2c21bc1694daa4 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -26,6 +26,7 @@ class pass_manager; class base_pass; class program_wrapper; class kernels_cache; +class ICompilationContext; struct program { @@ -252,6 +253,10 @@ struct program { void query_local_block_io_supported(); void calc_nodes_hash(); + ImplementationsCache& get_implementations_cache() const { return *_impls_cache; } + ICompilationContext& get_compilation_context() const { return *_compilation_context; } + void cancel_compilation_context(); + private: uint32_t prog_id = 0; engine& _engine; @@ -266,6 +271,9 @@ struct program { std::unique_ptr pm; bool is_body_program; int8_t is_subgroup_local_block_io_supported; + std::unique_ptr _impls_cache; + const size_t _impls_cache_capacity = 10000; + std::unique_ptr _compilation_context; std::map> nodes_map; std::list optimized_out; @@ -305,7 +313,9 @@ struct program { void cleanup(); void transfer_memory_to_device(); + InferenceEngine::CPUStreamsExecutor::Config make_task_executor_config(const ExecutionConfig& config, std::string tags = "") const; std::shared_ptr make_task_executor(const ExecutionConfig& config) const; + /* ** Analysis functions */ @@ -343,6 +353,8 @@ struct program { // old_node - node which will be replaced // new_node - node which will replace the old one void replace(program_node& old_node, program_node& new_node); + + void init_program(); }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp index 7880b79a85eb4d..182865306e4611 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp @@ -14,7 +14,8 @@ namespace cldnn { class BinaryOutputBuffer : public OutputBuffer { public: - BinaryOutputBuffer(std::ostream& stream) : OutputBuffer(this), stream(stream) {} + BinaryOutputBuffer(std::ostream& stream) + : OutputBuffer(this), stream(stream), _impl_params(nullptr) {} void write(void const * data, std::streamsize size) { auto const written_size = stream.rdbuf()->sputn(reinterpret_cast(data), size); @@ -32,7 +33,8 @@ class BinaryOutputBuffer : public OutputBuffer { class BinaryInputBuffer : public InputBuffer { public: - BinaryInputBuffer(std::istream& stream, engine& engine) : InputBuffer(this, engine), stream(stream) {} + BinaryInputBuffer(std::istream& stream, engine& engine) + : InputBuffer(this, engine), stream(stream), _impl_params(nullptr), _network(nullptr) {} void read(void* const data, std::streamsize size) { auto const read_size = stream.rdbuf()->sgetn(reinterpret_cast(data), size); @@ -42,6 +44,8 @@ class BinaryInputBuffer : public InputBuffer { void setKernlImplParams(void* impl_params) { _impl_params = impl_params; } void* getKernlImplParams() const { return _impl_params; } + void setNetwork(void* network) { _network = network; } + void* getNetwork() const { return _network; } std::streampos tellg() { return stream.tellg(); } void seekg(std::streampos pos) { stream.seekg(pos); } @@ -49,6 +53,7 @@ class BinaryInputBuffer : public InputBuffer { private: std::istream& stream; void* _impl_params; + void* _network; }; template diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp index 979e203cab51dd..20fb79db8664ae 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp @@ -46,9 +46,6 @@ class CompiledModel : public InferenceEngine::ExecutableNetworkThreadSafeDefault ExecutionConfig m_config; InferenceEngine::ITaskExecutor::Ptr m_taskExecutor; InferenceEngine::ITaskExecutor::Ptr m_waitExecutor; - -private: - bool is_serializable(); }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp index 007e55e7fb3f6c..9e0f8941527139 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp @@ -18,7 +18,7 @@ namespace intel_gpu { class Plugin : public InferenceEngine::IInferencePlugin { struct impl; std::shared_ptr _impl; - bool isModelCachingEnabled = false; + bool isModelCachingEnabled = true; std::string default_device_id = "0"; // key: device_id, value: cldnn device diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp index 6d354cd8d8f8f6..81e0dbcf774ee7 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include "kernel.hpp" @@ -30,15 +32,15 @@ class LruCache { } /** - * @brief Get the least recently used element object in the cache + * @brief Get the least recently used element with key and value pair in the cache * * @return Value */ - Value get_lru_element() const { + std::pair get_lru_element() const { if (_lru_data_list.size()) { - return _lru_data_list.back().second; + return _lru_data_list.back(); } else { - return Value(); + return std::make_pair(Key(), Value()); } } @@ -164,6 +166,46 @@ class LruCache { } }; -using ImplementationsCache = cldnn::LruCache>; using KernelsCache = cldnn::LruCache; + +template +class LruCacheThreadSafe : LruCache { +public: + using parent = LruCache; + using FuncRemoveItem = std::function&)>; + + explicit LruCacheThreadSafe(size_t caps) : parent(caps) { } + + bool add(const Key& key, const Value& value) { + std::lock_guard lock(_mutex); + auto popped_item = parent::get_lru_element(); + auto ret = parent::add(key, value); + if (ret && _remove_popped_item) { + _remove_popped_item(popped_item); + } + return ret; + } + + bool has(const Key& key) const { + std::lock_guard lock(_mutex); + return parent::has(key); + } + + Value get(const Key& key) { + std::lock_guard lock(_mutex); + return parent::get(key); + } + + void set_remove_item_callback(FuncRemoveItem callback) { + _remove_popped_item = callback; + } + +private: + FuncRemoveItem _remove_popped_item; + mutable std::mutex _mutex; +}; + + +using ImplementationsCache = cldnn::LruCacheThreadSafe>; + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/compilation_context.cpp b/src/plugins/intel_gpu/src/graph/compilation_context.cpp index 2aa02dbda0ebc9..75c6b3a65b8b12 100644 --- a/src/plugins/intel_gpu/src/graph/compilation_context.cpp +++ b/src/plugins/intel_gpu/src/graph/compilation_context.cpp @@ -1,91 +1,76 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2022-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // + #include "compilation_context.hpp" -#include "threading/ie_thread_safe_containers.hpp" -#include "kernel_selector/kernel_base.h" +#include +#include +#include +#include "intel_gpu/runtime/utils.hpp" namespace cldnn { -class CompilationTaskQueue { - using CompilationTaskData = std::pair; - +class CompilationContext : public ICompilationContext { public: - void push_task(size_t task_key, ICompilationContext::Task&& task) { - std::lock_guard lock(_mutex); - if (_queue_keymap.find(task_key) == _queue_keymap.end()) { - auto insert_it = _queue.insert(_queue.end(), {task_key, task}); - _queue_keymap.insert({task_key, insert_it}); - } + CompilationContext(InferenceEngine::CPUStreamsExecutor::Config task_executor_config) : _task_executor_config(task_executor_config) { + _task_executor_config._streams = 4; + _task_executor = std::make_shared(_task_executor_config); } - bool pop_front_task(size_t& task_key, ICompilationContext::Task& task) { + void push_task(size_t key, Task&& task) override { + if (_stop_compilation) + return; + std::lock_guard lock(_mutex); - if (!_queue.empty()) { - auto front = _queue.front(); - task = front.second; - task_key = front.first; - _queue.pop_front(); - return true; + if (_task_keys.find(key) == _task_keys.end()) { + _task_keys.insert(key); + if (_task_executor != nullptr) + _task_executor->run(task); } - return false; } - void erase_task_key(size_t removed_key) { + void remove_keys(std::vector&& keys) override { std::lock_guard lock(_mutex); - if (_queue_keymap.find(removed_key) != _queue_keymap.end()) { - _queue_keymap.erase(removed_key); + if (!_task_keys.empty()) { + for (auto key : keys) { + if (_task_keys.find(key) != _task_keys.end()) { + _task_keys.erase(key); + } + } } } -private: - std::deque _queue; - std::unordered_map::iterator> _queue_keymap; - std::mutex _mutex; -}; - -class CompilationContext : public ICompilationContext { -public: - CompilationContext(cldnn::engine& engine, const ExecutionConfig& config, size_t program_id) { - _kernels_cache = cldnn::make_unique(engine, config, program_id, nullptr, kernel_selector::KernelBase::get_db().get_batch_header_str()); - _worker = std::thread([this](){ - while (!_stop_compilation) { - CompilationContext::Task task; - size_t task_key; - bool success = _queue.pop_front_task(task_key, task); - if (success) { - task(*_kernels_cache); - _queue.erase_task_key(task_key); - } else { - std::chrono::milliseconds ms{1}; - std::this_thread::sleep_for(ms); - } - } - }); + ~CompilationContext() noexcept { + cancel(); } - void push_task(size_t key, ICompilationContext::Task&& task) override { - _queue.push_task(key, std::move(task)); + bool is_stopped() override { + return _stop_compilation; } void cancel() noexcept override { + if (_stop_compilation) + return; + _stop_compilation = true; - if (_worker.joinable()) - _worker.join(); + { + std::lock_guard lock(_mutex); + if (_task_executor != nullptr) + _task_executor.reset(); + _task_keys.clear(); + } } - ~CompilationContext() noexcept { cancel(); } - private: - std::unique_ptr _kernels_cache; - std::thread _worker; + InferenceEngine::CPUStreamsExecutor::Config _task_executor_config; + InferenceEngine::CPUStreamsExecutor::Ptr _task_executor; + std::mutex _mutex; + std::unordered_set _task_keys; std::atomic_bool _stop_compilation{false}; - - CompilationTaskQueue _queue; }; -std::unique_ptr ICompilationContext::create(cldnn::engine& engine, const ExecutionConfig& config, size_t program_id) { - return cldnn::make_unique(engine, config, program_id); +std::unique_ptr ICompilationContext::create(InferenceEngine::CPUStreamsExecutor::Config task_executor_config) { + return cldnn::make_unique(task_executor_config); } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/data.cpp b/src/plugins/intel_gpu/src/graph/data.cpp index 10be2a3504e81a..16e0edb6d2d033 100644 --- a/src/plugins/intel_gpu/src/graph/data.cpp +++ b/src/plugins/intel_gpu/src/graph/data.cpp @@ -85,15 +85,24 @@ void data_inst::load(BinaryInputBuffer& ib) { size_t data_size; ib >> make_data(&data_size, sizeof(size_t)); - _outputs[0] = get_network().get_memory_pool().get_memory(output_layout, _allocation_type, false); - if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { - ib >> make_data(_outputs[0]->buffer_ptr(), data_size); + if (ib.getNetwork()) { + const network* primary_network = reinterpret_cast(ib.getNetwork()); + _outputs[0] = primary_network->get_primitive(id())->output_memory_ptr(); + auto pos = ib.tellg(); + pos += data_size; + ib.seekg(pos); } else { - std::vector _buf; - _buf.resize(data_size); - ib >> make_data(_buf.data(), data_size); - _outputs[0]->copy_from(get_network().get_stream(), _buf.data()); + _outputs[0] = get_network().get_memory_pool().get_memory(output_layout, _allocation_type, false); + + if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { + ib >> make_data(_outputs[0]->buffer_ptr(), data_size); + } else { + std::vector _buf; + _buf.resize(data_size); + ib >> make_data(_buf.data(), data_size); + _outputs[0]->copy_from(get_network().get_stream(), _buf.data()); + } } } diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index fc7bf008f372b1..d00a36c676b3b9 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -39,8 +39,9 @@ bool is_batch_after_spatial(const std::string order) { } format::type get_preferred_format(fully_connected_node const& node, const kernel_impl_params& impl_param) { - if (node.get_preferred_impl_type() == impl_types::onednn) - return format::bfyx; + if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) { + return node.get_preferred_output_fmt(); + } auto input_layout = impl_param.get_input_layout(); diff --git a/src/plugins/intel_gpu/src/graph/gemm.cpp b/src/plugins/intel_gpu/src/graph/gemm.cpp index 6d2cd3d76f6c83..b15be72fc4d3d4 100644 --- a/src/plugins/intel_gpu/src/graph/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/gemm.cpp @@ -78,6 +78,10 @@ layout gemm_inst::calc_output_layout(gemm_node const& node, kernel_impl_params c auto output_format = input0_layout.format; + if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) { + output_format = node.get_preferred_output_fmt(); + } + return layout(output_shape, output_type, output_format, prim->output_paddings[0]); } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp index 3b563a4af907af..66af672ed97ffa 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp @@ -54,7 +54,7 @@ void post_optimize_weights::optimize_weights(T& node, program& p) { // Don't run impl selection to avoid double compilation of reorder kernels // in main program and internal program for constant propagation - if (!g_node.is_constant()) { + if ((!g_node.is_constant()) && (!reorder.second)) { g_node.set_selected_impl(g_node.type()->choose_impl(g_node)); if (auto impl = g_node.get_selected_impl()) { auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source()); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 06d2b2852d666c..2bfb2e9bcb2f7d 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -169,6 +169,18 @@ void remove_redundant_reorders::run(program& p) { !r_node.get_primitive()->has_surface_input(); if (remove_dep) { + // for chains like + // b_fs_yx_fsv16 -> reorder(ofmt:bfyx) -> bfyx -> reorder(ofmt:any) -> bfyx + // if output_format of current node is format::any, input format of the dependency node is propagated as it is + // b_fs_yx_fsv16 -> reorder(ofmt:any) -> b_fs_yx_fsv16 + // so output format of dependency node must be stored in output_format of current node + // b_fs_yx_fsv16 -> reorder(ofmt:bfyx) -> bfyx + auto output_layout = r_dep_node.get_output_layout(); + auto prim = std::const_pointer_cast(r_node.get_primitive()); + if (prim->output_format == format::any) + prim->output_format = output_layout.format; + + LOG_NODE_REMOVAL(r_dep_node.id()); r_dep_node.can_be_optimized(true); p.add_optimized_primitive_info(r_dep_node.id()); p.extract_and_remove(r_dep_node); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp index 5f3f741d7cb115..c0d5a734e2ffa6 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp @@ -5,6 +5,7 @@ #include "pass_manager.h" #include "data_inst.h" #include "mutable_data_inst.h" +#include "gemm_inst.h" #include "program_node.h" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/itt.hpp" @@ -44,6 +45,8 @@ void select_preferred_formats::run(program& p) { dnnl::primitive_attr(), dnnl::memory::format_tag::any); _lo.select_preferred_formats_for_onednn(*n, *prim_desc); + } else if (n->is_type() || n->is_type()) { + _lo.select_preferred_formats_for_onednn(*n); } } catch(std::exception &exception) { GPU_DEBUG_INFO << "WARNING(select_preferred_formats): " << exception.what() << std::endl; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp index 7131f2e8a4dfc1..acf400f3b7e9b1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp @@ -26,19 +26,60 @@ struct crop_impl : typed_primitive_impl_ocl { } public: - static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { - auto params = get_default_params(impl_param); + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param, is_shape_agnostic); auto optional_params = get_default_optional_params(impl_param.get_program()); params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(0)}, kernel_selector::eltwise_mode::ASSIGN}); - params.inputs[0] = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]); + if (impl_param.get_program().get_node(primitive->id).is_dynamic()) { + // WA to always match compiled dynamic kernel with dispatch data + // W/O enforcing this option we may generate kernel for "broadcast" scneario due to umatched tensor dimensions + // but in runtime dispatch data will be generated for non-broadcast case as shapes are actually same. + params.broadcast = true; + } else { + params.inputs[0] = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]); + } return {params, optional_params}; } + void update_dispatch_data(const kernel_impl_params& impl_param) override { + auto kernel_params = get_kernel_params(impl_param, true); + auto runtime_offset = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]).GetFirstElementOffset(); + kernel_selector::ScalarDescriptor s; + s.t = kernel_selector::ScalarDescriptor::Types::UINT32; + s.v.u32 = runtime_offset; + OPENVINO_ASSERT(_kernel_data.kernels[0].params.scalars.size() == 1, + "[GPU] Scalar field for runtime offset is not added for crop shape agnostic impl"); + _kernel_data.kernels[0].params.scalars[0] = s; + (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data); + update_kernels_list_to_skip(); + } }; namespace detail { attach_crop_impl::attach_crop_impl() { + auto dyn_types = { + data_types::f32, + data_types::f16, + data_types::i8, + data_types::u8, + data_types::i32, + data_types::i64 + }; + + auto dyn_formats = { + format::bfyx, + format::bfzyx, + format::bfwzyx + }; + + implementation_map::add(impl_types::ocl, + shape_types::dynamic_shape, + typed_primitive_impl_ocl::create, + dyn_types, + dyn_formats); + implementation_map::add(impl_types::ocl, typed_primitive_impl_ocl::create, { std::make_tuple(data_types::f32, format::yxfb), std::make_tuple(data_types::f16, format::yxfb), diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_tree.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_tree.cpp index 7ee17e191b2e6d..907c8ecb38e073 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_tree.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_tree.cpp @@ -39,7 +39,12 @@ struct gather_tree_impl : typed_primitive_impl_ocl { namespace detail { attach_gather_tree_impl::attach_gather_tree_impl() { - auto types = {data_types::i32, data_types::f32}; + auto types = { + data_types::f32, + data_types::f16, + data_types::i32 + }; + auto formats = { format::yxfb, format::bfyx, diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 90010f902e694e..10c1ee1cf40b3f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -276,6 +276,19 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { (std::accumulate(gws.begin(), gws.end(), 1, std::multiplies()) == 0); } } + + void set_kernels(std::map& kernels) override { + if (is_cpu()) + return; + + _kernel_ids.clear(); + _kernels.clear(); + _kernels.reserve(kernels.size()); + for (auto& k : kernels) { + _kernel_ids.push_back(k.first); + _kernels.emplace_back(std::move(k.second)); + } + } }; } // namespace ocl diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index 4e404518d6659d..5e8c03dd0c67fd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -324,10 +324,8 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl { void build_primitive(const ExecutionConfig& config) { auto cache_outpath = get_cache_directory(config); - if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) { - if (env_p[0] == '1') { - cache_outpath = ""; - } + if (!config.get_property(ov::intel_gpu::allow_new_shape_infer)) { + cache_outpath = ""; } if (cache_outpath.empty()) { diff --git a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp b/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp index abb686ac2d9874..f26aa904004630 100644 --- a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp +++ b/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp @@ -1,10 +1,10 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2022-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include "kernels_cache.hpp" +#include #include #include @@ -12,12 +12,14 @@ namespace cldnn { class ICompilationContext { public: - using Task = std::function; + using Task = std::function; virtual void push_task(size_t key, Task&& task) = 0; - virtual void cancel() noexcept = 0; + virtual void remove_keys(std::vector&& keys) = 0; virtual ~ICompilationContext() = default; + virtual bool is_stopped() = 0; + virtual void cancel() = 0; - static std::unique_ptr create(cldnn::engine& engine, const ExecutionConfig& config, size_t program_id); + static std::unique_ptr create(InferenceEngine::CPUStreamsExecutor::Config task_executor_config); }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h index edf78887cc33f3..21fd41a59c8863 100644 --- a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h +++ b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h @@ -210,7 +210,7 @@ class layout_optimizer { bool should_select_b_fs_yx_fsv16_layout(convolution_node const& node, layout const& output_or_weights_layout); #ifdef ENABLE_ONEDNN_FOR_GPU - void select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc); + void select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc = dnnl::primitive_desc()); #endif }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index f30c1db46dd601..c21ad7d3906b45 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -63,7 +63,6 @@ struct primitive_impl { } virtual std::vector> get_kernels_source() { return {}; } virtual void reset_kernels_source() {} - virtual void set_kernels(std::vector) {} virtual std::vector get_kernels() const { return {}; } virtual void set_kernel_ids(std::vector kernel_ids) {} virtual void save(cldnn::BinaryOutputBuffer& ob) const {} @@ -80,6 +79,8 @@ struct primitive_impl { OPENVINO_ASSERT(false, "[GPU] update_dispatch_data is not implemented for dynamic implemenation ", _kernel_name); } + virtual void set_kernels(std::map& kernels) {} + protected: std::string _kernel_name; bool _is_dynamic = false; diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index e9537f6869e707..d3b5baed5e928a 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1693,7 +1693,7 @@ format layout_optimizer::get_preferred_format(program_node& node) { } // In case of input -> ... -> quantize -> concat - if (expected == format::any + if (layout.is_static() && expected == format::any && (node.get_users().size() == 1 && node.get_users().front()->is_type()) && (layout.batch() < 4 && layout.feature() < 4)) { expected = format::get_default_format(layout.get_rank(), false, false); @@ -1757,6 +1757,10 @@ format layout_optimizer::get_preferred_format(program_node& node) { // Set default format for issue 92967/98750 // TODO: will remove when arg_max_min_ref supports blocked format expected = format::get_default_format(node.get_input_layouts()[0].get_rank(), false, false); + } else if (node.is_type() || node.is_type()) { + if (use_onednn_impls) { + expected = node.get_preferred_output_fmt(); + } } if (allow_new_shape_infer && node.get_preferred_input_fmt() != format::any) { @@ -1862,6 +1866,19 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d GPU_DEBUG_LOG << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt) << " For index : " << idx << std::endl; } + } else if (node.is_type() || node.is_type()) { + for (size_t idx = 0 ; idx < node.get_dependencies().size() ; idx++) { + if (node.get_dependency(idx).is_constant()) + continue; + node.set_preferred_input_fmt(idx, cldnn::format::bfyx); + + if (node.get_preferred_output_fmt() == format::any) { + for (size_t usr = 0; usr < std::max(1, node.get_users().size()); usr++) + node.set_preferred_output_fmt(usr, cldnn::format::bfyx); + } + GPU_DEBUG_LOG << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(cldnn::format::bfyx) << " --> " << fmt_to_str(cldnn::format::bfyx) + << " For index : " << idx << std::endl; + } } return; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index eff3f4d80b1ed4..bf30cb3bc7a59b 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -331,14 +331,7 @@ network::network(program::ptr program, const ExecutionConfig& config, stream::pt if (is_dynamic()) { GPU_DEBUG_DEFINE_MEM_LOGGER("dynamic_network_initialization"); - _kernels_cache = std::unique_ptr(new kernels_cache(program->get_engine(), - program->get_config(), - program->get_id(), - program->get_task_executor(), - kernel_selector::KernelBase::get_db().get_batch_header_str())); - _impls_cache = std::unique_ptr(new ImplementationsCache(_impls_cache_capacity)); _in_mem_kernels_cache = std::unique_ptr(new KernelsCache(_in_mem_kernels_cache_capacity)); - _compilation_context = ICompilationContext::create(program->get_engine(), program->get_config(), program->get_id()); } } @@ -468,11 +461,45 @@ network::network(cldnn::BinaryInputBuffer& ib, const ExecutionConfig& config, st } add_default_output_chains(); + + size_t prims_info_size; + ib >> prims_info_size; + + for (size_t i = 0; i < prims_info_size; i++) { + primitive_id original_id; + std::string type_id; + primitive::primitive_id_arr c_dependencies; + primitive::primitive_id_arr c_users; + primitive::primitive_id_arr c_fused_ids; + layout output_layout; + std::string layout_str; + std::string kernel_id; + data_types runtime_precision; + bool is_cpu; + int exec_id; + + ib >> original_id; + ib >> type_id; + ib >> c_dependencies; + ib >> c_users; + ib >> c_fused_ids; + ib >> output_layout; + ib >> layout_str; + ib >> kernel_id; + ib >> make_data(&runtime_precision, sizeof(data_types)); + ib >> is_cpu; + ib >> exec_id; + primitive_info prim_info(original_id, type_id, c_dependencies, c_users, c_fused_ids, + output_layout, layout_str, kernel_id, runtime_precision, is_cpu, exec_id); + _prims_info.emplace_back(prim_info); + } + + ib >> _ext_id_mapping; } network::~network() { - if (_compilation_context) - _compilation_context->cancel(); + if (_program != nullptr) + _program->cancel_compilation_context(); _memory_pool->clear_pool_for_network(net_id); GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) { @@ -551,6 +578,24 @@ void network::save(cldnn::BinaryOutputBuffer& ob) { for (const auto& p_inst : _variable_state_primitives) { ob << p_inst->id(); } + + auto& prims_info = get_primitives_info(); + ob << prims_info.size(); + for (auto& prim_info : prims_info) { + ob << prim_info.original_id; + ob << prim_info.type_id; + ob << prim_info.c_dependencies; + ob << prim_info.c_users; + ob << prim_info.c_fused_ids; + ob << prim_info.output_layout; + ob << prim_info.layout_str; + ob << prim_info.kernel_id; + ob << make_data(&prim_info.runtime_precision, sizeof(data_types)); + ob << prim_info.is_cpu; + ob << prim_info.exec_id; + } + + ob << get_ext_id_mapping(); } network::ptr network::allocate_network(stream::ptr stream, program::ptr program, bool is_internal, bool is_primary_stream) { @@ -1130,7 +1175,7 @@ std::vector network::get_all_primitive_org_ids() const { } const program::primitives_info& network::get_primitives_info() const { - return _program->get_primitives_info(); + return (_program == nullptr) ? _prims_info : _program->get_primitives_info(); } const program::graph_optimizer_info& network::get_optimizer_passes_info() const { @@ -1138,6 +1183,10 @@ const program::graph_optimizer_info& network::get_optimizer_passes_info() const } std::map network::get_ext_id_mapping() const { + if (_program == nullptr) { + return _ext_id_mapping; + } + std::map result; for (auto& prim : _primitives) { result.emplace(prim.first, prim.second->get_node().get_primitive()->origin_op_name); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 9f7f4d5c42b445..2eb6b3ec55a776 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -321,10 +321,9 @@ bool primitive_inst::update_impl() { // Update param if fake_alignment is available auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params); auto impl_key = get_impl_key(updated_params); - auto& cache = get_network().get_implementations_cache(); + auto& cache = get_network().get_program()->get_implementations_cache(); bool has_cached_impl = false; { - std::lock_guard lock(get_network().get_impl_cache_mutex()); has_cached_impl = cache.has(impl_key); if (has_cached_impl) { _impl = cache.get(impl_key)->clone(); @@ -337,11 +336,13 @@ bool primitive_inst::update_impl() { } if (!has_cached_impl) { if (_dynamic_impl) { - auto& compilation_context = get_network().get_compilation_context(); - compilation_context.push_task(impl_key, [this, updated_params, impl_key](kernels_cache& kc) { - auto& cache = get_network().get_implementations_cache(); + auto& compilation_context = get_network().get_program()->get_compilation_context(); + compilation_context.push_task(impl_key, [this, &compilation_context, updated_params, impl_key]() { + if (compilation_context.is_stopped()) + return; + auto _program = get_network().get_program(); + auto& cache = _program->get_implementations_cache(); { - std::lock_guard lock(get_network().get_impl_cache_mutex()); // Check existense in the cache one more time as several iterations of model execution could happens and multiple compilation // tasks created for same shapes if (cache.has(impl_key)) @@ -349,13 +350,8 @@ bool primitive_inst::update_impl() { } auto impl = _node->type()->choose_impl(*_node, updated_params); - auto kernel_ids = kc.add_kernels_source(impl->get_kernels_source()); - impl->set_kernel_ids(kernel_ids); - kc.compile(); - impl->init_kernels(kc); - kc.reset(); - - std::lock_guard lock(get_network().get_impl_cache_mutex()); + auto kernels = _program->get_kernels_cache().compile(impl->get_kernels_source()); + impl->set_kernels(kernels); cache.add(impl_key, impl->clone()); }); _impl = _dynamic_impl->clone(); @@ -364,13 +360,9 @@ bool primitive_inst::update_impl() { update_shape_info(*_impl_params); } else { _impl = _node->type()->choose_impl(*_node, updated_params); - auto& kernels_cache = get_network().get_kernels_cache(); - auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source()); - _impl->set_kernel_ids(kernel_ids); - kernels_cache.compile(); - _impl->init_kernels(kernels_cache); - kernels_cache.reset(); - std::lock_guard lock(get_network().get_impl_cache_mutex()); + auto& kernels_cache = get_network().get_program()->get_kernels_cache(); + auto kernels = kernels_cache.compile(_impl->get_kernels_source()); + _impl->set_kernels(kernels); cache.add(impl_key, _impl->clone()); auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr"; @@ -526,7 +518,7 @@ void primitive_inst::rebuild_exec_deps( primitive_inst::primitive_inst(network& network) : _network(network) , _node(nullptr) - , _impl_params(nullptr) + , _impl_params(make_unique()) , _impl(nullptr) , _dynamic_impl(nullptr) , _outputs({memory::ptr()}) @@ -707,12 +699,11 @@ event::ptr primitive_inst::update_weights() { } else { GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string() << " to " << expected_layout.to_short_string() << std::endl; - auto& kernels_cache = get_network().get_kernels_cache(); - auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false); - kernels_cache.compile(); - kernel = kernels_cache.get_kernel(kernel_id); + auto& kernels_cache = get_network().get_program()->get_kernels_cache(); + auto kernels = kernels_cache.compile({weights_params.clKernel->code.kernelString}); + OPENVINO_ASSERT(kernels.size() == 1, "The output of kernel compile has issue"); + kernel = kernels.begin()->second; cache.add(kernel_key, kernel); - kernels_cache.reset(); } auto& stream = get_network().get_stream(); @@ -1158,8 +1149,6 @@ int32_t primitive_inst::get_index_in_deps(memory::cptr arg) const { } void primitive_inst::load(cldnn::BinaryInputBuffer& ib) { - _impl_params.release(); - _impl_params = make_unique(); _impl_params->load(ib); ib.setKernlImplParams(_impl_params.get()); diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 1462f80eb18a26..db9c4436d0f27c 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -18,6 +18,7 @@ #include "program_dump_graph.h" #include "sliding_window_utils.hpp" #include "program_helpers.h" +#include "compilation_context.hpp" #include "matrix_nms_inst.h" #include "roi_pooling_inst.h" @@ -109,17 +110,11 @@ program::program(engine& engine_ref, processing_order(), is_body_program(is_body_program), is_subgroup_local_block_io_supported(-1) { + _config.apply_user_properties(_engine.get_device_info()); init_primitives(); - set_options(); - query_local_block_io_supported(); - _task_executor = make_task_executor(_config); - GPU_DEBUG_INFO << "Program config\n" << config.to_string(); - - pm = std::unique_ptr(new pass_manager(*this)); + init_program(); prepare_nodes(topology); - _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, - kernel_selector::KernelBase::get_db().get_batch_header_str())); program_node::reset_unique_id(); if (no_optimizations) { @@ -141,15 +136,9 @@ program::program(engine& engine_ref, _task_executor(task_executor), processing_order(), is_subgroup_local_block_io_supported(-1) { + _config.apply_user_properties(_engine.get_device_info()); init_primitives(); - set_options(); - query_local_block_io_supported(); - - _task_executor = make_task_executor(_config); - - _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, - kernel_selector::KernelBase::get_db().get_batch_header_str())); - pm = std::unique_ptr(new pass_manager(*this)); + init_program(); prepare_nodes(nodes); build_program(is_internal); calc_nodes_hash(); @@ -160,11 +149,35 @@ program::program(engine& engine) _stream(_engine.create_stream({})), _config(), processing_order(), - is_subgroup_local_block_io_supported(-1) { } + is_subgroup_local_block_io_supported(-1) { + _config.apply_user_properties(_engine.get_device_info()); + } + program::~program() { query_local_block_io_supported(); } +void program::init_program() { + set_options(); + query_local_block_io_supported(); + + pm = std::unique_ptr(new pass_manager(*this)); + + _task_executor = make_task_executor(_config); + _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, + kernel_selector::KernelBase::get_db().get_batch_header_str())); + + _compilation_context = ICompilationContext::create(make_task_executor_config(_config, + "Task executor config for CompilationContext in GPU plugin")); + + _impls_cache = cldnn::make_unique(_impls_cache_capacity); + // Remove items of compilation context's internal queue when some impl is popped in kernels_cache + // compilation context's queue check duplication of inserted task + _impls_cache->set_remove_item_callback([this](std::pair>& item) { + get_compilation_context().remove_keys({item.first}); + }); +} + void program::init_primitives() { static bool is_initialized = false; if (!is_initialized) { @@ -198,8 +211,8 @@ static void adjust_num_cores(InferenceEngine::CPUStreamsExecutor::Config& config config._streams = std::min(config._streams, num_cores); } -std::shared_ptr program::make_task_executor(const ExecutionConfig& config) const { - InferenceEngine::CPUStreamsExecutor::Config task_executor_config("CPU Tasks executor for GPU plugin", 1); +InferenceEngine::CPUStreamsExecutor::Config program::make_task_executor_config(const ExecutionConfig& config, std::string tags) const { + InferenceEngine::CPUStreamsExecutor::Config task_executor_config(tags, 1); task_executor_config._streams = config.get_property(ov::compilation_num_threads); auto priority = config.get_property(ov::intel_gpu::hint::host_task_priority); switch (priority) { @@ -211,6 +224,11 @@ std::shared_ptr program::make_task_executor adjust_num_cores(task_executor_config); + return task_executor_config; +} + +std::shared_ptr program::make_task_executor(const ExecutionConfig& config) const { + InferenceEngine::CPUStreamsExecutor::Config task_executor_config = make_task_executor_config(config, "CPU Tasks executor for GPU plugin"); return std::make_shared(task_executor_config); } @@ -1713,3 +1731,8 @@ std::pair program::get_estimated_device_mem_usage() { void program::remove_kernel(kernel_id id) { _kernels_cache->remove_kernel(id); } + +void program::cancel_compilation_context() { + if (_compilation_context != nullptr) + _compilation_context->cancel(); +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gather_tree_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gather_tree_gpu_ref.cl index 0fe1b3d5f75f1e..f560d7655fca19 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gather_tree_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gather_tree_gpu_ref.cl @@ -26,8 +26,8 @@ KERNEL(gather_tree_gpu_ref)( } for (int parent = beam; time >= 0; time--) { - output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = step_input[INPUT0_GET_INDEX(time, batch, parent, 0)]; - parent = parent_input[INPUT1_GET_INDEX(time, batch, parent, 0)]; + output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = TO_OUTPUT_TYPE(step_input[INPUT0_GET_INDEX(time, batch, parent, 0)]); + parent = (int)parent_input[INPUT1_GET_INDEX(time, batch, parent, 0)]; } bool finished = false; for (int time = 0; time < max_sequence_in_beam; time++) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl index 68904715da8d6d..948a737622b09d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl @@ -19,6 +19,9 @@ KERNEL(eltwise)( #if HAS_FUSED_OPS_DECLS , FUSED_OPS_DECLS #endif +#if IS_DYNAMIC_CROP + , int runtime_offset +#endif ) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl index 3dc30c7a88ffa9..d7c86c5bed361c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl @@ -4,27 +4,28 @@ #include "common.cl" -#define GET_FILTER_OS_IS_YX_ISV16_OSV16_INDEX(prefix, o, i, y, x, sub_group_size) \ - CAT(prefix, _OFFSET) + \ - ((o) % (sub_group_size)) + \ - (sub_group_size)*( \ - (x)*(sub_group_size)*CAT(prefix, _X_PITCH) + \ - (y)*(sub_group_size)*CAT(prefix, _Y_PITCH) + \ - ((i) % (sub_group_size)) + \ - ((i) / (sub_group_size))*(sub_group_size)*CAT(prefix, _IFM_PITCH) + \ - ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \ +#define GET_FILTER_OS_IS_YX_ISV_OSV_INDEX(prefix, o, i, y, x, osv, isv) \ + get_os_is_zyx_isv_osv_index( \ + o, i, 0, y, x, \ + CAT(prefix, _SIZE_X), \ + CAT(prefix, _SIZE_Y), \ + 1, \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + osv, \ + isv \ ) -#define GET_FILTER_OS_IS_ZYX_ISV16_OSV16_INDEX(prefix, o, i, z, y, x, sub_group_size) \ - CAT(prefix, _OFFSET) + \ - ((o) % (sub_group_size)) + \ - (sub_group_size)*( \ - (x)*(sub_group_size)*CAT(prefix, _X_PITCH) + \ - (y)*(sub_group_size)*CAT(prefix, _Y_PITCH) + \ - (z)*(sub_group_size)*CAT(prefix, _Z_PITCH) + \ - ((i) % (sub_group_size)) + \ - ((i) / (sub_group_size))*(sub_group_size)*CAT(prefix, _IFM_PITCH) + \ - ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \ +#define GET_FILTER_OS_IS_ZYX_ISV_OSV_INDEX(prefix, o, i, z, y, x, osv, isv) \ + get_os_is_zyx_isv_osv_index( \ + o, i, z, y, x, \ + CAT(prefix, _SIZE_X), \ + CAT(prefix, _SIZE_Y), \ + CAT(prefix, _SIZE_Z), \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + osv, \ + isv \ ) #define GET_FILTER_IS_OS_ZYX_ISV16_OSV16_INDEX(prefix, o, i, z, y, x, sub_group_size) \ @@ -85,6 +86,32 @@ CAT(prefix, _OFFSET) \ ) +inline uint get_os_is_zyx_isv_osv_index(uint o, uint i, uint z, uint y, uint x, + uint x_size, uint y_size, uint z_size, uint i_size, uint o_size, uint osv_size, uint isv_size) +{ + const uint isv = i % isv_size; + const uint osv = o % osv_size; + const uint is = i / isv_size; + const uint os = o / osv_size; + + const uint x_pitch = osv_size * isv_size; + const uint y_pitch = x_pitch * x_size; + const uint z_pitch = y_pitch * y_size; + const uint is_pitch = z_pitch * z_size; + const uint os_pitch = is_pitch * ((i_size + isv_size - 1) / isv_size); + + const uint output_offset = + osv + + isv * osv_size + + x * x_pitch + + y * y_pitch + + z * z_pitch + + is * is_pitch + + os * os_pitch; + + return output_offset; +} + inline uint get_os_is_zyx_osv_isv_index(uint o, uint i, uint z, uint y, uint x, uint x_size, uint y_size, uint z_size, uint i_size, uint o_size, uint osv_size, uint isv_size) { @@ -329,7 +356,7 @@ inline uint get_os_zyxi_osv16_index(uint o, uint i, uint z, uint y, uint x, uint #define GET_FILTER_INDEX_5D_SAFE(prefix, g, o, i, z, y, x) GET_FILTER_GOIZYX_SAFE(prefix, g, o, i, z, y, x) -#define GET_FILTER_OS_IYX_OSV8_INDEX(prefix, o, i, y, x, sub_group_size) \ +#define GET_FILTER_OS_IYX_OSV_INDEX(prefix, o, i, y, x, sub_group_size) \ CAT(prefix, _OFFSET) + \ ((o) % (sub_group_size)) + \ (sub_group_size)*( \ @@ -339,7 +366,7 @@ inline uint get_os_zyxi_osv16_index(uint o, uint i, uint z, uint y, uint x, uint ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \ ) -#define GET_FILTER_OS_IYX_OSV8_ROTATE_180_INDEX(prefix, o, i, y, x, sub_group_size) \ +#define GET_FILTER_OS_IYX_OSV_ROTATE_180_INDEX(prefix, o, i, y, x, sub_group_size) \ CAT(prefix, _OFFSET) + \ ((o) % (sub_group_size)) + \ (sub_group_size)*( \ @@ -1495,16 +1522,6 @@ inline uint get_os_i_yxs_osv_yxsv4_index(uint o, uint i, uint y, uint x, uint i_ CAT(prefix, _SIZE_Y), \ 4) -#define GET_FILTER_OS_IYX_OSV32__AI32_INDEX(prefix, o, i, y, x, sub_group_size) \ - CAT(prefix, _OFFSET) + \ - ((o) % (sub_group_size)) + \ - (sub_group_size)*( \ - (x)*CAT(prefix, _X_PITCH) + \ - (y)*CAT(prefix, _Y_PITCH) + \ - (i)*CAT(prefix, _IFM_PITCH) + \ - ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \ - ) - #define GET_FILTER_G_OS_IYX_OSV16(prefix, g, o, i, y, x, sub_group_size) \ CAT(prefix, _OFFSET) + \ (g * CAT(prefix, _GROUPS_PITCH)) + \ diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl index 582c2f6c6c74df..147ab43e837ee0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl @@ -25,19 +25,20 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x return GET_FILTER_INDEX_5D(INPUT0, 0, o, i, z, y, x); #elif defined INPUT0_LAYOUT_OS_IYX_OSV16 || \ defined INPUT0_LAYOUT_OS_I_OSV16 || \ - defined INPUT0_LAYOUT_OS_I_OSV8__AI8 || \ defined INPUT0_LAYOUT_OS_I_OSV16__AI8 - return GET_FILTER_OS_IYX_OSV8_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 16); +#elif defined INPUT0_LAYOUT_OS_I_OSV8__AI8 + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 8); #elif defined INPUT0_LAYOUT_IYX_OSV32 - return GET_FILTER_OS_IYX_OSV8_INDEX(INPUT0, o, i, y, x, 32); + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 32); #elif defined INPUT0_LAYOUT_OS_IYX_OSV32__AI32 - return GET_FILTER_OS_IYX_OSV32__AI32_INDEX(OUTPUT, o, i, y, x, 32); + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 32); #elif defined INPUT0_LAYOUT_O_IS_YX_ISV16 return GET_FILTER_O_IS_YX_ISV16_INDEX(INPUT0, o, i, y, x, 16); #elif defined INPUT0_LAYOUT_IYX_OSV64 - return GET_FILTER_OS_IYX_OSV8_INDEX(INPUT0, o, i, y, x, 64); + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 64); #elif defined INPUT0_LAYOUT_OS_IYX_OSV16_ROTATE_180 - return GET_FILTER_OS_IYX_OSV8_ROTATE_180_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IYX_OSV_ROTATE_180_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); #elif defined INPUT0_LAYOUT_I_YXS_OS_YXSV2_OSV16 return GET_FILTER_I_YXS_OS_YXSV2_OSV_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); #elif defined INPUT0_LAYOUT_IY_XS_OS_XSV2_OSV16__AO32 || defined OUTPUT_LAYOUT_IY_XS_OS_XSV2_OSV8__AO32 @@ -61,11 +62,11 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x #elif defined INPUT0_LAYOUT_OS_IS_Y_X8_OSV8_ISV4_SWIZZLED_BY_4 return GET_FILTER_OS_IS_Y_X8_OSV8_ISV4_SWIZZLED_BY_4(INPUT0, o, i, y, x); #elif defined INPUT0_LAYOUT_OS_IS_YX_ISV16_OSV16 - return GET_FILTER_OS_IS_YX_ISV16_OSV16_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IS_YX_ISV_OSV_INDEX(INPUT0, o, i, y, x, 16, 16); #elif defined INPUT0_LAYOUT_OIYX_O16 return GET_FILTER_OIYX_O16(INPUT0, o, i, y, x); #elif defined INPUT0_LAYOUT_OS_IS_ZYX_ISV16_OSV16 - return GET_FILTER_OS_IS_ZYX_ISV16_OSV16_INDEX(INPUT0, o, i, z, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IS_ZYX_ISV_OSV_INDEX(INPUT0, o, i, z, y, x, 16, 16); #elif defined INPUT0_LAYOUT_IS_OS_ZYX_ISV16_OSV16 return GET_FILTER_IS_OS_ZYX_ISV16_OSV16_INDEX(INPUT0, o, i, z, y, x, SUB_GROUP_SIZE); #elif defined INPUT0_LAYOUT_IS_OS_YX_ISV16_OSV16 @@ -219,19 +220,20 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint return GET_FILTER_INDEX_5D(OUTPUT, 0, o, i, z, y, x); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV16 || \ defined OUTPUT_LAYOUT_OS_I_OSV16 || \ - defined OUTPUT_LAYOUT_OS_I_OSV8__AI8 || \ defined OUTPUT_LAYOUT_OS_I_OSV16__AI8 - return GET_FILTER_OS_IYX_OSV8_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 16); +#elif defined OUTPUT_LAYOUT_OS_I_OSV8__AI8 + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 8); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV32 - return GET_FILTER_OS_IYX_OSV8_INDEX(OUTPUT, o, i, y, x, 32); + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 32); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV32__AI32 - return GET_FILTER_OS_IYX_OSV32__AI32_INDEX(OUTPUT, o, i, y, x, 32); + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 32); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV64 - return GET_FILTER_OS_IYX_OSV8_INDEX(OUTPUT, o, i, y, x, 64); + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 64); #elif defined OUTPUT_LAYOUT_O_IS_YX_ISV16 return GET_FILTER_O_IS_YX_ISV16_INDEX(OUTPUT, o, i, y, x, 16); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV16_ROTATE_180 - return GET_FILTER_OS_IYX_OSV8_ROTATE_180_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IYX_OSV_ROTATE_180_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); #elif defined OUTPUT_LAYOUT_I_YXS_OS_YXSV2_OSV16 return GET_FILTER_I_YXS_OS_YXSV2_OSV_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); #elif defined OUTPUT_LAYOUT_IY_XS_OS_XSV2_OSV16__AO32 || defined OUTPUT_LAYOUT_IY_XS_OS_XSV2_OSV8__AO32 @@ -313,11 +315,11 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4 return GET_FILTER_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, z, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_YX_ISV16_OSV16 - return GET_FILTER_OS_IS_YX_ISV16_OSV16_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IS_YX_ISV_OSV_INDEX(OUTPUT, o, i, y, x, 16, 16); #elif defined OUTPUT_LAYOUT_OS_YXI_OSV16 return GET_FILTER_OS_YXI_OSV16(OUTPUT, o, i, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISV16_OSV16 - return GET_FILTER_OS_IS_ZYX_ISV16_OSV16_INDEX(OUTPUT, o, i, z, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IS_ZYX_ISV_OSV_INDEX(OUTPUT, o, i, z, y, x, 16, 16); #elif defined OUTPUT_LAYOUT_IS_OS_ZYX_ISV16_OSV16 return GET_FILTER_IS_OS_ZYX_ISV16_OSV16_INDEX(OUTPUT, o, i, z, y, x, SUB_GROUP_SIZE); #elif defined OUTPUT_LAYOUT_IS_OS_YX_ISV16_OSV16 diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp index 1851baae28cf00..bd33f857d18ee3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_base_opencl.cpp @@ -217,7 +217,12 @@ void KernelBaseOpenCL::FillCLKernelData(clKernelData& kernel, kernel.code.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; - kernel.params.arguments = GetArgsDesc(number_of_inputs, weights, bias, number_of_inputs_for_fused_prims, number_of_outputs, is_dynamic); + kernel.params.arguments = GetArgsDesc(number_of_inputs, + weights, + bias, + number_of_inputs_for_fused_prims, + number_of_outputs, + is_dynamic); } bool KernelBaseOpenCL::layout_is_one_of(const MultiDataTensor& tensors, const std::vector& allowed_layouts) const { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_base.cpp index 7a801b098abdc3..240dae16b3c993 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_base.cpp @@ -311,9 +311,13 @@ JitConstants EltwiseKernelBase::MakeLoadJitConstants(const eltwise_params& param bool useVload8) const { JitConstants jit = {}; std::string vload_decls; + for (size_t op_num = 0; op_num < params.operations.size(); op_num++) { const std::string op_num_str = toCodeString(op_num); const auto &ew = params.operations[op_num]; + bool is_dynamic_crop_kernel = params.is_shape_agnostic && params.operations[op_num].mode == EltwiseMode::ASSIGN; + if (is_dynamic_crop_kernel) + jit.AddConstant(MakeJitConstant("IS_DYNAMIC_CROP", 1)); for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) { const auto &input = ew.inputs[input_idx]; const std::string name = "INPUT_" + op_num_str + "_" + toCodeString(input_idx); @@ -330,7 +334,7 @@ JitConstants EltwiseKernelBase::MakeLoadJitConstants(const eltwise_params& param jit.AddConstant(MakeJitConstant(name, "input" + toCodeString(input.index) + "[GET_INDEX(INPUT, " + toCodeString(input.index) + - "," + idx_order + ")]")); + "," + idx_order + ") " + (is_dynamic_crop_kernel ? "+ runtime_offset]" : "]"))); break; case EltwiseInputMode::OUTPUT_BUFFER: jit.AddConstant(MakeJitConstant(name, "output[GET_INDEX(OUTPUT,,OUTPUT_IDX_ORDER)]")); @@ -711,7 +715,13 @@ KernelsData EltwiseKernelBase::GetCommonKernelsData(const Params& params, const GetFusedPrimitiveInputsCount(params), 1, is_dynamic); - + if (params.is_shape_agnostic && newParams.operations[0].mode == EltwiseMode::ASSIGN) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); + kernel_selector::ScalarDescriptor s; + s.t = kernel_selector::ScalarDescriptor::Types::UINT32; + s.v.u32 = 0; + kernel.params.scalars.push_back(s); + } return {kd}; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gather_tree/gather_tree_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gather_tree/gather_tree_kernel_ref.cpp index f8676f9128b846..93a22d110699c7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gather_tree/gather_tree_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gather_tree/gather_tree_kernel_ref.cpp @@ -14,9 +14,13 @@ ParamsKey GatherTreeKernelRef::GetSupportedKey() const { k.EnableInputDataType(Datatype::INT32); k.EnableOutputDataType(Datatype::INT32); + k.EnableInputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F16); + k.EnableInputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::bfyx); diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index b2370fa7951287..c699b379984c94 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -6,6 +6,7 @@ #include "intel_gpu/graph/serialization/binary_buffer.hpp" #include "intel_gpu/graph/serialization/string_serializer.hpp" #include "intel_gpu/graph/serialization/utils.hpp" +#include "intel_gpu/graph/serialization/vector_serializer.hpp" #include "intel_gpu/plugin/graph.hpp" #include "intel_gpu/runtime/itt.hpp" #include "intel_gpu/plugin/infer_request.hpp" @@ -96,11 +97,14 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote std::string name; std::string precision; std::string layout; + InferenceEngine::SizeVector dims; ib >> name; ib >> precision; ib >> layout; + ib >> dims; DataPtr input = std::make_shared(name, Precision::FromStr(precision), cldnn::serial_util::layout_from_string(layout)); + input->setDims(dims); InputInfo::Ptr infoNew = std::make_shared(); infoNew->setInputData(input); inputs.emplace(std::make_pair(name, infoNew)); @@ -115,11 +119,14 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote std::string name; std::string precision; std::string layout; + InferenceEngine::SizeVector dims; ib >> name; ib >> precision; ib >> layout; + ib >> dims; DataPtr output = std::make_shared(name, Precision::FromStr(precision), cldnn::serial_util::layout_from_string(layout)); + output->setDims(dims); outputs.emplace(std::make_pair(name, output)); } @@ -234,6 +241,9 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote ib.seekg(pos); auto graph = std::make_shared(ib, context_impl, m_config, n); m_graphs.push_back(graph); + if (n == 0) { + ib.setNetwork(graph->GetNetwork().get()); + } } } @@ -317,14 +327,6 @@ IInferRequestInternal::Ptr CompiledModel::CreateInferRequest() { _callbackExecutor); } -bool CompiledModel::is_serializable() { - // Dynamic model serialization is not yet supported. - if (m_graphs[0]->GetNetwork()->is_dynamic()) - return false; - - return true; -} - // Cache blob format: // [ ConstInputsDataMap / ConstOutputsDataMap ] // [ ov::Node::Input/ ov::Node::Output ] @@ -334,9 +336,6 @@ void CompiledModel::Export(std::ostream& networkModel) { if (m_graphs.empty()) IE_THROW(NetworkNotLoaded); - if (!is_serializable()) - return; - cldnn::BinaryOutputBuffer ob(networkModel); // InputsInfo and OutputsInfo for CNNNetwork @@ -350,6 +349,7 @@ void CompiledModel::Export(std::ostream& networkModel) { std::stringstream ss; ss << in.second->getInputData()->getLayout(); ob << ss.str(); + ob << in.second->getTensorDesc().getDims(); } ob << GetOutputsInfo().size(); @@ -361,6 +361,7 @@ void CompiledModel::Export(std::ostream& networkModel) { std::stringstream ss; ss << out.second->getLayout(); ob << ss.str(); + ob << out.second->getTensorDesc().getDims(); } } diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp index 986511d9f87439..0250c70e5358bc 100644 --- a/src/plugins/intel_gpu/src/plugin/graph.cpp +++ b/src/plugins/intel_gpu/src/plugin/graph.cpp @@ -82,6 +82,23 @@ Graph::Graph(cldnn::BinaryInputBuffer &ib, RemoteContextImpl::Ptr context, const m_program->AddVariableStateInfo(variablesStateInfo.first, *variablesStateInfo.second.begin()); } ib >> primitiveIDs; + ib >> prevPrimitiveIDs; + ib >> profilingIDs; + { + size_t perfMap_size; + ib >> perfMap_size; + for (size_t i = 0; i < perfMap_size; ++i) { + cldnn::primitive_id prim_id; + ib >> prim_id; + perfMap[prim_id].first = prim_id; + auto& perfEntry = perfMap[prim_id].second; + ib >> perfEntry.layerType; + ib >> cldnn::make_data(&perfEntry.status, sizeof(InferenceEngine::InferenceEngineProfileInfo::LayerStatus)); + perfEntry.cpu_uSec = perfEntry.realTime_uSec = 0; + ib >> perfEntry.isCPU; + ib >> perfEntry.parentPrimitive; + } + } ib >> outputDims; size_t num_networks; @@ -502,6 +519,18 @@ void Graph::Export(cldnn::BinaryOutputBuffer &ob) { ob << m_program->inputLayouts; ob << m_program->GetVariablesStatesInfo(); ob << primitiveIDs; + ob << prevPrimitiveIDs; + ob << profilingIDs; + { + ob << perfMap.size(); + for (auto& perf_item : perfMap) { + ob << perf_item.first; + ob << perf_item.second.second.layerType; + ob << cldnn::make_data(&perf_item.second.second.status, sizeof(InferenceEngine::InferenceEngineProfileInfo::LayerStatus)); + ob << perf_item.second.second.isCPU; + ob << perf_item.second.second.parentPrimitive; + } + } ob << outputDims; ob << m_networks.size(); @@ -597,6 +626,13 @@ std::map Graph::GetPer auto executedPrimitives = GetNetwork()->get_executed_primitives(); auto primitivesInfo = GetNetwork()->get_primitives_info(); auto extIdMap = GetNetwork()->get_ext_id_mapping(); + std::map implementation_info; + + if (GetNetwork()->get_program() == nullptr) { + for (auto& pi : primitivesInfo) { + implementation_info[pi.original_id] = pi.kernel_id; + } + } auto getUpperCaseName = [](std::string name) { std::vector res; @@ -641,7 +677,16 @@ std::map Graph::GetPer static const std::string cpuExecType("CPU"); cpuExecType.copy(extPerfEntry.exec_type, cpuExecType.length()); // Override execType as CPU } else { - std::string impl = GetNetwork()->get_implementation_info(primId); + std::string impl; + if (GetNetwork()->get_program() != nullptr) { + impl = GetNetwork()->get_implementation_info(primId); + } else { + if (implementation_info.find(primId) != implementation_info.end()) { + impl = implementation_info[primId]; + } else { + impl = "undef"; + } + } impl.copy(extPerfEntry.exec_type, impl.length()); } diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 54630c5384aa40..aaced7fdc61dc6 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -142,12 +142,6 @@ Plugin::Plugin() : m_default_contexts({}) { m_default_contexts.insert({device.first, ctx}); } } - - if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) { - if (env_p[0] == '1') { - isModelCachingEnabled = true; - } - } } auto check_inputs = [](InferenceEngine::InputsDataMap _networkInputs) { @@ -204,6 +198,9 @@ IExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(const InferenceEngine { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::LoadExeNetworkImpl::CreateExeNetwork"); CompiledModel::Ptr exeNetwork = std::make_shared(transformedNetwork, context, config); + if (exeNetwork->m_graphs[0]->GetNetwork()->is_dynamic()) { + isModelCachingEnabled = false; + } update_memory_statistics(context->get_impl()); return exeNetwork; } diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp index 7ca5f1acb3c0cd..6aaa2f7385df4c 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp @@ -54,7 +54,7 @@ std::string reorder_options(const std::string& org_options) { } // namespace namespace cldnn { - +std::atomic kernels_cache::_kernel_idx{0}; std::mutex kernels_cache::_mutex; std::string kernels_cache::get_cache_path() const { @@ -70,10 +70,8 @@ std::string kernels_cache::get_cache_path() const { } bool kernels_cache::is_cache_enabled() const { - if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) { - if (env_p[0] == '1') { - return false; - } + if (!_config.get_property(ov::intel_gpu::allow_new_shape_infer)) { + return false; } return !_config.get_property(ov::cache_dir).empty(); @@ -191,7 +189,7 @@ static std::vector getProgramBinaries(cl::Program program) { } // TODO: This build_batch method should be backend specific -void kernels_cache::build_batch(const engine& build_engine, const batch_program& batch) { +void kernels_cache::build_batch(const engine& build_engine, const batch_program& batch, std::map& compiled_kernels) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::build_batch"); auto& cl_build_engine = dynamic_cast(build_engine); @@ -288,7 +286,7 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program& cl_context context = cl_build_engine.get_cl_context().get(); kernel::ptr kernel = kernels_factory::create(_engine, context, kern, entry_point); const auto& kmap = std::make_pair(k_id->second, kernel); - _kernels.insert(kmap); + compiled_kernels.insert(kmap); } else { throw std::runtime_error("Could not find entry point"); } @@ -393,7 +391,7 @@ void kernels_cache::build_all() { auto& batch = batches[idx]; tasks.push_back([this, &_build_engine, &batch, &exception] { try { - build_batch(_build_engine, batch); + build_batch(_build_engine, batch, _kernels); } catch(...) { exception = std::current_exception(); } @@ -407,7 +405,7 @@ void kernels_cache::build_all() { } } else { for (size_t idx = 0; idx < batches.size(); idx++) { - build_batch(_build_engine, batches[idx]); + build_batch(_build_engine, batches[idx], _kernels); } } @@ -438,10 +436,7 @@ std::vector kernels_cache::add_kernels_source(std::vector lock(_mutex); auto kernel_string = kernel_sources[i]; - // we need unique id in order to avoid conflict across topologies. - const auto kernel_num = _kernels.size() + (_kernel_idx++); - kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num); - + kernel_id id = gen_kernel_id(kernel_string->entry_point); auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program); assert(_kernels.find(id) == _kernels.end()); @@ -459,37 +454,10 @@ void kernels_cache::add_kernels(const std::vector& kernel_ids, cons for (size_t i = 0; i < kernel_ids.size(); i++) { const auto& kmap = std::make_pair(kernel_ids[i], kernels[i]); _kernels.insert(kmap); + _kernel_idx++; } } -void kernels_cache::compile() { - OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll"); - - std::unique_ptr _build_engine = nullptr; - if (_engine.type() == engine_types::ocl) { - _build_engine = std::unique_ptr(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl)); - } - - // create batches - std::vector batches; - get_program_source(_kernels_code, &batches); - - // build batches - for (size_t idx = 0; idx < batches.size(); idx++) { - build_batch(*_build_engine, batches[idx]); - } - - _kernels_code.clear(); - _pending_compilation = false; -#if defined(__unix__) && !defined(__ANDROID__) - // NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed. - // (It is at least 500 MB when we perform parallel compilation) - // It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory. - // Also, this is not happening in Windows. - // So, added malloc_trim for linux build until we figure out a better solution. - malloc_trim(0); -#endif -} void kernels_cache::save(BinaryOutputBuffer& ob) const { OPENVINO_ASSERT(_engine.type() == engine_types::ocl, "[GPU] Not supported engine type"); @@ -572,6 +540,7 @@ void kernels_cache::load(BinaryInputBuffer& ib) { cl_context cl_context = build_engine->get_cl_context().get(); kernel::ptr kernel = kernels_factory::create(_engine, cl_context, cl_kernel, entry_point); _kernels.insert({k_id->second, kernel}); + _kernel_idx++; } } } @@ -584,4 +553,41 @@ void kernels_cache::load(BinaryInputBuffer& ib) { } } +std::map kernels_cache::compile(std::vector> kernel_sources, + bool dump_custom_program) { + OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::Compile_ThreadSafe"); + kernels_code t_kernels_code; + + // Get kernels code from kernel sources + for (size_t idx = 0; idx < kernel_sources.size(); ++idx) { + auto kernel_string = kernel_sources[idx]; + kernel_id id = gen_kernel_id(kernel_string->entry_point); + t_kernels_code.emplace(kernel_string, id, dump_custom_program); + } + + ocl::ocl_engine& _build_engine = downcast(_engine); + + // Create batches + std::vector batches; + get_program_source(t_kernels_code, &batches); + + std::map output_kernels; + // Build batches + for (size_t idx = 0; idx < batches.size(); ++idx) { + build_batch(_build_engine, batches[idx], output_kernels); + } + + t_kernels_code.clear(); +#if defined(__unix__) && !defined(__ANDROID__) + // NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed. + // (It is at least 500 MB when we perform parallel compilation) + // It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory. + // Also, this is not happening in Windows. + // So, added malloc_trim for linux build until we figure out a better solution. + malloc_trim(0); +#endif + + return output_kernels; +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp index aa2e1187a3b5f5..79f9ad625d3197 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp @@ -22,6 +22,7 @@ #include "ocl/ocl_engine.hpp" namespace cldnn { + class kernels_cache { public: using source_code = std::vector; @@ -81,18 +82,23 @@ class kernels_cache { ExecutionConfig _config; uint32_t _prog_id = 0; kernels_code _kernels_code; - size_t _kernel_idx = 0; + static std::atomic _kernel_idx; std::atomic _pending_compilation{false}; std::map _kernels; std::vector batch_header_str; void get_program_source(const kernels_code& kernels_source_code, std::vector*) const; - void build_batch(const engine& build_engine, const batch_program& batch); + void build_batch(const engine& build_engine, const batch_program& batch, std::map& compiled_kernels); std::string get_cache_path() const; bool is_cache_enabled() const; size_t get_max_kernels_per_batch() const; + inline std::string gen_kernel_id(std::string entry_point) { + // we need unique id in order to avoid conflict across topologies. + return entry_point + "_" + std::to_string((_kernel_idx++)); + } + public: explicit kernels_cache(engine& engine, const ExecutionConfig& config, @@ -116,9 +122,9 @@ class kernels_cache { } std::vector add_kernels_source(std::vector> kernel_sources, bool dump_custom_program = false); void add_kernels(const std::vector& kernel_ids, const std::vector& kernels); - void compile(); void save(BinaryOutputBuffer& ob) const; void load(BinaryInputBuffer& ib); + std::map compile(std::vector> kernel_sources, bool dump_custom_program = false); }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp index 50b928dc25ef98..f4c9e20844abe7 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp @@ -211,6 +211,10 @@ clEnqueueMemFillINTEL_fn)( #define CL_DEVICE_UUID_KHR 0x106A +#endif // cl_khr_device_uuid + +#ifndef OV_GPU_USE_OPENCL_HPP + // for C++ wrappers using uuid_array = std::array; @@ -220,7 +224,7 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array) } // namespace detail } // namespace cl -#endif // cl_khr_device_uuid +#endif // OV_GPU_USE_OPENCL_HPP /*************************************************************** * cl_intel_device_attribute_query diff --git a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp index 59950f81cc9296..d88a740f441c80 100644 --- a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp @@ -1717,14 +1717,14 @@ TEST_P(conv_swap_xy_with_eltwise_diff_sizes, basic) { // in_shape; out_shape; eltw_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; #define CASE_CONV_ELTW_FP16_SWAP_XY_1 { 1, 16, 1, 5 }, { 1, 32, 1, 7 }, { 1, 32, 1, 1 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx #define CASE_CONV_ELTW_FP16_SWAP_XY_2 { 1, 16, 1, 5 }, { 1, 32, 1, 7 }, { 1, 32, 1, 7 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx -#define CASE_CONV_ELTW_FP32_SWAP_XY_1 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 1, 32, 1, 1 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx -#define CASE_CONV_ELTW_FP32_SWAP_XY_2 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 3, 32, 1, 7 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_FP16_SWAP_XY_3 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 1, 32, 1, 1 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx +#define CASE_CONV_ELTW_FP16_SWAP_XY_4 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 3, 32, 1, 7 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_swap_xy_with_eltwise_diff_sizes, ::testing::ValuesIn(std::vector{ - conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_1, 3, 3, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_2, 3, 3, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_SWAP_XY_1, 3, 3, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_SWAP_XY_2, 3, 3, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_1, 3, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_2, 3, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_3, 3, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_4, 3, 2, 4 }, })); class conv_scale_activation_eltwise_fp32_quantize_i8 : public ConvEltwTest {}; diff --git a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp index e6becbacb9e6c6..0b3ebc78d1ada9 100644 --- a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp @@ -151,6 +151,9 @@ class FullyConnectedFusingTestOneDNN : public BaseFusingTest{ + fully_connected_test_params{ CASE_FC_U8S8_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 3 }, +})); + class fc_int8_quantize_u8 : public FullyConnectedFusingTest {}; TEST_P(fc_int8_quantize_u8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input", get_input_layout(p)), @@ -272,7 +307,7 @@ TEST_P(fc_int8_quantize_u8, basic) { execute(p); } -INSTANTIATE_TEST_SUITE_P(fusings_gpu_fc, fc_int8_quantize_u8, ::testing::ValuesIn(std::vector{ +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_quantize_u8, ::testing::ValuesIn(std::vector{ fully_connected_test_params{ CASE_FC_U8S8_1, 2, 3 }, fully_connected_test_params{ CASE_FC_U8S8_2, 2, 3 }, fully_connected_test_params{ CASE_FC_U8S8_3, 2, 3 }, @@ -283,6 +318,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu_fc, fc_int8_quantize_u8, ::testing::ValuesI class fc_int8_eltwise_quantize_i8 : public FullyConnectedFusingTest {}; TEST_P(fc_int8_eltwise_quantize_i8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input", get_input_layout(p)), @@ -315,6 +353,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_eltwise_quantize_i8, ::testing::Va class fc_int8_eltwise_activation_quantize_i8 : public FullyConnectedFusingTest {}; TEST_P(fc_int8_eltwise_activation_quantize_i8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input", get_input_layout(p)), diff --git a/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp b/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp index 96f37db8fba43d..78a5781e93bf85 100644 --- a/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp +++ b/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp @@ -34,6 +34,10 @@ class BaseFusingTest : public ::testing::TestWithParam { cfg_fused.set_property(ov::intel_gpu::optimize_data(true)); cfg_not_fused.set_property(ov::intel_gpu::optimize_data(false)); cfg_not_fused.set_property(ov::intel_gpu::allow_static_input_reorder(true)); + if (engine.get_device_info().supports_immad) { + cfg_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order)); + cfg_not_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order)); + } } void compare(network& not_fused, network& fused, T& p, bool count_reorder = false) { @@ -74,20 +78,15 @@ class BaseFusingTest : public ::testing::TestWithParam { ASSERT_EQ(outputs_ref.size(), outputs_fused.size()); ASSERT_EQ(outputs_ref.size(), size_t(1)); - auto output_not_fused_prim = outputs_ref.begin()->second.get_memory(); - auto output_fused_prim = outputs_fused.begin()->second.get_memory(); - if (output_not_fused_prim->get_layout().data_type == data_types::f32) { - cldnn::mem_lock ref(output_not_fused_prim, get_test_stream()); - cldnn::mem_lock output_ptr(output_fused_prim, get_test_stream()); - for (size_t i = 0; i < output_fused_prim->get_layout().count(); i++) { - ASSERT_NEAR(ref[i], output_ptr[i], tolerance) << "i = " << i; - } - } else { - cldnn::mem_lock ref(output_not_fused_prim, get_test_stream()); - cldnn::mem_lock output_ptr(output_fused_prim, get_test_stream()); - for (size_t i = 0; i < output_fused_prim->get_layout().count(); i++) { - ASSERT_NEAR(half_to_float(ref[i]), half_to_float(output_ptr[i]), tolerance) << "i = " << i; - } + auto val_ref=get_output_values_to_float(not_fused, outputs_ref.begin()->first); + auto val_opt=get_output_values_to_float(fused, outputs_fused.begin()->first); + ASSERT_EQ(val_ref.size(), val_opt.size()); + for (size_t i = 0; i < val_ref.size(); i++) { + ASSERT_NEAR(val_ref[i], val_opt[i], tolerance) + << "tolerance = " << tolerance + << "\ni = " << i + << "\nref[i] = " << val_ref[i] + << "\nopt[i] = " << val_opt[i]; } } diff --git a/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp index 33dd5009be0367..40bb22589ccb84 100644 --- a/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp @@ -74,11 +74,6 @@ class GemmFusingTest : public ::BaseFusingTest { } layout get_per_channel_layout(gemm_test_params& p) { - // WA: per channel binary post-operation is not supported for onednn gemm. Use single value for such case. - if (engine.get_device_info().supports_immad){ - std::cout << "per_channel layout for onednn gemm not supported." << std::endl; - return layout{p.default_type, p.default_format, tensor{1, 1, 1, 1}}; - } return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shapes.at(0).feature[0], 1, 1 } }; } @@ -129,6 +124,9 @@ class GemmFusingTest : public ::BaseFusingTest { class gemm_3in_quantize_i8 : public GemmFusingTest {}; TEST_P(gemm_3in_quantize_i8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input0", get_input_layout(p, 0)), @@ -279,6 +277,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_scale, ::testing::ValuesIn(std::v class gemm_2in_act_scale_quantize_i8 : public GemmFusingTest {}; TEST_P(gemm_2in_act_scale_quantize_i8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input0", get_input_layout(p, 0)), diff --git a/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp index 2c9d7d87750bba..736392016e2730 100644 --- a/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp @@ -120,6 +120,9 @@ class ReduceFusingTest : public ::BaseFusingTest { class reduce_eltwise_activation_quantize : public ReduceFusingTest {}; TEST_P(reduce_eltwise_activation_quantize, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); update_out_shape(p); create_topologies( diff --git a/src/plugins/intel_gpu/tests/fusions/strided_slice_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/strided_slice_fusion_test.cpp index cad2927bbbfabf..3aa5d30d3c780e 100644 --- a/src/plugins/intel_gpu/tests/fusions/strided_slice_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/strided_slice_fusion_test.cpp @@ -21,6 +21,7 @@ struct strided_slice_test_params { data_types input_type; format input_format; size_t expected_fused_primitives; + size_t expected_fused_primitives_onednn; size_t expected_not_fused_primitives; std::vector> activation_func_list; }; @@ -64,6 +65,8 @@ TEST_P(strided_slice_activation, basic) { std::vector strides_data = { 1, 1, 1, 1 }; auto p = GetParam(); + if (engine.get_device_info().supports_immad) + p.expected_fused_primitives = p.expected_fused_primitives_onednn; create_topologies( input_layout("input", get_input_layout(p)), strided_slice("strided_slice", input_info("input"), begin_data, end_data, strides_data, {}, {}, {}, {}, {}, { 1, 8, 1, 1 }) @@ -84,7 +87,7 @@ TEST_P(strided_slice_activation, basic) { } INSTANTIATE_TEST_SUITE_P(fusings_gpu, strided_slice_activation, ::testing::ValuesIn(std::vector{ - strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 4, {{ activation_func::clamp, { } }, { activation_func::exp, { } }} }, - strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 3, {{ activation_func::logistic, { } } } }, - strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 3, {{ activation_func::hyperbolic_tan, { } } } }, + strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 2, 4, {{ activation_func::clamp, { } }, { activation_func::exp, { } }} }, + strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 2, 3, {{ activation_func::logistic, { } } } }, + strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 3, 3, {{ activation_func::hyperbolic_tan, { } } } }, })); diff --git a/src/plugins/intel_gpu/tests/passes/remove_redundant_reorders_tests.cpp b/src/plugins/intel_gpu/tests/passes/remove_redundant_reorders_tests.cpp new file mode 100644 index 00000000000000..b2c62a5506b302 --- /dev/null +++ b/src/plugins/intel_gpu/tests/passes/remove_redundant_reorders_tests.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include "intel_gpu/runtime/engine.hpp" + +#include "intel_gpu/graph/network.hpp" +#include "intel_gpu/graph/program.hpp" +#include "data_inst.h" +#include "convolution_inst.h" +#include "reorder_inst.h" +#include "softmax_inst.h" + +#include "pass_manager.h" +#include "to_string_utils.h" + +#include "program_wrapper.h" + +#include + +using namespace cldnn; +using namespace ::tests; + +TEST(remove_redundant_reorders, remove_dep_dynamic) { + // Topology: + // convolution -> reorder -> softmax + // + // Expectation: + // The preferred format of convolution should be selected as b_fs_yx_fsv16 (reorder_inputs) + // A new reorder that converts to bfyx should be inserted after convolution (reorder_inputs) + // In reorders, output format of dependency reorder should be saved as output_format of orginial reorder (remove_redundant_reorders) + + auto& engine = get_test_engine(); + auto input_layout_dynamic = layout{ov::PartialShape{1, 3, ov::Dimension::dynamic(), ov::Dimension::dynamic()}, + data_types::f16, format::bfyx}; + auto input = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 3, 224, 224 } }); + auto weights = engine.allocate_memory({ data_types::f16, format::bfyx, { 64, 3, 7, 7 } }); + + topology topology; + topology.add(data("weights", weights)); + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(convolution("conv", input_info("input"), { "weights" })); + topology.add(reorder("reorder", input_info("conv"), format::any, data_types::f32)); + topology.add(softmax("softmax", input_info("reorder"), 1)); + + ExecutionConfig config; + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + network.set_input_data("input", input); + + network.execute(); + + auto prog = network.get_program(); + ASSERT_NE(prog, nullptr); + auto& softmax_node = prog->get_node("softmax"); + auto softmax_layout = softmax_node.get_output_layout(); + + ASSERT_EQ(softmax_layout.format.value, format::bfyx); +} diff --git a/src/plugins/intel_gpu/tests/test_cases/crop_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/crop_gpu_test.cpp index 88fca81e8d1b71..0c147209c0b241 100644 --- a/src/plugins/intel_gpu/tests/test_cases/crop_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/crop_gpu_test.cpp @@ -8,6 +8,8 @@ #include #include +#include "crop_inst.h" + using namespace cldnn; using namespace ::tests; @@ -1342,9 +1344,9 @@ TEST(crop_gpu, dynamic_in1x4x1x1_split) { topology.add(crop("crop1", { input_info("input"), input_info("data") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_1)), { tensor(feature(feature_offset_1), spatial(0,0),batch(0)) }, op_mode, 0, num_splits)); topology.add(crop("crop2", { input_info("input"), input_info("data") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_2)), { tensor(feature(feature_offset_2), spatial(0,0),batch(0)) }, op_mode, 1, num_splits)); - std::vector input_vec = { -1, 2, -3, 4 }; - std::vector out1 = { -1, 2 }; - std::vector out2 = { -3, 4 }; + std::vector input_vec = { -1.0f, 2.0f, -3.0f, 4.0f }; + std::vector out1 = { -1.0f, 2.0f }; + std::vector out2 = { -3.0f, 4.0f }; set_values(input_mem, input_vec); ExecutionConfig config; config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); @@ -1355,14 +1357,21 @@ TEST(crop_gpu, dynamic_in1x4x1x1_split) { network.set_input_data("input", input_mem); auto outputs = network.execute(); - auto output = outputs.at("crop1").get_memory(); - cldnn::mem_lock output_ptr(output, get_test_stream()); + auto impl1 = network.get_primitive("crop1")->get_impl(); + ASSERT_TRUE(impl1 != nullptr); + ASSERT_TRUE(impl1->is_dynamic()); + auto impl2 = network.get_primitive("crop2")->get_impl(); + ASSERT_TRUE(impl2 != nullptr); + ASSERT_TRUE(impl2->is_dynamic()); + + auto output1 = outputs.at("crop1").get_memory(); + cldnn::mem_lock output_ptr_1(output1, get_test_stream()); for (size_t i = 0; i < out1.size(); i++) - ASSERT_EQ(output_ptr[i], out1[i]); + ASSERT_EQ(output_ptr_1[i], out1[i]); auto output_2 = outputs.at("crop2").get_memory(); - cldnn::mem_lock output_ptr_2(output_2, get_test_stream()); + cldnn::mem_lock output_ptr_2(output_2, get_test_stream()); for (size_t i = 0; i < out2.size(); i++) ASSERT_EQ(output_ptr_2[i], out2[i]); @@ -1399,9 +1408,9 @@ TEST(crop_gpu, dynamic_in1x4x1x1_varaidic_split) { topology.add(crop("crop1", { input_info("input"), input_info("axis"), input_info("splits_length") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_1)), { tensor(feature(feature_offset_1), spatial(0,0),batch(0)) }, op_mode, 0)); topology.add(crop("crop2", { input_info("input"), input_info("axis"), input_info("splits_length") }, tensor(batch(crop_batch_num), spatial(crop_x_size, crop_y_size), feature(crop_feature_num_2)), { tensor(feature(feature_offset_2), spatial(0,0),batch(0)) }, op_mode, 1)); - std::vector input_vec = { -1, 2, -3, 4 }; - std::vector out1 = { -1, 2, -3 }; - std::vector out2 = { 4 }; + std::vector input_vec = { -1.0f, 2.0f, -3.0f, 4.0f }; + std::vector out1 = { -1.0f, 2.0f, -3.0f }; + std::vector out2 = { 4.0f }; std::vector splits_vec = {3, 1}; set_values(input_mem, input_vec); @@ -1417,14 +1426,21 @@ TEST(crop_gpu, dynamic_in1x4x1x1_varaidic_split) { network.set_input_data("input", input_mem); auto outputs = network.execute(); + auto impl1 = network.get_primitive("crop1")->get_impl(); + ASSERT_TRUE(impl1 != nullptr); + ASSERT_TRUE(impl1->is_dynamic()); + auto impl2 = network.get_primitive("crop2")->get_impl(); + ASSERT_TRUE(impl2 != nullptr); + ASSERT_TRUE(impl2->is_dynamic()); + auto output = outputs.at("crop1").get_memory(); - cldnn::mem_lock output_ptr(output, get_test_stream()); + cldnn::mem_lock output_ptr(output, get_test_stream()); for (size_t i = 0; i < out1.size(); i++) ASSERT_EQ(output_ptr[i], out1[i]); auto output_2 = outputs.at("crop2").get_memory(); - cldnn::mem_lock output_ptr_2(output_2, get_test_stream()); + cldnn::mem_lock output_ptr_2(output_2, get_test_stream()); for (size_t i = 0; i < out2.size(); i++) ASSERT_EQ(output_ptr_2[i], out2[i]); diff --git a/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp index 052f7b206b9527..22ff33c3fd5c0c 100644 --- a/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp @@ -23,7 +23,7 @@ TEST(lru_cache, basic_data_type) input_values.push_back(std::make_pair(i, i + 10)); } - ASSERT_EQ(ca.get_lru_element(), int()); + ASSERT_EQ(ca.get_lru_element().second, int()); std::vector expected_hitted = {false, false, false, false, true, true, false}; for (size_t i = 0; i < input_values.size(); i++) { @@ -95,7 +95,7 @@ TEST(lru_cache, custom_data_type) { std::vector expected_hitted = {false, false, false, false, true, true, true, false}; - ASSERT_EQ(ca.get_lru_element(), std::shared_ptr()); + ASSERT_EQ(ca.get_lru_element().second, std::shared_ptr()); for (size_t i = 0; i < inputs.size(); i++) { auto& in = inputs[i]; std::shared_ptr p_data; diff --git a/src/plugins/intel_gpu/tests/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/test_utils/test_utils.cpp index b8f9396d137931..e71a2b32bc53e3 100644 --- a/src/plugins/intel_gpu/tests/test_utils/test_utils.cpp +++ b/src/plugins/intel_gpu/tests/test_utils/test_utils.cpp @@ -287,7 +287,12 @@ std::vector> generic_test::generate_generic_test_pa } std::shared_ptr create_test_engine() { - return cldnn::engine::create(engine_types::ocl, runtime_types::ocl); + auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl); +#ifdef ENABLE_ONEDNN_FOR_GPU + if(ret->get_device_info().supports_immad) + ret->create_onednn_engine({}); +#endif + return ret; } cldnn::engine& get_test_engine() { diff --git a/src/plugins/intel_gpu/tests/test_utils/test_utils.h b/src/plugins/intel_gpu/tests/test_utils/test_utils.h index 6993939d3e8c1d..d46935aee5384d 100644 --- a/src/plugins/intel_gpu/tests/test_utils/test_utils.h +++ b/src/plugins/intel_gpu/tests/test_utils/test_utils.h @@ -589,6 +589,26 @@ std::vector get_output_values_to_float(network& net, const primitive_id& ret.push_back(mem[i]); return ret; } + +inline std::vector get_output_values_to_float(network& net, const primitive_id& output_id, size_t max_cnt = std::numeric_limits::max()) { + switch(net.get_output_layout(output_id).data_type){ + case data_types::f16: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::f32: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::i8: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::u8: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::i32: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::i64: + return get_output_values_to_float(net, output_id, max_cnt); + default: + IE_THROW() << "Unknown output data_type"; + } +} + double default_tolerance(data_types dt); // inline void print_bin_blob(cldnn::memory& mem, std::string name) // { diff --git a/src/plugins/template/backend/CMakeLists.txt b/src/plugins/template/backend/CMakeLists.txt index 320fec922ecbae..04b16c14885a93 100644 --- a/src/plugins/template/backend/CMakeLists.txt +++ b/src/plugins/template/backend/CMakeLists.txt @@ -28,7 +28,6 @@ add_library(openvino::interpreter_backend ALIAS interpreter_backend) if(CMAKE_COMPILER_IS_GNUCXX) ie_add_compiler_flags(-Wno-missing-declarations) - ie_add_compiler_flags(-Wno-sign-compare) endif() ie_faster_build(interpreter_backend UNITY) diff --git a/src/plugins/template/src/async_infer_request.cpp b/src/plugins/template/src/async_infer_request.cpp index 74d3cfae77a10c..29532650b9d9f9 100644 --- a/src/plugins/template/src/async_infer_request.cpp +++ b/src/plugins/template/src/async_infer_request.cpp @@ -4,15 +4,16 @@ #include "async_infer_request.hpp" -#include "infer_request.hpp" #include "openvino/runtime/iinfer_request.hpp" +#include "sync_infer_request.hpp" #include "template_itt.hpp" // ! [async_infer_request:ctor] -TemplatePlugin::AsyncInferRequest::AsyncInferRequest(const std::shared_ptr& request, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& wait_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor) +ov::template_plugin::AsyncInferRequest::AsyncInferRequest( + const std::shared_ptr& request, + const std::shared_ptr& task_executor, + const std::shared_ptr& wait_executor, + const std::shared_ptr& callback_executor) : ov::IAsyncInferRequest(request, task_executor, callback_executor), m_wait_executor(wait_executor) { // In current implementation we have CPU only tasks and no needs in 2 executors @@ -46,7 +47,7 @@ TemplatePlugin::AsyncInferRequest::AsyncInferRequest(const std::shared_ptr -#include "infer_request.hpp" #include "openvino/runtime/iasync_infer_request.hpp" #include "openvino/runtime/iinfer_request.hpp" +#include "sync_infer_request.hpp" -namespace TemplatePlugin { +namespace ov { +namespace template_plugin { // ! [async_infer_request:header] class AsyncInferRequest : public ov::IAsyncInferRequest { public: AsyncInferRequest(const std::shared_ptr& request, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& wait_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor); + const std::shared_ptr& task_executor, + const std::shared_ptr& wait_executor, + const std::shared_ptr& callback_executor); ~AsyncInferRequest(); private: - InferenceEngine::ITaskExecutor::Ptr m_wait_executor; + std::shared_ptr m_wait_executor; }; // ! [async_infer_request:header] -} // namespace TemplatePlugin +} // namespace template_plugin +} // namespace ov diff --git a/src/plugins/template/src/compiled_model.cpp b/src/plugins/template/src/compiled_model.cpp index 6ea87593020f2f..4d420d09304620 100644 --- a/src/plugins/template/src/compiled_model.cpp +++ b/src/plugins/template/src/compiled_model.cpp @@ -9,78 +9,17 @@ #include "async_infer_request.hpp" #include "ie_ngraph_utils.hpp" #include "ie_plugin_config.hpp" +#include "openvino/runtime/properties.hpp" #include "plugin.hpp" #include "template/config.hpp" #include "template_itt.hpp" #include "transformations/utils/utils.hpp" -using namespace TemplatePlugin; - -namespace { - -InferenceEngine::SizeVector get_dims(const ov::Output& port) { - InferenceEngine::SizeVector dims = {}; - const auto& p_shape = port.get_partial_shape(); - if (p_shape.is_static()) - dims = p_shape.get_shape(); - return dims; -} - -} // namespace - -namespace ov { -namespace legacy_convert { - -void fill_input_info(const ov::Output& input, InferenceEngine::InputInfo::Ptr& input_info) { - if (!input_info) { - // Create input info - auto param_name = input.get_node()->get_friendly_name(); - auto dims = get_dims(input); - InferenceEngine::TensorDesc desc(InferenceEngine::details::convertPrecision(input.get_element_type()), - dims, - InferenceEngine::TensorDesc::getLayoutByDims(dims)); - auto data = std::make_shared(param_name, desc); - input_info = std::make_shared(); - input_info->setInputData(data); - } - auto& rt_info = input.get_rt_info(); - auto it = rt_info.find("ie_legacy_preproc"); - if (it != rt_info.end()) { - input_info->getPreProcess() = it->second.as(); - } - it = rt_info.find("ie_legacy_td"); - if (it != rt_info.end()) { - auto td = it->second.as(); - input_info->getInputData()->reshape(td.getDims(), td.getLayout()); - input_info->setPrecision(td.getPrecision()); - } -} -void fill_output_info(const ov::Output& output, InferenceEngine::DataPtr& output_info) { - if (!output_info) { - // Create input info - const auto& res_name = ov::op::util::create_ie_output_name(output); - auto dims = get_dims(output); - InferenceEngine::TensorDesc desc(InferenceEngine::details::convertPrecision(output.get_element_type()), - dims, - InferenceEngine::TensorDesc::getLayoutByDims(dims)); - output_info = std::make_shared(res_name, desc); - } - auto& rt_info = output.get_rt_info(); - auto it = rt_info.find("ie_legacy_td"); - if (it != rt_info.end()) { - auto td = it->second.as(); - output_info->reshape(td.getDims(), td.getLayout()); - output_info->setPrecision(td.getPrecision()); - } -} -} // namespace legacy_convert -} // namespace ov - // ! [executable_network:ctor_cnnnetwork] -TemplatePlugin::CompiledModel::CompiledModel(const std::shared_ptr& model, - const std::shared_ptr& plugin, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const Configuration& cfg) +ov::template_plugin::CompiledModel::CompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const std::shared_ptr& task_executor, + const Configuration& cfg) : ov::ICompiledModel(model, plugin, task_executor), // Disable default threads creation _cfg(cfg), m_model(model) { @@ -89,8 +28,9 @@ TemplatePlugin::CompiledModel::CompiledModel(const std::shared_ptr& m // In this case, _waitExecutor should also be created per device. try { compile_model(m_model); - } catch (const InferenceEngine::Exception&) { - throw; + } catch (const InferenceEngine::Exception& e) { + // Some transformations can throw legacy exception + throw ov::Exception(e.what()); } catch (const std::exception& e) { OPENVINO_ASSERT(false, "Standard exception from compilation library: ", e.what()); } catch (...) { @@ -103,7 +43,7 @@ TemplatePlugin::CompiledModel::CompiledModel(const std::shared_ptr& m // forward declaration void transform_model(const std::shared_ptr& model); -void TemplatePlugin::CompiledModel::compile_model(const std::shared_ptr& model) { +void ov::template_plugin::CompiledModel::compile_model(const std::shared_ptr& model) { // apply plugins transformations transform_model(model); // Perform any other steps like allocation and filling backend specific memory handles and so on @@ -111,44 +51,44 @@ void TemplatePlugin::CompiledModel::compile_model(const std::shared_ptr TemplatePlugin::CompiledModel::create_infer_request() const { +std::shared_ptr ov::template_plugin::CompiledModel::create_infer_request() const { auto internal_request = create_sync_infer_request(); - auto async_infer_request = - std::make_shared(std::static_pointer_cast(internal_request), - get_task_executor(), - get_template_plugin()->_waitExecutor, - get_callback_executor()); + auto async_infer_request = std::make_shared( + std::static_pointer_cast(internal_request), + get_task_executor(), + get_template_plugin()->_waitExecutor, + get_callback_executor()); return async_infer_request; } -std::shared_ptr TemplatePlugin::CompiledModel::create_sync_infer_request() const { +std::shared_ptr ov::template_plugin::CompiledModel::create_sync_infer_request() const { return std::make_shared( - std::static_pointer_cast(shared_from_this())); + std::static_pointer_cast(shared_from_this())); } // ! [executable_network:create_infer_request] -void TemplatePlugin::CompiledModel::set_property(const ov::AnyMap& properties) { +void ov::template_plugin::CompiledModel::set_property(const ov::AnyMap& properties) { OPENVINO_NOT_IMPLEMENTED; } -ov::RemoteContext TemplatePlugin::CompiledModel::get_context() const { +ov::RemoteContext ov::template_plugin::CompiledModel::get_context() const { OPENVINO_NOT_IMPLEMENTED; } -std::shared_ptr TemplatePlugin::CompiledModel::get_runtime_model() const { +std::shared_ptr ov::template_plugin::CompiledModel::get_runtime_model() const { return m_model; } -std::shared_ptr TemplatePlugin::CompiledModel::get_template_plugin() const { +std::shared_ptr ov::template_plugin::CompiledModel::get_template_plugin() const { auto plugin = get_plugin(); OPENVINO_ASSERT(plugin); - auto template_plugin = std::static_pointer_cast(plugin); + auto template_plugin = std::static_pointer_cast(plugin); OPENVINO_ASSERT(template_plugin); return template_plugin; } // ! [executable_network:get_config] -InferenceEngine::Parameter TemplatePlugin::CompiledModel::get_property(const std::string& name) const { +ov::Any ov::template_plugin::CompiledModel::get_property(const std::string& name) const { const auto& add_ro_properties = [](const std::string& name, std::vector& properties) { properties.emplace_back(ov::PropertyName{name, ov::PropertyMutability::RO}); }; @@ -179,7 +119,9 @@ InferenceEngine::Parameter TemplatePlugin::CompiledModel::get_property(const std return to_string_vector(metrics); } else if (EXEC_NETWORK_METRIC_KEY(SUPPORTED_CONFIG_KEYS) == name) { auto configs = default_rw_properties(); - auto streamExecutorConfigKeys = InferenceEngine::IStreamsExecutor::Config{}.SupportedKeys(); + auto streamExecutorConfigKeys = ov::threading::IStreamsExecutor::Config{} + .get_property(ov::supported_properties.name()) + .as>(); for (auto&& configKey : streamExecutorConfigKeys) { configs.emplace_back(configKey); } @@ -208,7 +150,7 @@ InferenceEngine::Parameter TemplatePlugin::CompiledModel::get_property(const std // ! [executable_network:get_config] // ! [executable_network:export] -void TemplatePlugin::CompiledModel::export_model(std::ostream& modelStream) const { +void ov::template_plugin::CompiledModel::export_model(std::ostream& modelStream) const { OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "ExecutableNetwork::Export"); std::stringstream xmlFile, binFile; diff --git a/src/plugins/template/src/compiled_model.hpp b/src/plugins/template/src/compiled_model.hpp index 82e4455cf18582..ffc632708e4ae5 100644 --- a/src/plugins/template/src/compiled_model.hpp +++ b/src/plugins/template/src/compiled_model.hpp @@ -10,7 +10,8 @@ #include "openvino/runtime/tensor.hpp" #include "template_config.hpp" -namespace TemplatePlugin { +namespace ov { +namespace template_plugin { class Plugin; class InferRequest; @@ -24,7 +25,7 @@ class CompiledModel : public ov::ICompiledModel { public: CompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, + const std::shared_ptr& task_executor, const Configuration& cfg); // Methods from a base class ov::ICompiledModel @@ -55,4 +56,5 @@ class CompiledModel : public ov::ICompiledModel { }; // ! [executable_network:header] -} // namespace TemplatePlugin +} // namespace template_plugin +} // namespace ov diff --git a/src/plugins/template/src/plugin.cpp b/src/plugins/template/src/plugin.cpp index 41cc097551e79b..3b2765172ed193 100644 --- a/src/plugins/template/src/plugin.cpp +++ b/src/plugins/template/src/plugin.cpp @@ -19,15 +19,13 @@ #include "transformations/op_conversions/convert_reduce_to_pooling.hpp" #include "transformations/template_pattern_transformation.hpp" -using namespace TemplatePlugin; - namespace { static constexpr const char* wait_executor_name = "TemplateWaitExecutor"; static constexpr const char* stream_executor_name = "TemplateStreamsExecutor"; } // namespace // ! [plugin:ctor] -Plugin::Plugin() { +ov::template_plugin::Plugin::Plugin() { // TODO: fill with actual device name, backend engine set_device_name("TEMPLATE"); @@ -35,12 +33,12 @@ Plugin::Plugin() { _backend = ngraph::runtime::Backend::create(); // create default stream executor with a given name - _waitExecutor = get_executor_manager()->getIdleCPUStreamsExecutor({wait_executor_name}); + _waitExecutor = get_executor_manager()->get_idle_cpu_streams_executor({wait_executor_name}); } // ! [plugin:ctor] // ! [plugin:dtor] -Plugin::~Plugin() { +ov::template_plugin::Plugin::~Plugin() { // Plugin should remove executors from executor cache to avoid threads number growth in the whole application get_executor_manager()->clear(stream_executor_name); get_executor_manager()->clear(wait_executor_name); @@ -49,11 +47,11 @@ Plugin::~Plugin() { } // ! [plugin:dtor] -ov::RemoteContext TemplatePlugin::Plugin::create_context(const ov::AnyMap& remote_properties) const { +ov::RemoteContext ov::template_plugin::Plugin::create_context(const ov::AnyMap& remote_properties) const { OPENVINO_NOT_IMPLEMENTED; } -ov::RemoteContext TemplatePlugin::Plugin::get_default_context(const ov::AnyMap& remote_properties) const { +ov::RemoteContext ov::template_plugin::Plugin::get_default_context(const ov::AnyMap& remote_properties) const { OPENVINO_NOT_IMPLEMENTED; } @@ -85,32 +83,34 @@ void transform_model(const std::shared_ptr& model) { // ! [plugin:transform_network] // ! [plugin:load_exe_network_impl] -std::shared_ptr TemplatePlugin::Plugin::compile_model(const std::shared_ptr& model, - const ov::AnyMap& properties) const { +std::shared_ptr ov::template_plugin::Plugin::compile_model( + const std::shared_ptr& model, + const ov::AnyMap& properties) const { OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "Plugin::compile_model"); auto fullConfig = Configuration{properties, _cfg}; auto streamsExecutorConfig = - InferenceEngine::IStreamsExecutor::Config::MakeDefaultMultiThreaded(fullConfig._streamsExecutorConfig); + ov::threading::IStreamsExecutor::Config::make_default_multi_threaded(fullConfig._streamsExecutorConfig); streamsExecutorConfig._name = stream_executor_name; auto compiled_model = std::make_shared(model->clone(), shared_from_this(), - get_executor_manager()->getIdleCPUStreamsExecutor(streamsExecutorConfig), + get_executor_manager()->get_idle_cpu_streams_executor(streamsExecutorConfig), fullConfig); return compiled_model; } -std::shared_ptr TemplatePlugin::Plugin::compile_model(const std::shared_ptr& model, - const ov::AnyMap& properties, - const ov::RemoteContext& context) const { +std::shared_ptr ov::template_plugin::Plugin::compile_model( + const std::shared_ptr& model, + const ov::AnyMap& properties, + const ov::RemoteContext& context) const { OPENVINO_NOT_IMPLEMENTED; } // ! [plugin:load_exe_network_impl] // ! [plugin:import_network] -std::shared_ptr TemplatePlugin::Plugin::import_model(std::istream& model, - const ov::AnyMap& properties) const { +std::shared_ptr ov::template_plugin::Plugin::import_model(std::istream& model, + const ov::AnyMap& properties) const { OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "Plugin::import_model"); auto fullConfig = Configuration{properties, _cfg}; @@ -131,26 +131,26 @@ std::shared_ptr TemplatePlugin::Plugin::import_model(std::is auto ov_model = get_core()->read_model(xmlString, weights); auto streamsExecutorConfig = - InferenceEngine::IStreamsExecutor::Config::MakeDefaultMultiThreaded(fullConfig._streamsExecutorConfig); + ov::threading::IStreamsExecutor::Config::make_default_multi_threaded(fullConfig._streamsExecutorConfig); streamsExecutorConfig._name = stream_executor_name; auto compiled_model = std::make_shared(ov_model, shared_from_this(), - get_executor_manager()->getIdleCPUStreamsExecutor(streamsExecutorConfig), + get_executor_manager()->get_idle_cpu_streams_executor(streamsExecutorConfig), fullConfig); return compiled_model; } -std::shared_ptr TemplatePlugin::Plugin::import_model(std::istream& model, - const ov::RemoteContext& context, - const ov::AnyMap& properties) const { +std::shared_ptr ov::template_plugin::Plugin::import_model(std::istream& model, + const ov::RemoteContext& context, + const ov::AnyMap& properties) const { OPENVINO_NOT_IMPLEMENTED; } // ! [plugin:import_network] // ! [plugin:query_network] -ov::SupportedOpsMap TemplatePlugin::Plugin::query_model(const std::shared_ptr& model, - const ov::AnyMap& properties) const { +ov::SupportedOpsMap ov::template_plugin::Plugin::query_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const { OV_ITT_SCOPED_TASK(TemplatePlugin::itt::domains::TemplatePlugin, "Plugin::query_model"); Configuration fullConfig{properties, _cfg, false}; @@ -194,13 +194,13 @@ ov::SupportedOpsMap TemplatePlugin::Plugin::query_model(const std::shared_ptr& properties) { properties.emplace_back(ov::PropertyName{name, ov::PropertyMutability::RO}); }; @@ -236,7 +236,9 @@ ov::Any TemplatePlugin::Plugin::get_property(const std::string& name, const ov:: return to_string_vector(metrics); } else if (METRIC_KEY(SUPPORTED_CONFIG_KEYS) == name) { auto configs = default_rw_properties(); - auto streamExecutorConfigKeys = InferenceEngine::IStreamsExecutor::Config{}.SupportedKeys(); + auto streamExecutorConfigKeys = ov::threading::IStreamsExecutor::Config{} + .get_property(ov::supported_properties.name()) + .as>(); for (auto&& configKey : streamExecutorConfigKeys) { if (configKey != InferenceEngine::PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) { configs.emplace_back(configKey); @@ -281,5 +283,5 @@ ov::Any TemplatePlugin::Plugin::get_property(const std::string& name, const ov:: // ! [plugin:create_plugin_engine] static const ov::Version version = {CI_BUILD_NUMBER, "openvino_template_plugin"}; -OV_DEFINE_PLUGIN_CREATE_FUNCTION(Plugin, version) +OV_DEFINE_PLUGIN_CREATE_FUNCTION(ov::template_plugin::Plugin, version) // ! [plugin:create_plugin_engine] diff --git a/src/plugins/template/src/plugin.hpp b/src/plugins/template/src/plugin.hpp index 1c45317522bc6b..0c04798e3a1c8d 100644 --- a/src/plugins/template/src/plugin.hpp +++ b/src/plugins/template/src/plugin.hpp @@ -8,10 +8,12 @@ #include "compiled_model.hpp" #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/iplugin.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" #include "template_config.hpp" //! [plugin:header] -namespace TemplatePlugin { +namespace ov { +namespace template_plugin { class Plugin : public ov::IPlugin { public: @@ -45,13 +47,14 @@ class Plugin : public ov::IPlugin { const ov::AnyMap& properties) const override; private: - friend class TemplatePlugin::CompiledModel; + friend class CompiledModel; friend class InferRequest; std::shared_ptr _backend; Configuration _cfg; - InferenceEngine::ITaskExecutor::Ptr _waitExecutor; + std::shared_ptr _waitExecutor; }; -} // namespace TemplatePlugin +} // namespace template_plugin +} // namespace ov //! [plugin:header] diff --git a/src/plugins/template/src/infer_request.cpp b/src/plugins/template/src/sync_infer_request.cpp similarity index 89% rename from src/plugins/template/src/infer_request.cpp rename to src/plugins/template/src/sync_infer_request.cpp index 8d2863c224cdb4..b6f2a6edf2c0f0 100644 --- a/src/plugins/template/src/infer_request.cpp +++ b/src/plugins/template/src/sync_infer_request.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "infer_request.hpp" +#include "sync_infer_request.hpp" #include #include @@ -31,7 +31,7 @@ void allocate_tensor_impl(ov::Tensor& tensor, const ov::element::Type& element_t } // namespace // ! [infer_request:ctor] -TemplatePlugin::InferRequest::InferRequest(const std::shared_ptr& model) +ov::template_plugin::InferRequest::InferRequest(const std::shared_ptr& model) : ov::ISyncInferRequest(model) { // TODO: allocate infer request device and host buffers if needed, fill actual list of profiling tasks @@ -75,23 +75,24 @@ TemplatePlugin::InferRequest::InferRequest(const std::shared_ptr> TemplatePlugin::InferRequest::query_state() const { +std::vector> ov::template_plugin::InferRequest::query_state() const { OPENVINO_NOT_IMPLEMENTED; } -std::shared_ptr TemplatePlugin::InferRequest::get_template_model() const { +std::shared_ptr ov::template_plugin::InferRequest::get_template_model() + const { auto& compiled_model = get_compiled_model(); - auto template_model = std::dynamic_pointer_cast(compiled_model); + auto template_model = std::dynamic_pointer_cast(compiled_model); OPENVINO_ASSERT(template_model); return template_model; } // ! [infer_request:dtor] -TemplatePlugin::InferRequest::~InferRequest() = default; +ov::template_plugin::InferRequest::~InferRequest() = default; // ! [infer_request:dtor] // ! [infer_request:infer_impl] -void TemplatePlugin::InferRequest::infer() { +void ov::template_plugin::InferRequest::infer() { // TODO: fill with actual list of pipeline stages, which are executed synchronously for sync infer requests infer_preprocess(); start_pipeline(); @@ -101,7 +102,7 @@ void TemplatePlugin::InferRequest::infer() { // ! [infer_request:infer_impl] // ! [infer_request:infer_preprocess] -void TemplatePlugin::InferRequest::infer_preprocess() { +void ov::template_plugin::InferRequest::infer_preprocess() { OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, m_profiling_task[Preprocess]); auto start = Time::now(); convert_batched_tensors(); @@ -168,7 +169,7 @@ void TemplatePlugin::InferRequest::infer_preprocess() { // ! [infer_request:infer_preprocess] // ! [infer_request:start_pipeline] -void TemplatePlugin::InferRequest::start_pipeline() { +void ov::template_plugin::InferRequest::start_pipeline() { OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, m_profiling_task[StartPipeline]) auto start = Time::now(); m_executable->call(m_backend_output_tensors, m_backend_input_tensors); @@ -176,7 +177,7 @@ void TemplatePlugin::InferRequest::start_pipeline() { } // ! [infer_request:start_pipeline] -void TemplatePlugin::InferRequest::wait_pipeline() { +void ov::template_plugin::InferRequest::wait_pipeline() { OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, m_profiling_task[WaitPipeline]) auto start = Time::now(); // TODO: Wait pipeline using driver API or other synchronizations methods @@ -185,7 +186,7 @@ void TemplatePlugin::InferRequest::wait_pipeline() { } // ! [infer_request:infer_postprocess] -void TemplatePlugin::InferRequest::infer_postprocess() { +void ov::template_plugin::InferRequest::infer_postprocess() { OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, m_profiling_task[Postprocess]); auto start = Time::now(); OPENVINO_ASSERT(get_outputs().size() == m_backend_output_tensors.size()); @@ -206,8 +207,8 @@ void TemplatePlugin::InferRequest::infer_postprocess() { // ! [infer_request:infer_postprocess] // ! [infer_request:set_blobs_impl] -void TemplatePlugin::InferRequest::set_tensors_impl(const ov::Output port, - const std::vector& tensors) { +void ov::template_plugin::InferRequest::set_tensors_impl(const ov::Output port, + const std::vector& tensors) { for (const auto& input : get_inputs()) { if (input == port) { m_batched_tensors[input.get_tensor_ptr()] = tensors; @@ -219,7 +220,7 @@ void TemplatePlugin::InferRequest::set_tensors_impl(const ov::Output TemplatePlugin::InferRequest::get_profiling_info() const { +std::vector ov::template_plugin::InferRequest::get_profiling_info() const { std::vector info; const auto fill_profiling_info = [](const std::string& name, const std::chrono::duration& time) -> ov::ProfilingInfo { diff --git a/src/plugins/template/src/infer_request.hpp b/src/plugins/template/src/sync_infer_request.hpp similarity index 89% rename from src/plugins/template/src/infer_request.hpp rename to src/plugins/template/src/sync_infer_request.hpp index 35b9c6f75b5811..49b9bdaa790666 100644 --- a/src/plugins/template/src/infer_request.hpp +++ b/src/plugins/template/src/sync_infer_request.hpp @@ -16,7 +16,8 @@ #include "openvino/itt.hpp" #include "openvino/runtime/isync_infer_request.hpp" -namespace TemplatePlugin { +namespace ov { +namespace template_plugin { // forward declaration class CompiledModel; @@ -24,7 +25,7 @@ class CompiledModel; // ! [infer_request:header] class InferRequest : public ov::ISyncInferRequest { public: - explicit InferRequest(const std::shared_ptr& compiled_model); + explicit InferRequest(const std::shared_ptr& compiled_model); ~InferRequest(); void infer() override; @@ -54,4 +55,5 @@ class InferRequest : public ov::ISyncInferRequest { }; // ! [infer_request:header] -} // namespace TemplatePlugin +} // namespace template_plugin +} // namespace ov diff --git a/src/plugins/template/src/template_config.cpp b/src/plugins/template/src/template_config.cpp index 582b1c6d0589bb..d24afae28857a4 100644 --- a/src/plugins/template/src/template_config.cpp +++ b/src/plugins/template/src/template_config.cpp @@ -9,23 +9,24 @@ #include "template/config.hpp" -using namespace TemplatePlugin; +using namespace ov::template_plugin; Configuration::Configuration() {} Configuration::Configuration(const ConfigMap& config, const Configuration& defaultCfg, bool throwOnUnsupported) { *this = defaultCfg; - // If plugin needs to use InferenceEngine::StreamsExecutor it should be able to process its configuration - auto streamExecutorConfigKeys = _streamsExecutorConfig.SupportedKeys(); + // If plugin needs to use ov::threading::StreamsExecutor it should be able to process its configuration + auto streamExecutorConfigKeys = + _streamsExecutorConfig.get_property(ov::supported_properties.name()).as>(); for (auto&& c : config) { const auto& key = c.first; const auto& value = c.second; if (ov::template_plugin::throughput_streams == key) { - _streamsExecutorConfig.SetConfig(CONFIG_KEY(CPU_THROUGHPUT_STREAMS), value.as()); + _streamsExecutorConfig.set_property(CONFIG_KEY(CPU_THROUGHPUT_STREAMS), value); } else if (streamExecutorConfigKeys.end() != std::find(std::begin(streamExecutorConfigKeys), std::end(streamExecutorConfigKeys), key)) { - _streamsExecutorConfig.SetConfig(key, value.as()); + _streamsExecutorConfig.set_property(key, value); } else if (CONFIG_KEY(DEVICE_ID) == key) { deviceId = std::stoi(value.as()); if (deviceId > 0) { @@ -42,11 +43,12 @@ Configuration::Configuration(const ConfigMap& config, const Configuration& defau } } -InferenceEngine::Parameter Configuration::Get(const std::string& name) const { - auto streamExecutorConfigKeys = _streamsExecutorConfig.SupportedKeys(); +ov::Any Configuration::Get(const std::string& name) const { + auto streamExecutorConfigKeys = + _streamsExecutorConfig.get_property(ov::supported_properties.name()).as>(); if ((streamExecutorConfigKeys.end() != std::find(std::begin(streamExecutorConfigKeys), std::end(streamExecutorConfigKeys), name))) { - return _streamsExecutorConfig.GetConfig(name); + return _streamsExecutorConfig.get_property(name); } else if (name == CONFIG_KEY(DEVICE_ID)) { return {std::to_string(deviceId)}; } else if (name == CONFIG_KEY(PERF_COUNT)) { @@ -54,7 +56,7 @@ InferenceEngine::Parameter Configuration::Get(const std::string& name) const { } else if (name == ov::template_plugin::throughput_streams || name == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { return {std::to_string(_streamsExecutorConfig._streams)}; } else if (name == CONFIG_KEY(CPU_BIND_THREAD)) { - return const_cast(_streamsExecutorConfig).GetConfig(name); + return _streamsExecutorConfig.get_property(name); } else if (name == CONFIG_KEY(CPU_THREADS_NUM)) { return {std::to_string(_streamsExecutorConfig._threads)}; } else if (name == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { diff --git a/src/plugins/template/src/template_config.hpp b/src/plugins/template/src/template_config.hpp index 74b578546e7241..a0a2bf8b41d26e 100644 --- a/src/plugins/template/src/template_config.hpp +++ b/src/plugins/template/src/template_config.hpp @@ -4,13 +4,14 @@ #pragma once -#include #include -#include #include -#include -namespace TemplatePlugin { +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" + +namespace ov { +namespace template_plugin { // ! [configuration:header] using ConfigMap = std::map; @@ -26,15 +27,16 @@ struct Configuration { const Configuration& defaultCfg = {}, const bool throwOnUnsupported = true); - InferenceEngine::Parameter Get(const std::string& name) const; + ov::Any Get(const std::string& name) const; // Plugin configuration parameters int deviceId = 0; bool perfCount = true; - InferenceEngine::IStreamsExecutor::Config _streamsExecutorConfig; + ov::threading::IStreamsExecutor::Config _streamsExecutorConfig; ov::hint::PerformanceMode performance_mode = ov::hint::PerformanceMode::UNDEFINED; }; // ! [configuration:header] -} // namespace TemplatePlugin +} // namespace template_plugin +} // namespace ov diff --git a/src/plugins/template/src/template_itt.hpp b/src/plugins/template/src/template_itt.hpp index 50a410bb07d228..b376682439ba82 100644 --- a/src/plugins/template/src/template_itt.hpp +++ b/src/plugins/template/src/template_itt.hpp @@ -11,10 +11,12 @@ #include -namespace TemplatePlugin { +namespace ov { +namespace template_plugin { namespace itt { namespace domains { OV_ITT_DOMAIN(TemplatePlugin); } } // namespace itt -} // namespace TemplatePlugin +} // namespace template_plugin +} // namespace ov diff --git a/src/tests/functional/plugin/conformance/subgraphs_dumper/include/ops_cache.hpp b/src/tests/functional/plugin/conformance/subgraphs_dumper/include/ops_cache.hpp index 1bce86d3c91b51..41a91ea5ceeccf 100644 --- a/src/tests/functional/plugin/conformance/subgraphs_dumper/include/ops_cache.hpp +++ b/src/tests/functional/plugin/conformance/subgraphs_dumper/include/ops_cache.hpp @@ -12,6 +12,8 @@ #include "matchers/matchers_manager.hpp" #include "functional_test_utils/include/functional_test_utils/summary/op_info.hpp" +#include "utils/model_wrap_struct.hpp" + namespace SubgraphsDumper { class OPCache { @@ -23,9 +25,9 @@ class OPCache { return std::unique_ptr(new OPCache()); } - void update_ops_cache(const std::shared_ptr &op, const std::string &source_model = {}); + void update_ops_cache(const std::shared_ptr &op, const Model& source_model); - void update_ops_cache(const std::shared_ptr &func, const bool extract_body = true, const std::string &source_model = {}); + void update_ops_cache(const std::shared_ptr &func, const Model& source_model, const bool extract_body = true); void serialize_cached_ops(const std::string &serialization_dir); diff --git a/src/tests/functional/plugin/conformance/subgraphs_dumper/include/utils/model_wrap_struct.hpp b/src/tests/functional/plugin/conformance/subgraphs_dumper/include/utils/model_wrap_struct.hpp index 63b7dd0cbe9f06..f92bcf53112650 100644 --- a/src/tests/functional/plugin/conformance/subgraphs_dumper/include/utils/model_wrap_struct.hpp +++ b/src/tests/functional/plugin/conformance/subgraphs_dumper/include/utils/model_wrap_struct.hpp @@ -4,6 +4,7 @@ #pragma once +#include "common_test_utils/file_utils.hpp" #include "functional_test_utils/ov_plugin_cache.hpp" namespace SubgraphsDumper { @@ -11,11 +12,17 @@ namespace SubgraphsDumper { struct Model { std::string path; size_t size = 0; + std::string name; + size_t op_cnt = 0; Model(std::string model) { path = model; + auto pos = model.rfind(CommonTestUtils::FileSeparator); + name = pos == std::string::npos ? model : CommonTestUtils::replaceExt(model.substr(pos + 1), ""); try { - size = ov::test::utils::PluginCache::get().core()->read_model(path)->get_graph_size(); + auto ov_model = ov::test::utils::PluginCache::get().core()->read_model(path); + size = ov_model->get_graph_size(); + op_cnt = ov_model->get_ops().size() - (ov_model->inputs().size() + ov_model->outputs().size()); } catch (...) { std::cout << "Impossible to read network: " << path << std::endl; } diff --git a/src/tests/functional/plugin/conformance/subgraphs_dumper/src/main.cpp b/src/tests/functional/plugin/conformance/subgraphs_dumper/src/main.cpp index 5d27cc08362bb2..3a5d1925a7d998 100644 --- a/src/tests/functional/plugin/conformance/subgraphs_dumper/src/main.cpp +++ b/src/tests/functional/plugin/conformance/subgraphs_dumper/src/main.cpp @@ -112,7 +112,7 @@ void cacheModels(std::unique_ptr &cache, ret_code = 1; continue; } - cache->update_ops_cache(function, extract_body, model.path); + cache->update_ops_cache(function, model, extract_body); successful_models_file << model.path << std::endl; } catch (std::exception &e) { not_fully_cached_models_file << model.path << std::endl; diff --git a/src/tests/functional/plugin/conformance/subgraphs_dumper/src/ops_cache.cpp b/src/tests/functional/plugin/conformance/subgraphs_dumper/src/ops_cache.cpp index 94a149ae34ddf2..ecc5de853db6ee 100644 --- a/src/tests/functional/plugin/conformance/subgraphs_dumper/src/ops_cache.cpp +++ b/src/tests/functional/plugin/conformance/subgraphs_dumper/src/ops_cache.cpp @@ -14,11 +14,12 @@ using namespace SubgraphsDumper; void OPCache::update_ops_cache(const std::shared_ptr &op, - const std::string &source_model) { + const Model& source_model) { const std::shared_ptr cachedOp = [&] { for (auto &&it : m_ops_cache) { if (manager.match_any(it.first, op, it.second)) { - it.second.found_in_models[source_model] += 1; + it.second.found_in_models[source_model.name].unique_op_cnt += 1; + it.second.found_in_models[source_model.name].model_paths.insert({{source_model.path, source_model.op_cnt}}); return it.first; } } @@ -28,7 +29,7 @@ void OPCache::update_ops_cache(const std::shared_ptr &op, auto saveOpToCash = [&] { try { const auto& clone_fn = SubgraphsDumper::ClonersMap::cloners.at(op->get_type_info()); - LayerTestsUtils::OPInfo meta(source_model); + LayerTestsUtils::OPInfo meta(source_model.name, source_model.path, source_model.op_cnt); const std::shared_ptr op_clone = clone_fn(op, meta); if (!op_clone) { return; @@ -62,7 +63,7 @@ void OPCache::update_ops_cache(const std::shared_ptr &op, } } -void OPCache::update_ops_cache(const std::shared_ptr &func, const bool extract_body, const std::string &source_model) { +void OPCache::update_ops_cache(const std::shared_ptr &func, const Model& source_model, const bool extract_body) { size_t cached_ops_count = m_ops_cache.size(); for (const auto &op : func->get_ordered_ops()) { if (std::dynamic_pointer_cast(op) || @@ -81,16 +82,16 @@ void OPCache::update_ops_cache(const std::shared_ptr &func, const boo std::vector> bodies; for (size_t i = 0; i < if_op->get_internal_subgraphs_size(); i++) { auto if_body = if_op->get_function(i); - update_ops_cache(if_body, extract_body, source_model); + update_ops_cache(if_body, source_model, extract_body); } } else if (std::dynamic_pointer_cast(op)) { auto loop = std::dynamic_pointer_cast(op); auto loop_body = loop->get_function(); - update_ops_cache(loop_body, extract_body, source_model); + update_ops_cache(loop_body, source_model, extract_body); } else if (std::dynamic_pointer_cast(op)) { auto ti = std::dynamic_pointer_cast(op); auto ti_body = ti->get_body(); - update_ops_cache(ti_body, extract_body, source_model); + update_ops_cache(ti_body, source_model, extract_body); } } update_ops_cache(op, source_model); @@ -122,12 +123,25 @@ void OPCache::serialize_meta_info(const LayerTestsUtils::OPInfo &info, const std pugi::xml_document doc; pugi::xml_node root = doc.append_child("meta_info"); pugi::xml_node models = root.append_child("models"); - models.append_child("initial_model").append_attribute("name").set_value(info.source_model.c_str()); + double k = 0; for (const auto &model : info.found_in_models) { pugi::xml_node model_node = models.append_child("model"); model_node.append_attribute("name").set_value(model.first.c_str()); - model_node.append_attribute("count").set_value(static_cast(model.second)); + double model_k = model.second.unique_op_cnt; + model_node.append_attribute("count").set_value(static_cast(model.second.unique_op_cnt)); + size_t tmp = 0; + for (const auto& model_path : model.second.model_paths) { + if (model_path.second) { + model_node.append_child("path").append_attribute("model").set_value(model_path.first.c_str()); + tmp += model_path.second; + } + } + model_k /= tmp; + model_k /= model.second.model_paths.size(); + k += model_k; } + k *= info.found_in_models.size(); + root.append_child("graph_priority").append_attribute("value").set_value(k); auto ports_info = root.append_child("ports_info"); for (const auto &port : info.ports_info) { auto port_node = ports_info.append_child("port"); @@ -164,7 +178,7 @@ OPCache::serialize_function(const std::pair, LayerTest const std::string &serialization_dir) { try { std::cout << "Serializing function wrapping op " << op.first << std::endl; - std::cout << "Taken from model: " << op.second.source_model << std::endl; + std::cout << "Taken from model: " << op.second.found_in_models.begin()->first << std::endl; ov::ParameterVector params; bool is_dynamic = false; diff --git a/src/tests/functional/plugin/conformance/test_runner/conformance_infra/include/read_ir_test/read_ir.hpp b/src/tests/functional/plugin/conformance/test_runner/conformance_infra/include/read_ir_test/read_ir.hpp index 43add7e79c5e6d..b38ec2d3343db6 100644 --- a/src/tests/functional/plugin/conformance/test_runner/conformance_infra/include/read_ir_test/read_ir.hpp +++ b/src/tests/functional/plugin/conformance/test_runner/conformance_infra/include/read_ir_test/read_ir.hpp @@ -34,7 +34,7 @@ class ReadIRTest : public testing::WithParamInterface, void SetUp() override; private: - std::string path_to_model, path_to_cache, source_model; + std::string path_to_model, path_to_cache; std::vector> ocurance_in_models; }; } // namespace subgraph diff --git a/src/tests/functional/plugin/conformance/test_runner/conformance_infra/src/read_ir_test/read_ir.cpp b/src/tests/functional/plugin/conformance/test_runner/conformance_infra/src/read_ir_test/read_ir.cpp index d96f52888e1467..397dcb9e6762f8 100644 --- a/src/tests/functional/plugin/conformance/test_runner/conformance_infra/src/read_ir_test/read_ir.cpp +++ b/src/tests/functional/plugin/conformance/test_runner/conformance_infra/src/read_ir_test/read_ir.cpp @@ -145,9 +145,16 @@ void ReadIRTest::SetUp() { pugi::xml_document doc; doc.load_file(metaFile.c_str()); auto models = doc.child("meta_info").child("models"); - source_model = models.child("initial_model").attribute("name").as_string(); + size_t model_len = 0, occurance = 0; for (const auto &model : models.children("model")) { ocurance_in_models.push_back({model.attribute("name").as_string(), model.attribute("count").as_uint()}); + model_len++; + occurance += model.attribute("count").as_uint(); + } + rel_influence_coef = doc.child("meta_info").child("graph_priority").attribute("value").as_double(); + // TODO: remove after cache update w/a + if (rel_influence_coef == 0) { + rel_influence_coef = 1.f; } auto portsInfo = doc.child("meta_info").child("ports_info"); auto getPortInfo = [&](size_t id) { @@ -264,6 +271,7 @@ void ReadIRTest::SetUp() { if (inputShapes.empty()) { GTEST_SKIP() << "The graph is constant. The case is not applicable for Operation conformance scenario"; } + std::cout << "[ CONFORMANCE ] Influence coefficient: " << rel_influence_coef << std::endl; init_input_shapes(inputShapes); is_report_stages = true; } diff --git a/src/tests/functional/plugin/gpu/CMakeLists.txt b/src/tests/functional/plugin/gpu/CMakeLists.txt index 1c17179c8c7ecd..5687994e9bef7b 100644 --- a/src/tests/functional/plugin/gpu/CMakeLists.txt +++ b/src/tests/functional/plugin/gpu/CMakeLists.txt @@ -45,7 +45,9 @@ if (ENABLE_INTEL_CPU) set_source_files_properties( "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/behavior/ov_plugin/life_time.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/behavior/ov_plugin/properties_tests.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/behavior/ov_infer_request/perf_counters.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/multi/gpu_remote_blob_tests.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/behavior/infer_request/memory_states.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/behavior/ov_executable_network/exec_net_base.cpp" PROPERTIES COMPILE_DEFINITIONS ENABLE_INTEL_CPU=1) endif() diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/memory_states.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/memory_states.cpp new file mode 100644 index 00000000000000..c0272699c46051 --- /dev/null +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/infer_request/memory_states.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "behavior/infer_request/memory_states.hpp" +#include "functional_test_utils/plugin_cache.hpp" +#include "ngraph_functions/builders.hpp" + +using namespace BehaviorTestsDefinitions; + +namespace { +std::vector memoryStateTestCases = { +#ifdef ENABLE_INTEL_CPU + memoryStateParams(InferRequestVariableStateTest::getNetwork(), + {"c_1-3", "r_1-3"}, + CommonTestUtils::DEVICE_MULTI, + {{MULTI_CONFIG_KEY(DEVICE_PRIORITIES), + CommonTestUtils::DEVICE_GPU + std::string(",") + CommonTestUtils::DEVICE_CPU}}) +#endif +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, + InferRequestQueryStateExceptionTest, + ::testing::ValuesIn(memoryStateTestCases), + InferRequestQueryStateExceptionTest::getTestCaseName); +} // namespace diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_infer_request/perf_counters.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_infer_request/perf_counters.cpp index b9cb611839a800..3c4b21202a58e7 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_infer_request/perf_counters.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_infer_request/perf_counters.cpp @@ -34,12 +34,14 @@ auto configs = []() { }; auto Multiconfigs = []() { - return std::vector{{ov::device::priorities(CommonTestUtils::DEVICE_GPU)}, + return std::vector{ + {ov::device::priorities(CommonTestUtils::DEVICE_GPU)}, #ifdef ENABLE_INTEL_CPU - {ov::device::priorities(CommonTestUtils::DEVICE_GPU, CommonTestUtils::DEVICE_CPU), - ov::intel_auto::device_bind_buffer(false)}, - {ov::device::priorities(CommonTestUtils::DEVICE_GPU, CommonTestUtils::DEVICE_CPU), - ov::intel_auto::device_bind_buffer(true)} + {ov::device::priorities(CommonTestUtils::DEVICE_GPU, CommonTestUtils::DEVICE_CPU), ov::enable_profiling(true)}, + {ov::device::priorities(CommonTestUtils::DEVICE_GPU, CommonTestUtils::DEVICE_CPU), + ov::intel_auto::device_bind_buffer(false)}, + {ov::device::priorities(CommonTestUtils::DEVICE_GPU, CommonTestUtils::DEVICE_CPU), + ov::intel_auto::device_bind_buffer(true)} #endif }; }; @@ -85,4 +87,19 @@ INSTANTIATE_TEST_SUITE_P(smoke_AutoBatch_BehaviorTests, OVInferRequestPerfCounte ::testing::Values(CommonTestUtils::DEVICE_BATCH), ::testing::ValuesIn(AutoBatchConfigs())), OVInferRequestPerfCountersTest::getTestCaseName); + +auto MulticonfigsTest = []() { + return std::vector{ +#ifdef ENABLE_INTEL_CPU + {ov::device::priorities(CommonTestUtils::DEVICE_GPU, CommonTestUtils::DEVICE_CPU), + ov::device::priorities(CommonTestUtils::DEVICE_CPU, CommonTestUtils::DEVICE_GPU)} +#endif + }; +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, + OVInferRequestPerfCountersExceptionTest, + ::testing::Combine(::testing::Values(CommonTestUtils::DEVICE_MULTI), + ::testing::ValuesIn(MulticonfigsTest())), + OVInferRequestPerfCountersExceptionTest::getTestCaseName); } // namespace diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp index f178723e025e86..64468aa0cece75 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp @@ -48,15 +48,15 @@ namespace { INSTANTIATE_TEST_SUITE_P(smoke_KernelCachingSupportCase_GPU, CompiledKernelsCacheTest, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_GPU), - ::testing::Values(std::make_pair(ov::AnyMap{}, "cl_cache"))), + ::testing::Values(std::make_pair(ov::AnyMap{}, "blob"))), CompiledKernelsCacheTest::getTestCaseName); auto autoConfigs = []() { return std::vector>{ - std::make_pair(ov::AnyMap{{ov::device::priorities(CommonTestUtils::DEVICE_GPU)}}, "cl_cache"), + std::make_pair(ov::AnyMap{{ov::device::priorities(CommonTestUtils::DEVICE_GPU)}}, "blob"), std::make_pair( ov::AnyMap{{ov::device::priorities(CommonTestUtils::DEVICE_GPU, CommonTestUtils::DEVICE_CPU)}}, - "blob,cl_cache"), + "blob"), std::make_pair( ov::AnyMap{{ov::device::priorities(CommonTestUtils::DEVICE_CPU, CommonTestUtils::DEVICE_GPU)}}, "blob")}; diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/caching_tests.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/caching_tests.cpp index 1c10cea5ffe8ca..92dc383ad49c82 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/caching_tests.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/caching_tests.cpp @@ -46,7 +46,7 @@ namespace { INSTANTIATE_TEST_SUITE_P(smoke_KernelCachingSupportCase_GPU, LoadNetworkCompiledKernelsCacheTest, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_GPU), - ::testing::Values(std::make_pair(std::map(), "cl_cache"))), + ::testing::Values(std::make_pair(std::map(), "blob"))), LoadNetworkCompiledKernelsCacheTest::getTestCaseName); typedef std::map conftype; @@ -54,10 +54,10 @@ namespace { return std::vector>{ std::make_pair(conftype{{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, CommonTestUtils::DEVICE_GPU}}, - "cl_cache"), + "blob"), std::make_pair(conftype{{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, (std::string(CommonTestUtils::DEVICE_GPU) + "," + CommonTestUtils::DEVICE_CPU)}}, - "blob,cl_cache"), + "blob"), std::make_pair(conftype{{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, (std::string(CommonTestUtils::DEVICE_CPU) + "," + CommonTestUtils::DEVICE_GPU)}}, "blob")}; diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather_tree.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather_tree.cpp index 82a94204d3114c..f70fb29ac97c49 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather_tree.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather_tree.cpp @@ -13,6 +13,7 @@ namespace { const std::vector netPrecisions = { InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16, InferenceEngine::Precision::I32 }; diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp index 176e1e7dd35cc5..cd11d6a444501e 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp @@ -118,7 +118,7 @@ std::vector disabledTestPatterns() { R"(.*smoke_select_CompareWithRefsNumpy_dynamic_range.*)", R"(.*CachingSupportCase.*LoadNetworkCacheTestBase.*CompareWithRefImpl.*)", #if defined(_WIN32) || defined(_WIN64) - R"(.*Auto_KernelCachingSupportCase.*CanCreateCacheDirAndDumpBinariesUnicodePath.*)", + R"(.*KernelCachingSupportCase.*CanCreateCacheDirAndDumpBinariesUnicodePath.*)", #endif R"(.*CachingSupportCase.*GPU.*CompileModelCacheTestBase.*CompareWithRefImpl.*)", // Currently 1D convolution has an issue diff --git a/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/split.cpp b/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/split.cpp index f27d9691ba9e50..7e38844f08b9ca 100644 --- a/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/split.cpp +++ b/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/split.cpp @@ -16,7 +16,7 @@ namespace GPULayerTestsDefinitions { typedef std::tuple< size_t, // Num splits - size_t, // Axis + int64_t, // Axis ElementType, // Net precision InputShape, // Input shapes std::vector // Used outputs indices @@ -52,7 +52,8 @@ class SplitLayerGPUDynamicTest : public testing::WithParamInterface outIndices; ElementType netPrecision; @@ -127,7 +128,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_SplitsCheck6D, SplitLayerGPUDynamicTest, SplitLayerGPUDynamicTest::getTestCaseName); typedef std::tuple< - size_t, // Axis + int64_t, // Axis std::vector, // SplitLength ElementType, // Net precision InputShape // Input shapes @@ -138,7 +139,7 @@ class VariadicSplitLayerGPUDynamicTest : public testing::WithParamInterface obj) { std::ostringstream result; - size_t axis; + int64_t axis; std::vector splitLength; ElementType netPrecision; InputShape inputShape; @@ -159,7 +160,7 @@ class VariadicSplitLayerGPUDynamicTest : public testing::WithParamInterface splitLength; ElementType netPrecision; diff --git a/src/tests/functional/plugin/shared/include/base/ov_behavior_test_utils.hpp b/src/tests/functional/plugin/shared/include/base/ov_behavior_test_utils.hpp index cbc4ec8c94be9f..73fc178a7209e8 100644 --- a/src/tests/functional/plugin/shared/include/base/ov_behavior_test_utils.hpp +++ b/src/tests/functional/plugin/shared/include/base/ov_behavior_test_utils.hpp @@ -50,6 +50,7 @@ class APIBaseTest : public CommonTestUtils::TestsCommon { const std::unique_ptr crashHandler = std::unique_ptr(new CommonTestUtils::CrashHandler()); protected: + size_t k = 1; std::string target_device = ""; ov::test::utils::ov_entity api_entity = ov::test::utils::ov_entity::undefined; ov::test::utils::ApiSummary& api_summary = ov::test::utils::ApiSummary::getInstance(); @@ -61,7 +62,10 @@ class APIBaseTest : public CommonTestUtils::TestsCommon { void SetUp() override { set_api_entity(); - api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::CRASHED); + auto test_name = this->GetTestName(); + k = test_name.find("_mandatory") != std::string::npos || test_name.find("mandatory_") != std::string::npos ? 1 : 0; + std::cout << "[ CONFORMANCE ] Influence coefficient: " << k << std::endl; + api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::CRASHED, k); #ifdef _WIN32 jmpRes = setjmp(CommonTestUtils::env); #else @@ -70,7 +74,7 @@ class APIBaseTest : public CommonTestUtils::TestsCommon { if (jmpRes == CommonTestUtils::JMP_STATUS::ok) { crashHandler->StartTimer(); } else if (jmpRes == CommonTestUtils::JMP_STATUS::alarmErr) { - api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::HANGED); + api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::HANGED, k); GTEST_FAIL(); } } @@ -80,11 +84,11 @@ class APIBaseTest : public CommonTestUtils::TestsCommon { set_api_entity(); } if (this->HasFailure()) { - api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::FAILED); + api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::FAILED, k); } else if (this->IsSkipped()) { - api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::SKIPPED); + api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::SKIPPED, k); } else { - api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::PASSED); + api_summary.updateStat(api_entity, target_device, ov::test::utils::PassRate::Statuses::PASSED, k); } } }; diff --git a/src/tests/functional/plugin/shared/include/behavior/infer_request/memory_states.hpp b/src/tests/functional/plugin/shared/include/behavior/infer_request/memory_states.hpp index 1cadd168d7afc9..be24de8f6f3976 100644 --- a/src/tests/functional/plugin/shared/include/behavior/infer_request/memory_states.hpp +++ b/src/tests/functional/plugin/shared/include/behavior/infer_request/memory_states.hpp @@ -28,5 +28,8 @@ class InferRequestVariableStateTest : public BehaviorTestsUtils::IEInferRequestT public: void SetUp() override; static std::string getTestCaseName(const testing::TestParamInfo &obj); + static InferenceEngine::CNNNetwork getNetwork(); }; + +using InferRequestQueryStateExceptionTest = InferRequestVariableStateTest; } // namespace BehaviorTestsDefinitions diff --git a/src/tests/functional/plugin/shared/include/behavior/ov_infer_request/perf_counters.hpp b/src/tests/functional/plugin/shared/include/behavior/ov_infer_request/perf_counters.hpp index 4458a8b8a37882..5c4c9a8c0e434c 100644 --- a/src/tests/functional/plugin/shared/include/behavior/ov_infer_request/perf_counters.hpp +++ b/src/tests/functional/plugin/shared/include/behavior/ov_infer_request/perf_counters.hpp @@ -14,6 +14,7 @@ struct OVInferRequestPerfCountersTest : public virtual OVInferRequestTests { void SetUp() override; ov::InferRequest req; }; +using OVInferRequestPerfCountersExceptionTest = OVInferRequestPerfCountersTest; } // namespace behavior } // namespace test } // namespace ov diff --git a/src/tests/functional/plugin/shared/src/behavior/infer_request/memory_states.cpp b/src/tests/functional/plugin/shared/src/behavior/infer_request/memory_states.cpp index 4666142e453d07..9d5136794c58a1 100644 --- a/src/tests/functional/plugin/shared/src/behavior/infer_request/memory_states.cpp +++ b/src/tests/functional/plugin/shared/src/behavior/infer_request/memory_states.cpp @@ -31,6 +31,36 @@ void InferRequestVariableStateTest::SetUp() { IEInferRequestTestBase::SetUp(); } +InferenceEngine::CNNNetwork InferRequestVariableStateTest::getNetwork() { + ngraph::Shape shape = {1, 200}; + ngraph::element::Type type = ngraph::element::f32; + + auto input = std::make_shared(type, shape); + auto mem_i1 = std::make_shared(type, shape, 0); + auto mem_r1 = std::make_shared(mem_i1, "r_1-3"); + auto mul1 = std::make_shared(mem_r1, input); + + auto mem_i2 = std::make_shared(type, shape, 0); + auto mem_r2 = std::make_shared(mem_i2, "c_1-3"); + auto mul2 = std::make_shared(mem_r2, mul1); + auto mem_w2 = std::make_shared(mul2, "c_1-3"); + + auto mem_w1 = std::make_shared(mul2, "r_1-3"); + auto sigm = std::make_shared(mul2); + sigm->set_friendly_name("sigmod_state"); + mem_r1->set_friendly_name("Memory_1"); + mem_w1->add_control_dependency(mem_r1); + sigm->add_control_dependency(mem_w1); + + mem_r2->set_friendly_name("Memory_2"); + mem_w2->add_control_dependency(mem_r2); + sigm->add_control_dependency(mem_w2); + + auto function = + std::make_shared(ngraph::NodeVector{sigm}, ngraph::ParameterVector{input}, "addOutput"); + return InferenceEngine::CNNNetwork{function}; +} + InferenceEngine::ExecutableNetwork InferRequestVariableStateTest::PrepareNetwork() { net.addOutput("Memory_1"); net.addOutput("Memory_2"); @@ -249,4 +279,11 @@ TEST_P(InferRequestVariableStateTest, inferreq_smoke_VariableState_2infers) { } } } + +TEST_P(InferRequestQueryStateExceptionTest, inferreq_smoke_QueryState_ExceptionTest) { + auto executableNet = PrepareNetwork(); + auto inferReq = executableNet.CreateInferRequest(); + + EXPECT_ANY_THROW(inferReq.QueryState()); +} } // namespace BehaviorTestsDefinitions diff --git a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/perf_counters.cpp b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/perf_counters.cpp index e7b607bd63a749..1ce23479b04de9 100644 --- a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/perf_counters.cpp +++ b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/perf_counters.cpp @@ -33,6 +33,11 @@ TEST_P(OVInferRequestPerfCountersTest, NotEmptyAfterSyncInfer) { OV_ASSERT_NO_THROW(perf = req.get_profiling_info()); ASSERT_FALSE(perf.empty()); } + +TEST_P(OVInferRequestPerfCountersExceptionTest, perfCountWereNotEnabledExceptionTest) { + EXPECT_ANY_THROW(req.get_profiling_info()); +} + } // namespace behavior } // namespace test } // namespace ov diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp index 139335249f1077..d9089bc1ad8854 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp @@ -57,6 +57,7 @@ class SubgraphBaseTest : public CommonTestUtils::TestsCommon { ov::test::utils::OpSummary& summary = ov::test::utils::OpSummary::getInstance(); bool is_report_stages = false; + double rel_influence_coef = 1.f; virtual std::vector calculate_refs(); virtual std::vector get_plugin_outputs(); diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/variadic_split.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/variadic_split.hpp index 80e5910d336be6..5b19b6453425ba 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/variadic_split.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/variadic_split.hpp @@ -16,7 +16,7 @@ namespace LayerTestsDefinitions { typedef std::tuple< std::vector, // Num splits - size_t, // Axis + int64_t, // Axis InferenceEngine::Precision, // Net precision InferenceEngine::Precision, // Input precision InferenceEngine::Precision, // Output precision diff --git a/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp b/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp index 2b1179cddb00fe..04071328149af6 100644 --- a/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp +++ b/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp @@ -47,7 +47,7 @@ void SubgraphBaseTest::run() { ov::test::utils::PassRate::Statuses::SKIPPED : ov::test::utils::PassRate::Statuses::CRASHED; summary.setDeviceName(targetDevice); - summary.updateOPsStats(function, status); + summary.updateOPsStats(function, status, rel_influence_coef); if (isCurrentTestDisabled) GTEST_SKIP() << "Disabled test due to configuration" << std::endl; @@ -92,14 +92,14 @@ void SubgraphBaseTest::run() { status = ov::test::utils::PassRate::Statuses::FAILED; errorMessage = "Unknown failure occurred."; } - summary.updateOPsStats(function, status); + summary.updateOPsStats(function, status, rel_influence_coef); if (status != ov::test::utils::PassRate::Statuses::PASSED) { GTEST_FATAL_FAILURE_(errorMessage.c_str()); } } else if (jmpRes == CommonTestUtils::JMP_STATUS::anyError) { IE_THROW() << "Crash happens"; } else if (jmpRes == CommonTestUtils::JMP_STATUS::alarmErr) { - summary.updateOPsStats(function, ov::test::utils::PassRate::Statuses::HANGED); + summary.updateOPsStats(function, ov::test::utils::PassRate::Statuses::HANGED, rel_influence_coef); IE_THROW() << "Crash happens"; } } diff --git a/src/tests/functional/shared_test_classes/src/single_layer/variadic_split.cpp b/src/tests/functional/shared_test_classes/src/single_layer/variadic_split.cpp index b04a11e72467b7..b812895cc46acd 100644 --- a/src/tests/functional/shared_test_classes/src/single_layer/variadic_split.cpp +++ b/src/tests/functional/shared_test_classes/src/single_layer/variadic_split.cpp @@ -7,7 +7,7 @@ namespace LayerTestsDefinitions { std::string VariadicSplitLayerTest::getTestCaseName(const testing::TestParamInfo& obj) { - size_t axis; + int64_t axis; std::vector numSplits; InferenceEngine::Precision netPrecision; InferenceEngine::Precision inPrc, outPrc; @@ -30,7 +30,7 @@ namespace LayerTestsDefinitions { } void VariadicSplitLayerTest::SetUp() { - size_t axis; + int64_t axis; std::vector inputShape, numSplits; InferenceEngine::Precision netPrecision; std::tie(numSplits, axis, netPrecision, inPrc, outPrc, inLayout, outLayout, inputShape, targetDevice) = this->GetParam(); diff --git a/src/tests/functional/shared_test_classes/src/subgraph/memory_LSTMCell.cpp b/src/tests/functional/shared_test_classes/src/subgraph/memory_LSTMCell.cpp index 780405ef1f1f06..50319f7f229fcb 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/memory_LSTMCell.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/memory_LSTMCell.cpp @@ -252,7 +252,7 @@ namespace SubgraphTestsDefinitions { std::vector({1, 1, 1, hiddenSize})); auto final_reshape = std::make_shared(out_unsqueeze, final_reshape_pattern, false); - function = std::make_shared(final_reshape, input_parameter, "PureTI"); + function = std::make_shared(OutputVector{final_reshape, out_hidden, out_cell}, input_parameter, "PureTI"); } void MemoryLSTMCellTest::LoadNetwork() { diff --git a/src/tests/functional/shared_test_classes/src/subgraph/variadic_split_pad.cpp b/src/tests/functional/shared_test_classes/src/subgraph/variadic_split_pad.cpp index b59b29970ebb76..7d05cd29501295 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/variadic_split_pad.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/variadic_split_pad.cpp @@ -8,7 +8,7 @@ namespace SubgraphTestsDefinitions { std::string VariadicSplitPad::getTestCaseName(const testing::TestParamInfo &obj) { InferenceEngine::SizeVector inputShape; - size_t axis; + int64_t axis; std::vector numSplits, connectIndexes; std::vector padsBegin, padsEnd; ngraph::helpers::PadMode padMode; @@ -31,7 +31,7 @@ std::string VariadicSplitPad::getTestCaseName(const testing::TestParamInfo numSplits, connectIndexes; std::vector padBegin, padEnd; ngraph::helpers::PadMode padMode; diff --git a/src/tests/ie_test_utils/common_test_utils/test_assertions.hpp b/src/tests/ie_test_utils/common_test_utils/test_assertions.hpp index 714b7d5d767a50..8dfe4b1038c794 100644 --- a/src/tests/ie_test_utils/common_test_utils/test_assertions.hpp +++ b/src/tests/ie_test_utils/common_test_utils/test_assertions.hpp @@ -73,14 +73,16 @@ inline bool strDoesnotContain(const std::string & str, const std::string & subst } \ } -#define OV_EXPECT_THROW(statement, exception, exception_what_matcher) \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - FAIL() << "Expected exception " << OV_PP_TOSTRING(exception); \ - } catch (const exception& ex) { \ - EXPECT_THAT(ex.what(), exception_what_matcher); \ - } catch (...) { \ - FAIL() << "Unknown exception"; \ +#define OV_EXPECT_THROW(statement, exp_exception, exception_what_matcher) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + FAIL() << "Expected exception " << OV_PP_TOSTRING(exp_exception); \ + } catch (const exp_exception& ex) { \ + EXPECT_THAT(ex.what(), exception_what_matcher); \ + } catch (const std::exception& e) { \ + FAIL() << "Unexpected exception " << e.what(); \ + } catch (...) { \ + FAIL() << "Unknown exception"; \ } inline void compare_blob(InferenceEngine::Blob::Ptr lhs, InferenceEngine::Blob::Ptr rhs) { diff --git a/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/api_summary.hpp b/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/api_summary.hpp index 5f77c0f35c0f33..d71808c6a08027 100644 --- a/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/api_summary.hpp +++ b/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/api_summary.hpp @@ -48,7 +48,7 @@ class ApiSummary : public virtual Summary { static ApiSummary &getInstance(); inline void getStatisticFromReport(const std::string& filePath); std::map> getApiStats() { return apiStats; } - void updateStat(ov_entity, const std::string& device, PassRate::Statuses); + void updateStat(ov_entity, const std::string& device, PassRate::Statuses, double rel_influence_coef = 1); void saveReport() override; }; diff --git a/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/op_info.hpp b/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/op_info.hpp index 123a20a3571b97..3f3e18559b2101 100644 --- a/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/op_info.hpp +++ b/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/op_info.hpp @@ -5,6 +5,18 @@ #pragma once namespace LayerTestsUtils { + +struct ModelInfo { + size_t unique_op_cnt; + // model_path, op_cnt + std::map model_paths; + + + ModelInfo(size_t _op_cnt = 0, const std::map& _model_paths = {{}}) + : unique_op_cnt(_op_cnt), + model_paths(_model_paths) {} +}; + struct PortInfo { double min; double max; @@ -20,12 +32,11 @@ struct PortInfo { }; struct OPInfo { - std::string source_model; - std::map found_in_models; + std::map found_in_models; std::map ports_info; - OPInfo(const std::string &source_model) : source_model(source_model) { - found_in_models = {{source_model, 1}}; + OPInfo(const std::string& source_model, const std::string& model_path, size_t total_op_cnt = 0) { + found_in_models = {{source_model, ModelInfo(1, {{model_path, total_op_cnt}})}}; ports_info = {}; } diff --git a/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/op_summary.hpp b/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/op_summary.hpp index 065875b5b2e522..62700ea41f9395 100644 --- a/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/op_summary.hpp +++ b/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/op_summary.hpp @@ -49,10 +49,10 @@ class OpSummary : public virtual Summary { std::map getStatisticFromReport(); void saveReport() override; - void updateOPsStats(const std::shared_ptr &model, const PassRate::Statuses &status); + void updateOPsStats(const std::shared_ptr &model, const PassRate::Statuses &status, double rel_influence_coef = 1); void updateOPsImplStatus(const std::shared_ptr &model, const bool implStatus); - void updateOPsStats(const ov::NodeTypeInfo &op, const PassRate::Statuses &status); + void updateOPsStats(const ov::NodeTypeInfo &op, const PassRate::Statuses &status, double rel_influence_coef = 1); void updateOPsImplStatus(const ov::NodeTypeInfo &op, const bool implStatus); }; diff --git a/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/summary.hpp b/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/summary.hpp index 76eddf3ad0ca12..efe33d0df25a28 100644 --- a/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/summary.hpp +++ b/src/tests/ie_test_utils/functional_test_utils/include/functional_test_utils/summary/summary.hpp @@ -31,14 +31,19 @@ struct PassRate { unsigned long hanged = 0; bool isImplemented = false; + double rel_passed = 0; + double rel_all = 0; + PassRate() = default; - PassRate(unsigned long p, unsigned long f, unsigned long s, unsigned long c, unsigned long h) { + PassRate(unsigned long p, unsigned long f, unsigned long s, unsigned long c, unsigned long h, double rel_p = 0, double rel_a = 0) { passed = p; failed = f; skipped = s; crashed = c; hanged = h; + rel_passed = rel_p; + rel_all = rel_a; if (!isImplemented && passed > 0) { isImplemented = true; } @@ -55,6 +60,14 @@ struct PassRate { return passed * 100.f / (passed + failed + skipped + crashed + hanged); } } + + double getRelPassrate() const { + if (rel_all == 0) { + return 100.f; + } else { + return rel_passed * 100.f / rel_all; + } + } }; class Summary { diff --git a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/merge_xmls.py b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/merge_xmls.py index e8227958b18e9f..fcc526a44a003c 100644 --- a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/merge_xmls.py +++ b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/merge_xmls.py @@ -37,7 +37,7 @@ def parse_arguments(): def update_result_node(xml_node: SubElement, aggregated_res: SubElement): for attr_name in xml_node.attrib: - if attr_name == "passrate": + if attr_name == "passrate" or attr_name == "relative_passrate": continue if attr_name == "implemented": xml_value = xml_node.attrib.get(attr_name) == "true" @@ -45,8 +45,8 @@ def update_result_node(xml_node: SubElement, aggregated_res: SubElement): str_value = "true" if xml_value or aggregated_value else "false" aggregated_res.set(attr_name, str_value) continue - xml_value = int(xml_node.attrib.get(attr_name)) - aggregated_value = int(aggregated_res.attrib.get(attr_name)) + xml_value = float(xml_node.attrib.get(attr_name)) if "relative_" in attr_name else int(xml_node.attrib.get(attr_name)) + aggregated_value = float(aggregated_res.attrib.get(attr_name)) if "relative_" in attr_name else int(aggregated_res.attrib.get(attr_name)) # if attr_name == "crashed" and xml_value > 0: # print("f") aggregated_res.set(attr_name, str(xml_value + aggregated_value)) diff --git a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/run_parallel.py b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/run_parallel.py index cface67efbb0cd..83ca732e2871b6 100644 --- a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/run_parallel.py +++ b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/run_parallel.py @@ -393,19 +393,28 @@ def __save_log(logs_dir, dir, test_name): os.mkdir(os.path.join(logs_dir, test_st)) hash_map = dict() test_times = list() + fix_priority = list() for log in Path(self._working_dir).rglob("log_*.log"): log_filename = os.path.join(self._working_dir, log) with open(log_filename, "r") as log_file: test_name = None test_log = list() dir = None + ref_k = None test_cnt_expected = test_cnt_real_saved_now = test_cnt_real_saved_before = 0 - for line in log_file.readlines(): + try: + lines = log_file.readlines() + except: + lines = log.read_text(encoding='ascii', errors='ignore').split('\n') + + for line in lines: if constants.GTEST_FILTER in line: line = line[line.find(constants.GTEST_FILTER):] test_cnt_expected = line.count(':') if constants.RUN in line: test_name = line[line.find(constants.RUN) + len(constants.RUN) + 1:-1:] + if constants.REF_COEF in line: + ref_k = float(line[line.rfind(' ') + 1:]) if dir is None: for test_st, mes_list in constants.TEST_STATUS.items(): for mes in mes_list: @@ -427,6 +436,9 @@ def __save_log(logs_dir, dir, test_name): test_results[dir] += 1 else: test_results[dir] = 1 + if dir != "passed" and ref_k != None: + fix_priority.append((ref_k, test_name)) + ref_k = None test_cnt_real_saved_now += 1 test_name = None test_log = list() @@ -453,6 +465,15 @@ def __save_log(logs_dir, dir, test_name): dir, name = st csv_writer.writerow([dir, hash, name]) logger.info(f"Hashed test list is saved to: {hash_table_path}") + if len(fix_priority) > 0: + fix_priority_path = os.path.join(logs_dir, "fix_priority.csv") + with open(fix_priority_path, "w") as csv_file: + fix_priority.sort(reverse=True) + csv_writer = csv.writer(csv_file, dialect='excel') + csv_writer.writerow(["Test Name", "Fix Priority"]) + for priority, name in fix_priority: + csv_writer.writerow([name, priority]) + logger.info(f"Fix priorities list is saved to: {fix_priority_path}") disabled_tests_path = os.path.join(logs_dir, "disabled_tests.lst") @@ -493,4 +514,4 @@ def __save_log(logs_dir, dir, test_name): logger.error("Run is not successful") sys.exit(-1) else: - logger.info("Run is successful") \ No newline at end of file + logger.info("Run is successful") diff --git a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/summarize.py b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/summarize.py index 357fabb6949870..1cf9a370467442 100644 --- a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/summarize.py +++ b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/summarize.py @@ -21,7 +21,7 @@ NOT_RUN = "NOT RUN" NA = "N/A" -STATUS_CSV_ORDER = ["implemented", "passed", "failed", "skipped", "crashed", "hanged", "passrate"] +STATUS_CSV_ORDER = ["implemented", "passed", "failed", "skipped", "crashed", "hanged", "passrate", "relative_passrate"] logger = get_logger('conformance_summary') @@ -115,9 +115,13 @@ def collect_statistic(root: Element, is_conformance_mode: bool): logger.info("Statistic collecting is started") trusted_ops = dict() pass_rate_avg = dict() + pass_rate_avg_rel = dict() general_pass_rate = dict() + general_pass_rate_rel = dict() general_test_count = dict() + general_test_count_rel = dict() general_passed_tests = dict() + general_passed_tests_rel = dict() op_res = dict() results = dict() @@ -126,8 +130,11 @@ def collect_statistic(root: Element, is_conformance_mode: bool): results[device.tag] = {op.tag: op.attrib for op in device} pass_rate_avg[device.tag] = 0 + pass_rate_avg_rel[device.tag] = 0 general_test_count[device.tag] = 0 + general_test_count_rel[device.tag] = 0 general_passed_tests[device.tag] = 0 + general_passed_tests_rel[device.tag] = 0 trusted_ops[device.tag] = 0 covered_ops[device.tag] = 0 for op in results[device.tag]: @@ -139,25 +146,34 @@ def collect_statistic(root: Element, is_conformance_mode: bool): if op_test_cnt == 0: continue covered_ops[device.tag] += 1 - pass_rate = round(float(results[device.tag][op]["passrate"]), 1) + pass_rate = float("%.2f"%float(results[device.tag][op]["passrate"])) + relative_pass_rate = float("%.2f"%float(results[device.tag][op]["relative_passrate"])) results[device.tag][op]["passrate"] = pass_rate + results[device.tag][op]["relative_passrate"] = relative_pass_rate - pass_rate_avg[device.tag] += pass_rate if pass_rate == 100.: trusted_ops[device.tag] += 1 device_general_test_count = op_test_cnt general_test_count[device.tag] += device_general_test_count + general_test_count_rel[device.tag] += float(results[device.tag][op]["relative_all"]) general_passed_tests[device.tag] += int(results[device.tag][op]["passed"]) + general_passed_tests_rel[device.tag] += float(results[device.tag][op]["relative_passed"]) + pass_rate_avg[device.tag] += float(results[device.tag][op]["passrate"]) + pass_rate_avg_rel[device.tag] += float(results[device.tag][op]["relative_passrate"]) if op in op_res.keys(): op_res[op].update({device.tag: device_general_test_count}) else: op_res.update({op: {device.tag: device_general_test_count}}) pass_rate_avg[device.tag] = 0 if covered_ops[device.tag] == 0 else pass_rate_avg[device.tag] / covered_ops[device.tag] - pass_rate_avg[device.tag] = round(float(pass_rate_avg[device.tag]), 1) + pass_rate_avg[device.tag] = float("%.2f"%float(pass_rate_avg[device.tag])) + pass_rate_avg_rel[device.tag] = 0 if covered_ops[device.tag] == 0 else pass_rate_avg_rel[device.tag] / covered_ops[device.tag] + pass_rate_avg_rel[device.tag] = float("%.2f"%float(pass_rate_avg_rel[device.tag])) general_pass_rate[device.tag] = 0 if general_test_count[device.tag] == 0 else (general_passed_tests[device.tag] * 100 / general_test_count[device.tag]) - general_pass_rate[device.tag] = round(float(general_pass_rate[device.tag]), 1) - trusted_ops[device.tag] = round(float(trusted_ops[device.tag] * 100) / covered_ops[device.tag], 1) if device.tag in covered_ops and covered_ops[device.tag] != 0 else 0 + general_pass_rate[device.tag] = float("%.2f"%float(general_pass_rate[device.tag])) + general_pass_rate_rel[device.tag] = 0 if general_test_count_rel[device.tag] == 0 else (general_passed_tests_rel[device.tag] * 100 / general_test_count_rel[device.tag]) + general_pass_rate_rel[device.tag] = float("%.2f"%float(general_pass_rate_rel[device.tag])) + trusted_ops[device.tag] = float("%.2f"%(float("%.2f"%(float(trusted_ops[device.tag]) * 100)) / covered_ops[device.tag])) if device.tag in covered_ops and covered_ops[device.tag] != 0 else 0 logger.info("Test number comparison between devices is started") for op in op_res: @@ -177,7 +193,7 @@ def collect_statistic(root: Element, is_conformance_mode: bool): devices = results.keys() logger.info("Statistic collecting is completed") - return devices, results, general_pass_rate, pass_rate_avg, general_test_count, trusted_ops, covered_ops + return devices, results, general_pass_rate, general_pass_rate_rel, pass_rate_avg, pass_rate_avg_rel, general_test_count, trusted_ops, covered_ops def format_string(input_str: str): @@ -231,7 +247,7 @@ def create_summary(summary_root: Element, output_folder: os.path, expected_devic if is_conformance_mode: stat_update_utils.update_conformance_test_counters(summary_root) stat_update_utils.update_passrates(summary_root.find("results")) - device_list, results, general_pass_rate, pass_rate_avg, general_test_count, trusted_ops, covered_ops = \ + device_list, results, general_pass_rate, general_pass_rate_rel, pass_rate_avg, pass_rate_avg_rel, general_test_count, trusted_ops, covered_ops = \ collect_statistic(summary_root, is_conformance_mode) op_list = list() @@ -264,7 +280,8 @@ def create_summary(summary_root: Element, output_folder: os.path, expected_devic template = env.get_template('report_template.html') res_summary = template.render(ordered_ops=op_list, devices=device_list, results=results, timestamp=timestamp, - general_pass_rate=general_pass_rate, pass_rate_avg=pass_rate_avg, + general_pass_rate=general_pass_rate, general_pass_rate_rel=general_pass_rate_rel, + pass_rate_avg=pass_rate_avg, pass_rate_avg_rel=pass_rate_avg_rel, trusted_ops=trusted_ops, covered_ops=covered_ops, general_test_count=general_test_count, report_tag=report_tag, report_version=report_version) @@ -279,5 +296,11 @@ def create_summary(summary_root: Element, output_folder: os.path, expected_devic if __name__ == "__main__": args = parse_arguments() summary_root = merge_xmls(args.xml) - create_summary(summary_root, args.out, [] if args.expected_devices is None else args.expected_devices, args.report_tag, args.report_version, args.conformance_mode, args.csv, args.output_filename) + create_summary(summary_root, args.out, + [] if args.expected_devices is None else args.expected_devices, + args.report_tag, + args.report_version, + args.conformance_mode, + args.csv, + args.output_filename) diff --git a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/template/filters.js b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/template/filters.js index 7056b37ddfa57d..e851a1c32247b7 100644 --- a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/template/filters.js +++ b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/template/filters.js @@ -195,10 +195,10 @@ function calculateColumnStatistics(device) { if (!all_operations) { trusted_op = "---"; } else { - trusted_op = (count_trusted_op * 100 / all_operations).toFixed(1) + ' %'; + trusted_op = (count_trusted_op * 100 / all_operations).toFixed(2) + ' %'; } $('#statistic .table-primary.' + device + '.trusted-ops').text(trusted_op); - $('#statistic .table-primary.' + device + '.test_total').text(all_operations || 0); + // $('#statistic .table-primary.' + device + '.test_total').text(all_operations || 0); // tested op_counter tested_op_count = 0; @@ -218,7 +218,7 @@ function calculateColumnStatistics(device) { $('#statistic .table-primary.' + device + '.general_pass_rate').text('---'); } else { - general_pass_rate = (passed_tested_op_count * 100 / tested_op_count).toFixed(1) + ' %'; + general_pass_rate = (passed_tested_op_count * 100 / tested_op_count).toFixed(2) + ' %'; $('#statistic .table-primary.' + device + '.general_pass_rate').text(general_pass_rate); } $('#statistic .table-primary.' + device + '.tested-ops_count').text(tested_op_count); @@ -231,9 +231,9 @@ function calculateColumnStatistics(device) { } }); if (all_operations == 0) { - $('#statistic .table-primary.' + device + '.avg_pass_rate').text('---'); + $('#statistic .table-primary.' + device + '.rel_pass_rate').text('---'); } else { - avg_pass_rate = (sum_pass_rate / all_operations).toFixed(1) + ' %'; - $('#statistic .table-primary.' + device + '.avg_pass_rate').text(avg_pass_rate); + rel_pass_rate = (sum_pass_rate / all_operations).toFixed(2) + ' %'; + $('#statistic .table-primary.' + device + '.rel_pass_rate').text(rel_pass_rate); } } diff --git a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/template/report_template.html b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/template/report_template.html index edf9c3b0a1bd11..bbaccbe0afdf9b 100644 --- a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/template/report_template.html +++ b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/template/report_template.html @@ -111,15 +111,9 @@

Operations coverage summary: Tag: {{report_tag}} | Version: {{report_version {% endfor %} - Tested op counter: + AVG passrate (=sum_pass_rates/covered_ops_num): {% for d in devices -%} - {{general_test_count[d]}} - {% endfor %} - - - AVG passrate per op (=sum_pass_rates/covered_ops_num): - {% for d in devices -%} - {{pass_rate_avg[d]}} {% if pass_rate_avg[d] != "NOT RUN" -%}%{% endif -%} + {{pass_rate_avg_rel[d]}} {% if pass_rate_avg_rel[d] != "NOT RUN" -%}%{% endif -%} {% endfor %} @@ -143,7 +137,7 @@

Operations coverage summary: Tag: {{report_tag}} | Version: {{report_version passed="{{ results[d][op].passed }}" failed="{{ results[d][op].failed }}" skipped="{{ results[d][op].skipped }}" crashed="{{ results[d][op].crashed }}" hanged="{{ results[d][op].hanged }}" - value="{% if (results[d][op].passed != '0' or results[d][op].failed != '0' or results[d][op].crashed != '0' or results[d][op].skipped) != '0' -%}{{ results[d][op].passrate }}{% else -%}---{% endif -%}" + value="{% if (results[d][op].passed != '0' or results[d][op].failed != '0' or results[d][op].crashed != '0' or results[d][op].skipped) != '0' -%}{{ results[d][op].relative_passrate }}{% else -%}---{% endif -%}" title="{% if results[d][op].implemented == 'true' -%} {{op}} is implemented in {{d}} plugin {% else -%} @@ -151,7 +145,7 @@

Operations coverage summary: Tag: {{report_tag}} | Version: {{report_version {% endif -%}"> {% if (results[d][op].passed != '0' or results[d][op].failed != '0' or results[d][op].crashed != '0' or results[d][op].skipped != '0' or results[d][op].hanged != '0') -%} - {{ results[d][op].passrate }} %
+ {{ results[d][op].relative_passrate }} %
{% else -%} ---
{% endif -%} @@ -185,4 +179,4 @@

Operations coverage summary: Tag: {{report_tag}} | Version: {{report_version - + \ No newline at end of file diff --git a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/constants.py b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/constants.py index 0ea13121373851..797832dff52bdc 100644 --- a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/constants.py +++ b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/constants.py @@ -13,6 +13,7 @@ RUN = "[ RUN ]" GTEST_FILTER = "Google Test filter = " DISABLED_PREFIX = "DISABLED_" +REF_COEF = "[ CONFORMANCE ] Influence coefficient: " IS_WIN = "windows" in platform or "win32" in platform diff --git a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/stat_update_utils.py b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/stat_update_utils.py index 72f5035f8e9b32..fb1013c582b22f 100644 --- a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/stat_update_utils.py +++ b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/stat_update_utils.py @@ -10,19 +10,26 @@ def update_passrates(results: ET.SubElement): for op in device: passed_tests = 0 total_tests = 0 + rel_passed_tests = 0 + rel_all_tests = 0 for attrib in op.attrib: - if attrib == "passrate": + if attrib == "passrate" or attrib == "relative_passrate": continue if attrib == "implemented": continue - if attrib == "passed": + elif attrib == "passed": passed_tests = int(op.attrib.get(attrib)) + elif attrib == "relative_passed": + rel_passed_tests = float(op.attrib.get(attrib)) + continue + elif attrib == "relative_all": + rel_all_tests = float(op.attrib.get(attrib)) + continue total_tests += int(op.attrib.get(attrib)) - if total_tests == 0: - passrate = 0 - else: - passrate = float(passed_tests * 100 / total_tests) if passed_tests < total_tests else 100 - op.set("passrate", str(round(passrate, 1))) + passrate = float(passed_tests * 100 / total_tests) if total_tests != 0 else 0 + rel_passrate = float(rel_passed_tests * 100 / rel_all_tests) if rel_all_tests != 0 else 0 + op.set("passrate", "%.2f"%passrate) + op.set("relative_passrate", "%.2f"%rel_passrate) def update_conformance_test_counters(results: ET.SubElement): @@ -32,9 +39,12 @@ def update_conformance_test_counters(results: ET.SubElement): for op in device: op_test_count = 0 for attr_name in op.attrib: - if attr_name == "passrate" or attr_name == "implemented": + if attr_name == "passrate" or attr_name == "implemented" or attr_name == "relative_passrate": continue - op_test_count += int(op.attrib.get(attr_name)) + elif "relative_" in attr_name: + op_test_count += float(op.attrib.get(attr_name)) + else: + op_test_count += int(op.attrib.get(attr_name)) if not op.tag in max_test_cnt.keys(): max_test_cnt.update({op.tag: op_test_count}) if op_test_count != max_test_cnt[op.tag]: @@ -46,9 +56,12 @@ def update_conformance_test_counters(results: ET.SubElement): if op.tag in incorrect_ops: test_cnt = 0 for attr_name in op.attrib: - if attr_name == "passrate" or attr_name == "implemented": + if attr_name == "passrate" or attr_name == "implemented" or attr_name == "relative_passrate": continue - test_cnt += int(op.attrib[attr_name]) + elif "relative_" in attr_name: + test_cnt += float(op.attrib[attr_name]) + else: + test_cnt += int(op.attrib[attr_name]) if test_cnt != max_test_cnt[op.tag]: diff = max_test_cnt[op.tag] - test_cnt op.set("skipped", str(int(op.attrib["skipped"]) + diff)) diff --git a/src/tests/ie_test_utils/functional_test_utils/src/summary/api_summary.cpp b/src/tests/ie_test_utils/functional_test_utils/src/summary/api_summary.cpp index 98db6a9e169246..2ef03cdab04ffc 100644 --- a/src/tests/ie_test_utils/functional_test_utils/src/summary/api_summary.cpp +++ b/src/tests/ie_test_utils/functional_test_utils/src/summary/api_summary.cpp @@ -47,7 +47,7 @@ ApiSummary &ApiSummary::getInstance() { return *p_instance; } -void ApiSummary::updateStat(ov_entity entity, const std::string& target_device, PassRate::Statuses status) { +void ApiSummary::updateStat(ov_entity entity, const std::string& target_device, PassRate::Statuses status, double rel_influence_coef) { if (apiStats.empty()) { std::string outputFilePath = outputFolder + std::string(CommonTestUtils::FileSeparator) + reportFilename + CommonTestUtils::REPORT_EXTENSION; const bool fileExists = CommonTestUtils::fileExists(outputFilePath); @@ -74,6 +74,7 @@ void ApiSummary::updateStat(ov_entity entity, const std::string& target_device, isHangReported = false; return; } + cur_stat[real_device].rel_all += rel_influence_coef; switch (status) { case PassRate::Statuses::SKIPPED: { cur_stat[real_device].skipped++; @@ -84,6 +85,7 @@ void ApiSummary::updateStat(ov_entity entity, const std::string& target_device, cur_stat[real_device].isImplemented = true; } cur_stat[real_device].passed++; + cur_stat[real_device].rel_passed += rel_influence_coef; break; } case PassRate::Statuses::HANGED: { @@ -129,7 +131,9 @@ void ApiSummary::getStatisticFromReport(const std::string& filePath) { auto s = std::stoi(realDeviceNode.attribute("skipped").value()); auto c = std::stoi(realDeviceNode.attribute("crashed").value()); auto h = std::stoi(realDeviceNode.attribute("hanged").value()); - PassRate entity_stat(p, f, s, c, h); + auto rel_p = std::stoi(realDeviceNode.attribute("relative_passed").value()); + auto rel_all = std::stoi(realDeviceNode.attribute("relative_all").value()); + PassRate entity_stat(p, f, s, c, h, rel_p, rel_all); if (apiStats.find(entity) == apiStats.end()) { apiStats.insert({entity, {}}); } @@ -206,6 +210,9 @@ void ApiSummary::saveReport() { entry.append_attribute("crashed").set_value(static_cast(stat_device.second.crashed)); entry.append_attribute("hanged").set_value(static_cast(stat_device.second.hanged)); entry.append_attribute("passrate").set_value(stat_device.second.getPassrate()); + entry.append_attribute("relative_passed").set_value(static_cast(stat_device.second.rel_passed)); + entry.append_attribute("relative_all").set_value(static_cast(stat_device.second.rel_all)); + entry.append_attribute("relative_passrate").set_value(stat_device.second.getRelPassrate()); } } diff --git a/src/tests/ie_test_utils/functional_test_utils/src/summary/op_summary.cpp b/src/tests/ie_test_utils/functional_test_utils/src/summary/op_summary.cpp index f81255be8adcec..3ea86bdff94350 100644 --- a/src/tests/ie_test_utils/functional_test_utils/src/summary/op_summary.cpp +++ b/src/tests/ie_test_utils/functional_test_utils/src/summary/op_summary.cpp @@ -40,7 +40,7 @@ OpSummary &OpSummary::getInstance() { return *p_instance; } -void OpSummary::updateOPsStats(const ov::NodeTypeInfo &op, const PassRate::Statuses &status) { +void OpSummary::updateOPsStats(const ov::NodeTypeInfo &op, const PassRate::Statuses &status, double rel_influence_coef) { auto it = opsStats.find(op); if (opsStats.find(op) == opsStats.end()) { opsStats.insert({op, PassRate()}); @@ -50,6 +50,8 @@ void OpSummary::updateOPsStats(const ov::NodeTypeInfo &op, const PassRate::Statu isCrashReported = false; if (passrate.crashed > 0) passrate.crashed--; + } else { + passrate.rel_all += rel_influence_coef; } if (isHangReported) { isHangReported = false; @@ -61,6 +63,7 @@ void OpSummary::updateOPsStats(const ov::NodeTypeInfo &op, const PassRate::Statu passrate.isImplemented = true; } passrate.passed++; + passrate.rel_passed += rel_influence_coef; break; case PassRate::FAILED: passrate.failed++; @@ -123,13 +126,15 @@ std::map OpSummary::getStatisticFromReport() { auto s = std::stoi(child.attribute("skipped").value()); auto c = std::stoi(child.attribute("crashed").value()); auto h = std::stoi(child.attribute("hanged").value()); - PassRate obj(p, f, s, c, h); + auto rel_passed = std::stoi(child.attribute("rel_passed").value()); + auto rel_all = std::stoi(child.attribute("rel_all").value()); + PassRate obj(p, f, s, c, h, rel_passed, rel_all); oldOpsStat.insert({entry, obj}); } return oldOpsStat; } -void OpSummary::updateOPsStats(const std::shared_ptr &model, const PassRate::Statuses &status) { +void OpSummary::updateOPsStats(const std::shared_ptr &model, const PassRate::Statuses &status, double k) { if (model->get_parameters().empty()) { return; } @@ -160,26 +165,26 @@ void OpSummary::updateOPsStats(const std::shared_ptr &model, const Pa } if (extractBody) { if (std::dynamic_pointer_cast(op)) { - updateOPsStats(op->get_type_info(), status); + updateOPsStats(op->get_type_info(), status, k); auto ti = ov::as_type_ptr(op); auto ti_body = ti->get_function(); - updateOPsStats(ti_body, status); + updateOPsStats(ti_body, status, k); } else if (std::dynamic_pointer_cast(op)) { - updateOPsStats(op->get_type_info(), status); + updateOPsStats(op->get_type_info(), status, k); auto loop = ov::as_type_ptr(op); auto loop_body = loop->get_function(); - updateOPsStats(loop_body, status); + updateOPsStats(loop_body, status, k); } else if (std::dynamic_pointer_cast(op)) { - updateOPsStats(op->get_type_info(), status); + updateOPsStats(op->get_type_info(), status, k); auto if_op = ov::as_type_ptr(op); std::vector> bodies; for (size_t i = 0; i < if_op->get_internal_subgraphs_size(); i++) { auto if_body = if_op->get_function(i); - updateOPsStats(if_body, status); + updateOPsStats(if_body, status, k); } } } - updateOPsStats(op->get_type_info(), status); + updateOPsStats(op->get_type_info(), status, k); } } @@ -313,6 +318,9 @@ void OpSummary::saveReport() { entry.append_attribute("crashed").set_value(static_cast(it.second.crashed)); entry.append_attribute("hanged").set_value(static_cast(it.second.hanged)); entry.append_attribute("passrate").set_value(it.second.getPassrate()); + entry.append_attribute("relative_passed").set_value(it.second.rel_passed); + entry.append_attribute("relative_all").set_value(it.second.rel_all); + entry.append_attribute("relative_passrate").set_value(it.second.getRelPassrate()); } if (extendReport && fileExists) { @@ -328,6 +336,9 @@ void OpSummary::saveReport() { entry.append_attribute("crashed").set_value(static_cast(item.second.crashed)); entry.append_attribute("hanged").set_value(static_cast(item.second.hanged)); entry.append_attribute("passrate").set_value(item.second.getPassrate()); + entry.append_attribute("relative_passed").set_value(item.second.rel_passed); + entry.append_attribute("relative_all").set_value(item.second.rel_all); + entry.append_attribute("relative_passrate").set_value(item.second.getRelPassrate()); } else { entry = currentDeviceNode.child(item.first.c_str()); auto implStatus = entry.attribute("implemented").value() == std::string("true") ? true : false; @@ -336,7 +347,9 @@ void OpSummary::saveReport() { auto s = std::stoi(entry.attribute("skipped").value()) + item.second.skipped; auto c = std::stoi(entry.attribute("crashed").value()) + item.second.crashed; auto h = std::stoi(entry.attribute("hanged").value()) + item.second.hanged; - PassRate obj(p, f, s, c, h); + auto rel_passed = std::stoi(entry.attribute("relative_passed").value()) + item.second.rel_passed; + auto rel_all = std::stoi(entry.attribute("relative_all").value()) + item.second.rel_all; + PassRate obj(p, f, s, c, h, rel_passed, rel_all); (implStatus || obj.isImplemented) ? entry.attribute("implemented").set_value(true) @@ -347,6 +360,9 @@ void OpSummary::saveReport() { entry.attribute("crashed").set_value(static_cast(obj.crashed)); entry.attribute("hanged").set_value(static_cast(obj.hanged)); entry.attribute("passrate").set_value(obj.getPassrate()); + entry.attribute("relative_passed").set_value(item.second.rel_passed); + entry.attribute("relative_all").set_value(item.second.rel_all); + entry.attribute("relative_passrate").set_value(item.second.getRelPassrate()); } } } diff --git a/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp b/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp index 4cc6d4a31a3ce1..9a72ae31b321dc 100644 --- a/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp +++ b/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp @@ -250,7 +250,7 @@ std::shared_ptr makeSplit(const ngraph::Output &in, std::shared_ptr makeVariadicSplit(const ngraph::Output &in, const std::vector numSplits, - size_t axis); + int64_t axis); std::shared_ptr makeActivation(const ngraph::Output &in, const element::Type &type, diff --git a/src/tests/ngraph_helpers/ngraph_functions/src/variadic_split.cpp b/src/tests/ngraph_helpers/ngraph_functions/src/variadic_split.cpp index 49348b914d42aa..c6e7b99644e82a 100644 --- a/src/tests/ngraph_helpers/ngraph_functions/src/variadic_split.cpp +++ b/src/tests/ngraph_helpers/ngraph_functions/src/variadic_split.cpp @@ -11,9 +11,9 @@ namespace ngraph { namespace builder { std::shared_ptr makeVariadicSplit(const ngraph::Output &in, const std::vector numSplits, - size_t axis) { - auto splitAxisOp = std::make_shared(element::u64, ngraph::Shape{}, - std::vector{axis}); + int64_t axis) { + auto splitAxisOp = std::make_shared(element::i64, ngraph::Shape{}, + std::vector{axis}); auto numSplit = std::make_shared(element::u64, ngraph::Shape{numSplits.size()}, numSplits); auto VariadicSplitNode = std::make_shared(in, splitAxisOp, numSplit); diff --git a/tests/layer_tests/common/tflite_layer_test_class.py b/tests/layer_tests/common/tflite_layer_test_class.py index fc459b5028fa7a..d1d89263b3a559 100644 --- a/tests/layer_tests/common/tflite_layer_test_class.py +++ b/tests/layer_tests/common/tflite_layer_test_class.py @@ -7,6 +7,7 @@ from common.layer_test_class import CommonLayerTest from common.utils.tflite_utils import get_tflite_results, get_tensors_from_graph + class TFLiteLayerTest(CommonLayerTest): model_path = None inputs = None diff --git a/tests/layer_tests/common/utils/tflite_utils.py b/tests/layer_tests/common/utils/tflite_utils.py index 3c700c54becee8..7edd77d667cc2f 100644 --- a/tests/layer_tests/common/utils/tflite_utils.py +++ b/tests/layer_tests/common/utils/tflite_utils.py @@ -1,9 +1,60 @@ import os import tensorflow as tf +import numpy as np from common.utils.tf_utils import summarize_graph, transpose_nhwc_to_nchw +def make_positive_array(inputs_dict): + for input in inputs_dict.keys(): + inputs_dict[input] = np.random.randint(1, 10, inputs_dict[input]).astype(np.float32) + return inputs_dict + + +def short_range(inputs_dict): + for input in inputs_dict.keys(): + inputs_dict[input] = np.random.randint(-1, 1, inputs_dict[input]).astype(np.float32) + return inputs_dict + + +def make_boolean_array(inputs_dict): + for input in inputs_dict.keys(): + inputs_dict[input] = np.random.randint(0, 1, inputs_dict[input]) > 1 + return inputs_dict + + +data_generators = { + 'positive': make_positive_array, + 'short_range': short_range, + 'boolean': make_boolean_array, +} + + +def activation_helper(input_node, activation_name, name): + if activation_name is None: + return input_node + else: + return activation_name(input_node, name=name) + + +additional_test_params = [ + [ + {'axis': None}, + {'axis': -1} + ], + [ + {'activation': None}, + {'activation': tf.nn.relu}, + {'activation': tf.nn.relu6}, + # skip tanh and signbit since tflite doesn't fuse such activations + # https://github.com/tensorflow/tensorflow/blob/77d8c333405a080c57850c45531dbbf077b2bd0e/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td#L86:L89 + # {'activation': tf.math.tanh}, + # {'activation': lambda x, name: tf.identity(tf.experimental.numpy.signbit(x), name=name)}, + {'activation': lambda x, name: tf.math.minimum(tf.math.maximum(-1., x), 1., name=name)} + ] +] + + def save_pb_to_tflite(pb_model): graph_summary = summarize_graph(pb_model) inputs = [k for k in graph_summary['inputs'].keys()] @@ -67,3 +118,4 @@ def get_tensors_from_graph(graph, ops: list): tensors.append(op_out_tensor) return tensors + diff --git a/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py b/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py index aef0fae1086cc0..55825f35dbe75c 100644 --- a/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py +++ b/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py @@ -8,11 +8,23 @@ import openvino.runtime as ov import pytest import torch +import unittest from openvino.runtime import PartialShape, Dimension, Model, Type from common.mo_convert_test_class import CommonMOConvertTest +class MyTorchOp(torch.autograd.Function): + @staticmethod + def symbolic(g, in_positions): + return g.op("MyTorchOp", in_positions) + + @staticmethod + def forward(self, in_positions): + out_pos = in_positions.reshape(-1) + return out_pos + 0.5 + + def make_pt_model_one_input(): from torch import nn class NeuralNetwork(nn.Module): @@ -735,3 +747,30 @@ def test_mo_import_from_memory(self, create_model, ie_device, precision, ir_vers if mo_params is not None: test_params.update(mo_params) self._test_by_ref_graph(temp_dir, test_params, graph_ref, compare_tensor_names=False) + + +def create_pt_model_with_custom_op(): + # + # Create PyTorch model with custom operation + # + import torch.nn as nn + + class MyModel(nn.Module): + def __init__(self): + super(MyModel, self).__init__() + self.my_op = MyTorchOp() + + def forward(self, x): + return self.my_op.apply(x) + + return MyModel() + + +class ConvertONNXFallthroughTest(unittest.TestCase): + def test_onnx_fallthrough(self): + from openvino.tools.mo import convert_model + pytorch_model = create_pt_model_with_custom_op() + + # Check that ONNX conversion passed, so ONNX frontend raises error message of unsupported op. + with self.assertRaisesRegex(RuntimeError, ".*OpenVINO does not support the following ONNX operations: MyTorchOp.*"): + convert_model(pytorch_model, input_shape=[1, 2, 3], use_legacy_frontend=True) diff --git a/tests/layer_tests/pytorch_tests/test_einsum.py b/tests/layer_tests/pytorch_tests/test_einsum.py new file mode 100644 index 00000000000000..37a52540d6852e --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_einsum.py @@ -0,0 +1,103 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from pytorch_layer_test_class import PytorchLayerTest + + +class TestEinsumBatchMatMul(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + + return (np.random.randn(5, 2, 3).astype(np.float32), np.random.randn(5, 3, 4).astype(np.float32),) + + def create_model(self): + import torch + + class EinsumModelBatchMatmul(torch.nn.Module): + def forward(self, x, y): + eqn = "bij, bjk -> bik" + return torch.einsum(eqn, x, y) + + ref_net = None + + return EinsumModelBatchMatmul(), ref_net, "aten::einsum" + + @pytest.mark.nightly + @pytest.mark.precommit + def test_einsum_batch_matmul(self, ie_device, precision, ir_version): + self._test(*self.create_model(), ie_device, precision, ir_version) + + +class TestEinsumBatchDiagonal(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + + return (np.random.randn(3, 5, 5).astype(np.float32),) + + def create_model(self): + import torch + + class EinsumModelBatchDiagonal(torch.nn.Module): + def forward(self, x): + eqn = "kii -> ki" + return torch.einsum(eqn, x) + + ref_net = None + + return EinsumModelBatchDiagonal(), ref_net, "aten::einsum" + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.xfail(reason='OpenVINO CPU plugin does not support einsum diagonal') + def test_einsum_batch_diagonal(self, ie_device, precision, ir_version): + self._test(*self.create_model(), ie_device, precision, ir_version, dynamic_shapes=False) + + +class TestEinsumInnerProd(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + + return (np.random.randn(5).astype(np.float32), np.random.randn(5).astype(np.float32)) + + def create_model(self): + import torch + + class EinsumModelInnerProd(torch.nn.Module): + def forward(self, x, y): + eqn = "i,i" + return torch.einsum(eqn, x, y) + + ref_net = None + + return EinsumModelInnerProd(), ref_net, "aten::einsum" + + @pytest.mark.nightly + @pytest.mark.precommit + def test_einsum_inner_prod(self, ie_device, precision, ir_version): + self._test(*self.create_model(), ie_device, precision, ir_version) + + +class TestEinsumTranspose(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + + return (np.random.randn(3, 5).astype(np.float32),) + + def create_model(self): + import torch + + class EinsumModelTranspose(torch.nn.Module): + def forward(self, x): + eqn = "ij->ji" + return torch.einsum(eqn, x) + + ref_net = None + + return EinsumModelTranspose(), ref_net, "aten::einsum" + + @pytest.mark.nightly + @pytest.mark.precommit + def test_einsum_transpose(self, ie_device, precision, ir_version): + self._test(*self.create_model(), ie_device, precision, ir_version) \ No newline at end of file diff --git a/tests/layer_tests/pytorch_tests/test_index.py b/tests/layer_tests/pytorch_tests/test_index.py new file mode 100644 index 00000000000000..967ef4c98afb6e --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_index.py @@ -0,0 +1,73 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import numpy as np + +from pytorch_layer_test_class import PytorchLayerTest + + +class TestIndex(PytorchLayerTest): + def _prepare_input(self, input_shape, idx): + import numpy as np + return (np.random.randn(*input_shape).astype(np.float32), idx) + + def create_model(self, model="list"): + import torch + + class aten_index_list(torch.nn.Module): + + def forward(self, x, idx): + return x[idx] + + class aten_index_getitem(torch.nn.Module): + + def forward(self, x, idx): + return x.__getitem__(idx) + + + class aten_index_list_bool(torch.nn.Module): + + def forward(self, x, idx): + return x[idx.to(torch.bool)] + + class aten_index_getitem_bool(torch.nn.Module): + + def forward(self, x, idx): + return x.__getitem__(idx.to(torch.bool)) + cases = { + "list": aten_index_list, + "getitem": aten_index_getitem, + "list_with_bool": aten_index_list_bool, + "getitem_with_bool": aten_index_getitem_bool + } + + aten_index = cases[model] + + ref_net = None + + return aten_index(), ref_net, "aten::index" + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize("case", ["list", "getitem"]) + @pytest.mark.parametrize(("input_shape", "idx"), [ + ((1,), np.array(0).astype(int)), + ([2, 3], np.array(-1).astype(int)), + ([4, 5, 6], np.array((1, 2)).astype(int)), + ([7, 8, 9], np.array((-1, 2, -3)).astype(int)), + ([2, 2, 3, 4], np.array((1,)).astype(int))]) + def test_index(self, input_shape, idx, case, ie_device, precision, ir_version): + self._test(*self.create_model(case), ie_device, precision, ir_version, kwargs_to_prepare_input={"input_shape": input_shape, "idx": idx}) + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize("case", ["getitem_with_bool", "list_with_bool"]) + @pytest.mark.parametrize(("input_shape", "idx"), [ + ((1, 2), np.array([[1, 0]]).astype(bool)), + ((2, 2, 5), np.zeros([2, 2, 5]).astype(bool)), + ((2, 2, 5), np.ones([2, 2, 5]).astype(bool)), + ((2, 2, 5), np.random.rand(2, 2, 5) > 0) + ]) + def test_index_bool(self, input_shape, idx, case, ie_device, precision, ir_version): + self._test(*self.create_model(case), ie_device, precision, ir_version, kwargs_to_prepare_input={"input_shape": input_shape, "idx": idx}) \ No newline at end of file diff --git a/tests/layer_tests/pytorch_tests/test_roi_align.py b/tests/layer_tests/pytorch_tests/test_roi_align.py new file mode 100644 index 00000000000000..fb03c51b0914e0 --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_roi_align.py @@ -0,0 +1,58 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +import numpy as np +import pytest +import torch + +from pytorch_layer_test_class import PytorchLayerTest +from torchvision.ops import roi_align + + +class TestROIAlign(PytorchLayerTest): + def _prepare_input(self): + return (self.input_tensor, self.boxes) + + def create_model(self, output_size, spatial_scale, sampling_ratio, aligned): + + class torchvision_roi_align(torch.nn.Module): + def __init__(self, output_size, spatial_scale, sampling_ratio, aligned): + super().__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + self.aligned = aligned + + def forward(self, input_tensor, rois): + return roi_align( + input_tensor, + rois.to(dtype=input_tensor.dtype), + self.output_size, + self.spatial_scale, + self.sampling_ratio, + self.aligned, + ) + + ref_net = None + + return (torchvision_roi_align(output_size, spatial_scale, sampling_ratio, aligned), + ref_net, "torchvision::roi_align") + + @pytest.mark.parametrize('input_tensor', (np.random.randn(4, 5, 6, 7).astype(np.float32),)) + @pytest.mark.parametrize('boxes', (np.array([[1, 2, 2, 3, 3]]).astype(np.float32), + np.array([[0, 1, 2, 5, 4], + [2, 1, 2, 5, 4], + [3, 1, 2, 5, 4]]).astype(np.float32))) + @pytest.mark.parametrize('output_size', ((4, 5), (3, 2), 3)) + @pytest.mark.parametrize('spatial_scale', (0.5, 1.0)) + @pytest.mark.parametrize('sampling_ratio', (0, 1)) + @pytest.mark.parametrize('aligned', (True, False)) + @pytest.mark.nightly + @pytest.mark.precommit + def test_roi_align(self, ie_device, precision, ir_version, input_tensor, boxes, output_size, + spatial_scale, sampling_ratio, aligned): + self.input_tensor = input_tensor + self.boxes = boxes + self._test(*self.create_model(output_size, spatial_scale, sampling_ratio, aligned), + ie_device, precision, ir_version, trace_model=True) diff --git a/tests/layer_tests/pytorch_tests/test_upsample.py b/tests/layer_tests/pytorch_tests/test_upsample.py index a5ea7df4157cd1..d1874f6c0d07b5 100644 --- a/tests/layer_tests/pytorch_tests/test_upsample.py +++ b/tests/layer_tests/pytorch_tests/test_upsample.py @@ -6,10 +6,50 @@ from pytorch_layer_test_class import PytorchLayerTest +class TestUpsample1D(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + return (np.random.randn(1, 3, 224).astype(np.float32),) + + def create_model(self, size, scale, mode): + import torch + import torch.nn.functional as F + + class aten_upsample(torch.nn.Module): + def __init__(self, size, scale, mode): + super().__init__() + self.size = size + self.scale = scale + self.mode = mode + + def forward(self, x): + return F.interpolate(x, self.size, scale_factor=self.scale, mode=self.mode) + + ref_net = None + + return aten_upsample(size, scale, mode), ref_net, F"aten::upsample_{mode}1d" + + @pytest.mark.parametrize("mode,size,scale", [ + ('nearest', 300, None), + ('nearest', 200, None), + ('nearest', None, 2.5), + ('nearest', None, 0.75), + ('linear', 300, None), + ('linear', 200, None), + ('linear', None, 2.5,), + ('linear', None, 0.75), + ]) + @pytest.mark.nightly + @pytest.mark.precommit + def test_upsample1d(self, mode, size, scale, ie_device, precision, ir_version): + self._test(*self.create_model(size, scale, mode), ie_device, + precision, ir_version, trace_model=True) + + class TestUpsample2D(PytorchLayerTest): def _prepare_input(self): import numpy as np - return (np.zeros((1, 3, 224, 224)).astype(np.float32),) + return (np.random.randn(1, 3, 200, 200).astype(np.float32),) def create_model(self, size, scale, mode): import torch @@ -31,25 +71,70 @@ def forward(self, x): @pytest.mark.parametrize("mode,size,scale", [ ('nearest', 300, None), - ('nearest', 200, None), - ('nearest', (128, 480), None), - ('nearest', None, 2.5,), + ('nearest', 150, None), + ('nearest', (300, 400), None), + ('nearest', None, 2.5), ('nearest', None, 0.75), - ('nearest', None, (1.2, 0.8)), + ('nearest', None, (1.5, 2)), ('bilinear', 300, None), - ('bilinear', 200, None), - ('bilinear', (128, 480), None), + ('bilinear', 150, None), + ('bilinear', (400, 480), None), ('bilinear', None, 2.5,), ('bilinear', None, 0.75), - ('bilinear', None, (1.2, 0.8)), + ('bilinear', None, (1.2, 1.3)), ('bicubic', 300, None), - ('bicubic', 200, None), - ('bicubic', (128, 480), None), + ('bicubic', 150, None), + ('bicubic', (400, 480), None), ('bicubic', None, 2.5,), ('bicubic', None, 0.75), - ('bicubic', None, (1.2, 0.8))] - ) + ('bicubic', None, (1.2, 1.3)) + ]) + @pytest.mark.nightly + @pytest.mark.precommit + def test_upsample2d(self, mode, size, scale, ie_device, precision, ir_version): + self._test(*self.create_model(size, scale, mode), ie_device, + precision, ir_version, trace_model=True, **{"custom_eps": 1e-3}) + + +class TestUpsample3D(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + return (np.random.randn(1, 3, 100, 100, 100).astype(np.float32),) + + def create_model(self, size, scale, mode): + import torch + import torch.nn.functional as F + + class aten_upsample(torch.nn.Module): + def __init__(self, size, scale, mode): + super().__init__() + self.size = size + self.scale = scale + self.mode = mode + + def forward(self, x): + return F.interpolate(x, self.size, scale_factor=self.scale, mode=self.mode) + + ref_net = None + + return aten_upsample(size, scale, mode), ref_net, F"aten::upsample_{mode}3d" + + @pytest.mark.parametrize("mode,size,scale", [ + ('nearest', 200, None), + ('nearest', 150, None), + ('nearest', (150, 200, 250), None), + ('nearest', None, 2.5), + ('nearest', None, 0.75), + ('nearest', None, (1.5, 2, 2.5)), + ('trilinear', 200, None), + ('trilinear', 150, None), + ('trilinear', (200, 240, 210), None), + ('trilinear', None, 2.5,), + ('trilinear', None, 0.75), + ('trilinear', None, (1.2, 1.1, 1.5)), + ]) @pytest.mark.nightly @pytest.mark.precommit - def test_upsample(self, mode, size, scale, ie_device, precision, ir_version): - self._test(*self.create_model(size, scale, mode), ie_device, precision, ir_version, trace_model=True) + def test_upsample3d(self, mode, size, scale, ie_device, precision, ir_version): + self._test(*self.create_model(size, scale, mode), ie_device, + precision, ir_version, trace_model=True, **{"custom_eps": 1e-3}) diff --git a/tests/layer_tests/tensorflow_lite_tests/test_tfl_Binary.py b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Binary.py new file mode 100644 index 00000000000000..22b9be44017af0 --- /dev/null +++ b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Binary.py @@ -0,0 +1,62 @@ +import itertools + +import pytest +import tensorflow as tf + +from common.tflite_layer_test_class import TFLiteLayerTest +from tensorflow_lite_tests.test_tfl_Unary import data_generators + +test_ops = [ + {'op_name': 'EQUAL', 'op_func': tf.math.equal}, + {'op_name': 'FLOOR_MOD', 'op_func': tf.math.floormod}, + {'op_name': 'GREATER', 'op_func': tf.math.greater}, + {'op_name': 'GREATER_EQUAL', 'op_func': tf.math.greater_equal}, + {'op_name': 'LESS', 'op_func': tf.math.less}, + {'op_name': 'LESS_EQUAL', 'op_func': tf.math.less_equal}, + {'op_name': 'LOGICAL_AND', 'op_func': tf.math.logical_and, 'kwargs_to_prepare_input': 'boolean', 'dtype': tf.bool}, + {'op_name': 'LOGICAL_OR', 'op_func': tf.math.logical_or, 'kwargs_to_prepare_input': 'boolean', 'dtype': tf.bool}, + {'op_name': 'MAXIMUM', 'op_func': tf.math.maximum}, + {'op_name': 'MINIMUM', 'op_func': tf.math.minimum}, + {'op_name': 'NOT_EQUAL', 'op_func': tf.math.not_equal}, + {'op_name': 'POW', 'op_func': tf.math.pow, 'kwargs_to_prepare_input': 'positive'}, + {'op_name': 'SQUARED_DIFFERENCE', 'op_func': tf.math.squared_difference}, +] + +test_params = [ + {'shape': [2, 10, 10, 3]}, + {'shape': [2, 10]} +] + +test_data = list(itertools.product(test_ops, test_params)) +for i, (parameters, shapes) in enumerate(test_data): + parameters.update(shapes) + test_data[i] = parameters.copy() + + +class TestTFLiteBinaryLayerTest(TFLiteLayerTest): + inputs = ["Input_0", "Input_1"] + outputs = ["BinaryOperation"] + + def _prepare_input(self, inputs_dict, generator=None): + if generator is None: + return super()._prepare_input(inputs_dict) + return data_generators[generator](inputs_dict) + + def make_model(self, params): + assert len(set(params.keys()).intersection({'op_name', 'op_func', 'shape'})) == 3, \ + 'Unexpected parameters for test: ' + ','.join(params.keys()) + self.allowed_ops = [params['op_name']] + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + place_holder0 = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteBinaryLayerTest.inputs[0]) + place_holder1 = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteBinaryLayerTest.inputs[1]) + params['op_func'](place_holder0, place_holder1, name=TestTFLiteBinaryLayerTest.outputs[0]) + net = sess.graph_def + return net + + @pytest.mark.parametrize("params", test_data) + @pytest.mark.nightly + def test_binary(self, params, ie_device, precision, temp_dir): + self._test(ie_device, precision, temp_dir, params) diff --git a/tests/layer_tests/tensorflow_lite_tests/test_tfl_BinaryWithActivation.py b/tests/layer_tests/tensorflow_lite_tests/test_tfl_BinaryWithActivation.py new file mode 100644 index 00000000000000..e48fd1501910db --- /dev/null +++ b/tests/layer_tests/tensorflow_lite_tests/test_tfl_BinaryWithActivation.py @@ -0,0 +1,63 @@ +import itertools + +import tensorflow as tf +import pytest + +from common.tflite_layer_test_class import TFLiteLayerTest +from common.utils.tflite_utils import data_generators, additional_test_params, activation_helper + +test_ops = [ + {'op_name': 'ADD', 'op_func': tf.math.add}, + {'op_name': 'DIV', 'op_func': tf.math.divide, 'kwargs_to_prepare_input': 'positive'}, + {'op_name': 'MUL', 'op_func': tf.math.multiply}, + {'op_name': 'SUB', 'op_func': tf.math.subtract}, +] + +test_params = [ + {'shape': [2, 10, 10, 3]}, + {'shape': [2, 10]} +] + + +test_data = list(itertools.product(test_ops, test_params)) +for i, (parameters, shapes) in enumerate(test_data): + parameters.update(shapes) + test_data[i] = parameters.copy() + +test_data = list(itertools.product(test_data, additional_test_params[1])) +for i, (parameters, additional_test_params[1]) in enumerate(test_data): + parameters.update(additional_test_params[1]) + test_data[i] = parameters.copy() + + +class TestTFLiteBinaryWithActivationLayerTest(TFLiteLayerTest): + inputs = ["Input_0", "Input_1"] + outputs = ["BinaryOperation"] + + def _prepare_input(self, inputs_dict, generator=None): + if generator is None: + return super()._prepare_input(inputs_dict) + return data_generators[generator](inputs_dict) + + def make_model(self, params): + assert len(set(params.keys()).intersection({'op_name', 'op_func', 'shape', 'activation'})) == 4, \ + 'Unexpected parameters for test: ' + ','.join(params.keys()) + self.allowed_ops = [params['op_name']] + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + in0 = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteBinaryWithActivationLayerTest.inputs[0]) + in1 = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteBinaryWithActivationLayerTest.inputs[1]) + bin_op_name = TestTFLiteBinaryWithActivationLayerTest.outputs[0] if not params['activation'] else \ + TestTFLiteBinaryWithActivationLayerTest.outputs[0] + "/op" + op = params['op_func'](in0, in1, name=bin_op_name) + op = activation_helper(op, params['activation'], TestTFLiteBinaryWithActivationLayerTest.outputs[0]) + + net = sess.graph_def + return net + + @pytest.mark.parametrize("params", test_data) + @pytest.mark.nightly + def test_binary(self, params, ie_device, precision, temp_dir): + self._test(ie_device, precision, temp_dir, params) diff --git a/tests/layer_tests/tensorflow_lite_tests/test_tfl_Reduce.py b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Reduce.py new file mode 100644 index 00000000000000..71488370596907 --- /dev/null +++ b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Reduce.py @@ -0,0 +1,61 @@ +import itertools + +import pytest +import tensorflow as tf + +from common.tflite_layer_test_class import TFLiteLayerTest +from common.utils.tflite_utils import data_generators, additional_test_params + +test_ops = [ + {'op_name': 'MEAN', 'op_func': tf.math.reduce_mean}, + {'op_name': 'REDUCE_ALL', 'op_func': tf.math.reduce_all, 'kwargs_to_prepare_input': 'boolean', 'dtype': tf.bool}, + {'op_name': 'REDUCE_ANY', 'op_func': tf.math.reduce_any, 'kwargs_to_prepare_input': 'boolean', 'dtype': tf.bool}, + {'op_name': 'REDUCE_MAX', 'op_func': tf.math.reduce_max}, + {'op_name': 'REDUCE_MIN', 'op_func': tf.math.reduce_min}, + {'op_name': 'REDUCE_PROD', 'op_func': tf.math.reduce_prod, 'kwargs_to_prepare_input': 'short_range'}, + {'op_name': 'SUM', 'op_func': tf.math.reduce_sum}, +] + +test_params = [ + {'shape': [2, 10, 10, 3]}, + {'shape': [2, 10]} +] + + +test_data = list(itertools.product(test_ops, test_params)) +for i, (parameters, shapes) in enumerate(test_data): + parameters.update(shapes) + test_data[i] = parameters.copy() + + +test_data = list(itertools.product(test_data, additional_test_params[0])) +for i, (parameters, additional_test_params[0]) in enumerate(test_data): + parameters.update(additional_test_params[0]) + test_data[i] = parameters.copy() + + +class TestTFLiteReduceLayerTest(TFLiteLayerTest): + inputs = ["Input"] + outputs = ["ReduceOperation"] + + def _prepare_input(self, inputs_dict, generator=None): + if generator is None: + return super()._prepare_input(inputs_dict) + return data_generators[generator](inputs_dict) + + def make_model(self, params): + assert len(set(params.keys()).intersection({'op_name', 'op_func', 'shape', 'axis'})) == 4, \ + 'Unexpected parameters for test: ' + ','.join(params.keys()) + self.allowed_ops = [params['op_name']] + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + place_holder = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteReduceLayerTest.inputs[0]) + params['op_func'](place_holder, axis=params['axis'], name=TestTFLiteReduceLayerTest.outputs[0]) + net = sess.graph_def + return net + + @pytest.mark.parametrize("params", test_data) + @pytest.mark.nightly + def test_reduce(self, params, ie_device, precision, temp_dir): + self._test(ie_device, precision, temp_dir, params) diff --git a/tests/layer_tests/tensorflow_lite_tests/test_tfl_Unary.py b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Unary.py index 0c7e9f849e198b..b782202d112b79 100644 --- a/tests/layer_tests/tensorflow_lite_tests/test_tfl_Unary.py +++ b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Unary.py @@ -7,27 +7,11 @@ import pytest import tensorflow as tf from common.tflite_layer_test_class import TFLiteLayerTest +from common.utils.tflite_utils import data_generators np.random.seed(42) -def make_positive_array(inputs_dict): - for input in inputs_dict.keys(): - inputs_dict[input] = np.random.randint(1, 10, inputs_dict[input]).astype(np.float32) - return inputs_dict - - -def make_boolean_array(inputs_dict): - for input in inputs_dict.keys(): - inputs_dict[input] = np.random.randint(0, 1, inputs_dict[input]) > 1 - return inputs_dict - - -data_generators = { - 'positive': make_positive_array, - 'boolean': make_boolean_array, -} - test_ops = [ {'op_name': 'ABS', 'op_func': tf.math.abs}, {'op_name': 'CAST', 'op_func': partial(tf.cast, dtype=tf.int32)}, @@ -82,14 +66,14 @@ def _prepare_input(self, inputs_dict, generator=None): return super()._prepare_input(inputs_dict) return data_generators[generator](inputs_dict) - def make_model(self, params): assert len(set(params.keys()).intersection({'op_name', 'op_func', 'shape'})) == 3, \ 'Unexpected parameters for test: ' + ','.join(params.keys()) self.allowed_ops = [params['op_name']] tf.compat.v1.reset_default_graph() with tf.compat.v1.Session() as sess: - place_holder = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], name=TestTFLiteUnaryLayerTest.inputs[0]) + place_holder = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteUnaryLayerTest.inputs[0]) params['op_func'](place_holder, name=TestTFLiteUnaryLayerTest.outputs[0]) net = sess.graph_def return net diff --git a/tests/layer_tests/tensorflow_tests/test_tf_CTCGreedyDecoder.py b/tests/layer_tests/tensorflow_tests/test_tf_CTCGreedyDecoder.py index ff8af22cca18ce..580202eefd862b 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_CTCGreedyDecoder.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_CTCGreedyDecoder.py @@ -62,6 +62,8 @@ def create_ctcgreedydecoder_placeholder_const_net(self, input_shape, merge_repea @pytest.mark.nightly def test_ctcgreedydecoder_placeholder_const(self, params, merge_repeated, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104860') self._test(*self.create_ctcgreedydecoder_placeholder_const_net(**params, ir_version=ir_version, use_new_frontend=use_new_frontend, merge_repeated=merge_repeated), ie_device, precision, ir_version, temp_dir=temp_dir, diff --git a/tests/layer_tests/tensorflow_tests/test_tf_Conv2D.py b/tests/layer_tests/tensorflow_tests/test_tf_Conv2D.py index ec3308d2b22a07..9e2dfdb585c697 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_Conv2D.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_Conv2D.py @@ -83,6 +83,8 @@ def create_conv2d_placeholder_const_net(self, input_shape, input_filter, input_s @pytest.mark.nightly def test_conv2d_placeholder_const(self, params, padding, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104862') self._test(*self.create_conv2d_placeholder_const_net(**params, input_padding=padding, ir_version=ir_version, use_new_frontend=use_new_frontend), ie_device, precision, ir_version, input_padding=padding, temp_dir=temp_dir, diff --git a/tests/layer_tests/tensorflow_tests/test_tf_DynamicPartition.py b/tests/layer_tests/tensorflow_tests/test_tf_DynamicPartition.py index c47eafbcb7c1fb..06f85d3948185a 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_DynamicPartition.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_DynamicPartition.py @@ -46,6 +46,8 @@ def create_dynamic_partition_net(self, data_shape, partitions_shape, num_partiti @pytest.mark.nightly def test_dynamic_partition_basic(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104855') if not use_new_frontend: pytest.skip("DynamicPartition operation is not supported via legacy frontend.") self._test(*self.create_dynamic_partition_net(**params), @@ -61,6 +63,8 @@ def test_dynamic_partition_basic(self, params, ie_device, precision, ir_version, @pytest.mark.nightly def test_dynamic_partition_other_types(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104855') if not use_new_frontend: pytest.skip("DynamicPartition operation is not supported via legacy frontend.") self._test(*self.create_dynamic_partition_net(**params), diff --git a/tests/layer_tests/tensorflow_tests/test_tf_FakeQuantWithMinMaxVars.py b/tests/layer_tests/tensorflow_tests/test_tf_FakeQuantWithMinMaxVars.py new file mode 100644 index 00000000000000..73a5dcb832ae1d --- /dev/null +++ b/tests/layer_tests/tensorflow_tests/test_tf_FakeQuantWithMinMaxVars.py @@ -0,0 +1,60 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest +import tensorflow as tf +from common.tf_layer_test_class import CommonTFLayerTest + + +class TestFakeQuantWithMinMaxVars(CommonTFLayerTest): + def _prepare_input(self, inputs_info): + # generate elements so that the input tensor may contain repeating elements + assert 'inputs' in inputs_info, "Test error: inputs_info must contain `input`" + inputs_shape = inputs_info['inputs'] + inputs_data = {} + inputs_data['inputs'] = np.random.randint(-10, 10, inputs_shape).astype(np.float32) + return inputs_data + + def create_fake_quant_with_min_max_vars_net(self, inputs_shape, min_value, max_value, num_bits, narrow_range, + fake_quant_op): + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + inputs = tf.compat.v1.placeholder(tf.float32, inputs_shape, 'inputs') + min = tf.constant(min_value, dtype=tf.float32) + max = tf.constant(max_value, dtype=tf.float32) + fake_quant_op(inputs=inputs, min=min, max=max, num_bits=num_bits, + narrow_range=narrow_range) + tf.compat.v1.global_variables_initializer() + tf_net = sess.graph_def + + return tf_net, None + + test_basic = [ + # test FakeQuantWithMinMaxVars + dict(inputs_shape=[2, 6, 4], min_value=-3, max_value=4, num_bits=None, narrow_range=None, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVars), + dict(inputs_shape=[3, 2, 1, 5], min_value=-4, max_value=5, num_bits=14, narrow_range=True, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVars), + dict(inputs_shape=[3, 2, 4], min_value=2, max_value=4, num_bits=10, narrow_range=False, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVars), + dict(inputs_shape=[1, 2, 3], min_value=-6, max_value=-3, num_bits=8, narrow_range=True, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVars), + + # test FakeQuantWithMinMaxVarsPerChannel + pytest.param(dict(inputs_shape=[2, 6, 4], min_value=[-4, -3, -5, -8], max_value=[4, 7, 9, 5], num_bits=None, + narrow_range=None, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVarsPerChannel), + marks=pytest.mark.xfail(reason="104822")) + + ] + + @pytest.mark.parametrize("params", test_basic) + @pytest.mark.precommit_tf_fe + @pytest.mark.nightly + def test_fake_quant_with_min_max_vars_basic(self, params, ie_device, precision, ir_version, temp_dir, + use_new_frontend, + use_old_api): + self._test(*self.create_fake_quant_with_min_max_vars_net(**params), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_new_frontend=use_new_frontend, use_old_api=use_old_api) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_Identity.py b/tests/layer_tests/tensorflow_tests/test_tf_Identity.py index 24fd85c30dd6a1..2e18d134d22fb4 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_Identity.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_Identity.py @@ -2,86 +2,37 @@ # SPDX-License-Identifier: Apache-2.0 import pytest -from common.layer_test_class import check_ir_version +import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest -from common.utils.tf_utils import permute_nchw_to_nhwc - -from unit_tests.utils.graph import build_graph class TestIdentity(CommonTFLayerTest): - def create_identity_net(self, shape, ir_version, use_new_frontend): - """ - Tensorflow net IR net - - Input->Identity->ReLU => Input->ReLU - - """ - - import tensorflow as tf - + def create_identity_net(self, input_shape, identity_op): tf.compat.v1.reset_default_graph() # Create the graph and model with tf.compat.v1.Session() as sess: - tf_x_shape = shape.copy() - - tf_x_shape = permute_nchw_to_nhwc(tf_x_shape, use_new_frontend) - - x = tf.compat.v1.placeholder(tf.float32, tf_x_shape, 'Input') - id = tf.identity(x, name="Operation") - tf.nn.relu(id, name='Operation') + input = tf.compat.v1.placeholder(tf.float32, input_shape, 'input') + relu = tf.raw_ops.Relu(features=input) + identity_op(input=relu, name="identity") tf.compat.v1.global_variables_initializer() tf_net = sess.graph_def - # - # Create reference IR net - # Please, specify 'type': 'Input' for input node - # Moreover, do not forget to validate ALL layer attributes!!! - # - - ref_net = None - - if check_ir_version(10, None, ir_version) and not use_new_frontend: - nodes_attributes = { - 'inputX': {'kind': 'op', 'type': 'Parameter'}, - 'inputX_data': {'shape': shape, 'kind': 'data'}, - 'ReLU': {'kind': 'op', 'type': 'ReLU'}, - 'ReLU_data': {'shape': shape, 'kind': 'data'}, - 'result': {'kind': 'op', 'type': 'Result'} - } - ref_net = build_graph(nodes_attributes, - [('inputX', 'inputX_data'), - ('inputX_data', 'ReLU'), - ('ReLU', 'ReLU_data'), - ('ReLU_data', 'result') - ]) - - return tf_net, ref_net - - test_data_precommit = [dict(shape=[1, 3, 50, 100, 224])] - - @pytest.mark.parametrize("params", test_data_precommit) - @pytest.mark.precommit - def test_identity_precommit(self, params, ie_device, precision, ir_version, temp_dir, - use_new_frontend, use_old_api): - self._test(*self.create_identity_net(**params, ir_version=ir_version, - use_new_frontend=use_new_frontend), - ie_device, precision, ir_version, temp_dir=temp_dir, - use_new_frontend=use_new_frontend, use_old_api=use_old_api) + return tf_net, None - test_data = [dict(shape=[1]), - pytest.param(dict(shape=[1, 224]), marks=pytest.mark.precommit_tf_fe), - dict(shape=[1, 3, 224]), - dict(shape=[1, 3, 100, 224]), - dict(shape=[1, 3, 50, 100, 224])] + test_data_basic = [ + dict(input_shape=[2], identity_op=tf.raw_ops.Identity), + dict(input_shape=[2, 3], identity_op=tf.raw_ops.PreventGradient), + dict(input_shape=[], identity_op=tf.raw_ops.Snapshot), + dict(input_shape=[1, 2, 3], identity_op=tf.raw_ops.StopGradient) + ] - @pytest.mark.parametrize("params", test_data) + @pytest.mark.parametrize("params", test_data_basic) + @pytest.mark.precommit_tf_fe @pytest.mark.nightly - def test_identity(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, - use_old_api): - self._test(*self.create_identity_net(**params, ir_version=ir_version, - use_new_frontend=use_new_frontend), + def test_identity_basic(self, params, ie_device, precision, ir_version, temp_dir, + use_new_frontend, use_old_api): + self._test(*self.create_identity_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_new_frontend=use_new_frontend, use_old_api=use_old_api) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_If.py b/tests/layer_tests/tensorflow_tests/test_tf_If.py index 8832d735697f38..6884e19b9a8b82 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_If.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_If.py @@ -62,6 +62,8 @@ def else_branch(): @pytest.mark.nightly def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104855') self._test(*self.create_if_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_new_frontend=use_new_frontend, use_old_api=use_old_api) @@ -123,6 +125,8 @@ def else_branch(): @pytest.mark.nightly def test_if_basic(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104855') self._test(*self.create_if_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_new_frontend=use_new_frontend, use_old_api=use_old_api) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_IsFinite.py b/tests/layer_tests/tensorflow_tests/test_tf_IsFinite.py index 7b32f502305f90..802200b0e3d9c4 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_IsFinite.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_IsFinite.py @@ -41,6 +41,8 @@ def create_is_finite_net(self, x_shape, x_type): @pytest.mark.nightly def test_is_finite_basic(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104855') if not use_new_frontend: pytest.skip("IsFinite operation is not supported via legacy frontend.") self._test(*self.create_is_finite_net(**params), diff --git a/tests/layer_tests/tensorflow_tests/test_tf_IsInf.py b/tests/layer_tests/tensorflow_tests/test_tf_IsInf.py index 1d1f1e531b09b6..f2814a5d7aceba 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_IsInf.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_IsInf.py @@ -39,6 +39,8 @@ def create_is_inf_net(self, x_shape, x_type): @pytest.mark.nightly def test_is_inf_basic(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104855') if not use_new_frontend: pytest.skip("IsInf operation is not supported via legacy frontend.") self._test(*self.create_is_inf_net(**params), diff --git a/tests/layer_tests/tensorflow_tests/test_tf_IsNan.py b/tests/layer_tests/tensorflow_tests/test_tf_IsNan.py index 4eb337d147c85d..b0a44b781d7d73 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_IsNan.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_IsNan.py @@ -41,6 +41,8 @@ def create_is_nan_net(self, x_shape, x_type): @pytest.mark.nightly def test_is_nan_basic(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104855') if not use_new_frontend: pytest.skip("IsNan operation is not supported via legacy frontend.") self._test(*self.create_is_nan_net(**params), diff --git a/tests/layer_tests/tensorflow_tests/test_tf_L2Loss.py b/tests/layer_tests/tensorflow_tests/test_tf_L2Loss.py index 29261d4dfd978c..9011f2ee9c3e6d 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_L2Loss.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_L2Loss.py @@ -29,6 +29,8 @@ def create_l2_loss_net(self, input_shape): @pytest.mark.nightly def test_l2_loss_basic(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, use_old_api): + if ie_device == 'GPU': + pytest.xfail('104863') if not use_new_frontend: pytest.skip("L2Loss is not supported by legacy FE.") self._test(*self.create_l2_loss_net(**params), diff --git a/tests/layer_tests/tensorflow_tests/test_tf_NestedWhile.py b/tests/layer_tests/tensorflow_tests/test_tf_NestedWhile.py new file mode 100644 index 00000000000000..07da61a4681e7b --- /dev/null +++ b/tests/layer_tests/tensorflow_tests/test_tf_NestedWhile.py @@ -0,0 +1,83 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from common.tf_layer_test_class import CommonTFLayerTest + + +class TestNestedWhile(CommonTFLayerTest): + def create_simple_while(self): + import tensorflow as tf + + g = tf.Graph() + with g.as_default(): + x = tf.compat.v1.placeholder(tf.float32, shape=(3, 2)) + v = tf.constant([1, 2, 3], dtype=tf.int32, shape=[3]) + i = tf.constant([0], dtype=tf.int32, shape=[1]) + a_combined = tf.zeros([1, 2], dtype=tf.float32) + b_combined = tf.zeros([1, 2], dtype=tf.float32) + + def body(x_arg, v_arg, i_arg, a_combined_arg, b_combined_arg): + x_slice = tf.slice(x_arg, [0, 0], [1, x_arg.shape[1]]) + i_arg = tf.add(i_arg, 1) + a_combined_arg = tf.add(a_combined_arg, x_slice) + return x_arg, v_arg, i_arg, a_combined_arg, b_combined_arg + + while_condition = lambda x, v, i, a_combined, b_combined: i < v.shape[0] + + tf.while_loop(while_condition, body, [x, v, i, a_combined, b_combined], + name="while_node") + + return g, None + + def create_nested_while(self): + import tensorflow as tf + + g = tf.Graph() + with g.as_default(): + x = tf.compat.v1.placeholder(tf.float32, shape=(3, 2)) + v = tf.constant([1, 2, 3], dtype=tf.int32, shape=[3]) + i = tf.constant([0], dtype=tf.int32, shape=[1]) + a_combined = tf.zeros([1, 2], dtype=tf.float32) + b_combined = tf.zeros([1, 2], dtype=tf.float32) + + def body(x_arg, v_arg, i_arg, a_combined_arg, b_combined_arg): + x_slice = tf.slice(x_arg, [0, 0], [1, x_arg.shape[1]]) + v_slice = tf.slice(v_arg, [0], [1]) + j = tf.constant([0], dtype=tf.int32, shape=[1]) + + def body_supp(x_slice_arg, v_slice_arg, j_arg, b_combined_arg_arg): + j_arg = tf.add(j_arg, 1) + b_combined_arg_arg = tf.add(b_combined_arg_arg, x_slice_arg) + return x_slice_arg, v_slice_arg, j_arg, b_combined_arg_arg + + while_condition_supp = lambda x_slice, v_slice, j, b_combined: tf.less(j, v_slice) + + x_slice, v_slice, j, b_combined_arg = tf.while_loop(while_condition_supp, body_supp, + [x_slice, v_slice, j, b_combined_arg]) + + i_arg = tf.add(i_arg, 1) + + a_combined_arg = tf.add(a_combined_arg, x_slice) + return x_arg, v_arg, i_arg, a_combined_arg, b_combined_arg + + while_condition = lambda x, v, i, a_combined, b_combined: i < v.shape[0] + + tf.while_loop(while_condition, body, [x, v, i, a_combined, b_combined], + name="while_node") + + return g, None + + @pytest.mark.nightly + def test_simple_while(self, ie_device, precision, ir_version, temp_dir, use_new_frontend, + use_old_api): + self._test(*self.create_simple_while(), ie_device, precision, ir_version, temp_dir=temp_dir, + use_new_frontend=use_new_frontend, use_old_api=use_old_api) + + @pytest.mark.precommit_tf_fe + @pytest.mark.nightly + def test_nested_while(self, ie_device, precision, ir_version, temp_dir, use_new_frontend, + use_old_api): + self._test(*self.create_nested_while(), ie_device, precision, ir_version, temp_dir=temp_dir, + use_new_frontend=use_new_frontend, use_old_api=use_old_api) diff --git a/tools/mo/openvino/tools/mo/moc_frontend/pytorch_frontend_utils.py b/tools/mo/openvino/tools/mo/moc_frontend/pytorch_frontend_utils.py index a86409b9a48204..29939d7d988bed 100644 --- a/tools/mo/openvino/tools/mo/moc_frontend/pytorch_frontend_utils.py +++ b/tools/mo/openvino/tools/mo/moc_frontend/pytorch_frontend_utils.py @@ -131,6 +131,7 @@ def convert_pytorch_to_onnx(model, input_shape, opset_version, example_inputs, o torch.onnx.export(model, inputs, model_onnx, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH, **additional_params) return model_onnx diff --git a/tools/mo/unit_tests/moc_tf_fe/conversion_basic_models.py b/tools/mo/unit_tests/moc_tf_fe/conversion_basic_models.py index 08effa43a4c5c6..d1c0115bf0f711 100644 --- a/tools/mo/unit_tests/moc_tf_fe/conversion_basic_models.py +++ b/tools/mo/unit_tests/moc_tf_fe/conversion_basic_models.py @@ -309,3 +309,41 @@ def test_conversion_model_oneshot_iterator_use_legacy_frontend(self): def test_conversion_model_oneshot_iterator_default(self): self.basic("model_oneshot_iterator.pbtxt", None, None, None, None, None, None, True, True, False, False) + + @generate( + *[ + ( + "in2{f32}->[0.0 0.0 0.0 0.0]", + {"in1": np.array([[1.0, 2.0], [3.0, 4.0]])}, + np.array([[1.0, 2.0], [3.0, 4.0]]), + np.float32, + ), + ( + "in2->[1.0 15.0 15.5 1.0]", + {"in1": np.array([[2.0, 4.0], [12.0, 8.0]])}, + np.array([[3.0, 19.0], [27.5, 9.0]]), + np.float32, + ), + ], + ) + def test_conversion_model_with_non_standard_extension(self, input_freezing_value, inputs, expected, + dtype): + self.basic("model_fp32.frozen", input_freezing_value, inputs, dtype, expected, only_conversion=False, + input_model_is_text=False, use_new_frontend=True, + use_legacy_frontend=False) + + def test_conversion_fake_model(self): + with self.assertRaisesRegex(Exception, + "Internal error or inconsistent input model: the frontend supports " + "only frozen binary protobuf format."): + self.basic("fake.pb", None, None, None, None, + only_conversion=True, input_model_is_text=False, use_new_frontend=True, + use_legacy_frontend=False) + + def test_conversion_dir_model(self): + with self.assertRaisesRegex(Exception, + "Internal error or inconsistent input model: the frontend supports " + "only frozen binary protobuf format."): + self.basic(".", None, None, None, None, + only_conversion=True, input_model_is_text=False, use_new_frontend=True, + use_legacy_frontend=False) diff --git a/tools/mo/unit_tests/moc_tf_fe/test_models/fake.pb b/tools/mo/unit_tests/moc_tf_fe/test_models/fake.pb new file mode 100644 index 00000000000000..ae05864994afaf --- /dev/null +++ b/tools/mo/unit_tests/moc_tf_fe/test_models/fake.pb @@ -0,0 +1,2 @@ +dcfsdcdsdcs +cscscsc \ No newline at end of file diff --git a/tools/mo/unit_tests/moc_tf_fe/test_models/model_fp32.frozen b/tools/mo/unit_tests/moc_tf_fe/test_models/model_fp32.frozen new file mode 100644 index 00000000000000..3343e4106f837c --- /dev/null +++ b/tools/mo/unit_tests/moc_tf_fe/test_models/model_fp32.frozen @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a33c91148b5e72ca03608c7d2ee18229ee4b610344dadd6896efeb6ac7b93e0 +size 141