diff --git a/cmake/developer_package/api_validator/api_validator.cmake b/cmake/developer_package/api_validator/api_validator.cmake index 778b78dfc5ec52..8c0057dcfbdf1d 100644 --- a/cmake/developer_package/api_validator/api_validator.cmake +++ b/cmake/developer_package/api_validator/api_validator.cmake @@ -21,34 +21,34 @@ endif() function(_ie_add_api_validator_post_build_step_recursive) cmake_parse_arguments(API_VALIDATOR "" "TARGET" "" ${ARGN}) - list(APPEND API_VALIDATOR_TARGETS ${API_VALIDATOR_TARGET}) - set(API_VALIDATOR_TARGETS ${API_VALIDATOR_TARGETS} PARENT_SCOPE) - - get_target_property(IS_IMPORTED ${API_VALIDATOR_TARGET} IMPORTED) - if(IS_IMPORTED) - return() + get_target_property(LIBRARY_TYPE ${API_VALIDATOR_TARGET} TYPE) + if(LIBRARY_TYPE MATCHES "^(SHARED_LIBRARY|MODULE_LIBRARY|EXECUTABLE)$" AND + NOT ${API_VALIDATOR_TARGET} IN_LIST API_VALIDATOR_TARGETS) + list(APPEND API_VALIDATOR_TARGETS ${API_VALIDATOR_TARGET}) endif() - get_target_property(LIBRARY_TYPE ${API_VALIDATOR_TARGET} TYPE) - if(LIBRARY_TYPE STREQUAL "EXECUTABLE" OR LIBRARY_TYPE STREQUAL "SHARED_LIBRARY") + if(NOT LIBRARY_TYPE STREQUAL "INTERFACE_LIBRARY") get_target_property(LINKED_LIBRARIES ${API_VALIDATOR_TARGET} LINK_LIBRARIES) - if(LINKED_LIBRARIES) - foreach(ITEM IN LISTS LINKED_LIBRARIES) - if(NOT TARGET ${ITEM}) - continue() - endif() - get_target_property(LIBRARY_TYPE_DEPENDENCY ${ITEM} TYPE) - if(LIBRARY_TYPE_DEPENDENCY STREQUAL "SHARED_LIBRARY") - _ie_add_api_validator_post_build_step_recursive(TARGET ${ITEM}) - endif() - endforeach() - endif() + else() + set(LINKED_LIBRARIES) endif() + get_target_property(INTERFACE_LINKED_LIBRARIES ${API_VALIDATOR_TARGET} INTERFACE_LINK_LIBRARIES) + + foreach(library IN LISTS LINKED_LIBRARIES INTERFACE_LINKED_LIBRARIES) + if(TARGET "${library}") + get_target_property(orig_library ${library} ALIASED_TARGET) + if(TARGET "${orig_library}") + _ie_add_api_validator_post_build_step_recursive(TARGET ${orig_library}) + else() + _ie_add_api_validator_post_build_step_recursive(TARGET ${library}) + endif() + endif() + endforeach() set(API_VALIDATOR_TARGETS ${API_VALIDATOR_TARGETS} PARENT_SCOPE) endfunction() -set(VALIDATED_LIBRARIES "" CACHE INTERNAL "") +set(VALIDATED_TARGETS "" CACHE INTERNAL "") function(_ov_add_api_validator_post_build_step) set(UWP_API_VALIDATOR_APIS "${PROGRAMFILES}/Windows Kits/10/build/universalDDIs/x64/UniversalDDIs.xml") @@ -58,7 +58,7 @@ function(_ov_add_api_validator_post_build_step) return() endif() - cmake_parse_arguments(API_VALIDATOR "" "TARGET" "EXTRA" "" ${ARGN}) + cmake_parse_arguments(API_VALIDATOR "" "TARGET" "EXTRA" ${ARGN}) if(NOT API_VALIDATOR_TARGET) message(FATAL_ERROR "RunApiValidator requires TARGET to validate!") @@ -71,13 +71,16 @@ function(_ov_add_api_validator_post_build_step) # collect targets _ie_add_api_validator_post_build_step_recursive(TARGET ${API_VALIDATOR_TARGET}) - if (API_VALIDATOR_EXTRA) + if(API_VALIDATOR_EXTRA) foreach(target IN LISTS API_VALIDATOR_EXTRA) _ie_add_api_validator_post_build_step_recursive(TARGET ${target}) endforeach() endif() - list(REMOVE_DUPLICATES API_VALIDATOR_TARGETS) + # remove targets which were tested before + foreach(item IN LISTS VALIDATED_TARGETS) + list(REMOVE_ITEM API_VALIDATOR_TARGETS ${item}) + endforeach() if(NOT API_VALIDATOR_TARGETS) return() @@ -93,8 +96,10 @@ function(_ov_add_api_validator_post_build_step) get_filename_component(target_name "${target_location}" NAME_WE) elseif(TARGET "${orig_target}") set(target_name ${orig_target}) + set(target_location $) else() set(target_name ${target}) + set(target_location $) endif() endmacro() @@ -109,7 +114,7 @@ function(_ov_add_api_validator_post_build_step) add_custom_command(TARGET ${API_VALIDATOR_TARGET} POST_BUILD COMMAND ${CMAKE_COMMAND} --config $ -D UWP_API_VALIDATOR=${UWP_API_VALIDATOR} - -D UWP_API_VALIDATOR_TARGET=$ + -D UWP_API_VALIDATOR_TARGET=${target_location} -D UWP_API_VALIDATOR_APIS=${UWP_API_VALIDATOR_APIS} -D UWP_API_VALIDATOR_EXCLUSION=${UWP_API_VALIDATOR_EXCLUSION} -D UWP_API_VALIDATOR_OUTPUT=${output_file} @@ -122,8 +127,8 @@ function(_ov_add_api_validator_post_build_step) # update list of validated libraries - list(APPEND VALIDATED_LIBRARIES ${API_VALIDATOR_TARGETS}) - set(VALIDATED_LIBRARIES "${VALIDATED_LIBRARIES}" CACHE INTERNAL "" FORCE) + list(APPEND VALIDATED_TARGETS ${API_VALIDATOR_TARGETS}) + set(VALIDATED_TARGETS "${VALIDATED_TARGETS}" CACHE INTERNAL "" FORCE) endfunction() # diff --git a/cmake/developer_package/frontends/frontends.cmake b/cmake/developer_package/frontends/frontends.cmake index ad78058f6a0747..84a9eea0735988 100644 --- a/cmake/developer_package/frontends/frontends.cmake +++ b/cmake/developer_package/frontends/frontends.cmake @@ -182,7 +182,7 @@ macro(ov_add_frontend) add_library(openvino::frontend::${OV_FRONTEND_NAME} ALIAS ${TARGET_NAME}) endif() - # Shutdown protobuf when unloading the front dynamic library + # Shutdown protobuf when unloading the frontend dynamic library if(proto_files AND BUILD_SHARED_LIBS) target_link_libraries(${TARGET_NAME} PRIVATE ov_protobuf_shutdown) endif() @@ -217,8 +217,6 @@ macro(ov_add_frontend) ie_add_vs_version_file(NAME ${TARGET_NAME} FILEDESCRIPTION ${OV_FRONTEND_FILEDESCRIPTION}) - ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) - target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime) target_link_libraries(${TARGET_NAME} PRIVATE ${OV_FRONTEND_LINK_LIBRARIES}) ov_add_library_version(${TARGET_NAME}) @@ -259,6 +257,11 @@ macro(ov_add_frontend) add_dependencies(ov_frontends ${TARGET_NAME}) + # must be called after all target_link_libraries + ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) + + # installation + if(NOT OV_FRONTEND_SKIP_INSTALL) if(BUILD_SHARED_LIBS) # Note: diff --git a/docs/OV_Runtime_UG/Operations_specifications.md b/docs/OV_Runtime_UG/Operations_specifications.md index 107cfa9a4060df..15331ac28160d9 100644 --- a/docs/OV_Runtime_UG/Operations_specifications.md +++ b/docs/OV_Runtime_UG/Operations_specifications.md @@ -202,6 +202,7 @@ Tile-1 TopK-1 TopK-3 + TopK-11 Transpose-1 Unique-10 Unsqueeze-1 diff --git a/docs/OV_Runtime_UG/auto_device_selection.md b/docs/OV_Runtime_UG/auto_device_selection.md index a2eb7aa0758e8b..4567100e9d50f2 100644 --- a/docs/OV_Runtime_UG/auto_device_selection.md +++ b/docs/OV_Runtime_UG/auto_device_selection.md @@ -8,11 +8,15 @@ Debugging Auto-Device Plugin -@endsphinxdirective This article introduces how Automatic Device Selection works and how to use it for inference. -## How AUTO Works + +.. _how-auto-works: + + +How AUTO Works +#################### The Automatic Device Selection mode, or AUTO for short, uses a "virtual" or a "proxy" device, which does not bind to a specific type of hardware, but rather selects the processing unit for inference automatically. @@ -21,13 +25,14 @@ This way, you can write the application once and deploy it anywhere. The selection also depends on your performance requirements, defined by the “hints” configuration API, as well as device priority list limitations, if you choose to exclude some hardware from the process. -The logic behind the choice is as follows: -1. Check what supported devices are available. -2. Check precisions of the input model (for detailed information on precisions read more on the `ov::device::capabilities`) -3. Select the highest-priority device capable of supporting the given model, as listed in the table below. -4. If model’s precision is FP32 but there is no device capable of supporting it, offload the model to a device supporting FP16. +The logic behind the choice is as follows: + +1. Check what supported devices are available. +2. Check precisions of the input model (for detailed information on precisions read more on the ``ov::device::capabilities``). +3. Select the highest-priority device capable of supporting the given model, as listed in the table below. +4. If model’s precision is FP32 but there is no device capable of supporting it, offload the model to a device supporting FP16. + -@sphinxdirective +----------+------------------------------------------------------+-------------------------------------+ | Device || Supported || Supported | | Priority || Device || model precision | @@ -41,135 +46,140 @@ The logic behind the choice is as follows: | 3 || Intel® CPU | FP32, FP16, INT8, BIN | | || (e.g. Intel® Core™ i7-1165G7) | | +----------+------------------------------------------------------+-------------------------------------+ -@endsphinxdirective -To put it simply, when loading the model to the first device on the list fails, AUTO will try to load it to the next device in line, until one of them succeeds. -What is important, **AUTO starts inference with the CPU of the system by default**, as it provides very low latency and can start inference with no additional delays. + +To put it simply, when loading the model to the first device on the list fails, AUTO will try to load it to the next device in line, until one of them succeeds. +What is important, **AUTO starts inference with the CPU of the system by default**, as it provides very low latency and can start inference with no additional delays. While the CPU is performing inference, AUTO continues to load the model to the device best suited for the purpose and transfers the task to it when ready. This way, the devices which are much slower in compiling models, GPU being the best example, do not impede inference at its initial stages. For example, if you use a CPU and a GPU, the first-inference latency of AUTO will be better than that of using GPU alone. -Note that if you choose to exclude CPU from the priority list or disable the initial CPU acceleration feature via `ov::intel_auto::enable_startup_fallback`, it will be unable to support the initial model compilation stage. - -![](../img/autoplugin_accelerate.svg) +Note that if you choose to exclude CPU from the priority list or disable the initial CPU acceleration feature via ``ov::intel_auto::enable_startup_fallback``, it will be unable to support the initial model compilation stage. -This mechanism can be easily observed in the [Using AUTO with Benchmark app sample](#using-auto-with-openvino-samples-and-benchmark-app) section, showing how the first-inference latency (the time it takes to compile the model and perform the first inference) is reduced when using AUTO. For example: -```sh -benchmark_app -m ../public/alexnet/FP32/alexnet.xml -d GPU -niter 128 -``` +.. image:: _static/images/autoplugin_accelerate.svg -```sh -benchmark_app -m ../public/alexnet/FP32/alexnet.xml -d AUTO -niter 128 -``` +This mechanism can be easily observed in the :ref:`Using AUTO with Benchmark app sample ` section, showing how the first-inference latency (the time it takes to compile the model and perform the first inference) is reduced when using AUTO. For example: -@sphinxdirective -.. note:: - The longer the process runs, the closer realtime performance will be to that of the best-suited device. -@endsphinxdirective +.. code-block: sh -## Using AUTO + benchmark_app -m ../public/alexnet/FP32/alexnet.xml -d GPU -niter 128 -Following the OpenVINO™ naming convention, the Automatic Device Selection mode is assigned the label of “AUTO.” It may be defined with no additional parameters, resulting in defaults being used, or configured further with the following setup options: -@sphinxdirective +.. code-block: sh -+---------------------------------------------+----------------------------------------------------------------------+ -| | Property | | Values and Description | -+=============================================+======================================================================+ -| | | | **Values**: | -| | | | empty | -| | | | `AUTO` | -| | | | `AUTO: ` (comma-separated, no spaces) | -| | | | | -| | | | Lists the devices available for selection. | -| | | | The device sequence will be taken as priority from high to low. | -| | | | If not specified, `AUTO` will be used as default, | -| | | | and all devices will be "viewed" as candidates. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::device::priorities` | | **Values**: | -| | | | `` (comma-separated, no spaces) | -| | | | | -| | | | Specifies the devices for AUTO to select. | -| | | | The device sequence will be taken as priority from high to low. | -| | | | This configuration is optional. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::hint::performance_mode` | | **Values**: | -| | | | `ov::hint::PerformanceMode::LATENCY` | -| | | | `ov::hint::PerformanceMode::THROUGHPUT` | -| | | | `ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT` | -| | | | | -| | | | Specifies the performance option preferred by the application. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::hint::model_priority` | | **Values**: | -| | | | `ov::hint::Priority::HIGH` | -| | | | `ov::hint::Priority::MEDIUM` | -| | | | `ov::hint::Priority::LOW` | -| | | | | -| | | | Indicates the priority for a model. | -| | | | IMPORTANT: This property is not fully supported yet. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::execution_devices` | | Lists the runtime target devices on which the inferences are being | -| | | | executed. | -| | | | Examples of returning results could be `(CPU)`(`(CPU)` is a | -| | | | temporary device, indicating that CPU is used for acceleration at | -| | | | the model compilation stage), `CPU`, `GPU`, `CPU GPU`, `GPU.0`, | -| | | | etc. | -+---------------------------------------------+----------------------------------------------------------------------+ -| | `ov::intel_auto::enable_startup_fallback` | | **Values**: | -| | | | `true` | -| | | | `false` | -| | | | | -| | | | Enables/disables CPU as acceleration (or the helper device) in the | -| | | | beginning. The default value is `true`, indicating that CPU is used| -| | | | as acceleration by default. | -+---------------------------------------------+----------------------------------------------------------------------+ + benchmark_app -m ../public/alexnet/FP32/alexnet.xml -d AUTO -niter 128 -@endsphinxdirective + + +.. note:: + + The longer the process runs, the closer realtime performance will be to that of the best-suited device. + + +Using AUTO +#################### + +Following the OpenVINO™ naming convention, the Automatic Device Selection mode is assigned the label of "AUTO". It may be defined with no additional parameters, resulting in defaults being used, or configured further with the following setup options: + + ++-----------------------------------------------+----------------------------------------------------------------------+ +| | Property | | Values and Description | ++===============================================+======================================================================+ +| | | | **Values**: | +| | | | empty | +| | | | ``AUTO`` | +| | | | ``AUTO: `` (comma-separated, no spaces) | +| | | | | +| | | | Lists the devices available for selection. | +| | | | The device sequence will be taken as priority from high to low. | +| | | | If not specified, ``AUTO`` will be used as default, | +| | | | and all devices will be "viewed" as candidates. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::device::priorities`` | | **Values**: | +| | | | ```` (comma-separated, no spaces) | +| | | | | +| | | | Specifies the devices for AUTO to select. | +| | | | The device sequence will be taken as priority from high to low. | +| | | | This configuration is optional. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::hint::performance_mode`` | | **Values**: | +| | | | ``ov::hint::PerformanceMode::LATENCY`` | +| | | | ``ov::hint::PerformanceMode::THROUGHPUT`` | +| | | | ``ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT`` | +| | | | | +| | | | Specifies the performance option preferred by the application. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::hint::model_priority`` | | **Values**: | +| | | | ``ov::hint::Priority::HIGH`` | +| | | | ``ov::hint::Priority::MEDIUM`` | +| | | | ``ov::hint::Priority::LOW`` | +| | | | | +| | | | Indicates the priority for a model. | +| | | | IMPORTANT: This property is not fully supported yet. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::execution_devices`` | | Lists the runtime target devices on which the inferences are being | +| | | | executed. | +| | | | Examples of returning results could be ``(CPU)``(``CPU`` is a | +| | | | temporary device, indicating that CPU is used for acceleration at | +| | | | the model compilation stage), ``CPU``, ``GPU``, ``CPU GPU``, | +| | | | ``GPU.0``, etc. | ++-----------------------------------------------+----------------------------------------------------------------------+ +| | ``ov::intel_auto::enable_startup_fallback`` | | **Values**: | +| | | | ``true`` | +| | | | ``false`` | +| | | | | +| | | | Enables/disables CPU as acceleration (or the helper device) in the | +| | | | beginning. The default value is ``true``, indicating that CPU is | +| | | | used as acceleration by default. | ++-----------------------------------------------+----------------------------------------------------------------------+ Inference with AUTO is configured similarly to when device plugins are used: you compile the model on the plugin with configuration and execute inference. -### Device Candidates and Priority -The device candidate list enables you to customize the priority and limit the choice of devices available to AUTO. -- If is not specified, AUTO assumes all the devices present in the system can be used. -- If `AUTO` without any device names is specified, AUTO assumes all the devices present in the system can be used, and will load the network to all devices and run inference based on their default priorities, from high to low. -To specify the priority of devices, enter the device names in the priority order (from high to low) in `AUTO: `, or use the `ov::device::priorities` property. +Device Candidates and Priority +++++++++++++++++++++++++++++++ -See the following code for using AUTO and specifying devices: -@sphinxdirective +The device candidate list enables you to customize the priority and limit the choice of devices available to AUTO. + +* If is not specified, AUTO assumes all the devices present in the system can be used. +* If ``AUTO`` without any device names is specified, AUTO assumes all the devices present in the system can be used, and will load the network to all devices and run inference based on their default priorities, from high to low. + +To specify the priority of devices, enter the device names in the priority order (from high to low) in ``AUTO: ``, or use the ``ov::device::priorities`` property. + +See the following code for using AUTO and specifying devices: + .. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO0.cpp - :language: cpp - :fragment: [part0] + .. doxygensnippet:: docs/snippets/AUTO0.cpp + :language: cpp + :fragment: [part0] .. tab:: Python - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part0] + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part0] -@endsphinxdirective -Note that OpenVINO Runtime lets you use “GPU” as an alias for “GPU.0” in function calls. More details on enumerating devices can be found in [Working with devices](supported_plugins/Device_Plugins.md). +Note that OpenVINO Runtime lets you use "GPU" as an alias for "GPU.0" in function calls. More details on enumerating devices can be found in :doc:`Working with devices `. -#### Checking Available Devices +Checking Available Devices +-------------------------- -To check what devices are present in the system, you can use Device API, as listed below. For information on how to use it, see [Query device properties and configuration](supported_plugins/config_properties.md). +To check what devices are present in the system, you can use Device API, as listed below. For information on how to use it, see :doc:`Query device properties and configuration `. -@sphinxdirective -.. tab:: C++ +.. tab:: C++ .. code-block:: sh - ov::runtime::Core::get_available_devices() + ov::runtime::Core::get_available_devices() See the Hello Query Device C++ Sample for reference. @@ -181,19 +191,18 @@ To check what devices are present in the system, you can use Device API, as list See the Hello Query Device Python Sample for reference. -@endsphinxdirective -#### Excluding Devices from Device Candidate List +Excluding Devices from Device Candidate List +-------------------------------------------- -You can also exclude hardware devices from AUTO, for example, to reserve CPU for other jobs. AUTO will not use the device for inference then. To do that, add a minus sign (-) before CPU in `AUTO: `, as in the following example: +You can also exclude hardware devices from AUTO, for example, to reserve CPU for other jobs. AUTO will not use the device for inference then. To do that, add a minus sign ``(-)`` before CPU in ``AUTO: ``, as in the following example: -@sphinxdirective .. tab:: C++ .. code-block:: sh - ov::CompiledModel compiled_model = core.compile_model(model, "AUTO:-CPU"); + ov::CompiledModel compiled_model = core.compile_model(model, "AUTO:-CPU"); .. tab:: Python @@ -201,144 +210,156 @@ You can also exclude hardware devices from AUTO, for example, to reserve CPU for compiled_model = core.compile_model(model=model, device_name="AUTO:-CPU") -@endsphinxdirective -AUTO will then query all available devices and remove CPU from the candidate list. +AUTO will then query all available devices and remove CPU from the candidate list. -Note that if you choose to exclude CPU from device candidate list, CPU will not be able to support the initial model compilation stage. See more information in [How AUTO Works](#how-auto-works). +Note that if you choose to exclude CPU from device candidate list, CPU will not be able to support the initial model compilation stage. See more information in :ref:`How AUTO Works `. -### Checking Target Runtime Devices -To query the runtime target devices on which the inferences are being executed using AUTO, you can use the `ov::execution_devices` property. It must be used with `get_property`, for example: +Performance Hints for AUTO +++++++++++++++++++++++++++ -@sphinxdirective +The ``ov::hint::performance_mode`` property enables you to specify a performance option for AUTO to be more efficient for particular use cases. The default hint for AUTO is ``LATENCY``. -.. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO7.cpp - :language: cpp - :fragment: [part7] +LATENCY +-------------------- -.. tab:: Python +This option prioritizes low latency, providing short response time for each inference job. It performs best for tasks where inference is required for a single input image, e.g. a medical analysis of an ultrasound scan image. It also fits the tasks of real-time or nearly real-time applications, such as an industrial robot's response to actions in its environment or obstacle avoidance for autonomous vehicles. - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part7] +.. note:: -@endsphinxdirective + If no performance hint is set explicitly, AUTO will set LATENCY for devices that have not set ``ov::device::properties``, for example, ``ov::device::properties(, ov::hint::performance_mode(ov::hint::LATENCY))``. -### Performance Hints for AUTO -The `ov::hint::performance_mode` property enables you to specify a performance option for AUTO to be more efficient for particular use cases. The default hint for AUTO is `LATENCY`. -#### LATENCY -This option prioritizes low latency, providing short response time for each inference job. It performs best for tasks where inference is required for a single input image, e.g. a medical analysis of an ultrasound scan image. It also fits the tasks of real-time or nearly real-time applications, such as an industrial robot's response to actions in its environment or obstacle avoidance for autonomous vehicles. +.. _cumulative throughput: -> **NOTE**: If no performance hint is set explicitly, AUTO will set LATENCY for devices that have not set `ov::device::properties`, for example, `ov::device::properties(, ov::hint::performance_mode(ov::hint::LATENCY))`. -@sphinxdirective +THROUGHPUT +-------------------- -.. _cumulative throughput: +This option prioritizes high throughput, balancing between latency and power. It is best suited for tasks involving multiple jobs, such as inference of video feeds or large numbers of images. -@endsphinxdirective -#### THROUGHPUT -This option prioritizes high throughput, balancing between latency and power. It is best suited for tasks involving multiple jobs, such as inference of video feeds or large numbers of images. +CUMULATIVE_THROUGHPUT +--------------------- -#### CUMULATIVE_THROUGHPUT -While `LATENCY` and `THROUGHPUT` can select one target device with your preferred performance option, the `CUMULATIVE_THROUGHPUT` option enables running inference on multiple devices for higher throughput. With `CUMULATIVE_THROUGHPUT`, AUTO loads the network model to all available devices in the candidate list, and then runs inference on them based on the default or specified priority. +While ``LATENCY`` and ``THROUGHPUT`` can select one target device with your preferred performance option, the ``CUMULATIVE_THROUGHPUT`` option enables running inference on multiple devices for higher throughput. With ``CUMULATIVE_THROUGHPUT``, AUTO loads the network model to all available devices in the candidate list, and then runs inference on them based on the default or specified priority. -CUMULATIVE_THROUGHPUT has similar behavior as [the Multi-Device execution mode (MULTI)](./multi_device.md). The only difference is that CUMULATIVE_THROUGHPUT uses the devices specified by AUTO, which means that it's not mandatory to add devices manually, while with MULTI, you need to specify the devices before inference. +CUMULATIVE_THROUGHPUT has similar behavior as :doc:`the Multi-Device execution mode (MULTI) `. The only difference is that CUMULATIVE_THROUGHPUT uses the devices specified by AUTO, which means that it's not mandatory to add devices manually, while with MULTI, you need to specify the devices before inference. With the CUMULATIVE_THROUGHPUT option: -- If `AUTO` without any device names is specified, and the system has more than two GPU devices, AUTO will remove CPU from the device candidate list to keep GPU running at full capacity. -- If device priority is specified, AUTO will run inference requests on devices based on the priority. In the following example, AUTO will always try to use GPU first, and then use CPU if GPU is busy: - ```sh - ov::CompiledModel compiled_model = core.compile_model(model, "AUTO:GPU,CPU", ov::hint::performance_mode(ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT)); - ``` -#### Code Examples +* If ``AUTO`` without any device names is specified, and the system has more than two GPU devices, AUTO will remove CPU from the device candidate list to keep GPU running at full capacity. +* If device priority is specified, AUTO will run inference requests on devices based on the priority. In the following example, AUTO will always try to use GPU first, and then use CPU if GPU is busy: + + .. code-block: sh + + ov::CompiledModel compiled_model = core.compile_model(model, "AUTO:GPU,CPU", ov::hint::performance_mode(ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT)); + + +Code Examples +-------------------- + +To enable performance hints for your application, use the following code: -To enable performance hints for your application, use the following code: -@sphinxdirective .. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO3.cpp - :language: cpp - :fragment: [part3] - + .. doxygensnippet:: docs/snippets/AUTO3.cpp + :language: cpp + :fragment: [part3] + .. tab:: Python - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part3] + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part3] -@endsphinxdirective -#### Disabling Auto-Batching for THROUGHPUT and CUMULATIVE_THROUGHPUT +Disabling Auto-Batching for THROUGHPUT and CUMULATIVE_THROUGHPUT +---------------------------------------------------------------- -The `ov::hint::PerformanceMode::THROUGHPUT` mode and the `ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT` mode will trigger Auto-Batching (for example, for the GPU device) by default. You can disable it by setting `ov::hint::allow_auto_batching(false)`, or change the default timeout value to a large number, e.g. `ov::auto_batch_timeout(1000)`. See [Automatic Batching](./automatic_batching.md) for more details. +The ``ov::hint::PerformanceMode::THROUGHPUT`` mode and the ``ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT`` mode will trigger Auto-Batching (for example, for the GPU device) by default. You can disable it by setting ``ov::hint::allow_auto_batching(false)``, or change the default timeout value to a large number, e.g. ``ov::auto_batch_timeout(1000)``. See :doc:`Automatic Batching ` for more details. -### Configuring Model Priority -The `ov::hint::model_priority` property enables you to control the priorities of models in the Auto-Device plugin. A high-priority model will be loaded to a supported high-priority device. A lower-priority model will not be loaded to a device that is occupied by a higher-priority model. +Configuring Model Priority +++++++++++++++++++++++++++ + +The ``ov::hint::model_priority`` property enables you to control the priorities of models in the Auto-Device plugin. A high-priority model will be loaded to a supported high-priority device. A lower-priority model will not be loaded to a device that is occupied by a higher-priority model. -@sphinxdirective .. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO4.cpp - :language: cpp - :fragment: [part4] - + .. doxygensnippet:: docs/snippets/AUTO4.cpp + :language: cpp + :fragment: [part4] + .. tab:: Python - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part4] + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part4] -@endsphinxdirective -## Configuring Individual Devices and Creating the Auto-Device plugin on Top +Checking Target Runtime Devices ++++++++++++++++++++++++++++++++ -Although the methods described above are currently the preferred way to execute inference with AUTO, the following steps can be also used as an alternative. It is currently available as a legacy feature and used if AUTO is uncapable of utilizing the Performance Hints option. +To query the runtime target devices on which the inferences are being executed using AUTO, you can use the ``ov::execution_devices`` property. It must be used with ``get_property``, for example: -@sphinxdirective +.. tab:: C++ + + .. doxygensnippet:: docs/snippets/AUTO7.cpp + :language: cpp + :fragment: [part7] + +.. tab:: Python + + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part7] + + +Configuring Individual Devices and Creating the Auto-Device plugin on Top +######################################################################### + +Although the methods described above are currently the preferred way to execute inference with AUTO, the following steps can be also used as an alternative. It is currently available as a legacy feature and used if AUTO is incapable of utilizing the Performance Hints option. .. tab:: C++ - .. doxygensnippet:: docs/snippets/AUTO5.cpp - :language: cpp - :fragment: [part5] - + .. doxygensnippet:: docs/snippets/AUTO5.cpp + :language: cpp + :fragment: [part5] + .. tab:: Python - .. doxygensnippet:: docs/snippets/ov_auto.py - :language: python - :fragment: [part5] + .. doxygensnippet:: docs/snippets/ov_auto.py + :language: python + :fragment: [part5] -@endsphinxdirective -## Using AUTO with OpenVINO Samples and Benchmark app +.. _using-auto-with-openvino-samples-and-benchmark-app: + +Using AUTO with OpenVINO Samples and Benchmark app +################################################## To see how the Auto-Device plugin is used in practice and test its performance, take a look at OpenVINO™ samples. All samples supporting the "-d" command-line option (which stands for "device") will accept the plugin out-of-the-box. The Benchmark Application will be a perfect place to start – it presents the optimal performance of the plugin without the need for additional settings, like the number of requests or CPU threads. To evaluate the AUTO performance, you can use the following commands: For unlimited device choice: -```sh -benchmark_app –d AUTO –m -i -niter 1000 -``` +.. code-block:sh + + benchmark_app –d AUTO –m -i -niter 1000 For limited device choice: -```sh -benchmark_app –d AUTO:CPU,GPU,GNA –m -i -niter 1000 -``` +.. code-block:sh -For more information, refer to the [C++](../../samples/cpp/benchmark_app/README.md) or [Python](../../tools/benchmark_tool/README.md) version instructions. + benchmark_app –d AUTO:CPU,GPU,GNA –m -i -niter 1000 + +For more information, refer to the :doc:`C++ ` or :doc:`Python ` version instructions. -@sphinxdirective .. note:: The default CPU stream is 1 if using “-d AUTO”. @@ -346,11 +367,13 @@ For more information, refer to the [C++](../../samples/cpp/benchmark_app/README. You can use the FP16 IR to work with auto-device. No demos are yet fully optimized for AUTO, by means of selecting the most suitable device, using the GPU streams/throttling, and so on. -@endsphinxdirective -## Additional Resources -- [Debugging AUTO](AutoPlugin_Debugging.md) -- [Running on Multiple Devices Simultaneously](./multi_device.md) -- [Supported Devices](supported_plugins/Supported_Devices.md) +Additional Resources +#################### +- :doc:`Debugging AUTO ` +- :doc:`Running on Multiple Devices Simultaneously ` +- :doc:`Supported Devices ` + +@endsphinxdirective diff --git a/docs/_static/download/OV_2023_models_supported.pdf b/docs/_static/download/OV_2023_models_supported.pdf new file mode 100644 index 00000000000000..a226075e08e4c0 Binary files /dev/null and b/docs/_static/download/OV_2023_models_supported.pdf differ diff --git a/docs/img/autoplugin_accelerate.svg b/docs/_static/images/autoplugin_accelerate.svg similarity index 100% rename from docs/img/autoplugin_accelerate.svg rename to docs/_static/images/autoplugin_accelerate.svg diff --git a/docs/_static/images/sample-graph-image.png b/docs/_static/images/sample-graph-image.png deleted file mode 100644 index 97477897623537..00000000000000 --- a/docs/_static/images/sample-graph-image.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64e64059e7416353cfd2ad836a36c12071804addf4fb165f0cf5150aa7658fa4 -size 123996 diff --git a/docs/_static/js/graphs.js b/docs/_static/js/graphs.js index e09dc8072c803d..9680031d99566d 100644 --- a/docs/_static/js/graphs.js +++ b/docs/_static/js/graphs.js @@ -310,7 +310,7 @@ class Graph { $(document).ready(function () { - $('#build-graphs-btn').on('click', showModal); + $('.ov-toolkit-benchmark-results').on('click', showModal); function clickBuildGraphs(graph, networkModels, ietype, platforms, kpis, precisions) { renderData(graph, networkModels, ietype, platforms, kpis, precisions); diff --git a/docs/benchmarks/performance_benchmarks.md b/docs/benchmarks/performance_benchmarks.md index 4fe5b4bf2e84ad..6c00b7df02f1ee 100644 --- a/docs/benchmarks/performance_benchmarks.md +++ b/docs/benchmarks/performance_benchmarks.md @@ -21,7 +21,6 @@ Benchmarks are available for: * [Intel® Distribution of OpenVINO™ toolkit](performance_benchmarks_openvino.md). - You can also test performance for your system yourself, following the guide on [getting performance numbers](../MO_DG/prepare_model/Getting_performance_numbers.md). Performance of a particular application can also be evaluated virtually using [Intel® DevCloud for the Edge](https://devcloud.intel.com/edge/). It is a remote development environment with access to Intel® hardware and the latest versions of the Intel® Distribution of the OpenVINO™ Toolkit. To learn more about it, visit [the website](https://www.intel.com/content/www/us/en/developer/tools/devcloud/edge/overview.html) or [create an account](https://www.intel.com/content/www/us/en/forms/idz/devcloud-registration.html?tgt=https://www.intel.com/content/www/us/en/secure/forms/devcloud-enrollment/account-provisioning.html). diff --git a/docs/benchmarks/performance_benchmarks_openvino.md b/docs/benchmarks/performance_benchmarks_openvino.md index 8fc6f80ffb6890..639f1c38a8dd64 100644 --- a/docs/benchmarks/performance_benchmarks_openvino.md +++ b/docs/benchmarks/performance_benchmarks_openvino.md @@ -9,89 +9,76 @@ openvino_docs_performance_int8_vs_fp32 Performance Data Spreadsheet (download xlsx) -@endsphinxdirective - Click the "Benchmark Graphs" button to see the OpenVINO™ benchmark graphs. Select the models, the hardware platforms (CPU SKUs), precision and performance index from the lists and click the “Build Graphs” button. -@sphinxdirective +.. button-link:: # + :class: ov-toolkit-benchmark-results + :color: primary + :outline: + + :material-regular:`bar_chart;1.4em` Benchmark Graphs -.. raw:: html +Measuring inference performance involves many variables and is extremely use-case and application dependent. +Below are four parameters for measurements, which are key elements to consider for a successful deep learning inference application: -
-
-

Build benchmark graphs to your specifications

-
-
- -
- -
-@endsphinxdirective +.. tab:: :material-regular:`keyboard_double_arrow_right;1.4em` Throughput -Measuring inference performance involves many variables and is extremely use-case and application dependent. -Below are four parameters for measurements, which are key elements to consider for a successful deep learning inference application: + Measures the number of inferences delivered within a latency threshold (for example, number of Frames Per Second - FPS). When deploying a system with deep learning inference, select the throughput that delivers the best trade-off between latency and power for the price and performance that meets your requirements. -@sphinxdirective +.. tab:: :material-regular:`attach_money;1.4em` Value + + While throughput is important, what is more critical in edge AI deployments is the performance efficiency or performance-per-cost. Application performance in throughput per dollar of system cost is the best measure of value. The value KPI is calculated as “Throughput measured as inferences per second / price of inference engine”. This means for a 2 socket system 2x the price of a CPU is used. Prices are as per date of benchmarking and sources can be found as links in the Hardware Platforms (PDF) description below. + +.. tab:: :material-regular:`flash_on;1.4em` Efficiency + + System power is a key consideration from the edge to the data center. When selecting deep learning solutions, power efficiency (throughput/watt) is a critical factor to consider. Intel designs provide excellent power efficiency for running deep learning workloads. The efficiency KPI is calculated as “Throughput measured as inferences per second / TDP of inference engine”. This means for a 2 socket system 2x the power dissipation (TDP) of a CPU is used. TDP-values are as per date of benchmarking and sources can be found as links in the Hardware Platforms (PDF) description below. + +.. tab:: :material-regular:`hourglass_empty;1.4em` Latency + + This measures the synchronous execution of inference requests and is reported in milliseconds. Each inference request (for example: preprocess, infer, postprocess) is allowed to complete before the next is started. This performance metric is relevant in usage scenarios where a single image input needs to be acted upon as soon as possible. An example would be the healthcare sector where medical personnel only request analysis of a single ultra sound scanning image or in real-time or near real-time applications for example an industrial robot's response to actions in its environment or obstacle avoidance for autonomous vehicles. + + + +Platform & Configurations +#################################### + +For a listing of all platforms and configurations used for testing, refer to the following: + +.. button-link:: _static/benchmarks_files/platform_list_22.3.pdf + :color: primary + :outline: + + :material-regular:`download;1.5em` Click for Hardware Platforms [PDF] + +.. button-link:: _static/benchmarks_files/OV-2022.3-system-info-detailed.xlsx + :color: primary + :outline: + + :material-regular:`download;1.5em` Click for Configuration Details [XLSX] -.. raw:: html - -
- - Throughput - - - Value - - - Efficiency - - - Latency - -

- Measures the number of inferences delivered within a latency threshold. (for example, number of Frames Per Second - FPS). When deploying a system with deep learning inference, select the throughput that delivers the best trade-off between latency and power for the price and performance that meets your requirements. -

-

- While throughput is important, what is more critical in edge AI deployments is the performance efficiency or performance-per-cost. Application performance in throughput per dollar of system cost is the best measure of value. The value KPI is calculated as “Throughput measured as inferences per second / price of inference engine”. This means for a 2 socket system 2x the price of a CPU is used. Prices are as per date of benchmarking and sources can be found as links in the Hardware Platforms (PDF) description below. -

- System power is a key consideration from the edge to the data center. When selecting deep learning solutions, power efficiency (throughput/watt) is a critical factor to consider. Intel designs provide excellent power efficiency for running deep learning workloads. The efficiency KPI is calculated as “Throughput measured as inferences per second / TDP of inference engine”. This means for a 2 socket system 2x the power dissipation (TDP) of a CPU is used. TDP-values are as per date of benchmarking and sources can be found as links in the Hardware Platforms (PDF) description below. -

- This measures the synchronous execution of inference requests and is reported in milliseconds. Each inference request (for example: preprocess, infer, postprocess) is allowed to complete before the next is started. This performance metric is relevant in usage scenarios where a single image input needs to be acted upon as soon as possible. An example would be the healthcare sector where medical personnel only request analysis of a single ultra sound scanning image or in real-time or near real-time applications for example an industrial robot's response to actions in its environment or obstacle avoidance for autonomous vehicles. -

-
- -

Platform & Configurations

-

For a listing of all platforms and configurations used for testing, refer to the following:

- - - - - -@endsphinxdirective This benchmark setup includes a single machine on which both the benchmark application and the OpenVINO™ installation reside. The presented performance benchmark numbers are based on the release 2022.3 of the Intel® Distribution of OpenVINO™ toolkit. The benchmark application loads the OpenVINO™ Runtime and executes inferences on the specified hardware (CPU, GPU or GNA). -It measures the time spent on actual inferencing (excluding any pre or post processing) and then reports on the inferences per second (or Frames Per Second). +It measures the time spent on actual inference (excluding any pre or post processing) and then reports on the inferences per second (or Frames Per Second). -## Disclaimers +Disclaimers +#################################### Intel® Distribution of OpenVINO™ toolkit performance benchmark numbers are based on release 2022.3. Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Learn more at intel.com, or from the OEM or retailer. Performance results are based on testing as of December 13, 2022 and may not reflect all publicly available updates. See configuration disclosure for details. No product can be absolutely secure. -Performance varies by use, configuration and other factors. Learn more at [www.intel.com/PerformanceIndex](https://www.intel.com/PerformanceIndex). +Performance varies by use, configuration and other factors. Learn more at :ref:`www.intel.com/PerformanceIndex`. Your costs and results may vary. Intel optimizations, for Intel compilers or other products, may not optimize to the same degree for non-Intel products. -© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. \ No newline at end of file +© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. + +@endsphinxdirective \ No newline at end of file diff --git a/docs/benchmarks/performance_int8_vs_fp32.md b/docs/benchmarks/performance_int8_vs_fp32.md index 6e163daa310093..7faed00e38e8e1 100644 --- a/docs/benchmarks/performance_int8_vs_fp32.md +++ b/docs/benchmarks/performance_int8_vs_fp32.md @@ -1,4 +1,4 @@ -# Model Accuracy and Performance for INT8 and FP32 {#openvino_docs_performance_int8_vs_fp32} +# Model Accuracy {#openvino_docs_performance_int8_vs_fp32} The following table presents the absolute accuracy drop calculated as the accuracy difference between FP32 and INT8 representations of a model on two platforms diff --git a/docs/install_guides/installing-openvino-from-archive-linux.md b/docs/install_guides/installing-openvino-from-archive-linux.md index 3450584e01ff29..abb899a116d227 100644 --- a/docs/install_guides/installing-openvino-from-archive-linux.md +++ b/docs/install_guides/installing-openvino-from-archive-linux.md @@ -10,23 +10,54 @@ See the [Release Notes](https://software.intel.com/en-us/articles/OpenVINO-RelNo @sphinxdirective + .. tab:: System Requirements - | Full requirement listing is available in: - | `System Requirements Page `_ + | Full requirement listing is available in: + | `System Requirements Page `_ .. tab:: Processor Notes Processor graphics are not included in all processors. See `Product Specifications`_ for information about your processor. - + .. _Product Specifications: https://ark.intel.com/ .. tab:: Software * `CMake 3.13 or higher, 64-bit `_ - * GCC 7.5.0 (for Ubuntu 18.04) or GCC 9.3.0 (for Ubuntu 20.04) * `Python 3.7 - 3.10, 64-bit `_ + * GCC: + + .. tab:: Ubuntu 18.04 + + * GCC 7.5.0 + + .. tab:: Ubuntu 20.04 + + * GCC 9.3.0 + + .. tab:: RHEL 8 + + * GCC 8.4.1 + + .. tab:: CENTOS 7 + + * GCC 8.3.1 + Use folloving instructions to install it: + Install GCC 8.3.1 via devtoolset-8 + + .. code-block:: sh + + sudo yum update -y && sudo yum install -y centos-release-scl epel-release + sudo yum install -y devtoolset-8 git patchelf + + Enable devtoolset-8 and check current gcc version + + .. code-block:: sh + + source /opt/rh/devtoolset-8/enable + gcc -v @endsphinxdirective diff --git a/docs/install_guides/uninstalling-openvino.md b/docs/install_guides/uninstalling-openvino.md index 504708d3b5007c..c6a6bceac1775b 100644 --- a/docs/install_guides/uninstalling-openvino.md +++ b/docs/install_guides/uninstalling-openvino.md @@ -1,12 +1,17 @@ # Uninstalling the Intel® Distribution of OpenVINO™ Toolkit {#openvino_docs_install_guides_uninstalling_openvino} -> **NOTE**: Uninstallation procedures remove all Intel® Distribution of OpenVINO™ Toolkit component files but don't affect user files in the installation directory. +@sphinxdirective + +.. note:: -## Uninstall Using the Original Installation Package + Uninstallation procedures remove all Intel® Distribution of OpenVINO™ Toolkit component files but don't affect user files in the installation directory. + +Uninstall Using the Original Installation Package +################################################# If you have installed OpenVINO Runtime from archive files, you can uninstall it by deleting the archive files and the extracted folders. -@sphinxdirective + .. tab:: Windows If you have created the symbolic link, remove the link first. @@ -15,25 +20,27 @@ If you have installed OpenVINO Runtime from archive files, you can uninstall it * Use Windows Explorer to remove the files. * Open a Command Prompt and run: - + .. code-block:: sh - + rmdir /s del - + .. tab:: Linux & macOS - + If you have created the symbolic link, remove the link first: .. code-block:: sh - - rm /home//intel/openvino_2022 + + rm /opt/intel/openvino_2022 To delete the files: .. code-block:: sh - + rm -r && rm + @endsphinxdirective + diff --git a/docs/ops/opset.md b/docs/ops/opset.md index 27a24a0ffef3dc..5e68cfd343bdc4 100644 --- a/docs/ops/opset.md +++ b/docs/ops/opset.md @@ -6,6 +6,7 @@ :maxdepth: 1 :hidden: + openvino_docs_ops_opset11 openvino_docs_ops_opset10 openvino_docs_ops_opset9 openvino_docs_ops_opset8 @@ -25,6 +26,7 @@ This topic provides a complete list of available sets of operations supported in | OpenVINO™ Version | Actual Operations Set | | :---------------- | :------------------------------- | +| 2023.0 | [opset11](opset11.md) | | 2022.3 | [opset10](opset10.md) | | 2022.2 | [opset9](opset9.md) | | 2022.1 | [opset8](opset8.md) | diff --git a/docs/ops/opset11.md b/docs/ops/opset11.md new file mode 100644 index 00000000000000..c8d2b3fae56377 --- /dev/null +++ b/docs/ops/opset11.md @@ -0,0 +1,187 @@ +# opset11 {#openvino_docs_ops_opset11} + +This specification document describes the `opset11` operation set supported in OpenVINO™. +Support for each particular operation from the list below depends on the capabilities of an inference plugin +and may vary among different hardware platforms and devices. Examples of operation instances are provided as IR V10 xml +snippets. Such IR is generated by the Model Optimizer. The semantics match corresponding nGraph operation classes +declared in `namespace opset11`. + + +## Table of Contents + +* [Abs](arithmetic/Abs_1.md) +* [Acos](arithmetic/Acos_1.md) +* [Acosh](arithmetic/Acosh_3.md) +* [AdaptiveAvgPool](pooling/AdaptiveAvgPool_8.md) +* [AdaptiveMaxPool](pooling/AdaptiveMaxPool_8.md) +* [Add](arithmetic/Add_1.md) +* [Asin](arithmetic/Asin_1.md) +* [Asinh](arithmetic/Asinh_3.md) +* [Assign](infrastructure/Assign_3.md) +* [Atan](arithmetic/Atan_1.md) +* [Atanh](arithmetic/Atanh_3.md) +* [AvgPool](pooling/AvgPool_1.md) +* [BatchNormInference](normalization/BatchNormInference_5.md) +* [BatchToSpace](movement/BatchToSpace_2.md) +* [BinaryConvolution](convolution/BinaryConvolution_1.md) +* [Broadcast](movement/Broadcast_3.md) +* [Bucketize](condition/Bucketize_3.md) +* [CTCGreedyDecoder](sequence/CTCGreedyDecoder_1.md) +* [CTCGreedyDecoderSeqLen](sequence/CTCGreedyDecoderSeqLen_6.md) +* [CTCLoss](sequence/CTCLoss_4.md) +* [Ceiling](arithmetic/Ceiling_1.md) +* [Clamp](activation/Clamp_1.md) +* [Concat](movement/Concat_1.md) +* [Constant](infrastructure/Constant_1.md) +* [Convert](type/Convert_1.md) +* [ConvertLike](type/ConvertLike_1.md) +* [Convolution](convolution/Convolution_1.md) +* [ConvolutionBackpropData](convolution/ConvolutionBackpropData_1.md) +* [Cos](arithmetic/Cos_1.md) +* [Cosh](arithmetic/Cosh_1.md) +* [CumSum](arithmetic/CumSum_3.md) +* [DeformableConvolution](convolution/DeformableConvolution_8.md) +* [DeformablePSROIPooling](detection/DeformablePSROIPooling_1.md) +* [DepthToSpace](movement/DepthToSpace_1.md) +* [DetectionOutput](detection/DetectionOutput_8.md) +* [DFT](signals/DFT_7.md) +* [Divide](arithmetic/Divide_1.md) +* [Einsum](matrix/Einsum_7.md) +* [Elu](activation/Elu_1.md) +* [EmbeddingBagOffsetsSum](sparse/EmbeddingBagOffsetsSum_3.md) +* [EmbeddingBagPackedSum](sparse/EmbeddingBagPackedSum_3.md) +* [EmbeddingSegmentsSum](sparse/EmbeddingSegmentsSum_3.md) +* [Equal](comparison/Equal_1.md) +* [Erf](arithmetic/Erf_1.md) +* [Exp](activation/Exp_1.md) +* [ExperimentalDetectronDetectionOutput_6](detection/ExperimentalDetectronDetectionOutput_6.md) +* [ExperimentalDetectronGenerateProposalsSingleImage_6](detection/ExperimentalDetectronGenerateProposalsSingleImage_6.md) +* [ExperimentalDetectronPriorGridGenerator_6](detection/ExperimentalDetectronPriorGridGenerator_6.md) +* [ExperimentalDetectronROIFeatureExtractor_6](detection/ExperimentalDetectronROIFeatureExtractor_6.md) +* [ExperimentalDetectronTopKROIs_6](sort/ExperimentalDetectronTopKROIs_6.md) +* [ExtractImagePatches](movement/ExtractImagePatches_3.md) +* [Eye](generation/Eye_9.md) +* [FakeQuantize](quantization/FakeQuantize_1.md) +* [Floor](arithmetic/Floor_1.md) +* [FloorMod](arithmetic/FloorMod_1.md) +* [Gather](movement/Gather_8.md) +* [GatherElements](movement/GatherElements_6.md) +* [GatherND](movement/GatherND_8.md) +* [GatherTree](movement/GatherTree_1.md) +* [Gelu](activation/GELU_7.md) +* [GenerateProposals](detection/GenerateProposals_9.md) +* [Greater](comparison/Greater_1.md) +* [GreaterEqual](comparison/GreaterEqual_1.md) +* [GridSample](image/GridSample_9.md) +* [GRN](normalization/GRN_1.md) +* [GroupConvolution](convolution/GroupConvolution_1.md) +* [GroupConvolutionBackpropData](convolution/GroupConvolutionBackpropData_1.md) +* [GRUCell](sequence/GRUCell_3.md) +* [GRUSequence](sequence/GRUSequence_5.md) +* [HardSigmoid](activation/HardSigmoid_1.md) +* [HSigmoid](activation/HSigmoid_5.md) +* [HSwish](activation/HSwish_4.md) +* [IDFT](signals/IDFT_7.md) +* [I420toBGR](image/I420toBGR_8.md) +* [I420toRGB](image/I420toRGB_8.md) +* [If](condition/If_8.md) +* [Interpolate](image/Interpolate_4.md) +* [IRDFT](signals/IRDFT_9.md) +* [IsInf](comparison/IsInf_10.md) +* [IsNaN](comparison/IsNaN_10.md) +* [Less](comparison/Less_1.md) +* [LessEqual](comparison/LessEqual_1.md) +* [Log](arithmetic/Log_1.md) +* [LogicalAnd](logical/LogicalAnd_1.md) +* [LogicalNot](logical/LogicalNot_1.md) +* [LogicalOr](logical/LogicalOr_1.md) +* [LogicalXor](logical/LogicalXor_1.md) +* [LogSoftmax](activation/LogSoftmax_5.md) +* [Loop](infrastructure/Loop_5.md) +* [LRN](normalization/LRN_1.md) +* [LSTMCell](sequence/LSTMCell_1.md) +* [LSTMSequence](sequence/LSTMSequence_1.md) +* [MatMul](matrix/MatMul_1.md) +* [MatrixNMS](sort/MatrixNMS_8.md) +* [MaxPool](pooling/MaxPool_8.md) +* [Maximum](arithmetic/Maximum_1.md) +* [Minimum](arithmetic/Minimum_1.md) +* [Mish](activation/Mish_4.md) +* [Mod](arithmetic/Mod_1.md) +* [MVN](normalization/MVN_6.md) +* [MulticlassNMS](sort/MulticlassNonMaxSuppression_9.md) +* [Multiply](arithmetic/Multiply_1.md) +* [Negative](arithmetic/Negative_1.md) +* [NonMaxSuppression](sort/NonMaxSuppression_5.md) +* [NonZero](condition/NonZero_3.md) +* [NormalizeL2](normalization/NormalizeL2_1.md) +* [NotEqual](comparison/NotEqual_1.md) +* [NV12toBGR](image/NV12toBGR_8.md) +* [NV12toRGB](image/NV12toRGB_8.md) +* [OneHot](sequence/OneHot_1.md) +* [Pad](movement/Pad_1.md) +* [Parameter](infrastructure/Parameter_1.md) +* [Power](arithmetic/Power_1.md) +* [PReLU](activation/PReLU_1.md) +* [PriorBoxClustered](detection/PriorBoxClustered_1.md) +* [PriorBox](detection/PriorBox_8.md) +* [Proposal](detection/Proposal_4.md) +* [PSROIPooling](detection/PSROIPooling_1.md) +* [RandomUniform](generation/RandomUniform_8.md) +* [Range](generation/Range_4.md) +* [RDFT](signals/RDFT_9.md) +* [ReLU](activation/ReLU_1.md) +* [ReadValue](infrastructure/ReadValue_3.md) +* [ReduceL1](reduction/ReduceL1_4.md) +* [ReduceL2](reduction/ReduceL2_4.md) +* [ReduceLogicalAnd](reduction/ReduceLogicalAnd_1.md) +* [ReduceLogicalOr](reduction/ReduceLogicalOr_1.md) +* [ReduceMax](reduction/ReduceMax_1.md) +* [ReduceMean](reduction/ReduceMean_1.md) +* [ReduceMin](reduction/ReduceMin_1.md) +* [ReduceProd](reduction/ReduceProd_1.md) +* [ReduceSum](reduction/ReduceSum_1.md) +* [RegionYolo](detection/RegionYolo_1.md) +* [ReorgYolo](detection/ReorgYolo_1.md) +* [Reshape](shape/Reshape_1.md) +* [Result](infrastructure/Result_1.md) +* [ReverseSequence](movement/ReverseSequence_1.md) +* [RNNCell](sequence/RNNCell_3.md) +* [RNNSequence](sequence/RNNSequence_5.md) +* [ROIAlign](detection/ROIAlign_9.md) +* [ROIPooling](detection/ROIPooling_1.md) +* [Roll](movement/Roll_7.md) +* [Round](arithmetic/Round_5.md) +* [ScatterElementsUpdate](movement/ScatterElementsUpdate_3.md) +* [ScatterNDUpdate](movement/ScatterNDUpdate_3.md) +* [ScatterUpdate](movement/ScatterUpdate_3.md) +* [Select](condition/Select_1.md) +* [Selu](activation/Selu_1.md) +* [ShapeOf](shape/ShapeOf_3.md) +* [ShuffleChannels](movement/ShuffleChannels_1.md) +* [Sigmoid](activation/Sigmoid_1.md) +* [Sign](arithmetic/Sign_1.md) +* [Sin](arithmetic/Sin_1.md) +* [Sinh](arithmetic/Sinh_1.md) +* [Slice](movement/Slice_8.md) +* [SoftMax](activation/SoftMax_8.md) +* [SoftPlus](activation/SoftPlus_4.md) +* [SoftSign](activation/SoftSign_9.md) +* [SpaceToBatch](movement/SpaceToBatch_2.md) +* [SpaceToDepth](movement/SpaceToDepth_1.md) +* [Split](movement/Split_1.md) +* [Sqrt](arithmetic/Sqrt_1.md) +* [SquaredDifference](arithmetic/SquaredDifference_1.md) +* [Squeeze](shape/Squeeze_1.md) +* [StridedSlice](movement/StridedSlice_1.md) +* [Subtract](arithmetic/Subtract_1.md) +* [Swish](activation/Swish_4.md) +* [Tan](arithmetic/Tan_1.md) +* [Tanh](arithmetic/Tanh_1.md) +* [TensorIterator](infrastructure/TensorIterator_1.md) +* [Tile](movement/Tile_1.md) +* [TopK](sort/TopK_11.md) +* [Transpose](movement/Transpose_1.md) +* [Unique](movement/Unique_10.md) +* [Unsqueeze](shape/Unsqueeze_1.md) +* [VariadicSplit](movement/VariadicSplit_1.md) diff --git a/docs/ops/sort/TopK_1.md b/docs/ops/sort/TopK_1.md index 824ae65fa2876c..b1ad91b4b791f7 100644 --- a/docs/ops/sort/TopK_1.md +++ b/docs/ops/sort/TopK_1.md @@ -51,7 +51,7 @@ **Detailed Description** -Output tensor is populated by values computes in the following way: +The output tensor is populated by values computed in the following way: output[i1, ..., i(axis-1), j, i(axis+1) ..., iN] = top_k(input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]), k, sort, mode) @@ -59,7 +59,7 @@ So for each slice `input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]` which repr Sorting and minimum/maximum are controlled by `sort` and `mode` attributes: * *mode*=`max`, *sort*=`value` - descending by value - * *mode*=`max`, *sort*=`index` - ascending by index + * *mode*=`max`, *sort*=`index` - descending by index * *mode*=`max`, *sort*=`none` - undefined * *mode*=`min`, *sort*=`value` - ascending by value * *mode*=`min`, *sort*=`index` - ascending by index diff --git a/docs/ops/sort/TopK_11.md b/docs/ops/sort/TopK_11.md new file mode 100644 index 00000000000000..f96007704da53e --- /dev/null +++ b/docs/ops/sort/TopK_11.md @@ -0,0 +1,118 @@ +# TopK {#openvino_docs_ops_sort_TopK_11} + +**Versioned name**: *TopK-11* + +**Category**: *Sorting and maximization* + +**Short description**: *TopK* computes indices and values of the *k* maximum/minimum values for each slice along a specified axis. + +**Attributes** + +* *axis* + + * **Description**: Specifies the axis along which the values are retrieved. + * **Range of values**: An integer. Negative values means counting dimension from the back. + * **Type**: `int` + * **Required**: *yes* + +* *mode* + + * **Description**: Specifies whether *TopK* selects the largest or the smallest elements from each slice. + * **Range of values**: "min", "max" + * **Type**: `string` + * **Required**: *yes* + +* *sort* + + * **Description**: Specifies the order of corresponding elements of the output tensor. + * **Range of values**: `value`, `index`, `none` + * **Type**: `string` + * **Required**: *yes* + +* *stable* + + * **Description**: Specifies whether the equivalent elements should maintain their relative order from the input tensor. Takes effect only if the `sort` attribute is set to `value`. + * **Range of values**: `true` of `false` + * **Type**: `boolean` + * **Default value**: `false` + * **Required**: *no* + +* *index_element_type* + + * **Description**: the type of output tensor with indices + * **Range of values**: "i64" or "i32" + * **Type**: string + * **Default value**: "i32" + * **Required**: *no* + + +**Inputs**: + +* **1**: tensor with arbitrary rank and type *T*. **Required.** + +* **2**: The value of *K* - a scalar of any integer type that specifies how many elements from the input tensor should be selected. The accepted range of values of *K* is `<1;input1.shape[axis]>`. The behavior of this operator is undefined if the value of *K* does not meet those requirements. **Required.** + +**Outputs**: + +* **1**: Output tensor of type *T* with *k* values from the input tensor along a specified *axis*. The shape of the tensor is `[input1.shape[0], ..., input1.shape[axis-1], 1..k, input1.shape[axis+1], ..., input1.shape[input1.rank - 1]]`. + +* **2**: Output tensor containing indices of the corresponding elements(values) from the first output tensor. The indices point to the location of selected values in the original input tensor. The shape of this output tensor is the same as the shape of the first output, that is `[input1.shape[0], ..., input1.shape[axis-1], 1..k, input1.shape[axis+1], ..., input1.shape[input1.rank - 1]]`. The type of this tensor *T_IND* is controlled by the `index_element_type` attribute. + +**Types** + +* *T*: any numeric type. + +* *T_IND*: `int64` or `int32`. + +**Detailed Description** + +The output tensor is populated by values computed in the following way: + + output[i1, ..., i(axis-1), j, i(axis+1) ..., iN] = top_k(input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]), k, sort, mode) + +meaning that for each slice `input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]` the *TopK* values are computed individually. + +Sorting and minimum/maximum are controlled by `sort` and `mode` attributes with additional configurability provided by `stable`: + * *sort*=`value`, *mode*=`max`, *stable*=`false` - descending by value, relative order of equal elements not guaranteed to be maintained + * *sort*=`value`, *mode*=`max`, *stable*=`true` - descending by value, relative order of equal elements guaranteed to be maintained + * *sort*=`value`, *mode*=`min`, *stable*=`false` - ascending by value, relative order of equal elements not guaranteed to be maintained + * *sort*=`value`, *mode*=`min`, *stable*=`true` - ascending by value, relative order of equal elements guaranteed to be maintained + * *sort*=`index`, *mode*=`max` - descending by index + * *sort*=`index`, *mode*=`min` - ascending by index + * *sort*=`none` , *mode*=`max` - undefined + * *sort*=`none` , *mode*=`min` - undefined + +The relative order of equivalent elements is only preserved if the *stable* attribute is set to `true`. This makes the implementation use stable sorting algorithm during the computation of TopK elements. Otherwise the output order is undefined. + +**Example** + +This example assumes that `K` is equal to 10: + +```xml + + + + + 1 + 3 + 224 + 224 + + + + + + 1 + 3 + 224 + 10 + + + 1 + 3 + 224 + 10 + + + +``` diff --git a/docs/ops/sort/TopK_3.md b/docs/ops/sort/TopK_3.md index d5d4d3a4085b36..2ad37b24cfbb7d 100644 --- a/docs/ops/sort/TopK_3.md +++ b/docs/ops/sort/TopK_3.md @@ -58,7 +58,7 @@ **Detailed Description** -Output tensor is populated by values computes in the following way: +The output tensor is populated by values computed in the following way: output[i1, ..., i(axis-1), j, i(axis+1) ..., iN] = top_k(input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]), k, sort, mode) @@ -66,7 +66,7 @@ So for each slice `input[i1, ...., i(axis-1), :, i(axis+1), ..., iN]` which repr Sorting and minimum/maximum are controlled by `sort` and `mode` attributes: * *mode*=`max`, *sort*=`value` - descending by value - * *mode*=`max`, *sort*=`index` - ascending by index + * *mode*=`max`, *sort*=`index` - descending by index * *mode*=`max`, *sort*=`none` - undefined * *mode*=`min`, *sort*=`value` - ascending by value * *mode*=`min`, *sort*=`index` - ascending by index diff --git a/docs/optimization_guide/nncf/filter_pruning.md b/docs/optimization_guide/nncf/filter_pruning.md index 726482a311ee1c..7633d2e2400751 100644 --- a/docs/optimization_guide/nncf/filter_pruning.md +++ b/docs/optimization_guide/nncf/filter_pruning.md @@ -1,183 +1,227 @@ # Filter Pruning of Convolutional Models {#filter_pruning} -## Introduction -Filter pruning is an advanced optimization method which allows reducing computational complexity of the model by removing redundant or unimportant filters from convolutional operations of the model. This removal is done in two steps: +@sphinxdirective + +Introduction +#################### + +Filter pruning is an advanced optimization method which allows reducing computational complexity of the model by removing +redundant or unimportant filters from convolutional operations of the model. This removal is done in two steps: + 1. Unimportant filters are zeroed out by the NNCF optimization with fine-tuning. -2. Zero filters are removed from the model during the export to OpenVINO™ Intermediate Representation (IR). -Filter Pruning method from the NNCF can be used stand-alone but we usually recommend to stack it with 8-bit quantization for two reasons. First, 8-bit quantization is the best method in terms of achieving the highest accuracy-performance trade-offs so stacking it with filter pruning can give even better performance results. Second, applying quantization along with filter pruning does not hurt accuracy a lot since filter pruning removes noisy filters from the model which narrows down values ranges of weights and activations and helps to reduce overall quantization error. +2. Zero filters are removed from the model during the export to OpenVINO Intermediate Representation (IR). + + +Filter Pruning method from the NNCF can be used stand-alone but we usually recommend to stack it with 8-bit quantization for +two reasons. First, 8-bit quantization is the best method in terms of achieving the highest accuracy-performance trade-offs so +stacking it with filter pruning can give even better performance results. Second, applying quantization along with filter +pruning does not hurt accuracy a lot since filter pruning removes noisy filters from the model which narrows down values +ranges of weights and activations and helps to reduce overall quantization error. + +.. note:: + Filter Pruning usually requires a long fine-tuning or retraining of the model which can be comparable to training the + model from scratch. Otherwise, a large accuracy degradation can be caused. Therefore, the training schedule should be + adjusted accordingly when applying this method. + -> **NOTE**: Filter Pruning usually requires a long fine-tuning or retraining of the model which can be comparable to training the model from scratch. Otherwise, a large accuracy degradation can be caused. Therefore, the training schedule should be adjusted accordingly when applying this method. Below, we provide the steps that are required to apply Filter Pruning + QAT to the model: -## Applying Filter Pruning with fine-tuning -Here, we show the basic steps to modify the training script for the model and use it to zero out unimportant filters: -### 1. Import NNCF API -In this step, NNCF-related imports are added in the beginning of the training script: +Applying Filter Pruning with fine-tuning +######################################## -@sphinxtabset +Here, we show the basic steps to modify the training script for the model and use it to zero out unimportant filters: -@sphinxtab{PyTorch} +1. Import NNCF API +++++++++++++++++++ -@snippet docs/optimization_guide/nncf/code/pruning_torch.py imports +In this step, NNCF-related imports are added in the beginning of the training script: -@endsphinxtab +.. tab:: PyTorch -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [imports] -@snippet docs/optimization_guide/nncf/code/pruning_tf.py imports +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [imports] -@endsphinxtabset +2. Create NNCF configuration +++++++++++++++++++++++++++++ -### 2. Create NNCF configuration -Here, you should define NNCF configuration which consists of model-related parameters (`"input_info"` section) and parameters of optimization methods (`"compression"` section). +Here, you should define NNCF configuration which consists of model-related parameters (`"input_info"` section) and parameters +of optimization methods (`"compression"` section). -@sphinxtabset +.. tab:: PyTorch -@sphinxtab{PyTorch} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [nncf_congig] -@snippet docs/optimization_guide/nncf/code/pruning_torch.py nncf_congig +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [nncf_congig] -@sphinxtab{TensorFlow 2} +Here is a brief description of the required parameters of the Filter Pruning method. For full description refer to the +`GitHub `__ page. -@snippet docs/optimization_guide/nncf/code/pruning_tf.py nncf_congig +* ``pruning_init`` - initial pruning rate target. For example, value ``0.1`` means that at the begging of training, convolutions that can be pruned will have 10% of their filters set to zero. -@endsphinxtab +* ``pruning_target`` - pruning rate target at the end of the schedule. For example, the value ``0.5`` means that at the epoch with the number of ``num_init_steps + pruning_steps``, convolutions that can be pruned will have 50% of their filters set to zero. -@endsphinxtabset +* ``pruning_steps` - the number of epochs during which the pruning rate target is increased from ``pruning_init` to ``pruning_target`` value. We recommend to keep the highest learning rate during this period. -Here is a brief description of the required parameters of the Filter Pruning method. For full description refer to the [GitHub](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Pruning.md) page. -- `pruning_init` - initial pruning rate target. For example, value `0.1` means that at the begging of training, convolutions that can be pruned will have 10% of their filters set to zero. -- `pruning_target` - pruning rate target at the end of the schedule. For example, the value `0.5` means that at the epoch with the number of `num_init_steps + pruning_steps`, convolutions that can be pruned will have 50% of their filters set to zero. -- `pruning_steps` - the number of epochs during which the pruning rate target is increased from `pruning_init` to `pruning_target` value. We recommend to keep the highest learning rate during this period. -### 3. Apply optimization methods -In the next step, the original model is wrapped by the NNCF object using the `create_compressed_model()` API using the configuration defined in the previous step. This method returns a so-called compression controller and the wrapped model that can be used the same way as the original model. It is worth noting that optimization methods are applied at this step so that the model undergoes a set of corresponding transformations and can contain additional operations required for the optimization. -@sphinxtabset +3. Apply optimization methods ++++++++++++++++++++++++++++++ -@sphinxtab{PyTorch} +In the next step, the original model is wrapped by the NNCF object using the ``create_compressed_model()`` API using the +configuration defined in the previous step. This method returns a so-called compression controller and the wrapped model +that can be used the same way as the original model. It is worth noting that optimization methods are applied at this step +so that the model undergoes a set of corresponding transformations and can contain additional operations required for the +optimization. -@snippet docs/optimization_guide/nncf/code/pruning_torch.py wrap_model -@endsphinxtab +.. tab:: PyTorch -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [wrap_model] -@snippet docs/optimization_guide/nncf/code/pruning_tf.py wrap_model +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [wrap_model] -@endsphinxtabset -### 4. Fine-tune the model -This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the case of Filter Pruning method we recommend using the training schedule and learning rate similar to what was used for the training of original model. +4. Fine-tune the model +++++++++++++++++++++++ -@sphinxtabset +This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the case +of Filter Pruning method we recommend using the training schedule and learning rate similar to what was used for the training +of original model. -@sphinxtab{PyTorch} -@snippet docs/optimization_guide/nncf/code/pruning_torch.py tune_model +.. tab:: PyTorch -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [tune_model] -@sphinxtab{TensorFlow 2} +.. tab:: TensorFlow 2 -@snippet docs/optimization_guide/nncf/code/pruning_tf.py tune_model + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [tune_model] -@endsphinxtab -@endsphinxtabset +5. Multi-GPU distributed training ++++++++++++++++++++++++++++++++++ -### 5. Multi-GPU distributed training -In the case of distributed multi-GPU training (not DataParallel), you should call `compression_ctrl.distributed()` before the fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode. -@sphinxtabset +In the case of distributed multi-GPU training (not DataParallel), you should call ``compression_ctrl.distributed()`` before the +fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode. -@sphinxtab{PyTorch} -@snippet docs/optimization_guide/nncf/code/qat_torch.py distributed +.. tab:: PyTorch -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [distributed] -@sphinxtab{TensorFlow 2} +.. tab:: TensorFlow 2 -@snippet docs/optimization_guide/nncf/code/qat_tf.py distributed + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [distributed] -@endsphinxtab -@endsphinxtabset +6. Export quantized model ++++++++++++++++++++++++++ -### 6. Export quantized model -When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in the case of PyTorch and frozen graph - for TensorFlow 2. +When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in +the case of PyTorch and frozen graph - for TensorFlow 2. -@sphinxtabset -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py export + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [export] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [export] -@snippet docs/optimization_guide/nncf/code/qat_tf.py export -@endsphinxtab +These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model +checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs. -@endsphinxtabset -These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs. +7. (Optional) Save checkpoint ++++++++++++++++++++++++++++++ -### 7. (Optional) Save checkpoint To save model checkpoint use the following API: -@sphinxtabset -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py save_checkpoint + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [save_checkpoint] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [save_checkpoint] -@snippet docs/optimization_guide/nncf/code/qat_tf.py save_checkpoint -@endsphinxtab +8. (Optional) Restore from checkpoint ++++++++++++++++++++++++++++++++++++++ -@endsphinxtabset - -### 8. (Optional) Restore from checkpoint To restore the model from checkpoint you should use the following API: -@sphinxtabset - -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py load_checkpoint + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_torch.py + :language: python + :fragment: [load_checkpoint] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/pruning_tf.py + :language: python + :fragment: [load_checkpoint] -@snippet docs/optimization_guide/nncf/code/qat_tf.py load_checkpoint -@endsphinxtab +For more details on saving/loading checkpoints in the NNCF, see the following +`documentation `__. -@endsphinxtabset +Deploying pruned model +###################### -For more details on saving/loading checkpoints in the NNCF, see the following [documentation](https://github.com/openvinotoolkit/nncf/blob/develop/docs/Usage.md#saving-and-loading-compressed-models). +The pruned model requres an extra step that should be done to get performance improvement. This step involves removal of the +zero filters from the model. This is done at the model conversion step using :doc:`Model Optimizer ` tool when model is converted from the framework representation (ONNX, TensorFlow, etc.) to OpenVINO Intermediate Representation. -## Deploying pruned model -The pruned model requres an extra step that should be done to get performance improvement. This step involves removal of the zero filters from the model. This is done at the model convertion step using [Model Optimizer](@ref openvino_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide) tool when model is converted from the framework representation (ONNX, TensorFlow, etc.) to OpenVINO Intermediate Representation. -- To remove zero filters from the pruned model add the following parameter to the model convertion command: `--transform=Pruning` +* To remove zero filters from the pruned model add the following parameter to the model convertion command: ``--transform=Pruning`` After that the model can be deployed with OpenVINO in the same way as the baseline model. -For more details about model deployment with OpenVINO, see the corresponding [documentation](../../OV_Runtime_UG/openvino_intro.md). +For more details about model deployment with OpenVINO, see the corresponding :doc:`documentation `. + + +Examples +#################### + +* `PyTorch Image Classiication example `__ + +* `TensorFlow Image Classification example `__ -## Examples -- [PyTorch Image Classiication example](https://github.com/openvinotoolkit/nncf/blob/develop/examples/torch/classification) -- [TensorFlow Image Classification example](https://github.com/openvinotoolkit/nncf/tree/develop/examples/tensorflow/classification) \ No newline at end of file +@endsphinxdirective diff --git a/docs/optimization_guide/nncf/ptq/basic_quantization_flow.md b/docs/optimization_guide/nncf/ptq/basic_quantization_flow.md index 38831daac02f04..2f315c04705fbd 100644 --- a/docs/optimization_guide/nncf/ptq/basic_quantization_flow.md +++ b/docs/optimization_guide/nncf/ptq/basic_quantization_flow.md @@ -1,135 +1,161 @@ # Basic Quantization Flow {#basic_qauntization_flow} -## Introduction +@sphinxdirective + +Introduction +#################### The basic quantization flow is the simplest way to apply 8-bit quantization to the model. It is available for models in the following frameworks: PyTorch, TensorFlow 2.x, ONNX, and OpenVINO. The basic quantization flow is based on the following steps: + * Set up an environment and install dependencies. * Prepare the **calibration dataset** that is used to estimate quantization parameters of the activations within the model. * Call the quantization API to apply 8-bit quantization to the model. -## Set up an Environment +Set up an Environment +##################### It is recommended to set up a separate Python environment for quantization with NNCF. To do this, run the following command: -```bash -python3 -m venv nncf_ptq_env -``` -Install all the packages required to instantiate the model object, for example, DL framework. After that, install NNCF on top of the environment: -```bash -pip install nncf -``` -## Prepare a Calibration Dataset +.. code-block:: sh -At this step, create an instance of the `nncf.Dataset` class that represents the calibration dataset. The `nncf.Dataset` class can be a wrapper over the framework dataset object that is used for model training or validation. The class constructor receives the dataset object and the transformation function. For example, if you use PyTorch, you can pass an instance of the `torch.utils.data.DataLoader` object. + python3 -m venv nncf_ptq_env -The transformation function is a function that takes a sample from the dataset and returns data that can be passed to the model for inference. For example, this function can take a tuple of a data tensor and labels tensor, and return the former while ignoring the latter. The transformation function is used to avoid modifying the dataset code to make it compatible with the quantization API. The function is applied to each sample from the dataset before passing it to the model for inference. The following code snippet shows how to create an instance of the `nncf.Dataset` class: +Install all the packages required to instantiate the model object, for example, DL framework. After that, install NNCF on top of the environment: -@sphinxtabset +.. code-block:: sh -@sphinxtab{PyTorch} + pip install nncf -@snippet docs/optimization_guide/nncf/ptq/code/ptq_torch.py dataset +Prepare a Calibration Dataset +############################# -@endsphinxtab +At this step, create an instance of the ``nncf.Dataset`` class that represents the calibration dataset. The ``nncf.Dataset`` class can be a wrapper over the framework dataset object that is used for model training or validation. The class constructor receives the dataset object and the transformation function. For example, if you use PyTorch, you can pass an instance of the ``torch.utils.data.DataLoader`` object. -@sphinxtab{ONNX} +The transformation function is a function that takes a sample from the dataset and returns data that can be passed to the model for inference. For example, this function can take a tuple of a data tensor and labels tensor, and return the former while ignoring the latter. The transformation function is used to avoid modifying the dataset code to make it compatible with the quantization API. The function is applied to each sample from the dataset before passing it to the model for inference. The following code snippet shows how to create an instance of the ``nncf.Dataset`` class: -@snippet docs/optimization_guide/nncf/ptq/code/ptq_onnx.py dataset +.. tab:: PyTorch -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_torch.py + :language: python + :fragment: [dataset] -@sphinxtab{OpenVINO} +.. tab:: ONNX -@snippet docs/optimization_guide/nncf/ptq/code/ptq_openvino.py dataset + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_onnx.py + :language: python + :fragment: [dataset] -@endsphinxtab +.. tab:: OpenVINO -@sphinxtab{TensorFlow} + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_openvino.py + :language: python + :fragment: [dataset] -@snippet docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py dataset +.. tab:: TensorFlow -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py + :language: python + :fragment: [dataset] -@endsphinxtabset -If there is no framework dataset object, you can create your own entity that implements the `Iterable` interface in Python and returns data samples feasible for inference. In this case, a transformation function is not required. +If there is no framework dataset object, you can create your own entity that implements the ``Iterable`` interface in Python and returns data samples feasible for inference. In this case, a transformation function is not required. -## Run a Quantized Model +Run a Quantized Model +##################### Once the dataset is ready and the model object is instantiated, you can apply 8-bit quantization to it: -@sphinxtabset -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/ptq/code/ptq_torch.py quantization + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_torch.py + :language: python + :fragment: [quantization] -@endsphinxtab +.. tab:: ONNX -@sphinxtab{ONNX} + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_onnx.py + :language: python + :fragment: [quantization] -@snippet docs/optimization_guide/nncf/ptq/code/ptq_torch.py quantization +.. tab:: OpenVINO -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_openvino.py + :language: python + :fragment: [quantization] -@sphinxtab{OpenVINO} +.. tab:: TensorFlow -@snippet docs/optimization_guide/nncf/ptq/code/ptq_torch.py quantization + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py + :language: python + :fragment: [quantization] -@endsphinxtab -@sphinxtab{TensorFlow} +.. note:: The ``model`` is an instance of the ``torch.nn.Module`` class for PyTorch, ``onnx.ModelProto`` for ONNX, and ``openvino.runtime.Model`` for OpenVINO. -@snippet docs/optimization_guide/nncf/ptq/code/ptq_tensorflow.py quantization +After that the model can be exported into th OpenVINO Intermediate Representation if needed and run faster with OpenVINO. -@endsphinxtab +Tune quantization parameters +############################ -@endsphinxtabset +``nncf.quantize()`` function has several parameters that allow to tune quantization process to get more accurate model. Below is the list of parameters and their description: -> **NOTE**: The `model` is an instance of the `torch.nn.Module` class for PyTorch, `onnx.ModelProto` for ONNX, and `openvino.runtime.Model` for OpenVINO. +* ``model_type`` - used to specify quantization scheme required for specific type of the model. For example, **Transformer** models (BERT, distillBERT, etc.) require a special quantization scheme to preserve accuracy after quantization. -After that the model can be exported into th OpenVINO Intermediate Representation if needed and run faster with OpenVINO. + .. code-block:: sh + + nncf.quantize(model, dataset, model_type=nncf.ModelType.Transformer) + +* ``preset`` - defines quantization scheme for the model. Two types of presets are available: + + * ``PERFORMANCE`` (default) - defines symmetric quantization of weights and activations + * ``MIXED`` - weights are quantized with symmetric quantization and the activations are quantized with asymmetric quantization. This preset is recommended for models with non-ReLU and asymmetric activation functions, e.g. ELU, PReLU, GELU, etc. + + .. code-block:: sh + + nncf.quantize(model, dataset, preset=nncf.Preset.MIXED) + +* ``fast_bias_correction`` - enables more accurate bias (error) correction algorithm that can be used to improve accuracy of the model. This parameter is available only for OpenVINO representation. ``True`` is used by default. + + .. code-block:: sh + + nncf.quantize(model, dataset, fast_bias_correction=False) + +* ``subset_size`` - defines the number of samples from the calibration dataset that will be used to estimate quantization parameters of activations. The default value is 300. + + .. code-block:: sh + + nncf.quantize(model, dataset, subset_size=1000) + +* ``ignored_scope`` - this parameter can be used to exclude some layers from quantization process. For example, if you want to exclude the last layer of the model from quantization. Below are some examples of how to use this parameter: -## Tune quantization parameters - -`nncf.quantize()` function has several parameters that allow to tune quantization process to get more accurate model. Below is the list of parameters and their description: -* `model_type` - used to specify quantization scheme required for specific type of the model. For example, **Transformer** models (BERT, distillBERT, etc.) require a special quantization scheme to preserve accuracy after quantization. - ```python - nncf.quantize(model, dataset, model_type=nncf.ModelType.Transformer) - ``` -* `preset` - defines quantization scheme for the model. Two types of presets are available: - * `PERFORMANCE` (default) - defines symmetric quantization of weigths and activations - * `MIXED` - weights are quantized with symmetric quantization and the activations are quantized with asymmetric quantization. This preset is recommended for models with non-ReLU and asymmetric activation funstions, e.g. ELU, PReLU, GELU, etc. - ```python - nncf.quantize(model, dataset, preset=nncf.Preset.MIXED) - ``` -* `fast_bias_correction` - enables more accurate bias (error) correction algorithm that can be used to improve accuracy of the model. This parameter is available only for OpenVINO representation. `True` is used by default. - ```python - nncf.quantize(model, dataset, fast_bias_correction=False) - ``` -* `subset_size` - defines the number of samples from the calibration dataset that will be used to estimate quantization parameters of activations. The default value is 300. - ```python - nncf.quantize(model, dataset, subset_size=1000) - ``` -* `ignored_scope` - this parameter can be used to exclude some layers from quantization process. For example, if you want to exclude the last layer of the model from quantization. Below are some examples of how to use this parameter: * Exclude by layer name: - ```python - names = ['layer_1', 'layer_2', 'layer_3'] - nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(names=names)) - ``` + + .. code-block:: sh + + names = ['layer_1', 'layer_2', 'layer_3'] + nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(names=names)) + * Exclude by layer type: - ```python - types = ['Conv2d', 'Linear'] - nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(types=types)) - ``` + + .. code-block:: sh + + types = ['Conv2d', 'Linear'] + nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(types=types)) + * Exclude by regular expression: - ```python - regex = '.*layer_.*' - nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(patterns=regex)) - ``` -If the accuracy of the quantized model is not satisfactory, you can try to use the [Quantization with accuracy control](@ref quantization_w_accuracy_control) flow. + .. code-block:: sh + + regex = '.*layer_.*' + nncf.quantize(model, dataset, ignored_scope=nncf.IgnoredScope(patterns=regex)) + + +If the accuracy of the quantized model is not satisfactory, you can try to use the :doc:`Quantization with accuracy control ` flow. + +See also +#################### -## See also +* `Example of basic quantization flow in PyTorch `__ -* [Example of basic quantization flow in PyTorch](https://github.com/openvinotoolkit/nncf/tree/develop/examples/post_training_quantization/torch/mobilenet_v2) \ No newline at end of file +@endsphinxdirective diff --git a/docs/optimization_guide/nncf/ptq/quantization_w_accuracy_control.md b/docs/optimization_guide/nncf/ptq/quantization_w_accuracy_control.md index 03ddd9f99ac3a5..65d5ede50e4d8e 100644 --- a/docs/optimization_guide/nncf/ptq/quantization_w_accuracy_control.md +++ b/docs/optimization_guide/nncf/ptq/quantization_w_accuracy_control.md @@ -1,66 +1,64 @@ # Quantizing with accuracy control {#quantization_w_accuracy_control} -## Introduction +@sphinxdirective -This is the advanced quantization flow that allows to apply 8-bit quantization to the model with control of accuracy metric. This is achieved by keeping the most impactful operations within the model in the original precision. The flow is based on the [Basic 8-bit quantization](@ref basic_qauntization_flow) and has the following differences: -* Besided the calibration dataset, a **validation dataset** is required to compute accuracy metric. They can refer to the same data in the simplest case. -* **Validation function**, used to compute accuracy metric is required. It can be a function that is already available in the source framework or a custom function. -* Since accuracy validation is run several times during the quantization process, quantization with accuracy control can take more time than the [Basic 8-bit quantization](@ref basic_qauntization_flow) flow. -* The resulted model can provide smaller performance improvement than the [Basic 8-bit quantization](@ref basic_qauntization_flow) flow because some of the operations are kept in the original precision. - -> **NOTE**: Currently, this flow is available only for models in OpenVINO representation. - -The steps for the quantizatation with accuracy control are described below. +Introduction +#################### -## Prepare datasets +This is the advanced quantization flow that allows to apply 8-bit quantization to the model with control of accuracy metric. This is achieved by keeping the most impactful operations within the model in the original precision. The flow is based on the :doc:`Basic 8-bit quantization ` and has the following differences: -This step is similar to the [Basic 8-bit quantization](@ref basic_qauntization_flow) flow. The only difference is that two datasets, calibration and validation, are required. - -@sphinxtabset - -@sphinxtab{OpenVINO} +* Beside the calibration dataset, a **validation dataset** is required to compute accuracy metric. They can refer to the same data in the simplest case. +* **Validation function**, used to compute accuracy metric is required. It can be a function that is already available in the source framework or a custom function. +* Since accuracy validation is run several times during the quantization process, quantization with accuracy control can take more time than the [Basic 8-bit quantization](@ref basic_qauntization_flow) flow. +* The resulted model can provide smaller performance improvement than the :doc:`Basic 8-bit quantization ` flow because some of the operations are kept in the original precision. -@snippet docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py dataset +.. note:: Currently, this flow is available only for models in OpenVINO representation. -@endsphinxtab +The steps for the quantization with accuracy control are described below. -@endsphinxtabset +Prepare datasets +#################### -## Prepare validation function +This step is similar to the :doc:`Basic 8-bit quantization ` flow. The only difference is that two datasets, calibration and validation, are required. -Validation funtion receives `openvino.runtime.CompiledModel` object and -validation dataset and returns accuracy metric value. The following code snippet shows an example of validation function for OpenVINO model: +.. tab:: OpenVINO -@sphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py + :language: python + :fragment: [dataset] -@sphinxtab{OpenVINO} -@snippet docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py validation +Prepare validation function +########################### -@endsphinxtab +Validation funtion receives ``openvino.runtime.CompiledModel`` object and validation dataset and returns accuracy metric value. The following code snippet shows an example of validation function for OpenVINO model: -@endsphinxtabset +.. tab:: OpenVINO -## Run quantization with accuracy control + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py + :language: python + :fragment: [validation] -Now, you can run quantization with accuracy control. The following code snippet shows an example of quantization with accuracy control for OpenVINO model: -@sphinxtabset +Run quantization with accuracy control -@sphinxtab{OpenVINO} +Now, you can run quantization with accuracy control. The following code snippet shows an example of quantization with accuracy control for OpenVINO model: -@snippet docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py quantization +.. tab:: OpenVINO -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/ptq/code/ptq_aa_openvino.py + :language: python + :fragment: [quantization] -@endsphinxtabset -`max_drop` defines the accuracy drop threshold. The quantization process stops when the degradation of accuracy metric on the validation dataset is less than the `max_drop`. +``max_drop`` defines the accuracy drop threshold. The quantization process stops when the degradation of accuracy metric on the validation dataset is less than the ``max_drop``. -`nncf.quantize_with_accuracy_control()` API supports all the parameters of `nncf.quantize()` API. For example, you can use `nncf.quantize_with_accuracy_control()` to quantize a model with a custom configuration. +``nncf.quantize_with_accuracy_control()`` API supports all the parameters of ``nncf.quantize()`` API. For example, you can use ``nncf.quantize_with_accuracy_control()`` to quantize a model with a custom configuration. -## See also +See also +#################### -* [Optimizing Models at Training Time](@ref tmo_introduction) +* :doc:`Optimizing Models at Training Time ` +@endsphinxdirective diff --git a/docs/optimization_guide/nncf/qat.md b/docs/optimization_guide/nncf/qat.md index 88c4cfa57730a2..0ddf086921002c 100644 --- a/docs/optimization_guide/nncf/qat.md +++ b/docs/optimization_guide/nncf/qat.md @@ -1,172 +1,201 @@ # Quantization-aware Training (QAT) {#qat_introduction} -## Introduction -Quantization-aware Training is a popular method that allows quantizing a model and applying fine-tuning to restore accuracy degradation caused by quantization. In fact, this is the most accurate quantization method. This document describes how to apply QAT from the Neural Network Compression Framework (NNCF) to get 8-bit quantized models. This assumes that you are knowledgeable in Python* programming and familiar with the training code for the model in the source DL framework. +@sphinxdirective -## Using NNCF QAT -Here, we provide the steps that are required to integrate QAT from NNCF into the training script written with PyTorch or TensorFlow 2: +Introduction +#################### -> **NOTE**: Currently, NNCF for TensorFlow 2 supports optimization of the models created using Keras [Sequesntial API](https://www.tensorflow.org/guide/keras/sequential_model) or [Functional API](https://www.tensorflow.org/guide/keras/functional). +Quantization-aware Training is a popular method that allows quantizing a model and applying fine-tuning to restore accuracy +degradation caused by quantization. In fact, this is the most accurate quantization method. This document describes how to +apply QAT from the Neural Network Compression Framework (NNCF) to get 8-bit quantized models. This assumes that you are +knowledgeable in Python programming and familiar with the training code for the model in the source DL framework. -### 1. Import NNCF API -In this step, you add NNCF-related imports in the beginning of the training script: - -@sphinxtabset - -@sphinxtab{PyTorch} - -@snippet docs/optimization_guide/nncf/code/qat_torch.py imports - -@endsphinxtab - -@sphinxtab{TensorFlow 2} +Using NNCF QAT +#################### -@snippet docs/optimization_guide/nncf/code/qat_tf.py imports +Here, we provide the steps that are required to integrate QAT from NNCF into the training script written with +PyTorch or TensorFlow 2: -@endsphinxtab +.. note:: + Currently, NNCF for TensorFlow 2 supports optimization of the models created using Keras + `Sequential API `__ or + `Functional API `__. -@endsphinxtabset +1. Import NNCF API +++++++++++++++++++++ -### 2. Create NNCF configuration -Here, you should define NNCF configuration which consists of model-related parameters (`"input_info"` section) and parameters of optimization methods (`"compression"` section). For faster convergence, it is also recommended to register a dataset object specific to the DL framework. It will be used at the model creation step to initialize quantization parameters. - -@sphinxtabset +In this step, you add NNCF-related imports in the beginning of the training script: -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py nncf_congig + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [imports] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [imports] -@snippet docs/optimization_guide/nncf/code/qat_tf.py nncf_congig -@endsphinxtab +2. Create NNCF configuration +++++++++++++++++++++++++++++ -@endsphinxtabset +Here, you should define NNCF configuration which consists of model-related parameters (``"input_info"`` section) and parameters +of optimization methods (``"compression"`` section). For faster convergence, it is also recommended to register a dataset object +specific to the DL framework. It will be used at the model creation step to initialize quantization parameters. -### 3. Apply optimization methods -In the next step, you need to wrap the original model object with the `create_compressed_model()` API using the configuration defined in the previous step. This method returns a so-called compression controller and a wrapped model that can be used the same way as the original model. It is worth noting that optimization methods are applied at this step so that the model undergoes a set of corresponding transformations and can contain additional operations required for the optimization. In the case of QAT, the compression controller object is used for model export and, optionally, in distributed training as it will be shown below. +.. tab:: PyTorch -@sphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [nncf_congig] -@sphinxtab{PyTorch} +.. tab:: TensorFlow 2 -@snippet docs/optimization_guide/nncf/code/qat_torch.py wrap_model + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [nncf_congig] -@endsphinxtab -@sphinxtab{TensorFlow 2} +3. Apply optimization methods ++++++++++++++++++++++++++++++ -@snippet docs/optimization_guide/nncf/code/qat_tf.py wrap_model +In the next step, you need to wrap the original model object with the ``create_compressed_model()`` API using the configuration +defined in the previous step. This method returns a so-called compression controller and a wrapped model that can be used the +same way as the original model. It is worth noting that optimization methods are applied at this step so that the model +undergoes a set of corresponding transformations and can contain additional operations required for the optimization. In +the case of QAT, the compression controller object is used for model export and, optionally, in distributed training as it +will be shown below. -@endsphinxtab +.. tab:: PyTorch -@endsphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [wrap_model] -### 4. Fine-tune the model -This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the case of QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5. In principle, you can skip this step which means that the post-training optimization will be applied to the model. +.. tab:: TensorFlow 2 -@sphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [wrap_model] -@sphinxtab{PyTorch} -@snippet docs/optimization_guide/nncf/code/qat_torch.py tune_model +4. Fine-tune the model +++++++++++++++++++++++ -@endsphinxtab +This step assumes that you will apply fine-tuning to the model the same way as it is done for the baseline model. In the +case of QAT, it is required to train the model for a few epochs with a small learning rate, for example, 10e-5. In principle, +you can skip this step which means that the post-training optimization will be applied to the model. -@sphinxtab{TensorFlow 2} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_tf.py tune_model + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [tune_model] -@endsphinxtab +.. tab:: TensorFlow 2 -@endsphinxtabset + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [tune_model] -### 5. Multi-GPU distributed training -In the case of distributed multi-GPU training (not DataParallel), you should call `compression_ctrl.distributed()` before the fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode. -@sphinxtabset -@sphinxtab{PyTorch} +5. Multi-GPU distributed training ++++++++++++++++++++++++++++++++++ -@snippet docs/optimization_guide/nncf/code/qat_torch.py distributed +In the case of distributed multi-GPU training (not DataParallel), you should call ``compression_ctrl.distributed()`` before +the fine-tuning that will inform optimization methods to do some adjustments to function in the distributed mode. -@endsphinxtab +.. tab:: PyTorch -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [distributed] -@snippet docs/optimization_guide/nncf/code/qat_tf.py distributed +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [distributed] -@endsphinxtabset +6. Export quantized model ++++++++++++++++++++++++++ -### 6. Export quantized model -When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in the case of PyTorch and frozen graph - for TensorFlow 2. +When fine-tuning finishes, the quantized model can be exported to the corresponding format for further inference: ONNX in +the case of PyTorch and frozen graph - for TensorFlow 2. -@sphinxtabset +.. tab:: PyTorch -@sphinxtab{PyTorch} + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [export] -@snippet docs/optimization_guide/nncf/code/qat_torch.py export +.. tab:: TensorFlow 2 -@endsphinxtab + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [export] -@sphinxtab{TensorFlow 2} -@snippet docs/optimization_guide/nncf/code/qat_tf.py export +.. note:: + The precision of weigths gets INT8 only after the step of model conversion to OpenVINO Intermediate Representation. + You can expect the model footprint reduction only for that format. -@endsphinxtab -@endsphinxtabset +These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model +checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs. -> **NOTE**: The precision of weigths gets INT8 only after the step of model conversion to OpenVINO Intermediate Representation. You can expect the model footprint reduction only for that format. -These were the basic steps to applying the QAT method from the NNCF. However, it is required in some cases to save/load model checkpoints during the training. Since NNCF wraps the original model with its own object it provides an API for these needs. +7. (Optional) Save checkpoint ++++++++++++++++++++++++++++++ -### 7. (Optional) Save checkpoint To save model checkpoint use the following API: -@sphinxtabset - -@sphinxtab{PyTorch} +.. tab:: PyTorch -@snippet docs/optimization_guide/nncf/code/qat_torch.py save_checkpoint + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [save_checkpoint] -@endsphinxtab +.. tab:: TensorFlow 2 -@sphinxtab{TensorFlow 2} + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [save_checkpoint] -@snippet docs/optimization_guide/nncf/code/qat_tf.py save_checkpoint -@endsphinxtab +8. (Optional) Restore from checkpoint ++++++++++++++++++++++++++++++++++++++ -@endsphinxtabset - -### 8. (Optional) Restore from checkpoint To restore the model from checkpoint you should use the following API: -@sphinxtabset +.. tab:: PyTorch + + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_torch.py + :language: python + :fragment: [load_checkpoint] -@sphinxtab{PyTorch} +.. tab:: TensorFlow 2 -@snippet docs/optimization_guide/nncf/code/qat_torch.py load_checkpoint + .. doxygensnippet:: docs/optimization_guide/nncf/code/qat_tf.py + :language: python + :fragment: [load_checkpoint] -@endsphinxtab -@sphinxtab{TensorFlow 2} +For more details on saving/loading checkpoints in the NNCF, see the following `documentation `__. -@snippet docs/optimization_guide/nncf/code/qat_tf.py load_checkpoint +Deploying quantized model +######################### -@endsphinxtab +The quantized model can be deployed with OpenVINO in the same way as the baseline model. No extra steps or options are +required in this case. For more details, see the corresponding :doc:`documentation `. -@endsphinxtabset +Examples +#################### -For more details on saving/loading checkpoints in the NNCF, see the following [documentation](https://github.com/openvinotoolkit/nncf/blob/develop/docs/Usage.md#saving-and-loading-compressed-models). +* `Quantizing PyTorch model with NNCF `__ -## Deploying quantized model -The quantized model can be deployed with OpenVINO in the same way as the baseline model. No extra steps or options are required in this case. For more details, see the corresponding [documentation](../../OV_Runtime_UG/openvino_intro.md). +* `Quantizing TensorFlow model with NNCF `__ -## Examples -- [Quantizing PyTorch model with NNCF](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/302-pytorch-quantization-aware-training) -- [Quantizing TensorFlow model with NNCF](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/305-tensorflow-quantization-aware-training) \ No newline at end of file +@endsphinxdirective diff --git a/docs/resources/prerelease_information.md b/docs/resources/prerelease_information.md new file mode 100644 index 00000000000000..7d260e086b7b25 --- /dev/null +++ b/docs/resources/prerelease_information.md @@ -0,0 +1,32 @@ +# Prerelease Information {#prerelease_information} + +@sphinxdirective + +OpenVINO follows a four-month release cycle, which means three major releases a year, +the last one being an LTS version. To ensure you do not have to wait long to test its new features, +OpenVINO developers continue to roll out prerelease versions. In this page you can find +a general changelog and the schedule for all versions for the current year. + +.. note:: + These versions are pre-release software and have not undergone full validation or qualification. OpenVINO™ toolkit pre-release is: + + * NOT to be incorporated into production software/solutions. + * NOT subject to official support. + * Subject to change in the future. + * Introduced to allow early testing and get early feedback from the community. + + +.. dropdown:: OpenVINO Toolkit 2023.0.0.dev20230217 + :open: + :animate: fade-in-slide-down + :color: primary + + OpenVINO™ repository tag: `2023.0.0.dev20230217 `__ + + * Enabled PaddlePaddle Framework 2.4 + * Preview of TensorFlow Lite Front End – Load models directly via “read_model” into OpenVINO Runtime and export OpenVINO IR format using Model Optimizer or “convert_model” + * Introduced new option ov::auto::enable_startup_fallback / ENABLE_STARTUP_FALLBACK to control whether to use CPU to accelerate first inference latency for accelerator HW devices like GPU. + * New FrontEndManager register_front_end(name, lib_path) interface added, to remove “OV_FRONTEND_PATH” env var (a way to load non-default frontends). + + +@endsphinxdirective \ No newline at end of file diff --git a/docs/resources/resources.md b/docs/resources/resources.md index ad5d806c036296..2f6056a9b78908 100644 --- a/docs/resources/resources.md +++ b/docs/resources/resources.md @@ -9,6 +9,7 @@ openvino_docs_performance_benchmarks openvino_ir + prerelease_information .. toctree:: :maxdepth: 1 diff --git a/docs/resources/supported_models.md b/docs/resources/supported_models.md index 907f220e6650d0..455a4d543d56f6 100644 --- a/docs/resources/supported_models.md +++ b/docs/resources/supported_models.md @@ -1,13 +1,12 @@ # Supported Models {#openvino_supported_models} +@sphinxdirective The OpenVINO team continues the effort to support as many models out-of-the-box as possible. Based on our research and user feedback, we prioritize the most common models and test them before every release. These models are considered officially supported. -@sphinxdirective - -.. button-link:: _static/download/OV_2022_models_supported.pdf +.. button-link:: _static/download/OV_2023_models_supported.pdf :color: primary :outline: @@ -18,36 +17,33 @@ before every release. These models are considered officially supported. | If your model is not included but is similar to those that are, it is still very likely to work. If your model fails to execute properly there are a few options available: -@endsphinxdirective - -* If the model originates from a framework like TensorFlow or PyTorch, OpenVINO™ offers a hybrid solution. The original model can be run without explicit conversion into the OpenVINO format. For more information, see [OpenVINO TensorFlow Integration](https://docs.openvino.ai/latest/ovtf_integration.html). +* If the model originates from a framework like TensorFlow or PyTorch, OpenVINO™ offers a hybrid solution. The original model can be run without explicit conversion into the OpenVINO format. For more information, see :ref:`OpenVINO TensorFlow Integration `. * You can create a GitHub request for the operation(s) that are missing. These requests are reviewed regularly. You will be informed if and how the request will be accommodated. Additionally, your request may trigger a reply from someone in the community who can help. -* As OpenVINO™ is open source you can enhance it with your own contribution to the GitHub repository. To learn more, see the articles on [OpenVINO Extensibility](https://docs.openvino.ai/latest/openvino_docs_Extensibility_UG_Intro.html). +* As OpenVINO™ is open source you can enhance it with your own contribution to the GitHub repository. To learn more, see the articles on :ref:`OpenVINO Extensibility`. The following table summarizes the number of models supported by OpenVINO™ in different categories: -@sphinxdirective +--------------------------------------------+-------------------+ | Model Categories: | Number of Models: | +============================================+===================+ -| Object Detection | 149 | +| Object Detection | 149 | +--------------------------------------------+-------------------+ | Instance Segmentation | 3 | +--------------------------------------------+-------------------+ | Semantic Segmentation | 19 | +--------------------------------------------+-------------------+ -| Image Processing, Enhancement | 16 | +| Image Processing, Enhancement | 16 | +--------------------------------------------+-------------------+ -| Monodepth | 2 | +| Monodepth | 2 | +--------------------------------------------+-------------------+ -| Colorization | 2 | +| Colorization | 2 | +--------------------------------------------+-------------------+ -| Behavior / Decision Prediction | 1 | +| Behavior / Decision Prediction | 1 | +--------------------------------------------+-------------------+ -| Action Recognition | 2 | +| Action Recognition | 2 | +--------------------------------------------+-------------------+ -| Time Series Forecasting | 1 | +| Time Series Forecasting | 1 | +--------------------------------------------+-------------------+ | Image Classification | 68 | +--------------------------------------------+-------------------+ @@ -55,14 +51,15 @@ The following table summarizes the number of models supported by OpenVINO™ in +--------------------------------------------+-------------------+ | Image Classification, Emotion | 1 | +--------------------------------------------+-------------------+ -| Image Translation | 1 | +| Image Translation | 1 | +--------------------------------------------+-------------------+ -| Natural language Processing | 35 | +| Natural language Processing | 35 | +--------------------------------------------+-------------------+ -| Text Detection | 18 | +| Text Detection | 18 | +--------------------------------------------+-------------------+ -| Audio Enhancement | 3 | +| Audio Enhancement | 3 | +--------------------------------------------+-------------------+ -| Sound Classification | 2 | +| Sound Classification | 2 | +--------------------------------------------+-------------------+ + @endsphinxdirective \ No newline at end of file diff --git a/install_build_dependencies.sh b/install_build_dependencies.sh index 7d76c26259c825..d7db483bc5fe6f 100755 --- a/install_build_dependencies.sh +++ b/install_build_dependencies.sh @@ -125,6 +125,46 @@ elif [ -f /etc/redhat-release ] || grep -q "rhel" /etc/os-release ; then `# samples and tools` \ zlib-devel \ gflags-devel +elif [ -f /etc/os-release ] && grep -q "SUSE" /etc/os-release ; then + zypper refresh + zypper install -y \ + file \ + `# build tools` \ + cmake \ + ccache \ + ninja \ + scons \ + gcc \ + gcc-c++ \ + make \ + `# to determine openvino version via git` \ + git \ + git-lfs \ + `# to build and check pip packages` \ + patchelf \ + fdupes \ + `# to build and check rpm packages` \ + rpm-build \ + rpmlint \ + `# check bash scripts for correctness` \ + ShellCheck \ + `# main openvino dependencies` \ + tbb-devel \ + pugixml-devel \ + `# GPU plugin dependency` \ + libva-devel \ + `# OpenCL for GPU` \ + ocl-icd-devel \ + opencl-cpp-headers \ + opencl-headers \ + `# python API` \ + python39-pip \ + python39-setuptools \ + python39-devel \ + `# samples and tools` \ + zlib-devel \ + gflags-devel-static \ + nlohmann_json-devel elif [ -f /etc/os-release ] && grep -q "raspbian" /etc/os-release; then # Raspbian apt update @@ -176,8 +216,10 @@ if [ ! "$(printf '%s\n' "$required_cmake_ver" "$current_cmake_ver" | sort -V | h if command -v apt-get &> /dev/null; then apt-get install -y --no-install-recommends wget - else + elif command -v yum &> /dev/null; then yum install -y wget + elif command -v zypper &> /dev/null; then + zypper in -y wget fi cmake_install_bin="cmake-${installed_cmake_ver}-linux-${arch}.sh" diff --git a/samples/cpp/benchmark_app/CMakeLists.txt b/samples/cpp/benchmark_app/CMakeLists.txt index 16d2bc8e53991e..6939dd118bd61b 100644 --- a/samples/cpp/benchmark_app/CMakeLists.txt +++ b/samples/cpp/benchmark_app/CMakeLists.txt @@ -26,6 +26,7 @@ if(NOT TARGET nlohmann_json::nlohmann_json) if(TARGET nlohmann_json) # Ubuntu 18.04 case where target 'nlohmann_json' is here, but nlohmann_json_FOUND is OFF if(NOT TARGET nlohmann_json::nlohmann_json) + set_target_properties(nlohmann_json PROPERTIES IMPORTED_GLOBAL ON) add_library(nlohmann_json::nlohmann_json ALIAS nlohmann_json) endif() set(nlohmann_json_FOUND ON) diff --git a/samples/cpp/speech_sample/CMakeLists.txt b/samples/cpp/speech_sample/CMakeLists.txt index caab61495d0495..2b99a9fe1367d2 100644 --- a/samples/cpp/speech_sample/CMakeLists.txt +++ b/samples/cpp/speech_sample/CMakeLists.txt @@ -15,8 +15,8 @@ endif() if(NOT TARGET zlib::zlib) if(PkgConfig_FOUND) pkg_search_module(zlib QUIET - IMPORTED_TARGET GLOBAL - zlib) + IMPORTED_TARGET GLOBAL + zlib) if(zlib_FOUND) add_library(zlib::zlib ALIAS PkgConfig::zlib) endif() diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt index c913fd1872bdf4..925275afe1d67f 100644 --- a/scripts/CMakeLists.txt +++ b/scripts/CMakeLists.txt @@ -15,11 +15,6 @@ set(shellcheck_skip_list "${OpenVINO_SOURCE_DIR}/src/bindings/python/thirdparty/pybind11" "${TEMP}") -if(shellcheck_VERSION VERSION_GREATER_EQUAL 0.7.0) - list(APPEND shellcheck_skip_list - "${OpenVINO_SOURCE_DIR}/scripts/setupvars/setupvars.sh") -endif() - ie_shellcheck_process(DIRECTORY "${OpenVINO_SOURCE_DIR}" SKIP ${shellcheck_skip_list}) diff --git a/scripts/install_dependencies/install_openvino_dependencies.sh b/scripts/install_dependencies/install_openvino_dependencies.sh index d8ee4b92660894..b31b86bd0fd19b 100755 --- a/scripts/install_dependencies/install_openvino_dependencies.sh +++ b/scripts/install_dependencies/install_openvino_dependencies.sh @@ -95,6 +95,7 @@ if [ "$os" == "auto" ] ; then case $os in centos7|centos8|rhel8|rhel9.1|\ almalinux8.7|amzn2|\ + opensuse-leap15.3| \ fedora34|fedora35|fedora36|fedora37|fedora38|\ raspbian9|debian9|ubuntu18.04|\ raspbian10|debian10|ubuntu20.04|ubuntu20.10|ubuntu21.04|\ @@ -216,6 +217,11 @@ elif [ "$os" == "centos7" ] || [ "$os" == "centos8" ] || pkgs_dev+=("https://download-ib01.fedoraproject.org/pub/epel/9/Everything/$arch/Packages/g/gflags-devel-2.2.2-9.el9.$arch.rpm") extra_repos+=("https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm") fi +elif [ "$os" == "opensuse-leap15.3" ] ; then + pkgs_core=(libtbb2 libtbbmalloc2 libpugixml1) + pkgs_gpu=() + pkgs_python=(python39-base python39 python39-venv python39-pip) + pkgs_dev=(cmake pkg-config gcc-c++ gcc gflags-devel-static zlib-devel nlohmann_json-devel make curl sudo) else echo "Internal script error: invalid OS (${os}) after check (package selection)" >&2 exit 3 @@ -280,6 +286,14 @@ elif [ "$os" == "centos7" ] || [ "$os" == "centos8" ] || yum install "$iopt" "${pkgs[@]}" +elif [ "$os" == "opensuse-leap15.3" ] ; then + + [ -z "$interactive" ] && iopt="-y" + [ -n "$dry" ] && iopt="--dry-run" + [ -n "$keepcache" ] && zypper clean --all + + zypper ref && zypper in --auto-agree-with-licenses --no-recommends "$iopt" "${pkgs[@]}" + else echo "Internal script error: invalid OS (${os}) after check (package installation)" >&2 exit 3 diff --git a/scripts/setupvars/setupvars.sh b/scripts/setupvars/setupvars.sh index 169b7f12cefd51..41789160e69a59 100755 --- a/scripts/setupvars/setupvars.sh +++ b/scripts/setupvars/setupvars.sh @@ -3,7 +3,13 @@ # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -SCRIPT_DIR="$( cd "$( dirname "$(realpath "${BASH_SOURCE[0]}")" )" >/dev/null 2>&1 && pwd )" +abs_path () { + path=$(eval echo "$1") + directory=$(dirname "$path") + echo "$(cd "$directory" || exit; pwd -P)/$(basename "$path")"; +} + +SCRIPT_DIR="$( cd "$( dirname "$(abs_path "${BASH_SOURCE[0]}")" )" >/dev/null 2>&1 && pwd )" INSTALLDIR="${SCRIPT_DIR}" export INTEL_OPENVINO_DIR="$INSTALLDIR" @@ -79,10 +85,12 @@ fi # OpenCV environment if [ -f "$INSTALLDIR/opencv/setupvars.sh" ]; then + # shellcheck source=/dev/null source "$INSTALLDIR/opencv/setupvars.sh" fi if [ -f "$INSTALLDIR/extras/opencv/setupvars.sh" ]; then + # shellcheck source=/dev/null source "$INSTALLDIR/extras/opencv/setupvars.sh" fi @@ -97,23 +105,12 @@ MAX_SUPPORTED_PYTHON_VERSION_MINOR="10" check_python_version () { if [ -z "$python_version" ]; then - python_version=$(python3 -c 'import sys; print(str(sys.version_info[0])+"."+str(sys.version_info[1]))') - fi - - # splitting Python version variable depending on the used shell - if [ -n "$ZSH_VERSION" ]; then - version_arr=(${(@s:.:)python_version}) - if [ "${#version_arr[@]}" -ge "2" ]; then - # zsh starts indexing from 1 - python_version_major=${version_arr[1]} - python_version_minor=${version_arr[2]} - fi + python_version_major=$( python3 -c 'import sys; print(str(sys.version_info[0]))' ) + python_version_minor=$( python3 -c 'import sys; print(str(sys.version_info[1]))' ) + python_version="$python_version_major.$python_version_minor" else - version_arr=(${python_version//./ }) - if [ "${#version_arr[@]}" -ge "2" ]; then - python_version_major=${version_arr[0]} - python_version_minor=${version_arr[1]} - fi + python_version_major=$( python3 -c "import sys; print(str(\"${python_version}\".split('.')[0]))" ) + python_version_minor=$( python3 -c "import sys; print(str(\"${python_version}\".split('.')[1]))" ) fi if [ "$PYTHON_VERSION_MAJOR" != "$python_version_major" ] || diff --git a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py index 914f34d2480303..95490cbf98acb4 100644 --- a/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py +++ b/src/bindings/python/src/openvino/runtime/utils/data_helpers/data_dispatcher.py @@ -212,7 +212,7 @@ def update_tensor( key: Optional[ValidKeys] = None, ) -> None: if hasattr(inputs, "__array__"): - update_tensor(normalize_arrays(inputs, is_shared=False), request, key=None) + update_tensor(normalize_arrays(inputs, is_shared=False), request, key) return None raise TypeError(f"Incompatible inputs of type: {type(inputs)} under {key} key!") diff --git a/src/bindings/python/src/pyopenvino/core/common.cpp b/src/bindings/python/src/pyopenvino/core/common.cpp index 2c7ec5653571ae..434433cb28dc6c 100644 --- a/src/bindings/python/src/pyopenvino/core/common.cpp +++ b/src/bindings/python/src/pyopenvino/core/common.cpp @@ -6,6 +6,7 @@ #include +#include "Python.h" #include "openvino/util/common_util.hpp" #define C_CONTIGUOUS py::detail::npy_api::constants::NPY_ARRAY_C_CONTIGUOUS_ @@ -51,78 +52,22 @@ const std::map& dtype_to_ov_type() { return dtype_to_ov_type_mapping; } -ov::Tensor tensor_from_pointer(py::array& array, const ov::Shape& shape, const ov::element::Type& type) { - bool is_contiguous = C_CONTIGUOUS == (array.flags() & C_CONTIGUOUS); - auto element_type = (type == ov::element::undefined) ? Common::dtype_to_ov_type().at(py::str(array.dtype())) : type; +namespace array_helpers { - if (is_contiguous) { - return ov::Tensor(element_type, shape, const_cast(array.data(0)), {}); - } else { - throw ov::Exception("Tensor with shared memory must be C contiguous!"); - } +bool is_contiguous(const py::array& array) { + return C_CONTIGUOUS == (array.flags() & C_CONTIGUOUS); } -ov::Tensor tensor_from_numpy(py::array& array, bool shared_memory) { - // Check if passed array has C-style contiguous memory layout. - bool is_contiguous = C_CONTIGUOUS == (array.flags() & C_CONTIGUOUS); - auto type = Common::dtype_to_ov_type().at(py::str(array.dtype())); - std::vector shape(array.shape(), array.shape() + array.ndim()); +ov::element::Type get_ov_type(const py::array& array) { + return Common::dtype_to_ov_type().at(py::str(array.dtype())); +} - // If memory is going to be shared it needs to be contiguous before - // passing to the constructor. This case should be handled by advanced - // users on their side of the code. - if (shared_memory) { - if (is_contiguous) { - std::vector strides(array.strides(), array.strides() + array.ndim()); - return ov::Tensor(type, shape, const_cast(array.data(0)), strides); - } else { - throw ov::Exception("Tensor with shared memory must be C contiguous!"); - } - } - // Convert to contiguous array if not already C-style. - if (!is_contiguous) { - array = Common::as_contiguous(array, type); - } - // Create actual Tensor and copy data. - auto tensor = ov::Tensor(type, shape); - // If ndim of py::array is 0, array is a numpy scalar. That results in size to be equal to 0. - // To gain access to actual raw/low-level data, it is needed to use buffer protocol. - py::buffer_info buf = array.request(); - std::memcpy(tensor.data(), buf.ptr, buf.ndim == 0 ? buf.itemsize : buf.itemsize * buf.size); - return tensor; +std::vector get_shape(const py::array& array) { + return std::vector(array.shape(), array.shape() + array.ndim()); } -ov::PartialShape partial_shape_from_list(const py::list& shape) { - using value_type = ov::Dimension::value_type; - ov::PartialShape pshape; - for (py::handle dim : shape) { - if (py::isinstance(dim)) { - pshape.insert(pshape.end(), ov::Dimension(dim.cast())); - } else if (py::isinstance(dim)) { - pshape.insert(pshape.end(), ov::Dimension(dim.cast())); - } else if (py::isinstance(dim)) { - pshape.insert(pshape.end(), dim.cast()); - } else if (py::isinstance(dim) || py::isinstance(dim)) { - py::list bounded_dim = dim.cast(); - if (bounded_dim.size() != 2) { - throw py::type_error("Two elements are expected in tuple(lower, upper) for dynamic dimension, but " + - std::to_string(bounded_dim.size()) + " elements were given."); - } - if (!(py::isinstance(bounded_dim[0]) && py::isinstance(bounded_dim[1]))) { - throw py::type_error("Incorrect pair of types (" + std::string(bounded_dim[0].get_type().str()) + ", " + - std::string(bounded_dim[1].get_type().str()) + - ") for dynamic dimension, ints are expected."); - } - pshape.insert(pshape.end(), - ov::Dimension(bounded_dim[0].cast(), bounded_dim[1].cast())); - } else { - throw py::type_error("Incorrect type " + std::string(dim.get_type().str()) + - " for dimension. Expected types are: " - "int, str, openvino.runtime.Dimension, list/tuple with lower and upper values for " - "dynamic dimension."); - } - } - return pshape; +std::vector get_strides(const py::array& array) { + return std::vector(array.strides(), array.strides() + array.ndim()); } py::array as_contiguous(py::array& array, ov::element::Type type) { @@ -165,6 +110,120 @@ py::array as_contiguous(py::array& array, ov::element::Type type) { } } +}; // namespace array_helpers + +template <> +ov::op::v0::Constant create_copied(py::array& array) { + // Convert to contiguous array if not already in C-style. + if (!array_helpers::is_contiguous(array)) { + array = array_helpers::as_contiguous(array, array_helpers::get_ov_type(array)); + } + // Create actual Constant and a constructor is copying data. + return ov::op::v0::Constant(array_helpers::get_ov_type(array), + array_helpers::get_shape(array), + const_cast(array.data(0))); +} + +template <> +ov::op::v0::Constant create_copied(ov::Tensor& tensor) { + // Create actual Constant and a constructor is copying data. + return ov::op::v0::Constant(tensor.get_element_type(), tensor.get_shape(), const_cast(tensor.data())); +} + +template <> +ov::op::v0::Constant create_shared(py::array& array) { + // Check if passed array has C-style contiguous memory layout. + // If memory is going to be shared it needs to be contiguous before passing to the constructor. + if (array_helpers::is_contiguous(array)) { + auto memory = + std::make_shared>(static_cast(array.mutable_data(0)), + array.nbytes(), + array); + return ov::op::v0::Constant(array_helpers::get_ov_type(array), array_helpers::get_shape(array), memory); + } + // If passed array is not C-style, throw an error. + throw ov::Exception( + "SHARED MEMORY MODE FOR THIS CONSTANT IS NOT APPLICABLE! Passed numpy array must be C contiguous."); +} + +template <> +ov::op::v0::Constant create_shared(ov::Tensor& tensor) { + return ov::op::v0::Constant(tensor); +} + +template <> +ov::Tensor create_copied(py::array& array) { + // Convert to contiguous array if not already in C-style. + if (!array_helpers::is_contiguous(array)) { + array = array_helpers::as_contiguous(array, array_helpers::get_ov_type(array)); + } + // Create actual Tensor and copy data. + auto tensor = ov::Tensor(array_helpers::get_ov_type(array), array_helpers::get_shape(array)); + // If ndim of py::array is 0, array is a numpy scalar. That results in size to be equal to 0. + // To gain access to actual raw/low-level data, it is needed to use buffer protocol. + py::buffer_info buf = array.request(); + std::memcpy(tensor.data(), buf.ptr, buf.ndim == 0 ? buf.itemsize : buf.itemsize * buf.size); + return tensor; +} + +template <> +ov::Tensor create_shared(py::array& array) { + // Check if passed array has C-style contiguous memory layout. + // If memory is going to be shared it needs to be contiguous before passing to the constructor. + if (array_helpers::is_contiguous(array)) { + return ov::Tensor(array_helpers::get_ov_type(array), + array_helpers::get_shape(array), + const_cast(array.data(0)), + array_helpers::get_strides(array)); + } + // If passed array is not C-style, throw an error. + throw ov::Exception( + "SHARED MEMORY MODE FOR THIS TENSOR IS NOT APPLICABLE! Passed numpy array must be C contiguous."); +} + +ov::Tensor tensor_from_pointer(py::array& array, const ov::Shape& shape, const ov::element::Type& type) { + auto element_type = (type == ov::element::undefined) ? Common::dtype_to_ov_type().at(py::str(array.dtype())) : type; + + if (array_helpers::is_contiguous(array)) { + return ov::Tensor(element_type, shape, const_cast(array.data(0)), {}); + } + throw ov::Exception( + "SHARED MEMORY MODE FOR THIS TENSOR IS NOT APPLICABLE! Passed numpy array must be C contiguous."); +} + +ov::PartialShape partial_shape_from_list(const py::list& shape) { + using value_type = ov::Dimension::value_type; + ov::PartialShape pshape; + for (py::handle dim : shape) { + if (py::isinstance(dim)) { + pshape.insert(pshape.end(), ov::Dimension(dim.cast())); + } else if (py::isinstance(dim)) { + pshape.insert(pshape.end(), ov::Dimension(dim.cast())); + } else if (py::isinstance(dim)) { + pshape.insert(pshape.end(), dim.cast()); + } else if (py::isinstance(dim) || py::isinstance(dim)) { + py::list bounded_dim = dim.cast(); + if (bounded_dim.size() != 2) { + throw py::type_error("Two elements are expected in tuple(lower, upper) for dynamic dimension, but " + + std::to_string(bounded_dim.size()) + " elements were given."); + } + if (!(py::isinstance(bounded_dim[0]) && py::isinstance(bounded_dim[1]))) { + throw py::type_error("Incorrect pair of types (" + std::string(bounded_dim[0].get_type().str()) + ", " + + std::string(bounded_dim[1].get_type().str()) + + ") for dynamic dimension, ints are expected."); + } + pshape.insert(pshape.end(), + ov::Dimension(bounded_dim[0].cast(), bounded_dim[1].cast())); + } else { + throw py::type_error("Incorrect type " + std::string(dim.get_type().str()) + + " for dimension. Expected types are: " + "int, str, openvino.runtime.Dimension, list/tuple with lower and upper values for " + "dynamic dimension."); + } + } + return pshape; +} + const ov::Tensor& cast_to_tensor(const py::handle& tensor) { return tensor.cast(); } diff --git a/src/bindings/python/src/pyopenvino/core/common.hpp b/src/bindings/python/src/pyopenvino/core/common.hpp index 9d363ded0fe494..910d9e55e966ed 100644 --- a/src/bindings/python/src/pyopenvino/core/common.hpp +++ b/src/bindings/python/src/pyopenvino/core/common.hpp @@ -10,8 +10,10 @@ #include #include +#include #include #include +#include #include "Python.h" #include "openvino/runtime/compiled_model.hpp" @@ -20,22 +22,62 @@ #include "openvino/pass/serialize.hpp" #include "pyopenvino/core/containers.hpp" #include "pyopenvino/graph/any.hpp" +#include "pyopenvino/graph/ops/constant.hpp" namespace py = pybind11; namespace Common { + +namespace values { + +// Minimum amount of bits for common numpy types. Used to perform checks against OV types. +constexpr size_t min_bitwidth = sizeof(int8_t) * CHAR_BIT; + +}; // namespace values + const std::map& ov_type_to_dtype(); const std::map& dtype_to_ov_type(); -ov::Tensor tensor_from_pointer(py::array& array, const ov::Shape& shape, const ov::element::Type& ov_type); +// Helpers for numpy arrays +namespace array_helpers { -ov::Tensor tensor_from_numpy(py::array& array, bool shared_memory); +bool is_contiguous(const py::array& array); -ov::PartialShape partial_shape_from_list(const py::list& shape); +ov::element::Type get_ov_type(const py::array& array); + +std::vector get_shape(const py::array& array); + +std::vector get_strides(const py::array& array); py::array as_contiguous(py::array& array, ov::element::Type type); +}; // namespace array_helpers + +template +T create_copied(py::array& array); + +template +T create_copied(ov::Tensor& array); + +template +T create_shared(py::array& array); + +template +T create_shared(ov::Tensor& array); + +template +T object_from_data(D& data, bool shared_memory) { + if (shared_memory) { + return create_shared(data); + } + return create_copied(data); +} + +ov::Tensor tensor_from_pointer(py::array& array, const ov::Shape& shape, const ov::element::Type& ov_type); + +ov::PartialShape partial_shape_from_list(const py::list& shape); + const ov::Tensor& cast_to_tensor(const py::handle& tensor); const Containers::TensorNameMap cast_to_tensor_name_map(const py::dict& inputs); diff --git a/src/bindings/python/src/pyopenvino/core/tensor.cpp b/src/bindings/python/src/pyopenvino/core/tensor.cpp index 397b5d4a73879b..8bdc90c58a9265 100644 --- a/src/bindings/python/src/pyopenvino/core/tensor.cpp +++ b/src/bindings/python/src/pyopenvino/core/tensor.cpp @@ -17,7 +17,7 @@ void regclass_Tensor(py::module m) { cls.doc() = "openvino.runtime.Tensor holding either copy of memory or shared host memory."; cls.def(py::init([](py::array& array, bool shared_memory) { - return Common::tensor_from_numpy(array, shared_memory); + return Common::object_from_data(array, shared_memory); }), py::arg("array"), py::arg("shared_memory") = false, @@ -209,7 +209,7 @@ void regclass_Tensor(py::module m) { [](ov::Tensor& self) { auto ov_type = self.get_element_type(); auto dtype = Common::ov_type_to_dtype().at(ov_type); - if (ov_type.bitwidth() < 8) { + if (ov_type.bitwidth() < Common::values::min_bitwidth) { return py::array(dtype, self.get_byte_size(), self.data(), py::cast(self)); } return py::array(dtype, self.get_shape(), self.get_strides(), self.data(), py::cast(self)); diff --git a/src/bindings/python/src/pyopenvino/frontend/input_model.cpp b/src/bindings/python/src/pyopenvino/frontend/input_model.cpp index 6069ceef2371cb..8e47b02bb7508a 100644 --- a/src/bindings/python/src/pyopenvino/frontend/input_model.cpp +++ b/src/bindings/python/src/pyopenvino/frontend/input_model.cpp @@ -310,7 +310,7 @@ void regclass_frontend_InputModel(py::module m) { "set_tensor_value", [](ov::frontend::InputModel& self, const ov::frontend::Place::Ptr& place, py::array& value) { // Convert to contiguous array if not already C-style. - auto tensor = Common::tensor_from_numpy(value, false); + auto tensor = Common::object_from_data(value, false); self.set_tensor_value(place, (const void*)tensor.data()); }, py::arg("place"), diff --git a/src/bindings/python/src/pyopenvino/graph/ops/constant.cpp b/src/bindings/python/src/pyopenvino/graph/ops/constant.cpp index 2e558242351b23..c98dba0db39390 100644 --- a/src/bindings/python/src/pyopenvino/graph/ops/constant.cpp +++ b/src/bindings/python/src/pyopenvino/graph/ops/constant.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "openvino/op/constant.hpp" +#include "pyopenvino/graph/ops/constant.hpp" #include #include @@ -10,10 +10,10 @@ #include #include -#include #include "openvino/core/shape.hpp" -#include "pyopenvino/graph/ops/constant.hpp" +#include "openvino/runtime/tensor.hpp" +#include "pyopenvino/core/common.hpp" namespace py = pybind11; @@ -27,6 +27,38 @@ std::vector _get_byte_strides(const ov::Shape& s) { return byte_strides; } +std::vector _get_strides(const ov::op::v0::Constant& self) { + auto element_type = self.get_element_type(); + auto shape = self.get_shape(); + if (element_type == ov::element::boolean) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::f16) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::f32) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::f64) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::i8) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::i16) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::i32) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::i64) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::u8 || element_type == ov::element::u1) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::u16) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::u32) { + return _get_byte_strides(shape); + } else if (element_type == ov::element::u64) { + return _get_byte_strides(shape); + } else { + throw std::runtime_error("Unsupported data type!"); + } +} + template py::buffer_info _get_buffer_info(const ov::op::v0::Constant& c) { ov::Shape shape = c.get_shape(); @@ -68,6 +100,18 @@ void regclass_graph_op_Constant(py::module m) { "Constant", py::buffer_protocol()); constant.doc() = "openvino.runtime.op.Constant wraps ov::op::v0::Constant"; + // Numpy-based constructor + constant.def(py::init([](py::array& array, bool shared_memory) { + return Common::object_from_data(array, shared_memory); + }), + py::arg("array"), + py::arg("shared_memory") = false); + // Tensor-based constructors + constant.def(py::init([](ov::Tensor& tensor, bool shared_memory) { + return Common::object_from_data(tensor, shared_memory); + }), + py::arg("tensor"), + py::arg("shared_memory") = false); constant.def(py::init&>()); constant.def(py::init&>()); constant.def(py::init&>()); @@ -151,4 +195,26 @@ void regclass_graph_op_Constant(py::module m) { throw std::runtime_error("Unsupported data type!"); } }); + + constant.def_property_readonly( + "data", + [](ov::op::v0::Constant& self) { + auto ov_type = self.get_element_type(); + auto dtype = Common::ov_type_to_dtype().at(ov_type); + if (ov_type.bitwidth() < Common::values::min_bitwidth) { + return py::array(dtype, self.get_byte_size(), self.get_data_ptr(), py::cast(self)); + } + return py::array(dtype, self.get_shape(), _get_strides(self), self.get_data_ptr(), py::cast(self)); + }, + R"( + Access to Constant's data. + + Returns numpy array with corresponding shape and dtype. + For Constants with openvino specific element type, such as u1, + it returns linear array, with uint8 / int8 numpy dtype. + + Note: this access method reflects shared memory if it was applied during initialization. + + :rtype: numpy.array + )"); } diff --git a/src/bindings/python/src/pyopenvino/graph/ops/constant.hpp b/src/bindings/python/src/pyopenvino/graph/ops/constant.hpp index 5b175e8c09d682..cb7d457b1296ad 100644 --- a/src/bindings/python/src/pyopenvino/graph/ops/constant.hpp +++ b/src/bindings/python/src/pyopenvino/graph/ops/constant.hpp @@ -4,8 +4,14 @@ #pragma once +#include + #include +#include "openvino/op/constant.hpp" + namespace py = pybind11; +std::vector _get_strides(const ov::op::v0::Constant& self); + void regclass_graph_op_Constant(py::module m); diff --git a/src/bindings/python/src/pyopenvino/graph/preprocess/pre_post_process.cpp b/src/bindings/python/src/pyopenvino/graph/preprocess/pre_post_process.cpp index 335f8e8c530989..31aec4a66a4297 100644 --- a/src/bindings/python/src/pyopenvino/graph/preprocess/pre_post_process.cpp +++ b/src/bindings/python/src/pyopenvino/graph/preprocess/pre_post_process.cpp @@ -322,7 +322,7 @@ static void regclass_graph_InputTensorInfo(py::module m) { "set_from", [](ov::preprocess::InputTensorInfo& self, py::array& numpy_array) { // Convert to contiguous array if not already C-style. - return &self.set_from(Common::tensor_from_numpy(numpy_array, false)); + return &self.set_from(Common::object_from_data(numpy_array, false)); }, py::arg("runtime_tensor"), R"( diff --git a/src/bindings/python/src/pyopenvino/graph/util.cpp b/src/bindings/python/src/pyopenvino/graph/util.cpp index a5f2c473972a3e..35ea9003c70eb1 100644 --- a/src/bindings/python/src/pyopenvino/graph/util.cpp +++ b/src/bindings/python/src/pyopenvino/graph/util.cpp @@ -6,9 +6,12 @@ #include +#include + #include "openvino/core/graph_util.hpp" #include "openvino/core/validation_util.hpp" #include "openvino/pass/manager.hpp" +#include "pyopenvino/graph/ops/constant.hpp" #include "pyopenvino/utils/utils.hpp" namespace py = pybind11; diff --git a/src/bindings/python/tests/test_graph/test_manager.py b/src/bindings/python/tests/test_graph/test_manager.py index a9b76538fa1a38..dad03fecaeaa2c 100644 --- a/src/bindings/python/tests/test_graph/test_manager.py +++ b/src/bindings/python/tests/test_graph/test_manager.py @@ -8,17 +8,26 @@ import numpy as np import pytest -import openvino.runtime.opset8 as ov -from openvino.runtime import Model +import openvino.runtime.opset10 as ops +from openvino.runtime import Core, Model from openvino.runtime.passes import Manager, Serialize, ConstantFolding, Version from tests.test_graph.util import count_ops_of_type -from openvino.runtime import Core from tests.test_utils.test_utils import create_filename_for_test +def create_model(): + shape = [100, 100, 2] + parameter_a = ops.parameter(shape, dtype=np.float32, name="A") + parameter_b = ops.parameter(shape, dtype=np.float32, name="B") + parameter_c = ops.parameter(shape, dtype=np.float32, name="C") + model = ops.floor(ops.minimum(ops.abs(parameter_a), ops.multiply(parameter_b, parameter_c))) + func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") + return func + + def test_constant_folding(): - node_constant = ov.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) - node_ceil = ov.ceiling(node_constant) + node_constant = ops.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) + node_ceil = ops.ceiling(node_constant) model = Model(node_ceil, [], "TestFunction") assert count_ops_of_type(model, node_ceil) == 1 @@ -43,9 +52,9 @@ def test_serialize_seperate_paths_kwargs(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) shape = [2, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") + parameter_a = ops.parameter(shape, dtype=np.float32, name="A") + parameter_b = ops.parameter(shape, dtype=np.float32, name="B") + parameter_c = ops.parameter(shape, dtype=np.float32, name="C") model = (parameter_a + parameter_b) * parameter_c func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") @@ -67,10 +76,10 @@ def test_serialize_seperate_paths_args(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) shape = [2, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") + parameter_a = ops.parameter(shape, dtype=np.float32, name="A") + parameter_b = ops.parameter(shape, dtype=np.float32, name="B") + parameter_c = ops.parameter(shape, dtype=np.float32, name="C") + parameter_d = ops.parameter(shape, dtype=np.float32, name="D") model = ((parameter_a + parameter_b) * parameter_c) / parameter_d func = Model(model, [parameter_a, parameter_b, parameter_c, parameter_d], "Model") @@ -92,8 +101,8 @@ def test_serialize_pass_mixed_args_kwargs(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) shape = [3, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") + parameter_a = ops.parameter(shape, dtype=np.float32, name="A") + parameter_b = ops.parameter(shape, dtype=np.float32, name="B") model = parameter_a - parameter_b func = Model(model, [parameter_a, parameter_b], "Model") @@ -114,20 +123,15 @@ def test_serialize_pass_mixed_args_kwargs(request, tmp_path): def test_serialize_pass_mixed_args_kwargs_v2(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") + model = create_model() pass_manager = Manager() pass_manager.register_pass(Serialize(path_to_xml=xml_path, path_to_bin=bin_path)) - pass_manager.run_passes(func) + pass_manager.run_passes(model) res_model = core.read_model(model=xml_path, weights=bin_path) - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() + assert model.get_parameters() == res_model.get_parameters() + assert model.get_ordered_ops() == res_model.get_ordered_ops() os.remove(xml_path) os.remove(bin_path) @@ -146,8 +150,8 @@ def test_serialize_pass_wrong_num_of_args(request, tmp_path): # request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request def test_serialize_results(request, tmp_path): core = Core() - node_constant = ov.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) - node_ceil = ov.ceiling(node_constant) + node_constant = ops.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) + node_ceil = ops.ceiling(node_constant) func = Model(node_ceil, [], "Model") xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) @@ -165,73 +169,19 @@ def test_serialize_results(request, tmp_path): os.remove(bin_path) -# request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request -def test_serialize_pass_tuple(request, tmp_path): - core = Core() - xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") - pass_manager = Manager() - pass_manager.register_pass("Serialize", output_files=(xml_path, bin_path)) - pass_manager.run_passes(func) - - res_model = core.read_model(model=xml_path, weights=bin_path) - - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() - - os.remove(xml_path) - os.remove(bin_path) - - # request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request def test_default_version(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") + model = create_model() pass_manager = Manager() - pass_manager.register_pass("Serialize", output_files=(xml_path, bin_path)) - pass_manager.run_passes(func) - - res_model = core.read_model(model=xml_path, weights=bin_path) - - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() - - os.remove(xml_path) - os.remove(bin_path) - - -# request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request -def test_default_version_IR_V11_tuple(request, tmp_path): - core = Core() - xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") - pass_manager = Manager() - pass_manager.register_pass("Serialize", output_files=(xml_path, bin_path), version="IR_V11") - pass_manager.run_passes(func) + pass_manager.register_pass(Serialize(xml_path, bin_path)) + pass_manager.run_passes(model) res_model = core.read_model(model=xml_path, weights=bin_path) - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() + assert model.get_parameters() == res_model.get_parameters() + assert model.get_ordered_ops() == res_model.get_ordered_ops() os.remove(xml_path) os.remove(bin_path) @@ -241,21 +191,15 @@ def test_default_version_IR_V11_tuple(request, tmp_path): def test_default_version_IR_V11_seperate_paths(request, tmp_path): core = Core() xml_path, bin_path = create_filename_for_test(request.node.name, tmp_path) - shape = [100, 100, 2] - parameter_a = ov.parameter(shape, dtype=np.float32, name="A") - parameter_b = ov.parameter(shape, dtype=np.float32, name="B") - parameter_c = ov.parameter(shape, dtype=np.float32, name="C") - parameter_d = ov.parameter(shape, dtype=np.float32, name="D") - model = ov.floor(ov.minimum(ov.abs(parameter_a), ov.multiply(parameter_b, parameter_c))) - func = Model(model, [parameter_a, parameter_b, parameter_c], "Model") + model = create_model() pass_manager = Manager() pass_manager.register_pass(Serialize(path_to_xml=xml_path, path_to_bin=bin_path, version=Version.IR_V11)) - pass_manager.run_passes(func) + pass_manager.run_passes(model) res_model = core.read_model(model=xml_path, weights=bin_path) - assert func.get_parameters() == res_model.get_parameters() - assert func.get_ordered_ops() == res_model.get_ordered_ops() + assert model.get_parameters() == res_model.get_parameters() + assert model.get_ordered_ops() == res_model.get_ordered_ops() os.remove(xml_path) os.remove(bin_path) diff --git a/src/bindings/python/tests/test_runtime/test_infer_request.py b/src/bindings/python/tests/test_runtime/test_infer_request.py index 5d9db0a461a456..b64623a4ad7c8d 100644 --- a/src/bindings/python/tests/test_runtime/test_infer_request.py +++ b/src/bindings/python/tests/test_runtime/test_infer_request.py @@ -925,6 +925,7 @@ def __array__(self): request, _, input_data = abs_model_with_data(device, Type.f32, np.single) model_input_object = ArrayLikeObject(input_data.tolist()) model_input_list = [ArrayLikeObject(input_data.tolist())] + model_input_dict = {0: ArrayLikeObject(input_data.tolist())} # Test single array-like object in InferRequest().Infer() res_object = request.infer(model_input_object, shared_memory=shared_flag) @@ -934,6 +935,10 @@ def __array__(self): res_list = request.infer(model_input_list) assert np.array_equal(res_list[request.model_outputs[0]], np.abs(input_data)) + # Test dict of array-like objects to use normalize_inputs() + res_dict = request.infer(model_input_dict) + assert np.array_equal(res_dict[request.model_outputs[0]], np.abs(input_data)) + @pytest.mark.parametrize("shared_flag", [True, False]) def test_array_like_input_async(device, shared_flag): diff --git a/src/bindings/python/tests/test_runtime/test_memory_modes.py b/src/bindings/python/tests/test_runtime/test_memory_modes.py new file mode 100644 index 00000000000000..ccbd44efa729bb --- /dev/null +++ b/src/bindings/python/tests/test_runtime/test_memory_modes.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest + +import openvino.runtime as ov +from openvino.runtime import Tensor +from openvino.runtime.op import Constant + +from tests.test_utils.test_utils import generate_image + + +@pytest.mark.parametrize(("cls", "cls_str"), [ + (Tensor, "TENSOR"), + (Constant, "CONSTANT"), +]) +def test_init_with_numpy_fail(cls, cls_str): + arr = np.asfortranarray(generate_image()) # F-style array + + with pytest.raises(RuntimeError) as e: + _ = cls(array=arr, shared_memory=True) + + assert "SHARED MEMORY MODE FOR THIS " + cls_str + " IS NOT APPLICABLE!" in str(e.value) + + +@pytest.mark.parametrize("cls", [Tensor, Constant]) +@pytest.mark.parametrize("shared_flag", [True, False]) +@pytest.mark.parametrize(("ov_type", "numpy_dtype"), [ + (ov.Type.f32, np.float32), + (ov.Type.f64, np.float64), + (ov.Type.f16, np.float16), + (ov.Type.i8, np.int8), + (ov.Type.u8, np.uint8), + (ov.Type.i32, np.int32), + (ov.Type.u32, np.uint32), + (ov.Type.i16, np.int16), + (ov.Type.u16, np.uint16), + (ov.Type.i64, np.int64), + (ov.Type.u64, np.uint64), + (ov.Type.boolean, bool), +]) +def test_with_numpy_memory(cls, shared_flag, ov_type, numpy_dtype): + arr = np.ascontiguousarray(generate_image().astype(numpy_dtype)) + ov_object = cls(array=arr, shared_memory=shared_flag) + + assert ov_object.get_element_type() == ov_type + assert tuple(ov_object.shape) == arr.shape + + assert isinstance(ov_object.data, np.ndarray) + assert ov_object.data.dtype == numpy_dtype + assert ov_object.data.shape == arr.shape + assert np.array_equal(ov_object.data, arr) + + if shared_flag is True: + assert np.shares_memory(arr, ov_object.data) + else: + assert not (np.shares_memory(arr, ov_object.data)) + + +@pytest.mark.parametrize("cls", [Tensor, Constant]) +@pytest.mark.parametrize("shared_flag", [True, False]) +def test_with_external_memory(cls, shared_flag): + class ArrayLikeObject: + # Array-like object to test inputs similar to torch.Tensor and tf.Tensor + def __init__(self, array) -> None: + self.data = array + + @property + def shape(self): + return self.data.shape + + @property + def dtype(self): + return self.data.dtype + + def to_numpy(self): + return self.data + + external_object = ArrayLikeObject(np.ascontiguousarray(generate_image())) + ov_object = cls(array=external_object.to_numpy(), shared_memory=shared_flag) + + assert np.array_equal(ov_object.data.dtype, external_object.dtype) + assert np.array_equal(ov_object.data.shape, external_object.shape) + assert np.array_equal(ov_object.data, external_object.to_numpy()) + + if shared_flag is True: + assert np.shares_memory(external_object.to_numpy(), ov_object.data) + else: + assert not (np.shares_memory(external_object.to_numpy(), ov_object.data)) + + +@pytest.mark.parametrize("cls", [Constant]) +@pytest.mark.parametrize("shared_flag_one", [True, False]) +@pytest.mark.parametrize("shared_flag_two", [True, False]) +@pytest.mark.parametrize(("ov_type", "numpy_dtype"), [ + (ov.Type.f32, np.float32), + (ov.Type.f64, np.float64), + (ov.Type.f16, np.float16), + (ov.Type.i8, np.int8), + (ov.Type.u8, np.uint8), + (ov.Type.i32, np.int32), + (ov.Type.u32, np.uint32), + (ov.Type.i16, np.int16), + (ov.Type.u16, np.uint16), + (ov.Type.i64, np.int64), + (ov.Type.u64, np.uint64), + (ov.Type.boolean, bool), +]) +def test_with_tensor_memory(cls, shared_flag_one, shared_flag_two, ov_type, numpy_dtype): + arr = np.ascontiguousarray(generate_image().astype(numpy_dtype)) + ov_tensor = Tensor(arr, shared_memory=shared_flag_one) + ov_object = cls(tensor=ov_tensor, shared_memory=shared_flag_two) + + # Case 1: all data is shared + if shared_flag_one is True and shared_flag_two is True: + assert np.shares_memory(arr, ov_object.data) + assert np.shares_memory(ov_tensor.data, ov_object.data) + # Case 2: data is shared only between object and Tensor + elif shared_flag_one is False and shared_flag_two is True: + assert not (np.shares_memory(arr, ov_object.data)) + assert np.shares_memory(ov_tensor.data, ov_object.data) + # Case 3: data is not shared, copy occurs in the object's constructor + else: + assert not (np.shares_memory(arr, ov_object.data)) + assert not (np.shares_memory(ov_tensor.data, ov_object.data)) diff --git a/src/bindings/python/tests/test_runtime/test_tensor.py b/src/bindings/python/tests/test_runtime/test_tensor.py index 9e4d0daf0fd4b1..f9eb556a15e482 100644 --- a/src/bindings/python/tests/test_runtime/test_tensor.py +++ b/src/bindings/python/tests/test_runtime/test_tensor.py @@ -148,13 +148,6 @@ def test_init_with_numpy_copy_memory(ov_type, numpy_dtype): assert ov_tensor.byte_size == arr.nbytes -def test_init_with_numpy_fail(): - arr = np.asfortranarray(generate_image()) - with pytest.raises(RuntimeError) as e: - _ = Tensor(array=arr, shared_memory=True) - assert "Tensor with shared memory must be C contiguous" in str(e.value) - - def test_init_with_roi_tensor(): array = np.random.normal(size=[1, 3, 48, 48]) ov_tensor1 = Tensor(array) diff --git a/src/bindings/python/tests/test_transformations/test_manager.py b/src/bindings/python/tests/test_transformations/test_manager.py index 1aa7cbb85d8dbe..d88863c43561a3 100644 --- a/src/bindings/python/tests/test_transformations/test_manager.py +++ b/src/bindings/python/tests/test_transformations/test_manager.py @@ -32,14 +32,10 @@ def test_registration_and_pass_name(): GraphRewrite().set_name("Anchor") BackwardGraphRewrite().set_name("BackAnchor") - # Preserve legacy behaviour when registered pass doesn't exist - # and in this case we shouldn't throw an exception. - manager.register_pass("NotExistingPass") - def test_negative_pass_registration(): manager = Manager() expect_exception(lambda: manager.register_pass(PatternReplacement)) expect_exception(lambda: manager.register_pass("PatternReplacement", PatternReplacement())) expect_exception(lambda: manager.register_pass("Serialize", Serialize("out.xml", "out.bin"))) - expect_exception(lambda: manager.register_pass("Serialize", "out.xml", "out.bin", "out.wrong")) + expect_exception(lambda: manager.register_pass(Serialize("out.xml", "out.bin", "out.wrong"))) diff --git a/src/bindings/python/tests/test_transformations/test_offline_api.py b/src/bindings/python/tests/test_transformations/test_offline_api.py index 1cae5c0af5ab8c..cf3089e30fc00b 100644 --- a/src/bindings/python/tests/test_transformations/test_offline_api.py +++ b/src/bindings/python/tests/test_transformations/test_offline_api.py @@ -6,7 +6,7 @@ import pytest import numpy as np from openvino.runtime import serialize -from openvino.offline_transformations import ( +from openvino._offline_transformations import ( apply_moc_transformations, apply_pot_transformations, apply_low_latency_transformation, diff --git a/src/bindings/python/tests_compatibility/test_ngraph/test_einsum.py b/src/bindings/python/tests_compatibility/test_ngraph/test_einsum.py index 27efce283adba4..2427d60f64d841 100644 --- a/src/bindings/python/tests_compatibility/test_ngraph/test_einsum.py +++ b/src/bindings/python/tests_compatibility/test_ngraph/test_einsum.py @@ -27,7 +27,7 @@ def einsum_op_exec(input_shapes: list, equation: str, data_type: np.dtype, ng_inputs = [] np_inputs = [] for i in range(num_inputs): - input_i = np.random.random_integers(10, size=input_shapes[i]).astype(data_type) + input_i = np.random.randint(1, 10 + 1, size=input_shapes[i]).astype(data_type) np_inputs.append(input_i) ng_inputs.append(ng.parameter(input_i.shape, dtype=data_type)) diff --git a/src/bindings/python/tests_compatibility/test_ngraph/test_ops_fused.py b/src/bindings/python/tests_compatibility/test_ngraph/test_ops_fused.py index f1b095b08ea5e9..4cd2bbcba2fc96 100644 --- a/src/bindings/python/tests_compatibility/test_ngraph/test_ops_fused.py +++ b/src/bindings/python/tests_compatibility/test_ngraph/test_ops_fused.py @@ -33,7 +33,7 @@ def test_elu_operator_with_scalar(): def test_fake_quantize(): - levels = np.float32(4) + levels = np.int32(4) data_shape = [1, 2, 3, 4] bound_shape = [] @@ -60,7 +60,7 @@ def test_fake_quantize(): def test_depth_to_space(): data_shape = [1, 4, 2, 3] mode = "blocks_first" - block_size = np.float32(2) + block_size = np.int32(2) parameter_data = ng.parameter(data_shape, name="Data", dtype=np.float32) diff --git a/src/bindings/python/wheel/requirements-dev.txt b/src/bindings/python/wheel/requirements-dev.txt index 38b09d5d1effb7..780102df1548f8 100644 --- a/src/bindings/python/wheel/requirements-dev.txt +++ b/src/bindings/python/wheel/requirements-dev.txt @@ -1,3 +1,3 @@ setuptools>=53.0.0 wheel>=0.38.1 -patchelf; sys_platform == 'linux' and platform_machine == 'x86_64' or sys_platform == 'linux' and platform_machine == 'aarch64' +patchelf; sys_platform == 'linux' and platform_machine == 'x86_64' diff --git a/src/cmake/openvino.cmake b/src/cmake/openvino.cmake index 336b50fdbc16f3..5f07f131a6686d 100644 --- a/src/cmake/openvino.cmake +++ b/src/cmake/openvino.cmake @@ -29,7 +29,6 @@ add_library(openvino::runtime ALIAS ${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES EXPORT_NAME runtime) ie_add_vs_version_file(NAME ${TARGET_NAME} FILEDESCRIPTION "OpenVINO runtime library") -ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME} EXTRA ${TBB_IMPORTED_TARGETS} ${TBBBIND_2_5_IMPORTED_TARGETS}) target_include_directories(${TARGET_NAME} PUBLIC $ @@ -65,6 +64,9 @@ endif() set_ie_threading_interface_for(${TARGET_NAME}) ie_mark_target_as_cc(${TARGET_NAME}) +# must be called after all target_link_libraries +ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) + # LTO set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/common/preprocessing/src/CMakeLists.txt b/src/common/preprocessing/src/CMakeLists.txt index 11fa0eadb7ab25..3e8a70b9e61151 100644 --- a/src/common/preprocessing/src/CMakeLists.txt +++ b/src/common/preprocessing/src/CMakeLists.txt @@ -167,6 +167,7 @@ if(ENABLE_GAPI_PREPROCESSING) set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) endif() + # must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) ie_add_vs_version_file(NAME ${TARGET_NAME} diff --git a/src/common/snippets/src/pass/convert_constants.cpp b/src/common/snippets/src/pass/convert_constants.cpp index c3d2318b49e8c2..951f51825c8f5f 100644 --- a/src/common/snippets/src/pass/convert_constants.cpp +++ b/src/common/snippets/src/pass/convert_constants.cpp @@ -32,5 +32,5 @@ ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() { ngraph::replace_node(constant, scalar); return true; }; - register_matcher(std::make_shared(constants), callback); + register_matcher(std::make_shared(constants, matcher_name), callback); } diff --git a/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp b/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp index 7a76f7207f7b8f..ef43e677f6f8cb 100644 --- a/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp +++ b/src/common/snippets/src/pass/convert_power_to_powerstatic.cpp @@ -16,7 +16,7 @@ ngraph::snippets::pass::ConvertPowerToPowerStatic::ConvertPowerToPowerStatic() { is_type(n->get_input_node_shared_ptr(1)); }); ngraph::graph_rewrite_callback callback = [](ngraph::pattern::Matcher &m) { - OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars") + OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertPowerToPowerStatic") auto power = ov::as_type_ptr(m.get_match_root()); auto scalar = ov::as_type_ptr(power->get_input_node_shared_ptr(1)); auto value = scalar->cast_vector()[0]; diff --git a/src/common/transformations/src/transformations/common_optimizations/pull_through_reduce.cpp b/src/common/transformations/src/transformations/common_optimizations/pull_through_reduce.cpp index 3c25795b296862..5a88b007341d1d 100644 --- a/src/common/transformations/src/transformations/common_optimizations/pull_through_reduce.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/pull_through_reduce.cpp @@ -183,7 +183,7 @@ ov::pass::PullReshapeThroughReduce::PullReshapeThroughReduce() { matcher_pass_callback callback = [=](pattern::Matcher& m) { auto& pattern_map = m.get_pattern_value_map(); - const auto input_node = pattern_map.at(input).get_node_shared_ptr(); + const auto input_node = pattern_map.at(input); const auto reduce_node = std::dynamic_pointer_cast(pattern_map.at(reduce).get_node_shared_ptr()); if (!reduce_node) { @@ -194,7 +194,7 @@ ov::pass::PullReshapeThroughReduce::PullReshapeThroughReduce() { return false; } const auto unsqueeze_axes = - try_get_unsqueeze_axes_from_reshape(reshape_node->get_shape(), input_node->get_shape()); + try_get_unsqueeze_axes_from_reshape(reshape_node->get_shape(), input_node.get_shape()); if (unsqueeze_axes.empty()) { return false; } diff --git a/src/common/transformations/src/transformations/common_optimizations/push_constant_to_subgraph.cpp b/src/common/transformations/src/transformations/common_optimizations/push_constant_to_subgraph.cpp index 2d55d79dd455b4..21e636992067d5 100644 --- a/src/common/transformations/src/transformations/common_optimizations/push_constant_to_subgraph.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/push_constant_to_subgraph.cpp @@ -46,12 +46,12 @@ static void replace_body_parameter(const std::shared_ptr& body, } static void update_multi_sub_graph_op_inputs(const std::shared_ptr& multi_sub_graph_op, - int remove_inputs_mask) { + const std::vector& remove_inputs_mask) { int num_subgraphs = static_cast(multi_sub_graph_op->get_internal_subgraphs_size()); auto inputs = multi_sub_graph_op->input_values(); for (size_t i = multi_sub_graph_op->get_input_size(); i > 0; i--) { const auto input_index = i - 1; - if ((remove_inputs_mask & (1 << input_index)) != 0) { + if (remove_inputs_mask[input_index]) { // remove MultiSubGraphOp's input if it was marked to be removed // (meaning it was constfolded and pushed to inner subgraph) inputs.erase(inputs.begin() + input_index); @@ -83,7 +83,7 @@ bool ov::pass::PushConstantToSubgraph::run_on_model(const std::shared_ptr // cache for already constant folded inputs std::unordered_map> cache; // bitmask describing which MultiSubGraphOp's input to remove - int remove_inputs_mask = 0; + std::vector remove_inputs_mask(multi_sub_graph_op->get_input_size(), false); int num_subgraphs = static_cast(multi_sub_graph_op->get_internal_subgraphs_size()); for (int body_idx = 0; body_idx < num_subgraphs; body_idx++) { @@ -95,7 +95,7 @@ bool ov::pass::PushConstantToSubgraph::run_on_model(const std::shared_ptr const auto input_index = desc->m_input_index; const auto constant = try_constantfold_input(multi_sub_graph_op, desc, cache); if (!constant) { - remove_inputs_mask &= ~(1 << input_index); + remove_inputs_mask[input_index] = false; desc_it++; continue; } @@ -103,12 +103,12 @@ bool ov::pass::PushConstantToSubgraph::run_on_model(const std::shared_ptr desc_it = descriptions.erase(desc_it); auto& body_param = body_params[body_parameter_index]; replace_body_parameter(body, body_param, body_parameter_index, constant, descriptions); - remove_inputs_mask |= 1 << input_index; + remove_inputs_mask[input_index] = true; result = true; } } - if (remove_inputs_mask > 0) { + if (result) { update_multi_sub_graph_op_inputs(multi_sub_graph_op, remove_inputs_mask); } diff --git a/src/common/transformations/src/transformations/common_optimizations/strides_optimization.cpp b/src/common/transformations/src/transformations/common_optimizations/strides_optimization.cpp index 839ba82cb49f23..25dc8300f674e9 100644 --- a/src/common/transformations/src/transformations/common_optimizations/strides_optimization.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/strides_optimization.cpp @@ -13,7 +13,9 @@ #include "itt.hpp" +using namespace std; using namespace ov; +using namespace ov::opset7; static bool can_propagate_conv_stride(const std::shared_ptr& conv) { const auto& kernel_shape = conv->input_value(1).get_shape(); @@ -39,40 +41,36 @@ static std::tuple check_next_ops(const std::vector& first, - ngraph::Input& second, - const ngraph::Strides& strides) { +static void insert_pooling(const Output& first, Input& second, const Strides& strides) { + pass::NodeRegistry rg; auto first_node = first.get_node_shared_ptr(); - auto rank = first.get_partial_shape().rank(); - bool do_reshape = rank.is_static() && static_cast(rank.get_length()) < strides.size() + 2; + const auto rank = first.get_partial_shape().rank(); + const bool do_reshape = rank.is_static() && static_cast(rank.get_length()) < strides.size() + 2; if (do_reshape) { - size_t diff = strides.size() + 2 - static_cast(rank.get_length()); - auto ones = opset7::Constant::create(ngraph::element::i64, ngraph::Shape{diff}, std::vector(diff, 1)); - auto current_shape = std::make_shared(first); - std::shared_ptr new_shape = - std::make_shared(ngraph::OutputVector{ones, current_shape}, 0); - std::shared_ptr constant_new_shape = get_constant_from_source(new_shape); - if (constant_new_shape) + const size_t diff = strides.size() + 2 - static_cast(rank.get_length()); + const auto ones = rg.make(element::i64, Shape{diff}, vector(diff, 1)); + const auto current_shape = rg.make(first); + shared_ptr new_shape = rg.make(OutputVector{ones, current_shape}, 0); + if (const auto constant_new_shape = get_constant_from_source(new_shape)) { + rg.add(constant_new_shape); new_shape = constant_new_shape; - first_node = std::make_shared(first_node, new_shape, false); + } + first_node = rg.make(first_node, new_shape, false); } - std::shared_ptr new_node = std::make_shared(first_node, - strides, - ngraph::Shape{}, - ngraph::Shape{}, - ngraph::Shape(strides.size(), 1)); + shared_ptr new_node = rg.make(first_node, strides, Shape{}, Shape{}, Shape(strides.size(), 1)); if (do_reshape) { // squeeze dimensions back - size_t diff = strides.size() + 2 - static_cast(rank.get_length()); - std::vector axes(diff); - std::iota(axes.begin(), axes.end(), 0); - new_node = std::make_shared( - new_node, - opset7::Constant::create(ngraph::element::u64, ngraph::Shape{diff}, axes)); + const size_t diff = strides.size() + 2 - static_cast(rank.get_length()); + vector axes(diff); + iota(axes.begin(), axes.end(), 0); + new_node = rg.make(new_node, rg.make(element::u64, Shape{diff}, axes)); } - std::shared_ptr constant_new_node = get_constant_from_source(new_node); - if (constant_new_node) + if (const auto constant_new_node = get_constant_from_source(new_node)) { + rg.add(constant_new_node); new_node = constant_new_node; + } + + copy_runtime_info(as_node_vector({second.get_source_output()}), rg.get()); second.replace_source_output(new_node); } diff --git a/src/common/transformations/tests/common_optimizations/pull_through_reduce_test.cpp b/src/common/transformations/tests/common_optimizations/pull_through_reduce_test.cpp index f9d9db0beae4bd..349f5b39a03232 100644 --- a/src/common/transformations/tests/common_optimizations/pull_through_reduce_test.cpp +++ b/src/common/transformations/tests/common_optimizations/pull_through_reduce_test.cpp @@ -168,6 +168,27 @@ INSTANTIATE_TEST_SUITE_P(PullUnsqueezeThroughReduceLogicalOr, PullUnsqueezeThroughReduceLogicalOr, ValuesIn(reduce_logical_or_params)); +TEST_F(TransformationTestsF, PullUnsqueezeThroughReduceMeanInputHasMoreThanOneOutput) { + const auto input = std::make_shared(element::f32, PartialShape{10, 10, 15}); + const auto split = std::make_shared(input, Constant::create(element::i64, Shape{}, {0}), 2); + const auto unsqueeze_axes = Constant::create(element::i64, Shape{1}, {0}); + { + const auto unsqueeze = std::make_shared(split->output(0), unsqueeze_axes); + const auto reduce_axes = Constant::create(element::i64, Shape{}, {1}); + const auto reduce_mean = std::make_shared(unsqueeze, reduce_axes); + + model = std::make_shared(OutputVector{reduce_mean, split->output(1)}, ParameterVector{input}); + manager.register_pass(); + } + { + const auto reduce_axes = Constant::create(element::i64, Shape{}, {0}); + const auto reduce_mean = std::make_shared(split->output(0), reduce_axes); + const auto unsqueeze = std::make_shared(reduce_mean, unsqueeze_axes); + + model_ref = std::make_shared(OutputVector{unsqueeze, split->output(1)}, ParameterVector{input}); + } +} + TEST_F(TransformationTestsF, PullUnsqueezeThroughReduceSkipIfTheSameAxes) { model = generate_unsqueeze_model(element::f32, {5, 10, 15}, {0, 1}, {1, 2}); manager.register_pass(); @@ -296,6 +317,28 @@ INSTANTIATE_TEST_SUITE_P(PullReshapeThroughReduceLogicalOr, PullReshapeThroughReduceLogicalOr, ValuesIn(reduce_logical_or_reshape_params)); +TEST_F(TransformationTestsF, PullReshapeThroughReduceMeanInputHasMoreThanOneOutput) { + const auto input = std::make_shared(element::f32, PartialShape{10, 10, 15}); + const auto split = std::make_shared(input, Constant::create(element::i64, Shape{}, {0}), 2); + { + const auto target_shape = Constant::create(element::i64, Shape{4}, {1, 5, 10, 15}); + const auto reshape = std::make_shared(split->output(0), target_shape, false); + const auto reduce_axes = Constant::create(element::i64, Shape{}, {1}); + const auto reduce_mean = std::make_shared(reshape, reduce_axes); + + model = std::make_shared(OutputVector{reduce_mean, split->output(1)}, ParameterVector{input}); + manager.register_pass(); + } + { + const auto reduce_axes = Constant::create(element::i64, Shape{}, {0}); + const auto reduce_mean = std::make_shared(split->output(0), reduce_axes); + const auto target_shape = Constant::create(element::i64, Shape{3}, {1, 10, 15}); + const auto reshape = std::make_shared(reduce_mean, target_shape, false); + + model_ref = std::make_shared(OutputVector{reshape, split->output(1)}, ParameterVector{input}); + } +} + TEST_F(TransformationTestsF, PullReshapeThroughReduceMeanSkipIfDynamicInput) { model = generate_reshape_model(element::f32, {5, Dimension::dynamic(), 15}, {1, 5, 10, 15}, {2}); manager.register_pass(); diff --git a/src/common/transformations/tests/common_optimizations/push_constant_to_subgraphs.cpp b/src/common/transformations/tests/common_optimizations/push_constant_to_subgraphs.cpp index 085c1e80019b4c..ce6d80fa9eb34e 100644 --- a/src/common/transformations/tests/common_optimizations/push_constant_to_subgraphs.cpp +++ b/src/common/transformations/tests/common_optimizations/push_constant_to_subgraphs.cpp @@ -179,3 +179,78 @@ TEST_F(TransformationTestsF, PushConstantToSubgraphIf) { comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } + +TEST_F(TransformationTestsF, PushConstantToSubgraphLoopMoreThan32Inputs) { + int num_const_inputs = 33; + { + auto trip_count = opset10::Constant::create(element::i32, Shape{}, {2}); + auto term_cond = opset10::Constant::create(element::boolean, Shape{}, {true}); + std::shared_ptr loop_body; + { + auto X = std::make_shared(element::f32, Shape{1, 2}); + ParameterVector params; + params.reserve(num_const_inputs + 1); + params.push_back(X); + NodeVector concat_inputs; + concat_inputs.reserve(num_const_inputs + 1); + concat_inputs.push_back(X); + for (int i = 0; i < num_const_inputs; i++) { + params.push_back(std::make_shared(element::f32, Shape{1, 2})); + concat_inputs.push_back(params.back()); + } + auto concat = std::make_shared(concat_inputs, 1); + auto cond = opset10::Constant::create(element::boolean, Shape{}, {true}); + loop_body = std::make_shared(NodeVector{concat, cond}, params); + } + auto loop = std::make_shared(trip_count, term_cond); + loop->set_function(loop_body); + + auto X = std::make_shared(element::f32, Shape{2, 2}); + NodeVector constants; + constants.reserve(num_const_inputs); + for (int i = 0; i < num_const_inputs; i++) { + constants.push_back(opset10::Constant::create(element::f32, Shape{1, 2}, {-2})); + } + const auto& loop_params = loop_body->get_parameters(); + loop->set_special_body_ports({-1, 1}); + loop->set_sliced_input(loop_params[0], X, 0, 1, 1, -1, 0); + for (int i = 0; i < num_const_inputs; i++) { + loop->set_invariant_input(loop_params[i + 1], constants[i]); + } + auto out = loop->get_concatenated_slices(loop_body->get_results()[0], 0, 1, 1, -1, 0); + function = std::make_shared(OutputVector{out}, ParameterVector{X}); + + manager.register_pass(); + } + + { + auto trip_count = opset10::Constant::create(element::i32, Shape{}, {2}); + auto term_cond = opset10::Constant::create(element::boolean, Shape{}, {true}); + std::shared_ptr loop_body; + { + auto constant = opset10::Constant::create(element::f32, Shape{1, 2}, {-2}); + auto X = std::make_shared(element::f32, Shape{1, 2}); + NodeVector concat_inputs; + concat_inputs.reserve(num_const_inputs + 1); + concat_inputs.push_back(X); + for (int i = 0; i < num_const_inputs; i++) { + concat_inputs.push_back(opset10::Constant::create(element::f32, Shape{1, 2}, {-2})); + } + auto concat = std::make_shared(concat_inputs, 1); + auto cond = opset10::Constant::create(element::boolean, Shape{}, {true}); + loop_body = std::make_shared(NodeVector{concat, cond}, ParameterVector{X}); + } + auto loop = std::make_shared(trip_count, term_cond); + loop->set_function(loop_body); + + auto X = std::make_shared(element::f32, Shape{2, 2}); + const auto& loop_params = loop_body->get_parameters(); + loop->set_special_body_ports({-1, 1}); + loop->set_sliced_input(loop_params[0], X, 0, 1, 1, -1, 0); + auto out = loop->get_concatenated_slices(loop_body->get_results()[0], 0, 1, 1, -1, 0); + function_ref = std::make_shared(OutputVector{out}, ParameterVector{X}); + } + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ACCURACY); +} diff --git a/src/common/transformations/tests/common_optimizations/strides_optimization.cpp b/src/common/transformations/tests/common_optimizations/strides_optimization.cpp index c6d546101db2a6..0e598fe3112d22 100644 --- a/src/common/transformations/tests/common_optimizations/strides_optimization.cpp +++ b/src/common/transformations/tests/common_optimizations/strides_optimization.cpp @@ -264,9 +264,6 @@ TEST_F(TransformationTestsF, StridesOptimization5) { function_ref = std::make_shared(ngraph::NodeVector{conv_2}, ngraph::ParameterVector{data}); } - - // TODO: update transformation and remove this check XXX-68696 - disable_rt_info_check(); } // Pl->Conv(1x1,1x1)->Conv(1x1,2x2)->Conv(3x3,1x1)->Conv(1x1,2x2) @@ -424,8 +421,6 @@ TEST_F(TransformationTestsF, StridesOptimization7) { function_ref = std::make_shared(ngraph::NodeVector{conv_3, conv_4}, ngraph::ParameterVector{data}); } - // TODO: update transformation and remove this check XXX-68696 - disable_rt_info_check(); } // Pl--->Conv(1x1,1x1)->ReLU--->Eltwise-->Conv(1x1,2x2)-->Eltwise-->Conv(1x1, 2x2) @@ -517,8 +512,6 @@ TEST_F(TransformationTestsF, StridesOptimization8) { function_ref = std::make_shared(ngraph::NodeVector{conv_3}, ngraph::ParameterVector{data, data_2}); } - // TODO: update transformation and remove this check XXX-68696 - disable_rt_info_check(); } // Pl------->Conv(1x1,1x1)------>Eltwise------>Conv(1x1,2x2)---->Eltwise-->Conv(1x1, 2x2) @@ -636,6 +629,4 @@ TEST_F(TransformationTestsF, StridesOptimization9) { function_ref = std::make_shared(ngraph::NodeVector{conv_3}, ngraph::ParameterVector{data, data_2, data_3}); } - // TODO: update transformation and remove this check XXX-68696 - disable_rt_info_check(); } diff --git a/src/common/transformations/tests/offline_transformations/pruning_test.cpp b/src/common/transformations/tests/offline_transformations/pruning_test.cpp index 2d6e1cc2fbce4e..ec47d1c8eda1be 100644 --- a/src/common/transformations/tests/offline_transformations/pruning_test.cpp +++ b/src/common/transformations/tests/offline_transformations/pruning_test.cpp @@ -287,7 +287,6 @@ TEST_F(TransformationTestsF, PropagateMasksBasic) { compare_masks(*getMask(conv2->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -371,7 +370,6 @@ TEST_F(TransformationTestsF, PropagateMasksDynamicConvolution) { compare_masks(*getMask(conv2->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -599,7 +597,6 @@ TEST_F(TransformationTestsF, PropagateMaskPassThrough) { compare_masks(*getMask(max_pool->output(0)), Mask({{}, {1, 2, 3}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -768,7 +765,6 @@ TEST_F(TransformationTestsF, PropagateMasksHardDependencies) { // compare_masks(*getMask(conv2), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -915,7 +911,6 @@ TEST_F(TransformationTestsF, PropagateMasksQuantizedGroupConvolution) { compare_masks(*getMask(conv2->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1084,7 +1079,6 @@ TEST_F(TransformationTestsF, PropagateMasksQuantizedGroupConvolutionWithShapeOf) compare_masks(*getMask(weights_2->output(0)), Mask({{}, {0, 1, 2, 3}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1222,7 +1216,6 @@ TEST_F(TransformationTestsF, PropagateMasksFakeQuantizePerTensor) { compare_masks(*getMask(conv2->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1427,7 +1420,6 @@ TEST_F(TransformationTestsF, PropagateMasksFakeQuantizePerChannel) { compare_masks(*getMask(fq->input(4).get_source_output()), Mask({{}, {0, 1, 2, 3, 4}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1559,7 +1551,6 @@ TEST_F(TransformationTestsF, TestConcatMaskPropagation) { Mask({{}, {0, 1, 2, 3, 15, 16, 17, 18, 28, 29, 30, 31}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1707,7 +1698,6 @@ TEST_F(TransformationTestsF, TestConcatMaskPropagationUp) { Mask({{}, {0, 1, 2, 3, 15, 16, 17, 18, 28, 29, 30, 31}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -1878,7 +1868,6 @@ TEST_F(TransformationTestsF, PruneConvIsClosingAndInGroup) { compare_masks(*getMask(end_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2070,7 +2059,6 @@ TEST_F(TransformationTestsF, PruneReducelayerUp) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2174,7 +2162,6 @@ TEST_F(TransformationTestsF, PruneReduceLayerDown) { compare_masks(*getMask(end_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2354,7 +2341,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeUp) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2467,7 +2453,6 @@ TEST_P(TransformationTestsBoolParamF, MaskPropagationReshapeUpWithShapeOf) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2579,7 +2564,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeUpShapeSubGraph) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2678,7 +2662,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeExtend) { compare_masks(*getMask(conv_1->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2784,7 +2767,6 @@ TEST_F(DISABLED_TransformationTestsF, MaskPropagationReshapeDownMul) { compare_masks(*getMask(last_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -2889,7 +2871,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeDownAdd) { compare_masks(*getMask(last_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3054,7 +3035,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeUnsqueezeUp) { compare_masks(*getMask(mul_left->output(0)), Mask({{}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3119,7 +3099,6 @@ TEST_F(TransformationTestsF, MaskPropagationReshapeUnsqueezeDown) { compare_masks(*getMask(mul_left->output(0)), Mask({{}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3292,7 +3271,6 @@ TEST_F(TransformationTestsF, PruneSEBlock) { compare_masks(*getMask(end_conv->output(0)), Mask({{}, {}, {}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3395,7 +3373,6 @@ TEST_F(TransformationTestsF, PropagateMasksLinear) { compare_masks(*getMask(last_linear->output(0)), Mask{{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3658,7 +3635,6 @@ TEST_F(TransformationTestsF, MaskPropagationLinearOuterDims) { compare_masks(*getMask(last_mul->output(0)), Mask({{}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3808,7 +3784,6 @@ TEST_F(TransformationTestsF, PruneMasksMatMulColsStopRowsUp) { compare_masks(*getMask(last_linear->output(0)), Mask{{}, {}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -3898,7 +3873,6 @@ TEST_F(TransformationTestsF, PruneMasksMatMulRowsStopColsUp) { compare_masks(*getMask(last_linear->output(0)), Mask{{}, {}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4003,7 +3977,6 @@ TEST_F(TransformationTestsF, PropagateFlattenUp) { compare_masks(*getMask(linear->output(0)), Mask{{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4076,7 +4049,6 @@ TEST_F(TransformationTestsF, PropagateFlattenDown) { compare_masks(*getMask(linear->output(0)), {{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4126,7 +4098,6 @@ TEST_F(TransformationTestsF, PropagateMasksTranspose) { compare_masks(*getMask(last_mul->output(0)), Mask{{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4200,7 +4171,6 @@ TEST_F(TransformationTestsF, PropagateMasksTransposeComplex) { compare_masks(*getMask(last_mul->output(0)), Mask{{}, {}, {}, {}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4402,7 +4372,6 @@ TEST_F(DISABLED_TransformationTestsF, PropagateMasksBroadcastedEltwiseWithInputs compare_masks(*getMask(last_mul->output(0)), Mask({{}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4583,7 +4552,6 @@ TEST_F(TransformationTestsF, PropagateMasksBroadcastedEltwise) { compare_masks(*getMask(last_mul->output(0)), Mask({{}, {}})); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4773,7 +4741,6 @@ TEST_F(TransformationTestsF, MaskPropagationComplexReshape) { std::string(VISUALIZE_TREE_ROOT) + "MaskPropagationComplexReshapeWithMasks.svg", modifier); } - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -4966,7 +4933,6 @@ TEST_P(TransformationTestsBoolParamF, MaskPropagationReshapedPassThroughP) { manager.register_pass( std::string(VISUALIZE_TREE_ROOT) + "MaskPropagationReverseFlattenWithMasks" + postfix + ".svg", modifier); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -5032,7 +4998,6 @@ TEST_P(TransformationTestsBoolParamF, MaskPropagationBroadcastedSameRankEltwiseS compare_masks(*getMask(mul_last->output(0)), Mask{{}, {}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } @@ -5194,7 +5159,6 @@ TEST_F(TransformationTestsF, MaskPropagationMatMulWithSeveralOutputs) { compare_masks(*getMask(right_matmul), Mask{{}, {}}); manager.register_pass(); - disable_rt_info_check(); comparator.enable(FunctionsComparator::CmpValues::ACCURACY); } diff --git a/src/core/include/openvino/core/node_output.hpp b/src/core/include/openvino/core/node_output.hpp index 3edca19a653143..c9746e1649c5c8 100644 --- a/src/core/include/openvino/core/node_output.hpp +++ b/src/core/include/openvino/core/node_output.hpp @@ -103,6 +103,7 @@ class OPENVINO_API Output { bool operator>(const Output& other) const; bool operator<=(const Output& other) const; bool operator>=(const Output& other) const; + operator Output() const; private: std::shared_ptr m_node; diff --git a/src/core/include/openvino/op/depth_to_space.hpp b/src/core/include/openvino/op/depth_to_space.hpp index 4c60e5969b1b86..802eddbd665d4c 100644 --- a/src/core/include/openvino/op/depth_to_space.hpp +++ b/src/core/include/openvino/op/depth_to_space.hpp @@ -42,9 +42,14 @@ class OPENVINO_API DepthToSpace : public Op { DepthToSpace(const Output& data, const std::string& mode, std::size_t block_size = 1); bool visit_attributes(AttributeVisitor& visitor) override; + void set_block_size(size_t block_size); + const std::size_t& get_block_size() const { return m_blocksize; } + + void set_mode(DepthToSpaceMode mode); + DepthToSpaceMode get_mode() const { return m_mode; } diff --git a/src/core/include/openvino/op/scatter_elements_update.hpp b/src/core/include/openvino/op/scatter_elements_update.hpp index d1980f338d1c51..903b1fb9bab0cc 100644 --- a/src/core/include/openvino/op/scatter_elements_update.hpp +++ b/src/core/include/openvino/op/scatter_elements_update.hpp @@ -36,6 +36,9 @@ class OPENVINO_API ScatterElementsUpdate : public Op { bool evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const override; OPENVINO_SUPPRESS_DEPRECATED_END bool has_evaluate() const override; + bool evaluate_lower(TensorVector& output_values) const override; + bool evaluate_upper(TensorVector& output_values) const override; + bool evaluate_label(TensorLabelVector& output_labels) const override; private: bool evaluate_scatter_element_update(const HostTensorVector& outputs, const HostTensorVector& inputs) const; diff --git a/src/core/include/openvino/op/shuffle_channels.hpp b/src/core/include/openvino/op/shuffle_channels.hpp index 0bec03b0f36b16..0c30b85d743f62 100644 --- a/src/core/include/openvino/op/shuffle_channels.hpp +++ b/src/core/include/openvino/op/shuffle_channels.hpp @@ -35,9 +35,14 @@ class OPENVINO_API ShuffleChannels : public Op { std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void set_axis(int64_t axis); + const int64_t& get_axis() const { return m_axis; } + + void set_group(int64_t group); + const int64_t& get_group() const { return m_group; } diff --git a/src/core/include/openvino/op/space_to_depth.hpp b/src/core/include/openvino/op/space_to_depth.hpp index 570b95d9b69d41..3b5515503502b6 100644 --- a/src/core/include/openvino/op/space_to_depth.hpp +++ b/src/core/include/openvino/op/space_to_depth.hpp @@ -40,9 +40,15 @@ class OPENVINO_API SpaceToDepth : public Op { SpaceToDepth(const Output& data, const std::string& mode, std::size_t block_size = 1); bool visit_attributes(AttributeVisitor& visitor) override; + + void set_block_size(size_t block_size); + const std::size_t& get_block_size() const { return m_blocksize; } + + void set_mode(SpaceToDepthMode mode); + SpaceToDepthMode get_mode() const { return m_mode; } diff --git a/src/core/include/openvino/runtime/tensor.hpp b/src/core/include/openvino/runtime/tensor.hpp index dfbf71e22db7fb..a90acfd1b66e07 100644 --- a/src/core/include/openvino/runtime/tensor.hpp +++ b/src/core/include/openvino/runtime/tensor.hpp @@ -116,6 +116,23 @@ class OPENVINO_API Tensor { */ Tensor(const element::Type type, const Shape& shape, void* host_ptr, const Strides& strides = {}); + /** + * @brief Constructs Tensor using port from node. Allocate internal host storage using default allocator + * @param port port from node + * @param allocator allocates memory for internal tensor storage + */ + Tensor(const ov::Output& port, const Allocator& allocator = {}); + + /** + * @brief Constructs Tensor using port from node. Wraps allocated host memory. + * @note Does not perform memory allocation internally + * @param port port from node + * @param host_ptr Pointer to pre-allocated host memory + * @param strides Optional strides parameters in bytes. Strides are supposed to be computed automatically based + * on shape and element size + */ + Tensor(const ov::Output& port, void* host_ptr, const Strides& strides = {}); + /** * @brief Constructs region of interest (ROI) tensor form another tensor. * @note Does not perform memory allocation internally @@ -143,10 +160,17 @@ class OPENVINO_API Tensor { */ Shape get_shape() const; + /** + * @brief Copy tensor, destination tensor should have the same element type and shape + * + * @param dst destination tensor + */ + void copy_to(ov::Tensor& dst) const; + /** * @brief Reports whether the tensor is continuous or not * - * @return true if blob is continuous + * @return true if tensor is continuous */ bool is_continuous() const; diff --git a/src/core/reference/include/ngraph/runtime/reference/convert_color_nv12.hpp b/src/core/reference/include/ngraph/runtime/reference/convert_color_nv12.hpp index f575041ba4a96f..81733924ae7b3d 100644 --- a/src/core/reference/include/ngraph/runtime/reference/convert_color_nv12.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/convert_color_nv12.hpp @@ -42,12 +42,12 @@ void color_convert_nv12(const T* arg_y, size_t stride_y, size_t stride_uv, ov::op::util::ConvertColorNV12Base::ColorConversion color_format) { - for (int batch = 0; batch < batch_size; batch++) { + for (size_t batch = 0; batch < batch_size; batch++) { T* out = out_ptr + batch * image_w * image_h * 3; auto y_ptr = arg_y + batch * stride_y; auto uv_ptr = arg_uv + batch * stride_uv; - for (int h = 0; h < image_h; h++) { - for (int w = 0; w < image_w; w++) { + for (size_t h = 0; h < image_h; h++) { + for (size_t w = 0; w < image_w; w++) { auto y_index = h * image_w + w; auto y_val = static_cast(y_ptr[y_index]); auto uv_index = (h / 2) * image_w + (w / 2) * 2; @@ -80,13 +80,13 @@ void color_convert_i420(const T* arg_y, size_t stride_y, size_t stride_uv, ov::op::util::ConvertColorI420Base::ColorConversion color_format) { - for (int batch = 0; batch < batch_size; batch++) { + for (size_t batch = 0; batch < batch_size; batch++) { T* out = out_ptr + batch * image_w * image_h * 3; auto y_ptr = arg_y + batch * stride_y; auto u_ptr = arg_u + batch * stride_uv; auto v_ptr = arg_v + batch * stride_uv; - for (int h = 0; h < image_h; h++) { - for (int w = 0; w < image_w; w++) { + for (size_t h = 0; h < image_h; h++) { + for (size_t w = 0; w < image_w; w++) { auto y_index = h * image_w + w; auto y_val = static_cast(y_ptr[y_index]); auto uv_index = (h / 2) * (image_w / 2) + (w / 2); diff --git a/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp b/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp index c4484ceab120dd..fabe70d95340b3 100644 --- a/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/convolution_backprop_data.hpp @@ -46,15 +46,15 @@ void extend_with_zeros(const Strides& strides, const auto offset_batch = batch * input_size * input_shape[1]; for (size_t channel = 0; channel < input_shape[1]; ++channel) { const auto offset_channel = offset_batch + channel * input_size; - for (int i_z = 0; i_z < input_3d[0]; ++i_z) { + for (size_t i_z = 0; i_z < input_3d[0]; ++i_z) { const auto offset_i_z = i_z * input_3d[2] * input_3d[1]; - for (int i_y = 0; i_y < input_3d[1]; ++i_y) { + for (size_t i_y = 0; i_y < input_3d[1]; ++i_y) { const auto offset_i_y = i_y * input_3d[2]; - for (int i_x = 0; i_x < input_3d[2]; ++i_x) { + for (size_t i_x = 0; i_x < input_3d[2]; ++i_x) { input_zeros.push_back(in[offset_channel + i_x + offset_i_y + offset_i_z]); if (i_x < input_3d[2] - 1) { - for (int k = 0; k < strides_3d[2] - 1; k++) { + for (size_t k = 0; k < strides_3d[2] - 1; k++) { input_zeros.push_back(0); } } diff --git a/src/core/reference/include/ngraph/runtime/reference/detection_output.hpp b/src/core/reference/include/ngraph/runtime/reference/detection_output.hpp index b197c110dd5bf6..76b7a6945f0617 100644 --- a/src/core/reference/include/ngraph/runtime/reference/detection_output.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/detection_output.hpp @@ -38,7 +38,7 @@ class referenceDetectionOutput { size_t offset; size_t numResults; size_t outTotalSize; - size_t numClasses; + int numClasses; void GetLocPredictions(const dataType* locData, std::vector& locations) { locations.resize(numImages); @@ -445,7 +445,7 @@ class referenceDetectionOutput { offset = _attrs.normalized ? 0 : 1; numPriors = priorsShape[2] / priorSize; priorsBatchSize = priorsShape[0]; - numClasses = classPredShape[1] / numPriors; + numClasses = classPredShape[1] / static_cast(numPriors); numLocClasses = _attrs.share_location ? 1 : numClasses; numResults = outShape[2]; outTotalSize = shape_size(outShape); diff --git a/src/core/reference/include/ngraph/runtime/reference/roi_align.hpp b/src/core/reference/include/ngraph/runtime/reference/roi_align.hpp index cac56d76b3b4c4..3ea62e8d03304f 100644 --- a/src/core/reference/include/ngraph/runtime/reference/roi_align.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/roi_align.hpp @@ -109,8 +109,8 @@ void roi_align(const T* feature_maps, T sample_x = x1 + static_cast(x_bin_ind) * bin_width + sample_distance_x * (static_cast(x_sample_ind) + static_cast(0.5f)); - if (sample_x < -1.0 || sample_x > feature_map_width || sample_y < -1.0 || - sample_y > feature_map_height) { + if (sample_x < -1.0 || sample_x > static_cast(feature_map_width) || sample_y < -1.0 || + sample_y > static_cast(feature_map_height)) { // For this sample we save 4x point (0,0) with weight 0 pooling_points.insert(pooling_points.end(), 4, {0, 0}); pooling_weights.insert(pooling_weights.end(), 4, T{0}); diff --git a/src/core/shape_inference/include/batch_to_space_shape_inference.hpp b/src/core/shape_inference/include/batch_to_space_shape_inference.hpp index 5069fc23c04ce9..fb7259280796c4 100644 --- a/src/core/shape_inference/include/batch_to_space_shape_inference.hpp +++ b/src/core/shape_inference/include/batch_to_space_shape_inference.hpp @@ -15,28 +15,28 @@ namespace ov { namespace op { namespace v1 { -template -void shape_infer(const ov::op::v1::BatchToSpace* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map>& constant_data = {}) { - using ValType = typename std::iterator_traits::value_type::value_type; - NODE_VALIDATION_CHECK(op, input_shapes.size() == 4 && output_shapes.size() == 1); +template +std::vector shape_infer(const BatchToSpace* op, + const std::vector& input_shapes, + const std::map& constant_data = {}) { + using ValType = typename TShape::value_type::value_type; + NODE_VALIDATION_CHECK(op, input_shapes.size() == 4); + const auto& data_shape = input_shapes[0]; const auto& block_shape = input_shapes[1]; const auto& crops_begin_shape = input_shapes[2]; const auto& crops_end_shape = input_shapes[3]; - bool got_const_data = false; auto inputs_same_ps = crops_begin_shape; - NODE_VALIDATION_CHECK(op, - T::merge_into(inputs_same_ps, crops_end_shape) && T::merge_into(inputs_same_ps, block_shape), - "block_shape, crops_begin and crops_end inputs must have the same shape. Got: ", - block_shape, - ", ", - crops_begin_shape, - " and ", - crops_end_shape); + NODE_VALIDATION_CHECK( + op, + TShape::merge_into(inputs_same_ps, crops_end_shape) && TShape::merge_into(inputs_same_ps, block_shape), + "block_shape, crops_begin and crops_end inputs must have the same shape. Got: ", + block_shape, + ", ", + crops_begin_shape, + " and ", + crops_end_shape); NODE_VALIDATION_CHECK(op, inputs_same_ps.rank().compatible(1), @@ -45,10 +45,11 @@ void shape_infer(const ov::op::v1::BatchToSpace* op, const ov::Rank data_rank = data_shape.rank(); if (data_rank.is_static()) { + constexpr size_t spatial_dim_offset = 1; NODE_VALIDATION_CHECK(op, - (data_rank.get_length() >= 2), + (data_shape.size() > spatial_dim_offset), "data input must have rank greater or equal than 2. Got: ", - data_rank.get_length()); + data_shape.size()); if (inputs_same_ps.is_static()) { NODE_VALIDATION_CHECK(op, data_rank.get_length() == inputs_same_ps[0].get_length(), @@ -59,60 +60,51 @@ void shape_infer(const ov::op::v1::BatchToSpace* op, data_rank); } - auto& output_shape = output_shapes[0]; - output_shape.resize(data_shape.size()); - + auto out_shape = data_shape; std::vector block_val, crops_begin_val, crops_end_val; - if (get_data_as_int64(1, op, block_val, constant_data) && - get_data_as_int64(2, op, crops_begin_val, constant_data) && - get_data_as_int64(3, op, crops_end_val, constant_data)) { - got_const_data = true; - bool block_vals_valid = std::all_of(begin(block_val), end(block_val), [](int64_t elem) { - return elem >= 1; - }); + if (get_data_as_int64(1, op, block_val, constant_data) && + get_data_as_int64(2, op, crops_begin_val, constant_data) && + get_data_as_int64(3, op, crops_end_val, constant_data)) { NODE_VALIDATION_CHECK(op, - block_vals_valid, + std::none_of(begin(block_val), end(block_val), cmp::Less(1)), "Elements of block_shape input must be greater or equal to one."); - bool crops_begin_vals_valid = std::all_of(begin(crops_begin_val), end(crops_begin_val), [](int64_t elem) { - return elem >= 0; - }); - bool crops_end_vals_valid = std::all_of(begin(crops_end_val), end(crops_end_val), [](int64_t elem) { - return elem >= 0; - }); + constexpr auto is_invalid_crop = cmp::Less(0); NODE_VALIDATION_CHECK(op, - crops_begin_vals_valid && crops_end_vals_valid, + std::none_of(begin(crops_begin_val), end(crops_begin_val), is_invalid_crop) && + std::none_of(begin(crops_end_val), end(crops_end_val), is_invalid_crop), "Elements of crops_begin and crops_end inputs must be greater or equal to zero."); - if (data_shape.is_static()) { - for (size_t idx = 0; idx < data_shape.size(); idx++) { - const bool is_valid_crops_and_shape = - crops_begin_val[idx] + crops_end_val[idx] <= block_val[idx] * data_shape[idx].get_length(); - NODE_VALIDATION_CHECK(op, - is_valid_crops_and_shape, - "crops_begin[i] + crops_end[i] must be less or equal to " - "block_shape[i] * input_shape[i]"); - } - } - int64_t block_prod = - std::accumulate(begin(block_val), end(block_val), int64_t(1), std::multiplies()); - const auto divisor = static_cast(block_prod); + const auto divisor = static_cast( + std::accumulate(begin(block_val), end(block_val), int64_t(1), std::multiplies())); - output_shape[0] = data_shape[0] / divisor; - check_divided_result(op, output_shape[0], data_shape[0], divisor); + out_shape[0] /= divisor; + check_divided_result(op, out_shape[0], data_shape[0], divisor); - for (size_t idx = 1; idx < data_shape.size(); idx++) { - output_shape[idx] = data_shape[idx] * static_cast(block_val[idx]) - - static_cast(crops_begin_val[idx]) - - static_cast(crops_end_val[idx]); + for (auto idx = spatial_dim_offset; idx < out_shape.size(); ++idx) { + out_shape[idx] *= static_cast(block_val[idx]); + auto crop = static_cast(crops_begin_val[idx] + crops_end_val[idx]); + NODE_VALIDATION_CHECK( + op, + out_shape[idx].is_dynamic() || crop <= out_shape[idx].get_length(), + "crops_begin[i] + crops_end[i] must be less or equal to block_shape[i] * input_shape[i]"); + + out_shape[idx] = out_shape[idx] - crop; } } + return {out_shape}; + } else { + return {PartialShape::dynamic()}; } - if (!got_const_data) - // For PartialShape, Set the output to be dynamic; - // For StaticShape, throw error caused by implicitly constructing StaticShape with PartialShape argument; - output_shapes[0] = ov::PartialShape::dynamic(data_rank); +} + +template +void shape_infer(const ov::op::v1::BatchToSpace* op, + const std::vector& input_shapes, + std::vector& output_shapes, + const std::map& constant_data = {}) { + output_shapes = shape_infer(op, input_shapes, constant_data); } } // namespace v1 diff --git a/src/core/shape_inference/include/depth_to_space_shape_inference.hpp b/src/core/shape_inference/include/depth_to_space_shape_inference.hpp index 4dd03ee76e861a..bda94acd5a58ab 100644 --- a/src/core/shape_inference/include/depth_to_space_shape_inference.hpp +++ b/src/core/shape_inference/include/depth_to_space_shape_inference.hpp @@ -14,46 +14,43 @@ namespace ov { namespace op { namespace v0 { -template -void shape_infer(const ov::op::v0::DepthToSpace* op, - const std::vector& input_shapes, - std::vector& output_shapes) { - using ValType = typename std::iterator_traits::value_type::value_type; - - NODE_VALIDATION_CHECK(op, input_shapes.size() == 1 && output_shapes.size() == 1); +template +std::vector shape_infer(const DepthToSpace* op, const std::vector& input_shapes) { + using TDim = typename TShape::value_type; + using TVal = typename TDim::value_type; + NODE_VALIDATION_CHECK(op, input_shapes.size() == 1); const auto& data_shape = input_shapes[0]; - const ov::Rank data_rank = data_shape.rank(); - const auto& block_size = op->get_block_size(); - if (data_rank.is_static()) { + if (data_shape.rank().is_static()) { + static constexpr size_t spatial_dim_offset = 2; NODE_VALIDATION_CHECK(op, - data_shape.size() >= 3, + data_shape.size() > spatial_dim_offset, "The input tensor with rank lower than 3 is not supported (input rank: ", data_shape.size(), ")"); - const size_t divider = static_cast(std::pow(block_size, data_shape.size() - 2)); - NODE_VALIDATION_CHECK(op, (divider), "DepthToSpace: The divider must not be 0"); - - auto& output_shape = output_shapes[0]; - output_shape.resize(data_shape.size()); - - output_shape[0] = data_shape[0]; - const auto divisor = static_cast(divider); - output_shape[1] = data_shape[1] / divisor; - check_divided_result(op, output_shape[1], data_shape[1], divisor); - for (size_t i = 2; i < output_shape.size(); i++) { - output_shape[i] = data_shape[i] * static_cast(block_size); - } - + const auto& block_size = op->get_block_size(); + const auto divisor = static_cast(std::pow(block_size, data_shape.size() - spatial_dim_offset)); + NODE_VALIDATION_CHECK(op, divisor != 0, "DepthToSpace: The divisor must not be 0"); + + auto out_shape = data_shape; + out_shape[1] /= divisor; + check_divided_result(op, out_shape[1], data_shape[1], divisor); + std::for_each(out_shape.begin() + spatial_dim_offset, out_shape.end(), [&block_size](TDim& d) { + d *= static_cast(block_size); + }); + return {out_shape}; } else { - // For PartialShape, Set the output to be dynamic; - // For StaticShape, throw error caused by implicitly constructing StaticShape with PartialShape argument; - output_shapes[0] = ov::PartialShape::dynamic(data_rank); + return {PartialShape::dynamic()}; } } +template +void shape_infer(const DepthToSpace* op, const std::vector& input_shapes, std::vector& output_shapes) { + output_shapes = shape_infer(op, input_shapes); +} + } // namespace v0 } // namespace op } // namespace ov diff --git a/src/core/shape_inference/include/scatter_elements_update_shape_inference.hpp b/src/core/shape_inference/include/scatter_elements_update_shape_inference.hpp index ecd9bc3f6cf13b..8db69b87edcc7a 100644 --- a/src/core/shape_inference/include/scatter_elements_update_shape_inference.hpp +++ b/src/core/shape_inference/include/scatter_elements_update_shape_inference.hpp @@ -12,33 +12,33 @@ namespace ov { namespace op { namespace v3 { -template -void shape_infer(const ScatterElementsUpdate* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map>& constant_data = {}) { - NODE_VALIDATION_CHECK(op, input_shapes.size() == 4 && output_shapes.size() == 1); +template +std::vector shape_infer(const ScatterElementsUpdate* op, + const std::vector& input_shapes, + const std::map& constant_data = {}) { + NODE_VALIDATION_CHECK(op, input_shapes.size() == 4); const auto& data_shape = input_shapes[0]; const auto& indices_shape = input_shapes[1]; const auto& updates_shape = input_shapes[2]; const auto& axis_shape = input_shapes[3]; - auto& output_shape = output_shapes[0]; - output_shape = data_shape; NODE_VALIDATION_CHECK(op, - axis_shape.compatible(T{}) || axis_shape.compatible(T{1}), + is_rank_compatible_any_of(axis_shape.rank(), {0, 1}), "Axis input shape are required to be scalar or 1D tensor. ", "Got: ", axis_shape); + const auto& data_rank = data_shape.rank(); + const auto& indices_rank = indices_shape.rank(); + NODE_VALIDATION_CHECK(op, - indices_shape.rank().compatible(data_shape.rank()), + indices_rank.compatible(data_rank), "Indices rank and data rank are required to be equal. ", "Got: ", - indices_shape.rank(), + indices_rank, " and: ", - data_shape.rank()); + data_rank); NODE_VALIDATION_CHECK(op, indices_shape.compatible(updates_shape), @@ -48,26 +48,20 @@ void shape_infer(const ScatterElementsUpdate* op, " and: ", updates_shape); - if (data_shape.rank().is_dynamic()) - return; - - std::vector axis_input; - if (get_data_as_int64(3, op, axis_input, constant_data)) { - auto axis = axis_input[0]; - - int64_t data_rank_length = data_shape.rank().get_length(); - NODE_VALIDATION_CHECK(op, - (-data_rank_length <= axis) && (axis <= data_rank_length - 1), - "Axis value has to be in range [-r, r-1] where r is rank of data shape. ", - " Data rank: ", - data_rank_length, - ", range:[", - -data_rank_length, - ", ", - data_rank_length - 1, - "]. Got axis value: ", - axis); + if (data_shape.rank().is_static()) { + if (const auto axis_input = get_input_const_data_as(op, 3, constant_data)) { + ov::normalize_axis(op, (*axis_input)[0], data_rank); + } } + return {data_shape}; +} + +template +void shape_infer(const ScatterElementsUpdate* op, + const std::vector& input_shapes, + std::vector& output_shapes, + const std::map& constant_data = {}) { + output_shapes = shape_infer(op, input_shapes, constant_data); } } // namespace v3 diff --git a/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp b/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp index 31d35987a19d22..fd54069ea5c294 100644 --- a/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp +++ b/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp @@ -6,13 +6,15 @@ #include +#include "openvino/core/validation_util.hpp" + namespace ov { namespace op { namespace v0 { -template -void shape_infer(const ShuffleChannels* op, const std::vector& input_shapes, std::vector& output_shapes) { - NODE_VALIDATION_CHECK(op, input_shapes.size() == 1 && output_shapes.size() == 1); +template +std::vector shape_infer(const ShuffleChannels* op, const std::vector& input_shapes) { + NODE_VALIDATION_CHECK(op, input_shapes.size() == 1); const auto& group = op->get_group(); NODE_VALIDATION_CHECK(op, group >= 1, "The 'group' parameter must be greater or equal to 1."); @@ -20,25 +22,31 @@ void shape_infer(const ShuffleChannels* op, const std::vector& input_shapes, const auto& input_shape = input_shapes[0]; const auto input_shape_rank = input_shape.rank(); - if (input_shape_rank.is_static()) { - const int64_t input_rank_value = static_cast(input_shape.size()); - NODE_VALIDATION_CHECK(op, input_rank_value >= 1, "The input tensor's shape is expected to be at least 1D."); + auto output_shapes = std::vector(1, input_shape); - const auto& axis = op->get_axis(); + if (input_shape_rank.is_static()) { + NODE_VALIDATION_CHECK(op, input_shape.size() >= 1, "The input tensor's shape is expected to be at least 1D."); + const auto axis_zb = static_cast(normalize_axis(op, op->get_axis(), input_shape_rank)); + const auto& channel_dim = input_shape[axis_zb]; NODE_VALIDATION_CHECK(op, - axis < input_rank_value && axis >= (0 - input_rank_value), - "The 'axis' parameter for ShuffleChannels has to point to one of the " - "input tensor's shape dimensions."); - size_t axis_zb = static_cast(axis >= 0 ? axis : (axis + input_rank_value)); - - if (input_shape[axis_zb].is_static()) { - const auto channel_dim_size = input_shape[axis_zb].get_length(); - NODE_VALIDATION_CHECK(op, - channel_dim_size % group == 0, - "The channel dimension size has to be a multiple of the groups parameter value."); + channel_dim.is_dynamic() || (channel_dim.get_length() % group) == 0, + "The channel dimension size has to be a multiple of the groups parameter value."); + + if (std::is_same::value) { + // overwrite channel dimension to loose label + using TDim = typename TShape::value_type; + output_shapes.front()[axis_zb] = TDim{channel_dim.get_min_length(), channel_dim.get_max_length()}; } } - output_shapes[0] = input_shape; + + return output_shapes; +} + +template +void shape_infer(const ShuffleChannels* op, + const std::vector& input_shapes, + std::vector& output_shapes) { + output_shapes = shape_infer(op, input_shapes); } } // namespace v0 diff --git a/src/core/shape_inference/include/slice_shape_inference_utils.hpp b/src/core/shape_inference/include/slice_shape_inference_utils.hpp index d78ad85a0ec40b..9b33900692b2e1 100644 --- a/src/core/shape_inference/include/slice_shape_inference_utils.hpp +++ b/src/core/shape_inference/include/slice_shape_inference_utils.hpp @@ -134,7 +134,11 @@ inline int64_t get_sliced_value(const int64_t& dim, const int64_t& start, const constexpr int64_t inf_bound = -1; const auto& norm_dim = dim == inf_bound ? std::numeric_limits::max() : dim; +#ifdef OPENVINO_ARCH_64_BIT const auto is_norm_dim_max = ov::internal::is_max(norm_dim); +#else + const auto is_norm_dim_max = ov::internal::is_max(size_t(norm_dim)); +#endif const int64_t lower_max = is_reverse_step ? norm_dim - 1 : norm_dim; const int64_t upper_min = is_reverse_step ? inf_bound : min_bound; diff --git a/src/core/shape_inference/include/space_to_batch_shape_inference.hpp b/src/core/shape_inference/include/space_to_batch_shape_inference.hpp index 33c9caffa22e03..792c7ddc7761d9 100644 --- a/src/core/shape_inference/include/space_to_batch_shape_inference.hpp +++ b/src/core/shape_inference/include/space_to_batch_shape_inference.hpp @@ -15,75 +15,72 @@ namespace ov { namespace op { namespace v1 { -template -void shape_infer(const ov::op::v1::SpaceToBatch* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map>& constant_data = {}) { - using ValType = typename std::iterator_traits::value_type::value_type; - NODE_VALIDATION_CHECK(op, input_shapes.size() == 4 && output_shapes.size() == 1); +template +std::vector shape_infer(const SpaceToBatch* op, + const std::vector& input_shapes, + const std::map& constant_data = {}) { + using TVal = typename TShape::value_type::value_type; + NODE_VALIDATION_CHECK(op, input_shapes.size() == 4); const auto& data_shape = input_shapes[0]; const auto& block_shape = input_shapes[1]; const auto& pads_begin_shape = input_shapes[2]; const auto& pads_end_shape = input_shapes[3]; - const ov::Rank data_rank = data_shape.rank(); - bool got_const_data = false; auto inputs_same_ps = pads_begin_shape; - NODE_VALIDATION_CHECK(op, - T::merge_into(inputs_same_ps, pads_end_shape) && T::merge_into(inputs_same_ps, block_shape), - "block_shape, pads_begin and pads_end inputs must have the same shape. Got: ", - block_shape, - ", ", - pads_begin_shape, - " and ", - pads_end_shape); + NODE_VALIDATION_CHECK( + op, + TShape::merge_into(inputs_same_ps, pads_end_shape) && TShape::merge_into(inputs_same_ps, block_shape), + "block_shape, pads_begin and pads_end inputs must have the same shape. Got: ", + block_shape, + ", ", + pads_begin_shape, + " and ", + pads_end_shape); NODE_VALIDATION_CHECK(op, inputs_same_ps.rank().compatible(1), "block_shape and pads inputs must have rank 1. Got: ", inputs_same_ps.rank()); - if (data_rank.is_static()) { + if (data_shape.rank().is_static()) { + constexpr size_t spatial_dim_offset = 1; NODE_VALIDATION_CHECK(op, - (data_shape.size() >= 2), + (data_shape.size() > spatial_dim_offset), "The data tensor with rank lower than 2 is not supported (data rank: ", data_shape.size(), ")"); - std::vector block_val, pads_begin_val, pads_end_val; - - auto& output_shape = output_shapes[0]; - output_shape.resize(data_shape.size()); - if (get_data_as_int64(1, op, block_val, constant_data) && - get_data_as_int64(2, op, pads_begin_val, constant_data) && - get_data_as_int64(3, op, pads_end_val, constant_data)) { - got_const_data = true; - int64_t block_prod = - std::accumulate(begin(block_val), end(block_val), int64_t(1), std::multiplies()); + auto out_shape = data_shape; + std::vector block, pads_begin, pads_end; + if (get_data_as_int64(1, op, block, constant_data) && + get_data_as_int64(2, op, pads_begin, constant_data) && + get_data_as_int64(3, op, pads_end, constant_data)) { + TVal block_prod = std::accumulate(begin(block), end(block), 1, std::multiplies()); - output_shape[0] = data_shape[0] * static_cast(block_prod); - - for (size_t idx = 1; idx < output_shape.size(); ++idx) { - NODE_VALIDATION_CHECK(op, block_val[idx] > 0, "block_shape values must be greater than 0"); - if (data_shape[idx].is_dynamic() && data_shape[idx] == ov::Dimension::dynamic()) { - output_shape[idx] = ov::Dimension::dynamic(); - } else { - const auto divided = - data_shape[idx] + static_cast((pads_begin_val[idx] + pads_end_val[idx])); - const auto divisor = static_cast(block_val[idx]); - output_shape[idx] = divided / divisor; - check_divided_result(op, output_shape[idx], divided, divisor); + out_shape[0] *= block_prod; + for (auto idx = spatial_dim_offset; idx < out_shape.size(); ++idx) { + NODE_VALIDATION_CHECK(op, block[idx] > 0, "block_shape values must be greater than 0"); + if (out_shape[idx].is_static() || out_shape[idx] != Dimension::dynamic()) { + const auto padded_dim = out_shape[idx] + static_cast(pads_begin[idx] + pads_end[idx]); + const auto divisor = static_cast(block[idx]); + out_shape[idx] = padded_dim / divisor; + check_divided_result(op, out_shape[idx], padded_dim, divisor); } } } + return {out_shape}; + } else { + return {PartialShape::dynamic()}; } +} - if (!got_const_data) - // For PartialShape, Set the output to be dynamic; - // For StaticShape, throw error caused by implicitly constructing StaticShape with PartialShape argument; - output_shapes[0] = ov::PartialShape::dynamic(data_rank); +template +void shape_infer(const SpaceToBatch* op, + const std::vector& input_shapes, + std::vector& output_shapes, + const std::map& constant_data = {}) { + output_shapes = shape_infer(op, input_shapes, constant_data); } } // namespace v1 diff --git a/src/core/shape_inference/include/space_to_depth_shape_inference.hpp b/src/core/shape_inference/include/space_to_depth_shape_inference.hpp index 0f5b8308c27301..7fff113b4d5422 100644 --- a/src/core/shape_inference/include/space_to_depth_shape_inference.hpp +++ b/src/core/shape_inference/include/space_to_depth_shape_inference.hpp @@ -14,44 +14,44 @@ namespace ov { namespace op { namespace v0 { -template -void shape_infer(const ov::op::v0::SpaceToDepth* op, - const std::vector& input_shapes, - std::vector& output_shapes) { - using ValType = typename std::iterator_traits::value_type::value_type; - - NODE_VALIDATION_CHECK(op, input_shapes.size() == 1 && output_shapes.size() == 1); +template +std::vector shape_infer(const ov::op::v0::SpaceToDepth* op, const std::vector& input_shapes) { + using TVal = typename TShape::value_type::value_type; + NODE_VALIDATION_CHECK(op, input_shapes.size() == 1); const auto& data_shape = input_shapes[0]; - const ov::Rank data_rank = data_shape.rank(); - if (data_rank.is_static()) { + if (data_shape.rank().is_static()) { + static constexpr size_t spatial_dim_offset = 2; NODE_VALIDATION_CHECK(op, - !(data_shape.size() < 3), + data_shape.size() > spatial_dim_offset, "The input tensor with rank lower than 3 is not supported (input rank: ", data_shape.size(), ")"); const auto& block_size = op->get_block_size(); - NODE_VALIDATION_CHECK(op, block_size > 0, "The block size must begreater then 0 ", block_size); - const ValType multiplier = static_cast(std::pow(block_size, data_shape.size() - 2)); + NODE_VALIDATION_CHECK(op, block_size > 0, "The block size must be greater than 0 ", block_size); - auto& out_shape = output_shapes[0]; - out_shape.resize(data_shape.size()); + auto out_shape = data_shape; + out_shape[1] *= static_cast(std::pow(block_size, data_shape.size() - spatial_dim_offset)); + const auto divisor = static_cast(block_size); - out_shape[0] = data_shape[0]; - out_shape[1] = data_shape[1] * multiplier; - const auto divisor = static_cast(block_size); - for (size_t i = 2; i < out_shape.size(); i++) { - out_shape[i] = data_shape[i] / divisor; + for (auto i = spatial_dim_offset; i < out_shape.size(); ++i) { + out_shape[i] /= divisor; check_divided_result(op, out_shape[i], data_shape[i], divisor); } + return {out_shape}; } else { - // For PartialShape, Set the output to be dynamic; - // For StaticShape, will throw error caused by implicitly constructing StaticShape with PartialShape argument; - output_shapes[0] = ov::PartialShape::dynamic(data_rank); + return {PartialShape::dynamic()}; } } +template +void shape_infer(const ov::op::v0::SpaceToDepth* op, + const std::vector& input_shapes, + std::vector& output_shapes) { + output_shapes = shape_infer(op, input_shapes); +} + } // namespace v0 } // namespace op } // namespace ov diff --git a/src/core/shape_inference/include/utils.hpp b/src/core/shape_inference/include/utils.hpp index 3c6b5a47435051..d8b85271c40943 100644 --- a/src/core/shape_inference/include/utils.hpp +++ b/src/core/shape_inference/include/utils.hpp @@ -463,17 +463,29 @@ inline bool get_data_as_shape( } } -template +/** + * @brief Check for valid quotient of dimension division. + * + * If quotient is not valid (quotient * divisor != dividend) throw NodeValidationFailure exception. + * + * @tparam TDim Type of dimension. + * + * @param op Pointer to operator. + * @param quotient Dimension result after division. + * @param dividend Original dimension. + * @param divisor Dimension divide value. + */ +template inline void check_divided_result(const ov::Node* op, - const T& res, - const T& divided, - const typename T::value_type& divisor) { + const TDim& quotient, + const TDim& dividend, + const typename TDim::value_type& divisor) { NODE_VALIDATION_CHECK(op, - res != T{}, + quotient != TDim{}, "Dimension value: [ ", - divided.get_min_length(), + dividend.get_min_length(), ", ", - divided.get_max_length(), + dividend.get_max_length(), "]", " must be a multiple of divisor: ", divisor); @@ -481,15 +493,15 @@ inline void check_divided_result(const ov::Node* op, template <> inline void check_divided_result(const ov::Node* op, - const ov::Dimension& res, - const ov::Dimension& divided, + const ov::Dimension& quotient, + const ov::Dimension& dividend, const typename ov::Dimension::value_type& divisor) { NODE_VALIDATION_CHECK(op, - !res.get_interval().empty(), + !quotient.get_interval().empty(), "Dimension value: [ ", - divided.get_min_length(), + dividend.get_min_length(), ", ", - divided.get_max_length(), + dividend.get_max_length(), "]", " must be a multiple of divisor: ", divisor); diff --git a/src/core/src/node_output.cpp b/src/core/src/node_output.cpp index d1671464a918b7..545546a3f0ac6f 100644 --- a/src/core/src/node_output.cpp +++ b/src/core/src/node_output.cpp @@ -147,6 +147,10 @@ bool Output::operator>=(const Output& other) const { return !(*this < other); } +Output::operator Output() const { + return Output(get_node(), get_index()); +} + Output::Output(const Node* node, size_t index) : m_index(index) { OPENVINO_ASSERT(node, "Cannot create ov::Output from nullptr!"); m_node = node->shared_from_this(); diff --git a/src/core/src/op/batch_to_space.cpp b/src/core/src/op/batch_to_space.cpp index 2d50c2c254860f..6541a90765b611 100644 --- a/src/core/src/op/batch_to_space.cpp +++ b/src/core/src/op/batch_to_space.cpp @@ -60,13 +60,8 @@ void op::v1::BatchToSpace::validate_and_infer_types() { "block_shape and crops inputs must have integer element type. Got: ", inputs_integer_et); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0), - get_input_partial_shape(1), - get_input_partial_shape(2), - get_input_partial_shape(3)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_et, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, data_et, output_shape); } std::shared_ptr ngraph::op::v1::BatchToSpace::clone_with_new_inputs(const OutputVector& new_args) const { diff --git a/src/core/src/op/depth_to_space.cpp b/src/core/src/op/depth_to_space.cpp index 516601a6b3e001..385824ffc56d12 100644 --- a/src/core/src/op/depth_to_space.cpp +++ b/src/core/src/op/depth_to_space.cpp @@ -14,6 +14,7 @@ #include "itt.hpp" #include "ngraph/runtime/reference/depth_to_space.hpp" #include "ngraph/shape.hpp" +#include "openvino/core/validation_util.hpp" using namespace ngraph; @@ -36,20 +37,15 @@ bool op::DepthToSpace::visit_attributes(AttributeVisitor& visitor) { std::shared_ptr op::DepthToSpace::clone_with_new_inputs(const OutputVector& new_args) const { OV_OP_SCOPE(v0_DepthToSpace_clone_with_new_inputs); - if (new_args.size() != 1) { - throw ngraph_error("Incorrect number of new arguments"); - } + check_new_args_count(this, new_args); return std::make_shared(new_args.at(0), m_mode, m_blocksize); } void op::DepthToSpace::validate_and_infer_types() { OV_OP_SCOPE(v0_DepthToSpace_validate_and_infer_types); - const auto& data_type = get_input_element_type(0); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_type, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, get_input_element_type(0), output_shape); } namespace { @@ -88,6 +84,14 @@ std::ostream& ov::operator<<(std::ostream& s, const ov::op::v0::DepthToSpace::De return s << as_string(type); } +void op::v0::DepthToSpace::set_block_size(size_t block_size) { + m_blocksize = block_size; +} + +void op::v0::DepthToSpace::set_mode(DepthToSpaceMode mode) { + m_mode = mode; +} + namespace ov { template <> NGRAPH_API EnumNames& diff --git a/src/core/src/op/scatter_elements_update.cpp b/src/core/src/op/scatter_elements_update.cpp index ff24d4a6048089..365745255332e2 100644 --- a/src/core/src/op/scatter_elements_update.cpp +++ b/src/core/src/op/scatter_elements_update.cpp @@ -6,6 +6,7 @@ #include +#include "bound_evaluate.hpp" #include "itt.hpp" #include "ngraph/op/constant.hpp" #include "ngraph/op/util/op_types.hpp" @@ -51,17 +52,9 @@ void op::v3::ScatterElementsUpdate::validate_and_infer_types() { " and: ", updates_et); - const auto& data = get_input_partial_shape(0); - const auto& indices = get_input_partial_shape(1); - const auto& updates = get_input_partial_shape(2); - const auto& axis = get_input_partial_shape(3); - - std::vector output_shapes = {ov::PartialShape()}; - std::vector input_shapes = {data, indices, updates, axis}; - - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_et, output_shapes[0]); - if (output_shapes[0].is_dynamic()) + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, data_et, output_shape); + if (output_shape.is_dynamic()) set_input_is_relevant_to_shape(0); } @@ -254,3 +247,19 @@ bool op::v3::ScatterElementsUpdate::has_evaluate() const { } return true; } + +bool op::v3::ScatterElementsUpdate::evaluate_lower(ov::TensorVector& output_values) const { + OV_OP_SCOPE(v3_ScatterNDUpdate_evaluate_lower); + return get_input_tensor(1).has_and_set_bound() && ov::default_lower_bound_evaluator(this, output_values); +} + +bool op::v3::ScatterElementsUpdate::evaluate_upper(ov::TensorVector& output_values) const { + OV_OP_SCOPE(v3_ScatterNDUpdate_evaluate_upper); + return get_input_tensor(1).has_and_set_bound() && ov::default_upper_bound_evaluator(this, output_values); +} + +bool op::v3::ScatterElementsUpdate::evaluate_label(TensorLabelVector& output_labels) const { + OV_OP_SCOPE(v3_ScatterNDUpdate_evaluate_label); + + return ov::default_label_evaluator(this, {0, 2}, output_labels); +} diff --git a/src/core/src/op/shuffle_channels.cpp b/src/core/src/op/shuffle_channels.cpp index 22d89c3364e458..51b057c9f7c3b3 100644 --- a/src/core/src/op/shuffle_channels.cpp +++ b/src/core/src/op/shuffle_channels.cpp @@ -15,6 +15,7 @@ #include "ngraph/runtime/reference/shuffle_channels.hpp" #include "ngraph/type/element_type.hpp" #include "ngraph/type/element_type_traits.hpp" +#include "openvino/core/validation_util.hpp" using namespace std; using namespace ngraph; @@ -34,34 +35,24 @@ bool ngraph::op::v0::ShuffleChannels::visit_attributes(AttributeVisitor& visitor } size_t op::ShuffleChannels::get_zero_based_axis() const { - if (m_axis >= 0) { - return m_axis; + const auto input_rank = get_input_partial_shape(0).rank(); + if (input_rank.is_static()) { + return ov::normalize_axis(this, m_axis, input_rank); } else { - if (!get_input_partial_shape(0).rank().is_dynamic()) { - return m_axis + get_input_partial_shape(0).rank().get_length(); - } else { - throw ngraph_error("Cannot request zero-based axis with a input of unknown rank"); - } + throw ngraph_error("Cannot request zero-based axis with a input of unknown rank"); } } void op::ShuffleChannels::validate_and_infer_types() { OV_OP_SCOPE(v0_ShuffleChannels_validate_and_infer_types); - const auto& data_type = get_input_element_type(0); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_type, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, get_input_element_type(0), output_shape); } shared_ptr op::ShuffleChannels::clone_with_new_inputs(const OutputVector& new_args) const { OV_OP_SCOPE(v0_ShuffleChannels_clone_with_new_inputs); - if (new_args.size() != 1) { - throw ngraph_error("Expected 1 element in new_args for the ShuffleChannels op but got " + - std::to_string(new_args.size())); - } - + check_new_args_count(this, new_args); return make_shared(new_args.at(0), m_axis, m_group); } @@ -88,3 +79,11 @@ bool op::ShuffleChannels::has_evaluate() const { OV_OP_SCOPE(v0_ShuffleChannels_has_evaluate); return true; } + +void op::v0::ShuffleChannels::set_axis(int64_t axis) { + m_axis = axis; +} + +void op::v0::ShuffleChannels::set_group(int64_t group) { + m_group = group; +} diff --git a/src/core/src/op/space_to_batch.cpp b/src/core/src/op/space_to_batch.cpp index aeaa0c9197bf62..a36cf37c752e29 100644 --- a/src/core/src/op/space_to_batch.cpp +++ b/src/core/src/op/space_to_batch.cpp @@ -58,13 +58,8 @@ void op::v1::SpaceToBatch::validate_and_infer_types() { "pads_end must be an integral number but got (", pads_end_type, ")."); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0), - get_input_partial_shape(1), - get_input_partial_shape(2), - get_input_partial_shape(3)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_type, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, data_type, output_shape); } std::shared_ptr ngraph::op::v1::SpaceToBatch::clone_with_new_inputs(const OutputVector& new_args) const { diff --git a/src/core/src/op/space_to_depth.cpp b/src/core/src/op/space_to_depth.cpp index 818447e915e8c6..3faa4074f8ba27 100644 --- a/src/core/src/op/space_to_depth.cpp +++ b/src/core/src/op/space_to_depth.cpp @@ -46,11 +46,8 @@ std::shared_ptr ov::op::v0::SpaceToDepth::clone_with_new_inputs(const Outp void ngraph::op::v0::SpaceToDepth::validate_and_infer_types() { OV_OP_SCOPE(v0_SpaceToDepth_validate_and_infer_types); - const auto& data_type = get_input_element_type(0); - std::vector output_shapes = {ov::PartialShape{}}; - const std::vector input_shapes = {get_input_partial_shape(0)}; - shape_infer(this, input_shapes, output_shapes); - set_output_type(0, data_type, output_shapes[0]); + const auto output_shape = shape_infer(this, get_node_input_partial_shapes(*this)).front(); + set_output_type(0, get_input_element_type(0), output_shape); } namespace { @@ -87,6 +84,14 @@ bool ngraph::op::v0::SpaceToDepth::has_evaluate() const { return !get_input_partial_shape(0).is_dynamic(); } +void op::v0::SpaceToDepth::set_block_size(size_t block_size) { + m_blocksize = block_size; +} + +void op::v0::SpaceToDepth::set_mode(SpaceToDepthMode mode) { + m_mode = mode; +} + std::ostream& ov::operator<<(std::ostream& s, const op::v0::SpaceToDepth::SpaceToDepthMode& type) { return s << as_string(type); } diff --git a/src/core/src/runtime/ov_tensor.cpp b/src/core/src/runtime/ov_tensor.cpp index 732dfe72ed4bba..e4d46691d19e68 100644 --- a/src/core/src/runtime/ov_tensor.cpp +++ b/src/core/src/runtime/ov_tensor.cpp @@ -7,6 +7,9 @@ #include "blob_factory.hpp" // IE private header #include "ie_ngraph_utils.hpp" // IE private header #include "openvino/core/except.hpp" +#include "openvino/core/shape.hpp" +#include "openvino/core/strides.hpp" +#include "openvino/runtime/remote_tensor.hpp" #include "openvino/runtime/tensor.hpp" #include "runtime/blob_allocator.hpp" #include "shape_util.hpp" @@ -94,6 +97,17 @@ Tensor::Tensor(const Tensor& owner, const Coordinate& begin, const Coordinate& e } } +Tensor::Tensor(const ov::Output& port, const Allocator& allocator) + : Tensor(port.get_element_type(), + port.get_partial_shape().is_dynamic() ? ov::Shape{0} : port.get_shape(), + allocator) {} + +Tensor::Tensor(const ov::Output& port, void* host_ptr, const Strides& byte_strides) + : Tensor(port.get_element_type(), + port.get_partial_shape().is_dynamic() ? ov::Shape{0} : port.get_shape(), + host_ptr, + byte_strides) {} + element::Type Tensor::get_element_type() const { OV_TENSOR_STATEMENT(return ie::details::convertPrecision(_impl->getTensorDesc().getPrecision())); } @@ -113,6 +127,128 @@ Shape Tensor::get_shape() const { OV_TENSOR_STATEMENT({ return _impl->getTensorDesc().getBlockingDesc().getBlockDims(); }); } +void Tensor::copy_to(ov::Tensor& dst) const { + OV_TENSOR_STATEMENT({ + OPENVINO_ASSERT(dst, "Destination tensor was not initialized."); + OPENVINO_ASSERT(!is(), "Default copy to doesn't support copy from remote tensor."); + OPENVINO_ASSERT(!dst.is(), "Default copy to doesn't support copy to remote tensor."); + OPENVINO_ASSERT(dst.get_element_type() == get_element_type(), + "Tensor element types are not equal. (src: ", + get_element_type(), + " != dst: ", + dst.get_element_type(), + ")"); + if (dst.get_shape() == ov::Shape{0}) + dst.set_shape(get_shape()); + OPENVINO_ASSERT(dst.get_shape() == get_shape(), + "Tensor shapes are not equal. (src: ", + get_shape(), + " != dst: ", + dst.get_shape(), + ")"); + const auto& shape = get_shape(); + auto* src_data = static_cast(data()); + auto* dst_data = static_cast(dst.data()); + ov::Strides src_strides{get_byte_size()}; + ov::Strides dst_strides{dst.get_byte_size()}; + ov::Shape cur_pos{0}; + ov::Shape max_pos{1}; + + if (get_element_type().bitwidth() < 8 || (get_strides() == dst.get_strides() && is_continuous())) { + // OpenVINO doesn't support strides for LP types + // or both tensors have default strides + // Strides and positions already initialized + } else { + // Tensors have default strides + const auto& type = get_element_type(); + std::vector strides(shape.size()); + if (!shape.empty()) { + strides[shape.size() - 1] = 1; + } + auto size = shape.size(); + for (size_t i = 1; i < size; i++) { + strides[size - i - 1] = strides[size - i] * shape[size - i]; + } + + ov::Strides default_strides(strides.size()); + for (size_t i = 0; i < strides.size(); ++i) + default_strides[i] = strides[i] * type.size(); + + src_strides = get_strides(); + dst_strides = dst.get_strides(); + + ov::Strides src_str, dst_str; + + // Calculate src and dst shapes + bool found_step = false; + for (size_t i = 0; i < shape.size(); i++) { + size_t inverted_idx = shape.size() - i - 1; + if (!found_step) { + if (default_strides[inverted_idx] == src_strides[inverted_idx] && + src_strides[inverted_idx] == dst_strides[inverted_idx]) { + continue; + } else { + found_step = true; + size_t strides_size = inverted_idx + 1; + // Set right size + src_str.resize(strides_size + 1); + dst_str.resize(strides_size + 1); + max_pos.resize(strides_size + 1); + cur_pos.resize(strides_size + 1); + // In case of default continuous strides we can copy several elements + // In other case only one element + size_t dim = 1; + size_t strides = type.size(); + + if (strides_size < default_strides.size()) { + strides = default_strides[strides_size]; + dim = get_shape()[strides_size]; + } + src_str[strides_size] = strides; + dst_str[strides_size] = strides; + max_pos[strides_size] = dim; + cur_pos[strides_size] = 0; + } + } + src_str[inverted_idx] = src_strides[inverted_idx]; + dst_str[inverted_idx] = dst_strides[inverted_idx]; + max_pos[inverted_idx] = shape[inverted_idx]; + cur_pos[inverted_idx] = 0; + } + src_strides = src_str; + dst_strides = dst_str; + } + + const auto update_index = [](const ov::Shape& pos, const ov::Shape& shape, const ov::Strides& strides) { + size_t offset = 0; + + for (size_t i = 0; i < pos.size(); i++) { + offset += pos[i] * strides[i]; + } + return offset; + }; + + bool finish = false; + for (size_t dst_idx = 0, src_idx = 0; !finish;) { + memcpy(dst_data + dst_idx, src_data + src_idx, src_strides[src_strides.size() - 1]); + // update indexes + for (size_t i = 0; i < cur_pos.size(); i++) { + size_t inverted_idx = cur_pos.size() - i - 1; + cur_pos[inverted_idx]++; + if (cur_pos[inverted_idx] != max_pos[inverted_idx]) { + break; + } + if (inverted_idx) + cur_pos[inverted_idx] = 0; + else + finish = true; + } + src_idx = update_index(cur_pos, max_pos, src_strides); + dst_idx = update_index(cur_pos, max_pos, dst_strides); + } + }); +} + Strides Tensor::get_strides() const { OPENVINO_ASSERT(get_element_type().bitwidth() >= 8, "Could not get strides for types with bitwidths less then 8 bit. Tensor type: ", @@ -174,24 +310,26 @@ Tensor::operator bool() const noexcept { } bool Tensor::is_continuous() const { - if (get_element_type().bitwidth() < 8) - // OpenVINO doesn't support strides for lp types - return true; - const auto& shape = get_shape(); - const auto& type = get_element_type(); - std::vector strides(shape.size()); - if (!shape.empty()) { - strides[shape.size() - 1] = 1; - } - auto size = shape.size(); - for (size_t i = 1; i < size; i++) { - strides[size - i - 1] = strides[size - i] * shape[size - i]; - } + OV_TENSOR_STATEMENT({ + if (get_element_type().bitwidth() < 8) + // OpenVINO doesn't support strides for lp types + return true; + const auto& shape = get_shape(); + const auto& type = get_element_type(); + std::vector strides(shape.size()); + if (!shape.empty()) { + strides[shape.size() - 1] = 1; + } + auto size = shape.size(); + for (size_t i = 1; i < size; i++) { + strides[size - i - 1] = strides[size - i] * shape[size - i]; + } - ov::Strides byte_strides(strides.size()); - for (size_t i = 0; i < strides.size(); ++i) - byte_strides[i] = strides[i] * type.size(); - return byte_strides == get_strides(); + ov::Strides byte_strides(strides.size()); + for (size_t i = 0; i < strides.size(); ++i) + byte_strides[i] = strides[i] * type.size(); + return byte_strides == get_strides(); + }); } } // namespace ov diff --git a/src/core/src/validation_util.cpp b/src/core/src/validation_util.cpp index 7b987123fd4a64..b1f96977ccce74 100644 --- a/src/core/src/validation_util.cpp +++ b/src/core/src/validation_util.cpp @@ -938,8 +938,7 @@ int64_t ov::normalize_axis(const std::string& node_description, OPENVINO_ASSERT((axis_range_min <= axis) && (axis <= axis_range_max), node_description, normalize_axis_error_msg(axis, axis_range_min, axis_range_max)); - normalize_axis_to(tensor_rank)(axis); - return axis; + return normalize(axis, tensor_rank); } void ngraph::opset1::infer_conv_backprop_auto_padding(const Shape& input_data_shape, diff --git a/src/core/tests/ov_tensor_test.cpp b/src/core/tests/ov_tensor_test.cpp index 68261e854c3147..98dbf9f2383069 100644 --- a/src/core/tests/ov_tensor_test.cpp +++ b/src/core/tests/ov_tensor_test.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -13,7 +14,11 @@ #include "ngraph/coordinate_transform.hpp" #include "openvino/core/except.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/type/element_type_traits.hpp" +#include "openvino/op/parameter.hpp" #include "openvino/runtime/allocator.hpp" +#include "openvino/runtime/remote_tensor.hpp" #include "openvino/runtime/tensor.hpp" using OVTensorTest = ::testing::Test; @@ -40,6 +45,26 @@ TEST_F(OVTensorTest, canCreateTensor) { ASSERT_THROW(t.data(), ov::Exception); } +TEST_F(OVTensorTest, createTensorFromPort) { + auto parameter1 = std::make_shared(ov::element::f64, ov::Shape{1, 3, 2, 2}); + auto parameter2 = std::make_shared(ov::element::f32, ov::Shape{1, 3}); + auto parameter3 = std::make_shared(ov::element::f32, ov::PartialShape::dynamic()); + float data[] = {5.f, 6.f, 7.f}; + ov::Tensor t1{parameter1->output(0)}; + ov::Tensor t2{parameter2->output(0), data}; + ov::Tensor t3{parameter3->output(0)}; + ov::Tensor t4{parameter3->output(0), data}; + + EXPECT_EQ(t1.get_shape(), parameter1->get_shape()); + EXPECT_EQ(t1.get_element_type(), parameter1->get_element_type()); + EXPECT_EQ(t2.get_shape(), parameter2->get_shape()); + EXPECT_EQ(t2.get_element_type(), parameter2->get_element_type()); + EXPECT_EQ(t3.get_shape(), ov::Shape{0}); + EXPECT_EQ(t3.get_element_type(), parameter3->get_element_type()); + EXPECT_EQ(t4.get_shape(), ov::Shape{0}); + EXPECT_EQ(t4.get_element_type(), parameter3->get_element_type()); +} + TEST_F(OVTensorTest, canAccessF16Tensor) { ov::Shape shape = {4, 3, 2}; ov::Tensor t{ov::element::f16, shape}; @@ -281,3 +306,201 @@ TEST_F(OVTensorTest, readRangeRoiBlob) { } } } + +struct TestParams { + ov::Shape src_shape; + ov::Strides src_strides; + ov::Shape dst_shape; + ov::Strides dst_strides; +}; + +struct OVTensorTestCopy : ::testing::TestWithParam> {}; + +namespace { +template +std::vector fill_data(const ov::Tensor& tensor) { + std::vector actual; + const T* data = tensor.data(); + auto strides = tensor.get_strides(); + for (auto&& c : ngraph::CoordinateTransformBasic{tensor.get_shape()}) { + actual.emplace_back( + *(data + (c[2] * strides[2] + c[1] * strides[1] + c[0] * strides[0]) / tensor.get_element_type().size())); + } + return actual; +}; +template +void compare_data(const ov::Tensor& src, const ov::Tensor& dst) { + auto source_vec = fill_data(src); + auto dest_vec = fill_data(dst); + + ASSERT_EQ(source_vec.size(), dest_vec.size()); + + for (size_t i = 0; i < source_vec.size(); i++) { + EXPECT_EQ(source_vec[i], dest_vec[i]); + } +}; + +template +void init_tensor(const ov::Tensor& tensor, bool input) { + const auto origPtr = tensor.data(); + ASSERT_NE(nullptr, origPtr); + for (size_t i = 0; i < tensor.get_size(); ++i) { + origPtr[i] = static_cast(input ? i : -1); + } +} + +void init_tensor(const ov::Tensor& tensor, bool input) { + switch (tensor.get_element_type()) { + case ov::element::bf16: + init_tensor::value_type>(tensor, input); + break; + case ov::element::f16: + init_tensor::value_type>(tensor, input); + break; + case ov::element::f32: + init_tensor::value_type>(tensor, input); + break; + case ov::element::f64: + init_tensor::value_type>(tensor, input); + break; + case ov::element::i8: + init_tensor::value_type>(tensor, input); + break; + case ov::element::i16: + init_tensor::value_type>(tensor, input); + break; + case ov::element::i32: + init_tensor::value_type>(tensor, input); + break; + case ov::element::i64: + init_tensor::value_type>(tensor, input); + break; + case ov::element::u8: + init_tensor::value_type>(tensor, input); + break; + case ov::element::u16: + init_tensor::value_type>(tensor, input); + break; + case ov::element::u32: + init_tensor::value_type>(tensor, input); + break; + case ov::element::u64: + init_tensor::value_type>(tensor, input); + break; + default: + OPENVINO_UNREACHABLE("Unsupported data type"); + } +} + +void compare_tensors(const ov::Tensor& src, const ov::Tensor& dst) { + ASSERT_EQ(src.get_byte_size(), dst.get_byte_size()); + ASSERT_EQ(src.get_shape(), dst.get_shape()); + ASSERT_EQ(src.get_element_type(), dst.get_element_type()); + switch (src.get_element_type()) { + case ov::element::bf16: + compare_data::value_type>(src, dst); + break; + case ov::element::f16: + compare_data::value_type>(src, dst); + break; + case ov::element::f32: + compare_data::value_type>(src, dst); + break; + case ov::element::f64: + compare_data::value_type>(src, dst); + break; + case ov::element::i8: + compare_data::value_type>(src, dst); + break; + case ov::element::i16: + compare_data::value_type>(src, dst); + break; + case ov::element::i32: + compare_data::value_type>(src, dst); + break; + case ov::element::i64: + compare_data::value_type>(src, dst); + break; + case ov::element::u8: + compare_data::value_type>(src, dst); + break; + case ov::element::u16: + compare_data::value_type>(src, dst); + break; + case ov::element::u32: + compare_data::value_type>(src, dst); + break; + case ov::element::u64: + compare_data::value_type>(src, dst); + break; + default: + OPENVINO_UNREACHABLE("Unsupported data type"); + } +} +} // namespace + +TEST_P(OVTensorTestCopy, copy_to) { + ov::element::Type type; + TestParams p; + std::tie(type, p) = GetParam(); + // Source tensors + ov::Tensor full_src_tensor; + ov::Tensor src_tensor; + if (!p.src_strides.empty()) { + full_src_tensor = ov::Tensor(type, ov::Shape{p.src_shape[0] * p.src_strides[0]}); + src_tensor = ov::Tensor(type, p.src_shape, full_src_tensor.data(), p.src_strides); + } else { + src_tensor = full_src_tensor = ov::Tensor(type, p.src_shape); + } + init_tensor(full_src_tensor, true); + + ov::Tensor full_dst_tensor; + ov::Tensor dst_tensor; + if (!p.dst_strides.empty()) { + full_dst_tensor = ov::Tensor(type, ov::Shape{p.dst_shape[0] * p.dst_strides[0]}); + dst_tensor = ov::Tensor(type, p.dst_shape, full_dst_tensor.data(), p.dst_strides); + } else { + dst_tensor = full_dst_tensor = ov::Tensor(type, p.dst_shape); + } + init_tensor(full_src_tensor, false); + + src_tensor.copy_to(dst_tensor); + compare_tensors(src_tensor, dst_tensor); +} + +// clang-format off +INSTANTIATE_TEST_SUITE_P(copy_tests, + OVTensorTestCopy, + ::testing::Combine(::testing::Values( + ov::element::bf16, + ov::element::f16, + ov::element::f32, + ov::element::f64, + ov::element::i8, + ov::element::i16, + ov::element::i32, + ov::element::i64, + ov::element::u8, + ov::element::u16, + ov::element::u32, + ov::element::u64 + ), + ::testing::Values( + TestParams { + ov::Shape{1, 3, 4, 8}, {}, + {0}, {} + }, + TestParams { + ov::Shape{3, 2, 2}, {}, + ov::Shape{3, 2, 2}, ov::Strides{128, 24, 8} + }, + TestParams { + ov::Shape{3, 2, 2}, ov::Strides{64, 16, 8}, + ov::Shape{3, 2, 2}, ov::Strides{} + }, + TestParams { + ov::Shape{3, 2, 2}, ov::Strides{64, 16, 8}, + ov::Shape{3, 2, 2}, ov::Strides{128, 24, 8} + } + ))); +// clang-format on diff --git a/src/core/tests/type_prop/batch_to_space.cpp b/src/core/tests/type_prop/batch_to_space.cpp index 5870283161ae7c..97a9fd57b883a2 100644 --- a/src/core/tests/type_prop/batch_to_space.cpp +++ b/src/core/tests/type_prop/batch_to_space.cpp @@ -10,7 +10,7 @@ using namespace std; using namespace ngraph; -#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +using namespace testing; namespace { constexpr size_t data_input_idx = 0; @@ -339,7 +339,7 @@ TEST(type_prop, batch_to_space_output_shape_5D) { ASSERT_EQ(batch_to_space->get_shape(), (Shape{960 / (6 * 5 * 16), 6 * 6 - 2 - 2, 13 * 5 - 1, 128, 16 * 16})); } -TEST(type_prop, batch_to_space_output_dynamicshape_5D_when_batch_is_static) { +TEST(type_prop, batch_to_space_output_dynamic_shape_5D_when_batch_is_static) { auto data = make_shared(element::f32, PartialShape{960, {2, 20}, {12, 14}, {100, 150}, {10, 20}}); auto block_shape = make_shared(element::i32, Shape{5}, vector{1, 6, 5, 1, 16}); auto crops_begin = make_shared(element::i32, Shape{5}, vector{0, 2, 0, 0, 0}); @@ -354,20 +354,37 @@ TEST(type_prop, batch_to_space_output_dynamicshape_5D_when_batch_is_static) { {10 * 16, 20 * 16}})); } -TEST(type_prop, batch_to_space_output_dynamicshape_5D_when_batch_is_dynamic) { - auto data = - make_shared(element::f32, PartialShape{{959, 962}, {2, 34}, {9, 21}, {100, 162}, {1, 1999}}); +TEST(type_prop, batch_to_space_output_dynamic_shape_5D_when_batch_is_dynamic) { + auto data_shape = PartialShape{{959, 962}, {2, 34}, {9, 21}, {100, 162}, {1, 1999}}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); auto block_shape = make_shared(element::i32, Shape{5}, vector{1, 6, 5, 1, 16}); auto crops_begin = make_shared(element::i32, Shape{5}, vector{0, 2, 0, 0, 0}); auto crops_end = make_shared(element::i32, Shape{5}, vector{0, 2, 1, 0, 0}); auto batch_to_space = make_shared(data, block_shape, crops_begin, crops_end); - ASSERT_EQ(batch_to_space->get_output_partial_shape(0), - (PartialShape{{DIV_ROUND_UP(959, (6 * 5 * 16)), 962 / (6 * 5 * 16)}, + EXPECT_EQ(batch_to_space->get_output_partial_shape(0), + (PartialShape{{ceil_div(959, (6 * 5 * 16)), 962 / (6 * 5 * 16)}, {2 * 6 - 2 - 2, 34 * 6 - 2 - 2}, {9 * 5 - 1, 21 * 5 - 1}, {100, 162}, {1 * 16, 1999 * 16}})); + EXPECT_THAT(get_shape_labels(batch_to_space->get_output_partial_shape(0)), + ElementsAre(ov::no_label, ov::no_label, ov::no_label, 13, ov::no_label)); +} + +TEST(type_prop, batch_to_space_input_interval_shape_block_one) { + auto data_shape = PartialShape{{959, 962}, {2, 34}, {9, 21}}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); + auto block_shape = make_shared(element::i32, Shape{3}, vector{1, 1, 1}); + auto crops_begin = make_shared(element::i32, Shape{3}, vector{0, 0, 0}); + auto crops_end = make_shared(element::i32, Shape{3}, vector{0, 0, 1}); + auto batch_to_space = make_shared(data, block_shape, crops_begin, crops_end); + + EXPECT_EQ(batch_to_space->get_output_partial_shape(0), + PartialShape({{959, 962}, {2, 34}, {9 * 1 - 1, 21 * 1 - 1}})); + EXPECT_THAT(get_shape_labels(batch_to_space->get_output_partial_shape(0)), ElementsAre(10, 11, ov::no_label)); } TEST(type_prop, batch_to_space_and_space_to_batch) { @@ -407,3 +424,20 @@ TEST(type_prop, batch_to_space_dynamic_shape_dynamic_rank) { ASSERT_EQ(batch_to_space->get_element_type(), element::f32); ASSERT_EQ(batch_to_space->get_output_partial_shape(0), PartialShape::dynamic()); } + +TEST(type_prop, batch_to_space_default_ctor) { + auto data = make_shared(element::i16, Shape{100, 7, 13, 3}); + auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 10, 5, 1}); + auto crops_begin = make_shared(element::i64, Shape{4}, vector{0, 3, 1, 0}); + auto crops_end = make_shared(element::i64, Shape{4}, vector{0, 3, 0, 0}); + + auto batch_to_space = make_shared(); + + batch_to_space->set_arguments(OutputVector{data, block_shape, crops_begin, crops_end}); + batch_to_space->validate_and_infer_types(); + + EXPECT_EQ(batch_to_space->get_input_size(), 4); + EXPECT_EQ(batch_to_space->get_output_size(), 1); + EXPECT_EQ(batch_to_space->get_element_type(), element::i16); + EXPECT_EQ(batch_to_space->get_shape(), (Shape{100 / (10 * 5), 7 * 10 - 3 - 3, 13 * 5 - 1, 3})); +} diff --git a/src/core/tests/type_prop/depth_to_space.cpp b/src/core/tests/type_prop/depth_to_space.cpp index 6101fdd731392e..935730a78b9b10 100644 --- a/src/core/tests/type_prop/depth_to_space.cpp +++ b/src/core/tests/type_prop/depth_to_space.cpp @@ -8,74 +8,86 @@ using namespace std; using namespace ngraph; +using namespace testing; -#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +TEST(type_prop, depth_to_space_input_interval_shape_block_first_5D_when_depth_is_static) { + auto a_shape = PartialShape{{2, 10}, 24, {3, 7}, {423, 3000}, {235, 1345}}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); -TEST(type_prop, depth_to_space_output_dynamicshape_block_first_5D_when_depth_is_static) { - auto A = make_shared(element::f32, PartialShape{{2, 10}, 24, {3, 7}, {423, 3000}, {235, 1345}}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); - - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), + EXPECT_EQ(depth_to_space->get_output_element_type(0), element::f32); + EXPECT_EQ(depth_to_space->get_output_partial_shape(0), (PartialShape{{2, 10}, 3, {3 * 2, 7 * 2}, {423 * 2, 3000 * 2}, {235 * 2, 1345 * 2}})); + EXPECT_THAT(get_shape_labels(depth_to_space->get_output_partial_shape(0)), + ElementsAre(10, ov::no_label, ov::no_label, ov::no_label, ov::no_label)); +} + +TEST(type_prop, depth_to_space_input_interval_shape_default_block_size) { + auto a_shape = PartialShape{{2, 10}, 24, {3, 7}, {423, 3000}, {235, 1345}}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST); + + EXPECT_EQ(depth_to_space->get_output_element_type(0), element::f32); + EXPECT_EQ(depth_to_space->get_output_partial_shape(0), a_shape); + EXPECT_THAT(get_shape_labels(depth_to_space->get_output_partial_shape(0)), ElementsAre(10, 11, 12, 13, 14)); } TEST(type_prop, depth_to_space_output_dynamicshape_block_first_5D_when_depth_is_dynamic) { auto A = make_shared(element::f32, PartialShape{{2, 10}, {81, 82}, {3, 7}, {423, 3000}, {235, 1345}}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 3); - - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), - (PartialShape{{2, 10}, - {DIV_ROUND_UP(81, 27), 82 / 27}, - {3 * 3, 7 * 3}, - {423 * 3, 3000 * 3}, - {235 * 3, 1345 * 3}})); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 3); + + ASSERT_EQ( + depth_to_space->get_output_partial_shape(0), + (PartialShape{{2, 10}, {ceil_div(81, 27), 82 / 27}, {3 * 3, 7 * 3}, {423 * 3, 3000 * 3}, {235 * 3, 1345 * 3}})); } TEST(type_prop, depth_to_space_output_shape_block_first_4D) { auto A = make_shared(element::f32, Shape{1, 128, 8, 8}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 8); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 8); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 2, 64, 64})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 2, 64, 64})); } TEST(type_prop, depth_to_space_output_shape_block_first_4D_2) { auto A = make_shared(element::f32, Shape{1, 12, 1080, 1616}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 3, 2 * 1080, 2 * 1616})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 3, 2 * 1080, 2 * 1616})); } TEST(type_prop, depth_to_space_output_shape_block_first_5D) { auto A = make_shared(element::f32, Shape{1, 16, 3, 1080, 1616}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); } TEST(type_prop, depth_to_space_output_shape_depth_first_4D) { auto A = make_shared(element::f32, Shape{1, 12, 1080, 1616}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 3, 2 * 1080, 2 * 1616})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 3, 2 * 1080, 2 * 1616})); } TEST(type_prop, depth_to_space_output_shape_depth_first_5D) { auto A = make_shared(element::f32, Shape{1, 16, 3, 1080, 1616}); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_shape(), (Shape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); } TEST(type_prop, depth_to_space_input_rank_not_supported) { auto A = make_shared(element::f32, Shape{1, 8}); try { - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); FAIL() << "Not supported input shape for DepthToSpace exception not thrown"; } catch (const ngraph_error& error) { EXPECT_HAS_SUBSTRING(error.what(), "The input tensor with rank lower than 3 is not supported (input rank: 2)"); @@ -87,7 +99,7 @@ TEST(type_prop, depth_to_space_input_rank_not_supported) { TEST(type_prop, depth_to_space_blocksize_not_matched) { auto A = make_shared(element::f32, Shape{1, 7, 4, 4}); try { - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); FAIL() << "Not matched blocksize for DepthToSpace exception not thrown"; } catch (const ngraph_error& error) { EXPECT_HAS_SUBSTRING(error.what(), "Dimension value: [ 7, 7] must be a multiple of divisor: 4"); @@ -98,16 +110,34 @@ TEST(type_prop, depth_to_space_blocksize_not_matched) { TEST(type_prop, depth_to_space_dynamic_shape_static_rank) { auto A = make_shared(element::f32, PartialShape::dynamic(4)); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), PartialShape::dynamic(4)); + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_output_partial_shape(0), PartialShape::dynamic(4)); } TEST(type_prop, depth_to_space_dynamic_shape_dynamic_rank) { auto A = make_shared(element::f32, PartialShape::dynamic()); - auto space_to_depth = make_shared(A, op::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); + auto depth_to_space = make_shared(A, "depth_first", 2); + + ASSERT_EQ(depth_to_space->get_element_type(), element::f32); + ASSERT_EQ(depth_to_space->get_output_partial_shape(0), PartialShape::dynamic()); +} - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), PartialShape::dynamic()); +TEST(type_prop, depth_to_space_default_ctor) { + const auto a_shape = PartialShape{{2, 10}, 27, {0, 54}, {9, -1}}; + const auto A = make_shared(element::u32, a_shape); + + const auto depth_to_space = make_shared(); + depth_to_space->set_block_size(3); + depth_to_space->set_mode(op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST); + depth_to_space->set_argument(0, A); + depth_to_space->validate_and_infer_types(); + + EXPECT_EQ(depth_to_space->get_block_size(), 3); + EXPECT_EQ(depth_to_space->get_mode(), op::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST); + EXPECT_EQ(depth_to_space->get_input_size(), 1); + EXPECT_EQ(depth_to_space->get_output_size(), 1); + EXPECT_EQ(depth_to_space->get_output_element_type(0), element::u32); + EXPECT_EQ(depth_to_space->get_output_partial_shape(0), (PartialShape{{2, 10}, 3, {0 * 3, 54 * 3}, {9 * 3, -1}})); } diff --git a/src/core/tests/type_prop/scatter_elements_update.cpp b/src/core/tests/type_prop/scatter_elements_update.cpp index 3e2d031242cca9..269d06de74ea16 100644 --- a/src/core/tests/type_prop/scatter_elements_update.cpp +++ b/src/core/tests/type_prop/scatter_elements_update.cpp @@ -2,12 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "common_test_utils/test_assertions.hpp" #include "gtest/gtest.h" #include "ngraph/ngraph.hpp" #include "util/type_prop.hpp" using namespace std; using namespace ngraph; +using namespace testing; TEST(type_prop, scatter_elements_update_output_shape) { Shape data_shape{2, 4, 5, 7}; @@ -28,18 +30,37 @@ TEST(type_prop, scatter_elements_update_output_shape) { TEST(type_prop, scatter_elements_update_output_partial_dyn_shape) { PartialShape data_shape{2, Dimension::dynamic(), 5}; + set_shape_labels(data_shape, 10); PartialShape indices_shape{Dimension::dynamic(), 2, 2}; PartialShape updates_shape{2, 2, Dimension::dynamic()}; PartialShape axis_shape = PartialShape::dynamic(); - auto data = make_shared(element::f32, data_shape); + auto data = make_shared(element::f64, data_shape); auto indices = make_shared(element::i16, indices_shape); - auto updates = make_shared(element::f32, updates_shape); + auto updates = make_shared(element::f64, updates_shape); auto axis = make_shared(element::i16, axis_shape); auto scatter = make_shared(data, indices, updates, axis); - EXPECT_TRUE(scatter->get_output_partial_shape(0).same_scheme(data_shape)); + EXPECT_EQ(scatter->get_output_element_type(0), element::f64); + EXPECT_EQ(scatter->get_output_partial_shape(0), data_shape); + EXPECT_THAT(get_shape_labels(scatter->get_output_partial_shape(0)), ElementsAre(10, 11, 12)); +} + +TEST(type_prop, scatter_elements_update_data_has_interval_dimensions) { + PartialShape data_shape{{5, 10}, -1, {-1, 3}, {8, -1}}; + set_shape_labels(data_shape, 10); + + const auto data = make_shared(element::i64, data_shape); + const auto indices = make_shared(element::i16, PartialShape{1, 2, 2, {2, 3}}); + const auto updates = make_shared(element::i64, PartialShape{{0, 2}, -1, 2, -1}); + const auto axis = make_shared(element::i16, PartialShape::dynamic()); + + const auto scatter = make_shared(data, indices, updates, axis); + + EXPECT_EQ(scatter->get_output_element_type(0), element::i64); + EXPECT_EQ(scatter->get_output_partial_shape(0), data_shape); + EXPECT_THAT(get_shape_labels(scatter->get_output_partial_shape(0)), ElementsAre(10, 11, 12, 13)); } TEST(type_prop, scatter_elements_update_output_full_dyn_shape) { @@ -55,7 +76,42 @@ TEST(type_prop, scatter_elements_update_output_full_dyn_shape) { auto scatter = make_shared(data, indices, updates, axis); - EXPECT_TRUE(scatter->get_output_partial_shape(0).same_scheme(data_shape)); + EXPECT_EQ(scatter->get_output_element_type(0), element::f32); + EXPECT_EQ(scatter->get_output_partial_shape(0), data_shape); +} + +TEST(type_prop, scatter_elements_update_default_ctor) { + const auto data = make_shared(element::f32, PartialShape{2, 5, 5, 6}); + const auto indices = make_shared(element::i16, PartialShape{1, 2, 1, 3}); + const auto updates = make_shared(element::f32, PartialShape{1, 2, 1, 3}); + const auto axis = make_shared(element::i16, Shape{}, -4); + + const auto scatter = make_shared(data, indices, updates, axis); + scatter->set_arguments(OutputVector{data, indices, updates, axis}); + scatter->validate_and_infer_types(); + + EXPECT_EQ(scatter->get_input_size(), 4); + EXPECT_EQ(scatter->get_output_size(), 1); + EXPECT_EQ(scatter->get_output_element_type(0), element::f32); + EXPECT_EQ(scatter->get_output_partial_shape(0), PartialShape({2, 5, 5, 6})); + EXPECT_THAT(get_shape_labels(scatter->get_output_partial_shape(0)), Each(ov::no_label)); +} + +TEST(type_prop, scatter_elements_update_preserve_partial_values_and_labels_via_evaluates_bounds) { + const auto data = op::Constant::create(element::i64, Shape{4}, {2, 3, 15, 4}); + const auto indices = op::Constant::create(element::i64, Shape{2}, {3, 0}); + auto updates_shape = PartialShape{{10, 20}, {3, 4}}; + set_shape_labels(updates_shape, 20); + const auto axis = make_shared(element::i16, Shape{}, 0); + + const auto shape_of_u = std::make_shared(std::make_shared(element::i64, updates_shape)); + const auto scatter = make_shared(data, indices, shape_of_u, axis); + + auto param = std::make_shared(element::f32, PartialShape{1}); + auto bc = std::make_shared(param, scatter, op::BroadcastType::BIDIRECTIONAL); + + EXPECT_EQ(bc->get_output_partial_shape(0), PartialShape({{3, 4}, 3, 15, {10, 20}})); + EXPECT_THAT(get_shape_labels(bc->get_output_partial_shape(0)), ElementsAre(21, ov::no_label, ov::no_label, 20)); } TEST(type_prop, scatter_elements_update_axis_validation) { @@ -69,14 +125,9 @@ TEST(type_prop, scatter_elements_update_axis_validation) { auto updates = make_shared(element::f32, updates_shape); auto axis = make_shared(element::i16, axis_shape, std::vector{8}); - try { - auto scatter = make_shared(data, indices, updates, axis); - FAIL() << "Not detected axis with value out of the range"; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), std::string("Axis value has to be in range")); - } catch (...) { - FAIL() << "Deduced type check failed for unexpected reason"; - } + OV_EXPECT_THROW(auto scatter = make_shared(data, indices, updates, axis), + ov::AssertFailure, + HasSubstr("Parameter axis 8 out of the tensor rank range [-4, 3]")); } TEST(type_prop, scatter_elements_updates_indices_shape) { @@ -90,14 +141,9 @@ TEST(type_prop, scatter_elements_updates_indices_shape) { auto updates = make_shared(element::f32, updates_shape); auto axis = make_shared(element::i16, axis_shape, std::vector{1}); - try { - auto scatter = make_shared(data, indices, updates, axis); - FAIL() << "Not detected incompatibile indices and updates shape"; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), std::string("Indices and updates input shapes are required to be equal")); - } catch (...) { - FAIL() << "Deduced type check failed for unexpected reason"; - } + OV_EXPECT_THROW(auto scatter = make_shared(data, indices, updates, axis), + NodeValidationFailure, + HasSubstr("Indices and updates input shapes are required to be equal")); } TEST(type_prop, scatter_elements_updates_indices_rank) { @@ -111,14 +157,9 @@ TEST(type_prop, scatter_elements_updates_indices_rank) { auto updates = make_shared(element::f32, updates_shape); auto axis = make_shared(element::i16, axis_shape, std::vector{1}); - try { - auto scatter = make_shared(data, indices, updates, axis); - FAIL() << "Not detected incompatibile indices and updates shape"; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), std::string("Indices and updates input shapes are required to be equal")); - } catch (...) { - FAIL() << "Deduced type check failed for unexpected reason"; - } + OV_EXPECT_THROW(auto scatter = make_shared(data, indices, updates, axis), + NodeValidationFailure, + HasSubstr("Indices and updates input shapes are required to be equal")); } TEST(type_prop, scatter_elements_data_indices_rank) { @@ -132,12 +173,7 @@ TEST(type_prop, scatter_elements_data_indices_rank) { auto updates = make_shared(element::f32, updates_shape); auto axis = make_shared(element::i16, axis_shape, std::vector{1}); - try { - auto scatter = make_shared(data, indices, updates, axis); - FAIL() << "Not detected incompatibile indices and data rank"; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), std::string("Indices rank and data rank are required to be equal")); - } catch (...) { - FAIL() << "Deduced type check failed for unexpected reason"; - } + OV_EXPECT_THROW(auto scatter = make_shared(data, indices, updates, axis), + NodeValidationFailure, + HasSubstr("Indices rank and data rank are required to be equal")); } diff --git a/src/core/tests/type_prop/shuffle_channels.cpp b/src/core/tests/type_prop/shuffle_channels.cpp index ce54933ad243a1..4bd5a8bf28c380 100644 --- a/src/core/tests/type_prop/shuffle_channels.cpp +++ b/src/core/tests/type_prop/shuffle_channels.cpp @@ -2,12 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "gtest/gtest.h" +#include "common_test_utils/test_assertions.hpp" +#include "gmock/gmock.h" #include "ngraph/ngraph.hpp" #include "util/type_prop.hpp" using namespace std; using namespace ngraph; +using namespace testing; TEST(type_prop, shuffle_channels_default_4D) { const auto data_input_shape = Shape{3, 9, 4, 5}; @@ -30,7 +32,8 @@ TEST(type_prop, shuffle_channels_basic_4D) { } TEST(type_prop, shuffle_channels_dynamic_4D) { - const auto data_input_shape = PartialShape{Dimension::dynamic(), Dimension(3, 9), 4, Dimension(4, 15)}; + auto data_input_shape = PartialShape{Dimension::dynamic(), Dimension(3, 9), 4, Dimension(4, 15)}; + set_shape_labels(data_input_shape, 10); const auto data = make_shared(element::f32, data_input_shape); const auto axis = 1; const auto group = 3; @@ -38,6 +41,7 @@ TEST(type_prop, shuffle_channels_dynamic_4D) { EXPECT_EQ(shuffle_channels->get_element_type(), element::f32); EXPECT_EQ(shuffle_channels->get_output_partial_shape(0), data_input_shape); + EXPECT_THAT(get_shape_labels(shuffle_channels->get_output_partial_shape(0)), ElementsAre(10, ov::no_label, 12, 13)); } TEST(type_prop, shuffle_channels_dynamic_fully) { @@ -108,16 +112,11 @@ TEST(type_prop, shuffle_channels_ND_smaller) { } TEST(type_prop, shuffle_channels_axis_validation) { - try { - const auto data = make_shared(element::f64, Shape{1, 2, 3, 4}); - const auto shuffle_channels = make_shared(data, -5, 5); - FAIL() << "ShuffleChannels validation did not work. Op node was created with incorrect " - "params."; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), - "The 'axis' parameter for ShuffleChannels has to point to one of the " - "input tensor's shape dimensions"); - } + const auto data = make_shared(element::f64, Shape{1, 2, 3, 4}); + + OV_EXPECT_THROW(const auto op = make_shared(data, -5, 5), + ov::AssertFailure, + HasSubstr("ShuffleChannels Parameter axis -5 out of the tensor rank range [-4, 3]")); } TEST(type_prop, shuffle_channels_negative_axis_calculation) { @@ -155,24 +154,36 @@ TEST(type_prop, shuffle_channels_infer_shape_with_negative_axis_calculation) { } TEST(type_prop, shuffle_channels_invalid_input_shape) { - try { - const auto data = make_shared(element::f64, Shape{}); - const auto shuffle_channels = make_shared(data, 0, 1); - FAIL() << "ShuffleChannels validation did not work. Op node was created with incorrect " - "params."; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), "The input tensor's shape is expected to be at least 1D."); - } + const auto data = make_shared(element::f64, Shape{}); + + OV_EXPECT_THROW(const auto op = make_shared(data, 0, 1), + NodeValidationFailure, + HasSubstr("The input tensor's shape is expected to be at least 1D.")); } TEST(type_prop, shuffle_channels_invalid_groups_value) { - try { - const auto data = make_shared(element::f64, Shape{1, 2, 3, 15}); - const auto shuffle_channels = make_shared(data, -1, 2); - FAIL() << "ShuffleChannels validation did not work. Op node was created with incorrect " - "params."; - } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), - "The channel dimension size has to be a multiple of the groups parameter value."); - } + const auto data = make_shared(element::f64, Shape{1, 2, 3, 15}); + + OV_EXPECT_THROW(const auto op = make_shared(data, -1, 2), + NodeValidationFailure, + HasSubstr("The channel dimension size has to be a multiple of the groups parameter value.")); +} + +TEST(type_prop, shuffle_channels_default_ctor) { + const auto data_shape = PartialShape{{2, 5}, {0, 2}, 3, {2, -1}}; + const auto data = make_shared(element::i32, data_shape); + + const auto shuffle_channels = make_shared(); + shuffle_channels->set_axis(-3); + shuffle_channels->set_group(3); + shuffle_channels->set_argument(0, data); + shuffle_channels->validate_and_infer_types(); + + EXPECT_EQ(shuffle_channels->get_axis(), -3); + EXPECT_EQ(shuffle_channels->get_zero_based_axis(), 1); + EXPECT_EQ(shuffle_channels->get_group(), 3); + EXPECT_EQ(shuffle_channels->get_input_size(), 1); + EXPECT_EQ(shuffle_channels->get_output_size(), 1); + EXPECT_EQ(shuffle_channels->get_element_type(), element::i32); + EXPECT_EQ(shuffle_channels->get_output_partial_shape(0), data_shape); } diff --git a/src/core/tests/type_prop/space_to_batch.cpp b/src/core/tests/type_prop/space_to_batch.cpp index 682d71363cedce..b3d5b9bcbb2a89 100644 --- a/src/core/tests/type_prop/space_to_batch.cpp +++ b/src/core/tests/type_prop/space_to_batch.cpp @@ -2,12 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "gtest/gtest.h" +#include "gmock/gmock.h" #include "ngraph/ngraph.hpp" #include "util/type_prop.hpp" using namespace std; using namespace ngraph; +using namespace testing; #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) @@ -65,31 +66,52 @@ TEST(type_prop, space_to_batch_and_batch_to_space) { } TEST(type_prop, space_to_batch_when_space_is_static) { - auto data = make_shared(element::f32, PartialShape{{2, 5}, 100, 1024, 3}); + auto data_shape = PartialShape{{2, 5}, 100, 1024, 3}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 12, 100, 2}); auto pads_begin = make_shared(element::i64, Shape{4}, vector{0, 3, 38, 1}); auto pads_end = make_shared(element::i64, Shape{4}, vector{0, 5, 38, 0}); auto space_to_batch = make_shared(data, block_shape, pads_begin, pads_end); - ASSERT_EQ( + EXPECT_EQ( space_to_batch->get_output_partial_shape(0), (PartialShape{{2 * 12 * 100 * 2, 5 * 12 * 100 * 2}, (100 + 3 + 5) / 12, (1024 + 38 + 38) / 100, (3 + 1) / 2})); + EXPECT_THAT(get_shape_labels(space_to_batch->get_output_partial_shape(0)), Each(ov::no_label)); +} + +TEST(type_prop, space_to_batch_when_data_dynamic_) { + auto data_shape = PartialShape{{2, 5}, {5, 100}, {100, 1024}, {3, 10}}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); + auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 1, 1, 1}); + auto pads_begin = make_shared(element::i64, Shape{4}, vector{1, 0, 2, 0}); + auto pads_end = make_shared(element::i64, Shape{4}, vector{1, 0, 3, 0}); + + auto space_to_batch = make_shared(data, block_shape, pads_begin, pads_end); + + EXPECT_EQ(space_to_batch->get_output_partial_shape(0), + PartialShape({{2, 5}, {5, 100}, {(100 + 2 + 3) / 1, (1024 + 2 + 3) / 1}, {3, 10}})); + EXPECT_THAT(get_shape_labels(space_to_batch->get_output_partial_shape(0)), ElementsAre(10, 11, ov::no_label, 13)); } TEST(type_prop, space_to_batch_when_space_is_dynamic) { - auto data = make_shared(element::f32, PartialShape{{2, 5}, {5, 100}, {100, 1024}, {3, 10}}); + auto data_shape = PartialShape{{2, 5}, {5, 100}, {100, 1024}, {3, 10}}; + set_shape_labels(data_shape, 10); + auto data = make_shared(element::f32, data_shape); auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 12, 100, 2}); auto pads_begin = make_shared(element::i64, Shape{4}, vector{0, 3, 38, 1}); auto pads_end = make_shared(element::i64, Shape{4}, vector{0, 5, 38, 0}); auto space_to_batch = make_shared(data, block_shape, pads_begin, pads_end); - ASSERT_EQ(space_to_batch->get_output_partial_shape(0), + EXPECT_EQ(space_to_batch->get_output_partial_shape(0), (PartialShape{{2 * 12 * 100 * 2, 5 * 12 * 100 * 2}, {DIV_ROUND_UP((5 + 5 + 3), 12), (100 + 5 + 3) / 12}, {DIV_ROUND_UP((100 + 38 + 38), 100), (1024 + 38 + 38) / 100}, {DIV_ROUND_UP((3 + 1), 2), (10 + 1) / 2}})); + EXPECT_THAT(get_shape_labels(space_to_batch->get_output_partial_shape(0)), Each(ov::no_label)); } TEST(type_prop, space_to_batch_dynamic_shape_static_rank) { @@ -116,6 +138,35 @@ TEST(type_prop, space_to_batch_dynamic_shape_dynamic_rank) { ASSERT_EQ(space_to_batch->get_output_partial_shape(0), PartialShape::dynamic()); } +TEST(type_prop, space_to_batch_dynamic_rank_shape_block_and_pads_not_const) { + auto data = make_shared(element::f32, PartialShape::dynamic()); + auto block_shape = make_shared(element::i64, Shape{4}); + auto pads_begin = make_shared(element::i64, Shape{4}); + auto pads_end = make_shared(element::i64, Shape{4}); + + auto space_to_batch = make_shared(data, block_shape, pads_begin, pads_end); + + ASSERT_EQ(space_to_batch->get_element_type(), element::f32); + ASSERT_EQ(space_to_batch->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TEST(type_prop, space_to_batch_default_ctor) { + auto data = make_shared(element::f32, PartialShape{{2, 5}, 100, {100, 1024}, 3}); + auto block_shape = make_shared(element::i64, Shape{4}, vector{1, 2, 4, 1}); + auto pads_begin = make_shared(element::i64, Shape{4}, vector{1, 1, 2, 0}); + auto pads_end = make_shared(element::i64, Shape{4}, vector{1, 1, 6, 0}); + + auto space_to_batch = make_shared(); + space_to_batch->set_arguments(OutputVector{data, block_shape, pads_begin, pads_end}); + space_to_batch->validate_and_infer_types(); + + EXPECT_EQ(space_to_batch->get_input_size(), 4); + EXPECT_EQ(space_to_batch->get_output_size(), 1); + EXPECT_EQ(space_to_batch->get_output_element_type(0), element::f32); + EXPECT_EQ(space_to_batch->get_output_partial_shape(0), + PartialShape({{2 * 2 * 4, 5 * 2 * 4}, (100 + 2) / 2, {(100 + 2 + 6) / 4, (1024 + 2 + 6) / 4}, 3})); +} + TEST(type_prop, space_to_batch_invalid_element_type_block_shape) { auto data = make_shared(element::f32, Shape{2, 128}); auto block_shape = make_shared(element::f32, Shape{2}, vector{1, 5}); diff --git a/src/core/tests/type_prop/space_to_depth.cpp b/src/core/tests/type_prop/space_to_depth.cpp index e20131500b321f..c190356a9c855f 100644 --- a/src/core/tests/type_prop/space_to_depth.cpp +++ b/src/core/tests/type_prop/space_to_depth.cpp @@ -8,6 +8,7 @@ using namespace std; using namespace ngraph; +using namespace testing; #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) @@ -47,25 +48,45 @@ TEST(type_prop, space_to_depth_output_shape_depth_first_5D) { ASSERT_EQ(space_to_depth->get_shape(), (Shape{1, 12 * 8, 4 / 2, 1080 / 2, 1616 / 2})); } +TEST(type_prop, space_to_depth_output_shape_depth_first_5D_1) { + auto a_shape = PartialShape{{1, 4}, {12, 36}, 1080, 1616}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); + const auto mode = ngraph::op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST; + auto space_to_depth = make_shared(A, mode, 1); + + EXPECT_EQ(space_to_depth->get_element_type(), element::f32); + EXPECT_EQ(space_to_depth->get_output_partial_shape(0), a_shape); + EXPECT_THAT(get_shape_labels(space_to_depth->get_output_partial_shape(0)), ElementsAre(10, 11, 12, 13)); +} + TEST(type_prop, space_to_depth_output_shape_when_space_is_static) { - auto A = make_shared(element::f32, PartialShape{{1, 4}, {12, 36}, 1080, 1616}); + auto a_shape = PartialShape{{1, 4}, {12, 36}, 1080, 1616}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); const auto mode = ngraph::op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST; auto space_to_depth = make_shared(A, mode, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ(space_to_depth->get_output_partial_shape(0), + EXPECT_EQ(space_to_depth->get_element_type(), element::f32); + EXPECT_EQ(space_to_depth->get_output_partial_shape(0), (PartialShape{{1, 4}, {12 * 4, 36 * 4}, 1080 / 2, 1616 / 2})); + EXPECT_THAT(get_shape_labels(space_to_depth->get_output_partial_shape(0)), + ElementsAre(10, ov::no_label, ov::no_label, ov::no_label)); } TEST(type_prop, space_to_depth_output_shape_when_space_is_dynamic) { - auto A = make_shared(element::f32, PartialShape{{1, 4}, {12, 36}, {100, 1081}, {99, 1616}}); + auto a_shape = PartialShape{{1, 4}, {12, 36}, {100, 1081}, {99, 1616}}; + set_shape_labels(a_shape, 10); + auto A = make_shared(element::f32, a_shape); const auto mode = ngraph::op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST; auto space_to_depth = make_shared(A, mode, 2); - ASSERT_EQ(space_to_depth->get_element_type(), element::f32); - ASSERT_EQ( + EXPECT_EQ(space_to_depth->get_element_type(), element::f32); + EXPECT_EQ( space_to_depth->get_output_partial_shape(0), (PartialShape{{1, 4}, {12 * 4, 36 * 4}, {DIV_ROUND_UP(100, 2), 1081 / 2}, {DIV_ROUND_UP(99, 2), 1616 / 2}})); + EXPECT_THAT(get_shape_labels(space_to_depth->get_output_partial_shape(0)), + ElementsAre(10, ov::no_label, ov::no_label, ov::no_label)); } TEST(type_prop, space_to_depth_dynamic_shape_static_rank) { @@ -86,6 +107,23 @@ TEST(type_prop, space_to_depth_dynamic_shape_dynamic_rank) { ASSERT_EQ(space_to_depth->get_output_partial_shape(0), PartialShape::dynamic()); } +TEST(type_prop, space_to_depth_default_ctor) { + auto A = make_shared(element::f64, PartialShape{{1, 4}, {12, 36}, 900, 3}); + + const auto space_to_depth = make_shared(); + space_to_depth->set_block_size(3); + space_to_depth->set_mode(op::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST); + space_to_depth->set_argument(0, A); + space_to_depth->validate_and_infer_types(); + + EXPECT_EQ(space_to_depth->get_block_size(), 3); + EXPECT_EQ(space_to_depth->get_mode(), op::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST); + EXPECT_EQ(space_to_depth->get_input_size(), 1); + EXPECT_EQ(space_to_depth->get_output_size(), 1); + EXPECT_EQ(space_to_depth->get_element_type(), element::f64); + EXPECT_EQ(space_to_depth->get_output_partial_shape(0), (PartialShape{{1, 4}, {12 * 9, 36 * 9}, 900 / 3, 3 / 3})); +} + TEST(type_prop, space_to_depth_input_rank_not_supported) { auto A = make_shared(element::f32, Shape{1, 8}); try { diff --git a/src/core/tests/type_prop/tile.cpp b/src/core/tests/type_prop/tile.cpp index db73b27a37f226..bb97e30ef52e02 100644 --- a/src/core/tests/type_prop/tile.cpp +++ b/src/core/tests/type_prop/tile.cpp @@ -152,12 +152,11 @@ class TileTest : public TypePropTileTest, public WithParamInterface= 0) { - repeats.insert(repeats.begin(), size_diff, 1); + if (labels.size() > repeats.size()) { + repeats.insert(repeats.begin(), labels.size() - repeats.size(), 1); } else { - labels.insert(labels.begin(), -size_diff, ov::no_label); + labels.insert(labels.begin(), repeats.size() - labels.size(), ov::no_label); } std::transform(labels.begin(), diff --git a/src/frontends/CMakeLists.txt b/src/frontends/CMakeLists.txt index b1ace92f3279b2..41902d02d24b4e 100644 --- a/src/frontends/CMakeLists.txt +++ b/src/frontends/CMakeLists.txt @@ -34,4 +34,4 @@ endif() if (ENABLE_OV_TF_LITE_FRONTEND) add_subdirectory(tensorflow_lite) -endif() \ No newline at end of file +endif() diff --git a/src/frontends/common/include/openvino/frontend/manager.hpp b/src/frontends/common/include/openvino/frontend/manager.hpp index 161d37ced06103..4968ef8bbf62e4 100644 --- a/src/frontends/common/include/openvino/frontend/manager.hpp +++ b/src/frontends/common/include/openvino/frontend/manager.hpp @@ -14,7 +14,7 @@ namespace ov { // Forward declaration -void FRONTEND_API shutdown(); +FRONTEND_API void shutdown(); namespace frontend { // -------------- FrontEndManager ----------------- using FrontEndFactory = std::function; diff --git a/src/frontends/pytorch/src/frontend.cpp b/src/frontends/pytorch/src/frontend.cpp index a53a55525437e8..90f93267524bfc 100644 --- a/src/frontends/pytorch/src/frontend.cpp +++ b/src/frontends/pytorch/src/frontend.cpp @@ -18,7 +18,9 @@ #include "transforms/append_list_unpack_replacer.hpp" #include "transforms/aten_cat_replacer.hpp" #include "transforms/aten_getitem_replacer.hpp" +#include "transforms/aten_index_replacer.hpp" #include "transforms/aten_stack_list_construct_replacer.hpp" +#include "transforms/einsum_list_construct.hpp" #include "transforms/listconstruct_replacer.hpp" #include "transforms/min_max_prim_list_construct_replacer.hpp" #include "transforms/prim_list_construct_pad.hpp" @@ -96,7 +98,9 @@ void FrontEnd::normalize(const std::shared_ptr& model) const { manager.register_pass(); manager.register_pass(); manager.register_pass(); + manager.register_pass(); manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); manager.register_pass(); diff --git a/src/frontends/pytorch/src/node_context.cpp b/src/frontends/pytorch/src/node_context.cpp index d8bb94305d86a5..a3e8c81633a800 100644 --- a/src/frontends/pytorch/src/node_context.cpp +++ b/src/frontends/pytorch/src/node_context.cpp @@ -142,6 +142,11 @@ ngraph::Shape NodeContext::const_input(size_t index) const { return get_constant_at_input(*this, index)->cast_vector(); } +template <> +int32_t NodeContext::const_input(size_t index) const { + return get_constant_at_input(*this, index)->cast_vector()[0]; +} + template <> int64_t NodeContext::const_input(size_t index) const { return get_constant_at_input(*this, index)->cast_vector()[0]; diff --git a/src/frontends/pytorch/src/op/roi_align.cpp b/src/frontends/pytorch/src/op/roi_align.cpp new file mode 100644 index 00000000000000..d3a389c59654b9 --- /dev/null +++ b/src/frontends/pytorch/src/op/roi_align.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/roi_align.hpp" + +#include "openvino/frontend/pytorch/node_context.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/reshape.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace op { + +using namespace ov::op; + +OutputVector translate_roi_align(NodeContext& context) { + num_inputs_check(context, 7, 7); + auto const_1 = context.mark_node(v0::Constant::create(element::i32, Shape{1}, {1})); + auto const_neg_1 = context.mark_node(v0::Constant::create(element::i32, Shape{1}, {-1})); + auto const_0 = context.mark_node(v0::Constant::create(element::i32, Shape{1}, {0})); + auto const_rois_indices = context.mark_node(v0::Constant::create(element::i32, Shape{4}, {1, 2, 3, 4})); + + auto input = context.get_input(0); + auto boxes_input = context.get_input(1); + + auto input_real_type = context.mark_node(std::make_shared(input, element::f32)); + auto boxes = context.mark_node(std::make_shared(boxes_input, input_real_type)); + + auto spatial_scale = context.const_input(2); + int output_size_h = context.const_input(3); + int output_size_w = context.const_input(4); + int sampling_ratio = context.const_input(5); + + auto aligned = context.const_input(6); + + auto rois = context.mark_node(std::make_shared(boxes, const_rois_indices, const_1)); + + auto batch_indices_gather = context.mark_node(std::make_shared(boxes, const_0, const_1)); + auto batch_indices_reshape = + context.mark_node(std::make_shared(batch_indices_gather, const_neg_1, false)); + auto batch_indices = context.mark_node(std::make_shared(batch_indices_reshape, element::i32)); + + v9::ROIAlign::AlignedMode aligned_mode = + aligned ? v9::ROIAlign::AlignedMode::HALF_PIXEL_FOR_NN : v9::ROIAlign::AlignedMode::ASYMMETRIC; + + auto roi_align = context.mark_node(std::make_shared(input_real_type, + rois, + batch_indices, + output_size_h, + output_size_w, + sampling_ratio, + spatial_scale, + v9::ROIAlign::PoolingMode::AVG, + aligned_mode)); + + return {roi_align}; +}; + +} // namespace op +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/op/upsample.cpp b/src/frontends/pytorch/src/op/upsample.cpp index 6c5b33d882c1a9..111a07a28c70e9 100644 --- a/src/frontends/pytorch/src/op/upsample.cpp +++ b/src/frontends/pytorch/src/op/upsample.cpp @@ -16,10 +16,12 @@ namespace op { using namespace ov::op; namespace { -OutputVector base_translate_upsample2d(const NodeContext& context, v4::Interpolate::InterpolateMode interpolate_mode) { - num_inputs_check(context, 3, 4); +OutputVector base_translate_upsample(const NodeContext& context, + v4::Interpolate::InterpolateMode interpolate_mode, + size_t dims) { + num_inputs_check(context, 1, 4); auto data = context.get_input(0); - std::vector pad{0}; + std::vector pad(dims, 0); auto size_mode = v4::Interpolate::ShapeCalcMode::SIZES; bool align_corners = false; int scale_id = 2; @@ -29,11 +31,21 @@ OutputVector base_translate_upsample2d(const NodeContext& context, v4::Interpola align_corners = context.const_input(2); } } - auto target_axes = std::make_shared(element::i32, Shape{2}, std::vector({2, 3})); + std::vector spatial_axes; + if (dims == 1) { + spatial_axes = {2}; + } else if (dims == 2) { + spatial_axes = {2, 3}; + } else if (dims == 3) { + spatial_axes = {2, 3, 4}; + } else { + FRONT_END_OP_CONVERSION_CHECK(false, "Unsupported number of dimensions in upsample"); + } + auto target_axes = std::make_shared(element::i32, Shape{spatial_axes.size()}, spatial_axes); auto scales = - context.mark_node(std::make_shared(element::f32, Shape{2}, std::vector({1, 1}))); + context.mark_node(std::make_shared(element::f32, Shape{dims}, std::vector(dims, 1))); auto output_sizes = - context.mark_node(std::make_shared(element::i32, Shape{2}, std::vector({1, 1}))); + context.mark_node(std::make_shared(element::i32, Shape{dims}, std::vector(dims, 1))); if (context.input_is_none(1)) { FRONT_END_OP_CONVERSION_CHECK(!context.input_is_none(scale_id), "Scale or Output size should be provided"); auto spatial_scales = context.get_input(scale_id); @@ -48,6 +60,7 @@ OutputVector base_translate_upsample2d(const NodeContext& context, v4::Interpola attrs.coordinate_transformation_mode = v4::Interpolate::CoordinateTransformMode::ASYMMETRIC; attrs.nearest_mode = v4::Interpolate::NearestMode::FLOOR; if (attrs.mode != v4::Interpolate::InterpolateMode::NEAREST) { + attrs.coordinate_transformation_mode = v4::Interpolate::CoordinateTransformMode::PYTORCH_HALF_PIXEL; if (align_corners) { attrs.coordinate_transformation_mode = v4::Interpolate::CoordinateTransformMode::ALIGN_CORNERS; } @@ -56,16 +69,33 @@ OutputVector base_translate_upsample2d(const NodeContext& context, v4::Interpola }; } // namespace +OutputVector translate_upsample_linear1d(NodeContext& context) { + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::LINEAR_ONNX, 1); +}; + OutputVector translate_upsample_bilinear2d(NodeContext& context) { - return base_translate_upsample2d(context, v4::Interpolate::InterpolateMode::LINEAR_ONNX); + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::LINEAR_ONNX, 2); +}; + +OutputVector translate_upsample_trilinear3d(NodeContext& context) { + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::LINEAR_ONNX, 3); +}; + +OutputVector translate_upsample_nearest1d(NodeContext& context) { + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::NEAREST, 1); }; OutputVector translate_upsample_nearest2d(NodeContext& context) { - return base_translate_upsample2d(context, v4::Interpolate::InterpolateMode::NEAREST); + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::NEAREST, 2); +}; + +OutputVector translate_upsample_nearest3d(NodeContext& context) { + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::NEAREST, 3); }; +// bicubic is only supported for 2d in pytorch OutputVector translate_upsample_bicubic2d(NodeContext& context) { - return base_translate_upsample2d(context, v4::Interpolate::InterpolateMode::CUBIC); + return base_translate_upsample(context, v4::Interpolate::InterpolateMode::CUBIC, 2); }; } // namespace op diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 098afbfc9a6a8c..bd2e9bf0564e7b 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -89,6 +89,7 @@ OP_CONVERTER(translate_repeat); OP_CONVERTER(translate_repeat_interleave); OP_CONVERTER(translate_reshape); OP_CONVERTER(translate_reshape_as); +OP_CONVERTER(translate_roi_align); OP_CONVERTER(translate_roll); OP_CONVERTER(translate_rsqrt); OP_CONVERTER(translate_rsub); @@ -110,7 +111,11 @@ OP_CONVERTER(translate_triu); OP_CONVERTER(translate_unfold); OP_CONVERTER(translate_upsample_bicubic2d); OP_CONVERTER(translate_upsample_bilinear2d); +OP_CONVERTER(translate_upsample_linear1d); +OP_CONVERTER(translate_upsample_nearest1d); OP_CONVERTER(translate_upsample_nearest2d); +OP_CONVERTER(translate_upsample_nearest3d); +OP_CONVERTER(translate_upsample_trilinear3d); OP_CONVERTER(translate_var); OP_CONVERTER(translate_var_mean); OP_CONVERTER(translate_where); @@ -303,7 +308,11 @@ const std::map get_supported_ops() { {"aten::unsqueeze_", op::inplace_op>}, {"aten::upsample_bicubic2d", op::translate_upsample_bicubic2d}, {"aten::upsample_bilinear2d", op::translate_upsample_bilinear2d}, + {"aten::upsample_linear1d", op::translate_upsample_linear1d}, + {"aten::upsample_nearest1d", op::translate_upsample_nearest1d}, {"aten::upsample_nearest2d", op::translate_upsample_nearest2d}, + {"aten::upsample_nearest3d", op::translate_upsample_nearest3d}, + {"aten::upsample_trilinear3d", op::translate_upsample_trilinear3d}, {"aten::var", op::translate_var}, {"aten::var_mean", op::translate_var_mean}, {"aten::view", op::translate_reshape}, @@ -319,6 +328,7 @@ const std::map get_supported_ops() { {"prim::NumToTensor", op::skip_node}, // In openvino we already store number as tensor with shape [] {"prim::requires_grad", op::return_false_scalar}, {"torchvision::nms", op::translate_nms}, + {"torchvision::roi_align", op::translate_roi_align}, }; }; diff --git a/src/frontends/pytorch/src/transforms/aten_index_replacer.cpp b/src/frontends/pytorch/src/transforms/aten_index_replacer.cpp new file mode 100644 index 00000000000000..7affc4511d028a --- /dev/null +++ b/src/frontends/pytorch/src/transforms/aten_index_replacer.cpp @@ -0,0 +1,271 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "aten_index_replacer.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/frontend/pytorch/visibility.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/gather_elements.hpp" +#include "openvino/op/gather_nd.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/non_zero.hpp" +#include "openvino/op/reduce_prod.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/split.hpp" +#include "openvino/op/squeeze.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/util/framework_node.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace pass { + +using namespace ov::op; +namespace { + +std::shared_ptr flatten(const Output& value, size_t axis) { + // First dimension of output tensor is the product of [d_0, ... d_{axis-1}] dimensions of + // input tensor. The last dimension is the product of the rest of input tensor dimensions: + // [d_{axis}, ..., d_n] + Output output_shape; + if (axis == 0) { + output_shape = v0::Constant::create(element::i64, Shape{2}, {1, -1}); + } else if (axis == 1) { + output_shape = v0::Constant::create(element::i64, Shape{2}, {0, -1}); + } else { + const auto value_shape = std::make_shared(value); + const auto value_rank = std::make_shared(value_shape); + const auto axis_node = v0::Constant::create(element::i64, Shape{}, {axis}); + auto start = v0::Constant::create(element::i64, Shape{}, {0}); + auto step = v0::Constant::create(element::i64, Shape{}, {1}); + const auto first_part_dims = std::make_shared(value_shape, start, axis_node, step); + auto zero = v0::Constant::create(element::i64, {}, {0}); + auto first_part_dims_length = std::make_shared(first_part_dims, zero, true); + + auto remaining_part_length = v0::Constant::create(element::i64, {1}, {-1}); + + output_shape = std::make_shared(OutputVector{first_part_dims_length, remaining_part_length}, 0); + } + return std::make_shared(value, output_shape, true); +} +}; // namespace + +AtenIndexToSelect::AtenIndexToSelect() { + auto index_op = ov::pass::pattern::wrap_type(); + + ov::matcher_pass_callback callback = [](ov::pass::pattern::Matcher& m) { + auto index_op = cast_fw_node(m.get_match_root(), "aten::index"); + if (!index_op) { + return false; + } + auto input_node = index_op->input_value(0).get_node_shared_ptr(); + auto indicies = index_op->input_value(1).get_node_shared_ptr(); + auto list_indicies = cast_fw_node(indicies, "prim::ListConstruct"); + if (list_indicies) { + // Multiple tensors as indices. Each tensor could either be + // 1. prim::Constant() + // representing ":" in python indexing. E.g. tensor[:, :] + // 2. prim::Constant[value=...] or tensor output + // representing advanced indexing. E.g. tensor[[0, 1], [2, 0]]. + // For more info on advanced indexing, + // check https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing + + // Consider a general case of + // t: [x_1, y_1, y_2, ..., x_m, ..., y_n] + // where t is a tensor of rank m+n, {x_i} are axes where tensor index is provided, and {y_i} are axes for + // ":". Same results can be achieved through transposing t into + // t: [x_1, x_2, ..., x_m, y_1, y_2, ..., y_n] + // and use gather + // t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n] + // tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j)) + // After gather, reshape and transpose back. + auto ids = list_indicies->input_values(); + std::vector advanced_ids; + std::vector is_masked_bool; + OutputVector masked_indicies; + // for case when index is bool e.g. x[x>0], replace index with non_zero + for (size_t i = 0; i < ids.size(); i++) { + auto const_input = cast_fw_node(ids[i].get_node_shared_ptr(), "prim::Constant"); + + // skip dimensions where index is None + if (const_input) { + const auto& attrs = const_input->get_attrs(); + if (attrs.find("none_value") != attrs.end()) { + masked_indicies.push_back(ids[i]); + is_masked_bool.push_back(false); + continue; + } + } + auto id_dtype = ids[i].get_node_shared_ptr()->get_element_type(); + if (id_dtype == element::boolean || id_dtype == element::u8) { + auto idx = std::make_shared(ids[i], element::u8); + auto nonzero = std::make_shared(idx); + auto input_order = v0::Constant::create(element::i64, Shape{2}, {1, 0}); + auto masked_id = std::make_shared(nonzero, input_order); + masked_indicies.push_back(masked_id); + is_masked_bool.push_back(true); + } else { + masked_indicies.push_back(ids[i]); + is_masked_bool.push_back(false); + } + advanced_ids.push_back(i); + } + + // all indicies prim::Constant(None), return input as is + if (advanced_ids.size() == 0) { + copy_runtime_info({index_op, input_node}, input_node); + replace_node(index_op, input_node); + return true; + } + // perform gather for single element case + if (advanced_ids.size() == 1) { + auto index = masked_indicies[advanced_ids[0]]; + index = std::make_shared(index, element::i64); + if (is_masked_bool[advanced_ids[0]]) { + auto gather = std::make_shared(input_node, index); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + } + auto dim = v0::Constant::create(element::i64, Shape{}, {advanced_ids[0]}); + auto gather = std::make_shared(input_node, index, dim); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + } + auto adv_idx_count = advanced_ids.size(); + auto rank = input_node->get_input_partial_shape(0).rank(); + if (rank.is_dynamic()) { + FRONT_END_CHECK_IMPLEMENTED(false, "indexing for tensor with dynamic rank is not implemented "); + } + auto input_shape = std::make_shared(input_node); + auto zero = v0::Constant::create(element::i64, Shape{}, {0}); + auto input_dims = std::make_shared(input_shape, zero, rank.get_length()); + std::vector non_used_dims; + for (auto i = 0; i < rank.get_length(); i++) { + if (std::find(advanced_ids.begin(), advanced_ids.end(), i) == advanced_ids.end()) { + non_used_dims.push_back(i); + } + } + std::vector permutation_dims; + permutation_dims.insert(permutation_dims.end(), advanced_ids.begin(), advanced_ids.end()); + permutation_dims.insert(permutation_dims.end(), non_used_dims.begin(), non_used_dims.end()); + auto transpose_dims = v0::Constant::create(element::i64, Shape{permutation_dims.size()}, permutation_dims); + auto transposed_input = std::make_shared(input_node, transpose_dims); + auto flatten_input = flatten(transposed_input, adv_idx_count); + auto cum_adv_index = masked_indicies[advanced_ids[adv_idx_count - 1]]; + auto multiplier = input_dims->output(advanced_ids[adv_idx_count - 1]); + for (int i = static_cast(adv_idx_count) - 2; i > 0; i--) { + auto adv_index = std::make_shared(masked_indicies[i], multiplier); + cum_adv_index = std::make_shared(cum_adv_index, adv_index); + auto input_id = advanced_ids[i]; + multiplier = std::make_shared(multiplier, input_dims->output(input_id)); + } + std::shared_ptr gather = std::make_shared(flatten_input, cum_adv_index, zero); + OutputVector concat_dims; + // check if all advanced indices are consecutive. + std::vector consequence_dims; + auto cum_adv_index_shape_tensor = std::make_shared(cum_adv_index); + for (size_t i = advanced_ids[0]; i <= advanced_ids[advanced_ids.size() - 1]; i++) { + consequence_dims.push_back(i); + } + // unfold regular index axes + if (advanced_ids == consequence_dims) { + OutputVector folded_adv_idx_shape_vector; + auto minus_one = v0::Constant::create(element::i64, Shape{1}, {-1}); + folded_adv_idx_shape_vector.push_back(minus_one); + for (auto i : non_used_dims) { + folded_adv_idx_shape_vector.push_back(input_dims->output(i)); + } + auto folded_adv_idx_shape = std::make_shared(folded_adv_idx_shape_vector, 0); + gather = std::make_shared(gather, folded_adv_idx_shape, false); + std::vector adv_idx_permute; + for (size_t i = 1; i < advanced_ids[0] + 1; i++) { + adv_idx_permute.push_back(i); + } + adv_idx_permute.push_back(0); + for (size_t i = advanced_ids[0] + 1; i < (rank.get_length() - adv_idx_count + 1); i++) { + adv_idx_permute.push_back(i); + } + // Transpose folded advanced indexed axis to its original location. + auto permute_indicies = + v0::Constant::create(element::i64, Shape{adv_idx_permute.size()}, adv_idx_permute); + gather = std::make_shared(gather, permute_indicies); + // unfold advanced index axes + for (size_t i = 0; i <= advanced_ids[0]; i++) { + concat_dims.push_back(input_dims->output(i)); + } + concat_dims.push_back(cum_adv_index_shape_tensor); + for (auto i : non_used_dims) { + if (i < advanced_ids[i]) { + continue; + } + concat_dims.push_back(input_dims->output(i)); + } + + } else { + concat_dims.push_back(cum_adv_index_shape_tensor); + for (auto i : non_used_dims) { + concat_dims.push_back(input_dims->output(i)); + } + } + auto final_shape = std::make_shared(concat_dims, 0); + gather = std::make_shared(gather, final_shape, false); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + + } else { + auto const_input = cast_fw_node(indicies, "prim::Constant"); + + if (const_input) { + // index is None, stay input as is + const auto& attrs = const_input->get_attrs(); + if (attrs.find("none_value") != attrs.end()) { + copy_runtime_info({index_op, input_node, indicies}, input_node); + replace_node(index_op, input_node); + return true; + } + } + auto index_dtype = indicies->get_output_element_type(0); + if (index_dtype == element::boolean || index_dtype == element::u8) { + auto nonzero = std::make_shared(indicies); + auto input_order = v0::Constant::create(element::i64, Shape{2}, {1, 0}); + auto masked_id = std::make_shared(nonzero, input_order); + auto gather = std::make_shared(input_node, masked_id); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + } + if (index_dtype != element::i32 && index_dtype != element::i64) { + indicies = std::make_shared(indicies, element::i64); + } + auto dim = v0::Constant::create(element::i64, Shape{}, {0}); + auto gather = std::make_shared(input_node, indicies, dim); + copy_runtime_info({index_op, input_node, indicies}, gather); + replace_node(index_op, gather); + return true; + } + return false; + }; + + auto m = std::make_shared(index_op, "ov::frontend::pytorch::pass::AtenIndexToSelect"); + this->register_matcher(m, callback); +}; + +} // namespace pass +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp b/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp new file mode 100644 index 00000000000000..84f6133253aea6 --- /dev/null +++ b/src/frontends/pytorch/src/transforms/aten_index_replacer.hpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/frontend/pytorch/visibility.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pass.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace pass { + +// This transformation replaces pattern prim::ListConstruct->aten::index +class PYTORCH_API AtenIndexToSelect : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenIndexToSelect"); + AtenIndexToSelect(); +}; + +} // namespace pass +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/transforms/einsum_list_construct.cpp b/src/frontends/pytorch/src/transforms/einsum_list_construct.cpp new file mode 100644 index 00000000000000..96881ebcbb25e0 --- /dev/null +++ b/src/frontends/pytorch/src/transforms/einsum_list_construct.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "einsum_list_construct.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/op/einsum.hpp" +#include "openvino/op/util/framework_node.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "utils.hpp" + +using namespace ov::pass::pattern; + +namespace ov { +namespace frontend { +namespace pytorch { +namespace pass { + +using namespace ov::pass; +using namespace ov::op; + +AtenEinsumListConstructReplacer::AtenEinsumListConstructReplacer() { + auto einsum_op = pattern::wrap_type(); + ov::matcher_pass_callback callback = [](pattern::Matcher& m) { + auto einsum_op = cast_fw_node(m.get_match_root(), "aten::einsum"); + if (!einsum_op) { + return false; + } + auto equation_input = einsum_op->input_value(0).get_node_shared_ptr(); + auto tensor_list = einsum_op->input_value(1).get_node_shared_ptr(); + std::string equation; + // equation should be string constant + if (const auto& fw_node_mode = cast_fw_node(equation_input, "prim::Constant")) { + const auto& attrs = fw_node_mode->get_attrs(); + if (attrs.find("string_value") != attrs.end()) { + equation = attrs.at("string_value"); + } + } else { + return false; + } + // Check if ListConstruct is an input + if (auto list_construct_node = cast_fw_node(tensor_list, "prim::ListConstruct")) { + const auto& list_inputs = list_construct_node->input_values(); + OutputVector node_vector; + // Iterate over values in ListConstruct + for (const auto& list_input : list_inputs) { + node_vector.push_back(list_input); + } + + auto einsum = std::make_shared(node_vector, equation); + copy_runtime_info({einsum_op, equation_input, tensor_list}, einsum); + replace_node(einsum_op, einsum); + return true; + } + return false; + }; + + auto m = + std::make_shared(einsum_op, "ov::frontend::pytorch::pass::AtenEinsumListConstructReplacer"); + this->register_matcher(m, callback); +}; + +} // namespace pass +} // namespace pytorch +} // namespace frontend +} // namespace ov \ No newline at end of file diff --git a/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp b/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp new file mode 100644 index 00000000000000..af2ac9b5301129 --- /dev/null +++ b/src/frontends/pytorch/src/transforms/einsum_list_construct.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pass.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace pass { + +class AtenEinsumListConstructReplacer : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ov::frontend::pytorch::pass::AtenEinsumListConstructReplacer"); + AtenEinsumListConstructReplacer(); +}; + +} // namespace pass +} // namespace pytorch +} // namespace frontend +} // namespace ov \ No newline at end of file diff --git a/src/frontends/tensorflow/src/frontend.cpp b/src/frontends/tensorflow/src/frontend.cpp index 0a8fd6abaa20e9..1a7a6c2ae887ac 100644 --- a/src/frontends/tensorflow/src/frontend.cpp +++ b/src/frontends/tensorflow/src/frontend.cpp @@ -58,11 +58,12 @@ bool FrontEnd::supported_impl(const std::vector& variants) const { if (variants.size() != 1) return false; - // Validating first path, it must contain a model if (variants[0].is()) { - std::string suffix = ".pb"; std::string model_path = variants[0].as(); - if (ov::util::ends_with(model_path, suffix.c_str())) { + if (ov::util::ends_with(model_path, ".pb") && GraphIteratorProto::is_supported(model_path)) { + // handle binary protobuf format + // for automatic deduction of the frontend to convert the model + // we have more strict rule that is to have `.pb` extension in the path return true; } } @@ -70,12 +71,16 @@ bool FrontEnd::supported_impl(const std::vector& variants) const { else if (variants[0].is()) { std::wstring suffix = L".pb"; std::wstring model_path = variants[0].as(); - if (ov::util::ends_with(model_path, suffix)) { + if (ov::util::ends_with(model_path, suffix) && GraphIteratorProto::is_supported(model_path)) { + // handle binary protobuf format with a path in Unicode + // for automatic deduction of the frontend to convert the model + // we have more strict rule that is to have `.pb` extension in the path return true; } } #endif else if (variants[0].is()) { + // this is used for OpenVINO with TensorFlow Integration return true; } return false; @@ -83,33 +88,36 @@ bool FrontEnd::supported_impl(const std::vector& variants) const { ov::frontend::InputModel::Ptr FrontEnd::load_impl(const std::vector& variants) const { // TODO: Support other TensorFlow formats: SavedModel, .meta, checkpoint, pbtxt - if (variants.size() == 1) { - // a case when binary protobuf format is provided - if (variants[0].is()) { - std::string suffix = ".pb"; - std::string model_path = variants[0].as(); - if (ov::util::ends_with(model_path, suffix.c_str())) { - return std::make_shared( - std::make_shared<::ov::frontend::tensorflow::GraphIteratorProto>(model_path), - m_telemetry); - } + FRONT_END_GENERAL_CHECK(variants.size() == 1, + "[TensorFlow Frontend] Internal error or inconsistent input model: the frontend supports " + "only frozen binary protobuf format."); + + if (variants[0].is()) { + auto model_path = variants[0].as(); + if (GraphIteratorProto::is_supported(model_path)) { + // handle binary protobuf format + return std::make_shared(std::make_shared(model_path), m_telemetry); } + } #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) - else if (variants[0].is()) { - std::wstring suffix = L".pb"; - std::wstring model_path = variants[0].as(); - if (ov::util::ends_with(model_path, suffix)) { - return std::make_shared( - std::make_shared<::ov::frontend::tensorflow::GraphIteratorProto>(model_path), - m_telemetry); - } + else if (variants[0].is()) { + std::wstring model_path = variants[0].as(); + if (GraphIteratorProto::is_supported(model_path)) { + // handle binary protobuf format with a path in Unicode + return std::make_shared(std::make_shared(model_path), m_telemetry); } + } #endif - else if (variants[0].is()) { - auto graph_iterator = variants[0].as(); - return std::make_shared(graph_iterator, m_telemetry); - } + else if (variants[0].is()) { + // this is used for OpenVINO with TensorFlow Integration + auto graph_iterator = variants[0].as(); + return std::make_shared(graph_iterator, m_telemetry); } + + FRONT_END_GENERAL_CHECK(false, + "[TensorFlow Frontend] Internal error or inconsistent input model: the frontend supports " + "only frozen binary protobuf format."); + return nullptr; } diff --git a/src/frontends/tensorflow/src/graph_iterator_proto.hpp b/src/frontends/tensorflow/src/graph_iterator_proto.hpp index c2b08ebe9de923..1fa836e3b036e1 100644 --- a/src/frontends/tensorflow/src/graph_iterator_proto.hpp +++ b/src/frontends/tensorflow/src/graph_iterator_proto.hpp @@ -88,29 +88,40 @@ class GraphIteratorProto : public GraphIterator { } } - /// Set iterator to the start position + /// \brief Check if the input file is supported + template + static bool is_supported(const std::basic_string& path) { + std::ifstream pb_stream(path, std::ios::in | std::ifstream::binary); + auto graph_def = std::make_shared<::tensorflow::GraphDef>(); + return pb_stream && pb_stream.is_open() && graph_def->ParsePartialFromIstream(&pb_stream); + } + + /// \brief Set iterator to the start position void reset() override { node_index = 0; } + /// \brief Return a number of nodes in the graph size_t size() const override { return m_decoders.size(); } - /// Moves to the next node in the graph + /// \brief Move to the next node in the graph void next() override { node_index++; } + /// \brief Check if the graph is fully traversed bool is_end() const override { return node_index >= m_decoders.size(); } - /// Return NodeContext for the current node that iterator points to + /// \brief Return NodeContext for the current node that iterator points to std::shared_ptr get_decoder() const override { return m_decoders[node_index]; } + /// \brief Get GraphIterator for library funnction by name std::shared_ptr get_body_graph_iterator(const std::string& func_name) const override { if (m_library_map.count(func_name)) { auto func_ind = m_library_map.at(func_name); @@ -127,10 +138,12 @@ class GraphIteratorProto : public GraphIterator { return nullptr; } + /// \brief Get input names in the original order. Used for the library functions std::vector get_input_names() const override { return m_input_names; } + /// \brief Get output names in the original order. Used for the library functions std::vector get_output_names() const override { return m_output_names; } diff --git a/src/frontends/tensorflow_common/src/op/fake_quant_min_max_vars.cpp b/src/frontends/tensorflow_common/src/op/fake_quant_min_max_vars.cpp index aed41eea03e94c..fa5c819b88b386 100644 --- a/src/frontends/tensorflow_common/src/op/fake_quant_min_max_vars.cpp +++ b/src/frontends/tensorflow_common/src/op/fake_quant_min_max_vars.cpp @@ -18,41 +18,44 @@ OutputVector translate_fake_quant_op(const NodeContext& node) { auto min = node.get_input(1); auto max = node.get_input(2); - auto narrow_range = node.get_attribute("narrow_range"); - auto num_bits = node.get_attribute("num_bits"); + // retrieve attributes + auto narrow_range = node.get_attribute("narrow_range", false); + auto num_bits = node.get_attribute("num_bits", 8); - size_t levels = static_cast(std::pow(2, num_bits) - int(narrow_range)); - auto min_less_max = make_shared(min, max); - auto minimum = make_shared(min_less_max, max, min); + size_t levels = static_cast(pow(2, num_bits)); + levels = narrow_range ? levels - 1 : levels; - auto zero = make_shared(min.get_element_type(), Shape{}, std::vector({0})); + // compute real min and max values + Output minimum = make_shared(min, max); + Output maximum = make_shared(min, max); + // adjust min and max so that min <= 0 + auto zero = make_shared(min.get_element_type(), Shape{}, 0); auto min_greater_zero = make_shared(minimum, zero); - auto max_minus_min = make_shared(maximum, minimum); + Output max_minus_min = make_shared(maximum, minimum); minimum = make_shared(min_greater_zero, max_minus_min, maximum); + // adjust min and max so that 0 <= max auto max_less_zero = make_shared(maximum, zero); auto min_minus_max = make_shared(minimum, maximum); minimum = make_shared(max_less_zero, zero, maximum); - auto float_range = make_shared(maximum, minimum); - auto quant_min_value = int(narrow_range); - auto quant_max_value = std::pow(2, num_bits) - 1; - auto value = static_cast(quant_max_value - quant_min_value); - auto int_range = make_shared(element::f32, Shape{}, std::vector({value})); - auto scale = make_shared(float_range, int_range); + // adjust min and max so that scale = (max - min) / (2^num_bits - 1), + // min_adj = scale * round(min / scale) and max_adj = max + min_adj - min + max_minus_min = make_shared(maximum, minimum); + auto const_levels = make_shared(element::f32, Shape{}, static_cast(levels - 1)); + auto scale = make_shared(max_minus_min, const_levels); auto descaled_min = make_shared(minimum, scale); auto rounded_descaled_min = make_shared(descaled_min, Round::RoundMode::HALF_TO_EVEN); auto min_adj = make_shared(scale, rounded_descaled_min); auto adjustment = make_shared(min_adj, minimum); auto max_adj = make_shared(maximum, adjustment); - auto res = make_shared(inputs, min_adj, max_adj, min_adj, max_adj, levels); - set_node_name(node.get_name(), res); - return {res}; + auto fake_quantize = make_shared(inputs, min_adj, max_adj, min_adj, max_adj, levels); + set_node_name(node.get_name(), fake_quantize); + return {fake_quantize}; } } // namespace op } // namespace tensorflow diff --git a/src/frontends/tensorflow_common/src/op/identity.cpp b/src/frontends/tensorflow_common/src/op/identity.cpp index 4dbc9e285a892b..7bd6d7735e37fb 100644 --- a/src/frontends/tensorflow_common/src/op/identity.cpp +++ b/src/frontends/tensorflow_common/src/op/identity.cpp @@ -14,13 +14,12 @@ namespace tensorflow { namespace op { OutputVector translate_identity_op(const NodeContext& node) { + vector supported_ops = {"Identity", "PreventGradient", "Snapshot", "StopGradient"}; + default_op_checks(node, 1, supported_ops); auto input = node.get_input(0); - // since the input node can have several outputs, and identity have only one input, - // we cannot use set_node_name(..) helper, we have to set names for output connected - // to this identity only. - // Node_1 -> Node_2 - // -(identity name) -> Identity + // set only tensor names + // no need to change node name since Identity node is skipped set_out_name(node.get_name(), input); set_out_name(node.get_name() + ":" + "0", input); return {input}; diff --git a/src/frontends/tensorflow_lite/src/op/op_translation_utils.cpp b/src/frontends/tensorflow_lite/src/op/op_translation_utils.cpp index ad94a8f03cda13..b849c06f2594c6 100644 --- a/src/frontends/tensorflow_lite/src/op/op_translation_utils.cpp +++ b/src/frontends/tensorflow_lite/src/op/op_translation_utils.cpp @@ -78,13 +78,21 @@ void get_activation(ov::OutputVector& output, output = ov::frontend::tensorflow::op::translate_relu_6_op(context); } else if (activation == "TANH") { output = ov::frontend::tensorflow::op::translate_unary_op(context); + } else if (activation == "RELU_N1_TO_1") { + auto clamp = std::make_shared(output[0], -1.0f, 1.0f); + clamp->set_friendly_name(context.get_name()); + output = clamp->outputs(); + } else if (activation == "SIGN_BIT") { + auto zero = std::make_shared(opset10::Constant::create(element::i32, {}, {0}), output[0]); + auto less = std::make_shared(output[0], zero); + less->set_friendly_name(context.get_name()); + output = less->outputs(); } else { - // TODO: Fused activation to support: - // RELU_N1_TO_1 = 2, - // SIGN_BIT = 5, - if (activation != "NONE") { - FRONT_END_THROW("Unknown Activation fused to " + node.get_decoder()->get_op_type() + ": " + activation); - } + FRONT_END_GENERAL_CHECK(activation == "NONE", + "Unknown Activation fused to ", + node.get_decoder()->get_op_type(), + ": ", + activation); } del_output_names(output); } diff --git a/src/inference/CMakeLists.txt b/src/inference/CMakeLists.txt index 03ded6608790f8..f3f436e57d50e1 100644 --- a/src/inference/CMakeLists.txt +++ b/src/inference/CMakeLists.txt @@ -14,6 +14,7 @@ file (GLOB LIBRARY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/dev/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/dev/preprocessing/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/dev/threading/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/threading/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp_interfaces/interface/*.cpp diff --git a/src/inference/dev_api/ie_system_conf.h b/src/inference/dev_api/ie_system_conf.h index 17f1781c13ffab..408c626accf436 100644 --- a/src/inference/dev_api/ie_system_conf.h +++ b/src/inference/dev_api/ie_system_conf.h @@ -12,7 +12,7 @@ #include #include -#include "ie_api.h" +#include "openvino/runtime/system_conf.hpp" namespace InferenceEngine { @@ -23,7 +23,9 @@ namespace InferenceEngine { * @param[in] includeOMPNumThreads Indicates if the omp number threads is included * @return `True` if any OpenMP environment variable is defined, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) checkOpenMpEnvVars(bool includeOMPNumThreads = true); +inline bool checkOpenMpEnvVars(bool includeOMPNumThreads = true) { + return ov::check_open_mp_env_vars(includeOMPNumThreads); +} /** * @brief Returns available CPU NUMA nodes (on Linux, and Windows [only with TBB], single node is assumed on all @@ -31,7 +33,9 @@ INFERENCE_ENGINE_API_CPP(bool) checkOpenMpEnvVars(bool includeOMPNumThreads = tr * @ingroup ie_dev_api_system_conf * @return NUMA nodes */ -INFERENCE_ENGINE_API_CPP(std::vector) getAvailableNUMANodes(); +inline std::vector getAvailableNUMANodes() { + return ov::get_available_numa_nodes(); +} /** * @brief Returns available CPU cores types (on Linux, and Windows) and ONLY with TBB, single core type is assumed @@ -39,7 +43,9 @@ INFERENCE_ENGINE_API_CPP(std::vector) getAvailableNUMANodes(); * @ingroup ie_dev_api_system_conf * @return Vector of core types */ -INFERENCE_ENGINE_API_CPP(std::vector) getAvailableCoresTypes(); +inline std::vector getAvailableCoresTypes() { + return ov::get_available_cores_types(); +} /** * @brief Returns number of CPU physical cores on Linux/Windows (which is considered to be more performance @@ -50,7 +56,9 @@ INFERENCE_ENGINE_API_CPP(std::vector) getAvailableCoresTypes(); * @param[in] bigCoresOnly Additionally limits the number of reported cores to the 'Big' cores only. * @return Number of physical CPU cores. */ -INFERENCE_ENGINE_API_CPP(int) getNumberOfCPUCores(bool bigCoresOnly = false); +inline int getNumberOfCPUCores(bool bigCoresOnly = false) { + return ov::get_number_of_cpu_cores(bigCoresOnly); +} /** * @brief Returns number of CPU logical cores on Linux/Windows (on other OSes it simply relies on the original @@ -60,80 +68,81 @@ INFERENCE_ENGINE_API_CPP(int) getNumberOfCPUCores(bool bigCoresOnly = false); * @param[in] bigCoresOnly Additionally limits the number of reported cores to the 'Big' cores only. * @return Number of logical CPU cores. */ -INFERENCE_ENGINE_API_CPP(int) getNumberOfLogicalCPUCores(bool bigCoresOnly = false); +inline int getNumberOfLogicalCPUCores(bool bigCoresOnly = false) { + return ov::get_number_of_logical_cpu_cores(bigCoresOnly); +} /** * @brief Checks whether CPU supports SSE 4.2 capability * @ingroup ie_dev_api_system_conf * @return `True` is SSE 4.2 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_sse42(); +using ov::with_cpu_x86_sse42; /** * @brief Checks whether CPU supports AVX capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx(); +using ov::with_cpu_x86_avx; /** * @brief Checks whether CPU supports AVX2 capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX2 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx2(); +using ov::with_cpu_x86_avx2; /** * @brief Checks whether CPU supports AVX 512 capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX512F (foundation) instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512f(); +using ov::with_cpu_x86_avx512f; /** * @brief Checks whether CPU supports AVX 512 capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX512F, AVX512BW, AVX512DQ instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core(); +using ov::with_cpu_x86_avx512_core; /** * @brief Checks whether CPU supports AVX 512 VNNI capability * @ingroup ie_dev_api_system_conf * @return `True` is AVX512F, AVX512BW, AVX512DQ, AVX512_VNNI instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_vnni(); +using ov::with_cpu_x86_avx512_core_vnni; /** * @brief Checks whether CPU supports BFloat16 capability * @ingroup ie_dev_api_system_conf * @return `True` is tAVX512_BF16 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_bfloat16(); +using ov::with_cpu_x86_bfloat16; /** * @brief Checks whether CPU supports AMX int8 capability * @ingroup ie_dev_api_system_conf * @return `True` is tAMX_INT8 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_amx_int8(); +using ov::with_cpu_x86_avx512_core_amx_int8; /** * @brief Checks whether CPU supports AMX bf16 capability * @ingroup ie_dev_api_system_conf * @return `True` is tAMX_BF16 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_amx_bf16(); +using ov::with_cpu_x86_avx512_core_amx_bf16; /** * @brief Checks whether CPU supports AMX capability * @ingroup ie_dev_api_system_conf * @return `True` is tAMX_INT8 or tAMX_BF16 instructions are available, `false` otherwise */ -INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_amx(); +using ov::with_cpu_x86_avx512_core_amx; /** - * @enum column_of_processor_type_table * @brief This enum contains defination of each columns in processor type table which bases on cpu core types. Will * extend to support other CPU core type like ARM. * @@ -150,16 +159,9 @@ INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core_amx(); * ALL_PROC | MAIN_CORE_PROC | EFFICIENT_CORE_PROC | HYPER_THREADING_PROC * 32 8 16 8 // Total number of one socket */ -typedef enum { - ALL_PROC = 0, //!< All processors, regardless of backend cpu - MAIN_CORE_PROC = 1, //!< Processor based on physical core of Intel Performance-cores - EFFICIENT_CORE_PROC = 2, //!< Processor based on Intel Efficient-cores - HYPER_THREADING_PROC = 3, //!< Processor based on logical core of Intel Performance-cores - PROC_TYPE_TABLE_SIZE = 4 //!< Size of processor type table -} column_of_processor_type_table; +using ov::ColumnOfProcessorTypeTable; /** - * @enum column_of_cpu_mapping_table * @brief This enum contains defination of each columns in CPU mapping table which use processor id as index. * * GROUP_ID is generated according to the following rules. @@ -181,14 +183,6 @@ typedef enum { * 6 0 4 2 2 0 * 7 0 5 2 2 0 */ -typedef enum { - CPU_MAP_PROCESSOR_ID = 0, //!< column for processor id of the processor - CPU_MAP_SOCKET_ID = 1, //!< column for socket id of the processor - CPU_MAP_CORE_ID = 2, //!< column for hardware core id of the processor - CPU_MAP_CORE_TYPE = 3, //!< column for CPU core type corresponding to the processor - CPU_MAP_GROUP_ID = 4, //!< column for group id to the processor. Processors in one group have dependency. - CPU_MAP_USED_FLAG = 5, //!< column for resource management of the processor - CPU_MAP_TABLE_SIZE = 6 //!< Size of CPU mapping table -} column_of_cpu_mapping_table; +using ov::ColumnOfCPUMappingTable; } // namespace InferenceEngine diff --git a/src/inference/dev_api/openvino/runtime/iasync_infer_request.hpp b/src/inference/dev_api/openvino/runtime/iasync_infer_request.hpp index 687b05030cd566..628c2c651f85a9 100644 --- a/src/inference/dev_api/openvino/runtime/iasync_infer_request.hpp +++ b/src/inference/dev_api/openvino/runtime/iasync_infer_request.hpp @@ -17,7 +17,7 @@ #include "openvino/runtime/iinfer_request.hpp" #include "openvino/runtime/profiling_info.hpp" #include "openvino/runtime/tensor.hpp" -#include "threading/ie_itask_executor.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" namespace ov { @@ -37,8 +37,8 @@ namespace ov { class OPENVINO_RUNTIME_API IAsyncInferRequest : public IInferRequest { public: IAsyncInferRequest(const std::shared_ptr& request, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor); + const std::shared_ptr& task_executor, + const std::shared_ptr& callback_executor); ~IAsyncInferRequest(); /** @@ -153,7 +153,7 @@ class OPENVINO_RUNTIME_API IAsyncInferRequest : public IInferRequest { const std::vector>& get_outputs() const override; protected: - using Stage = std::pair; + using Stage = std::pair, ov::threading::Task>; /** * @brief Pipeline is vector of stages */ @@ -212,11 +212,11 @@ class OPENVINO_RUNTIME_API IAsyncInferRequest : public IInferRequest { void run_first_stage(const Pipeline::iterator itBeginStage, const Pipeline::iterator itEndStage, - const InferenceEngine::ITaskExecutor::Ptr callbackExecutor = {}); + const std::shared_ptr callbackExecutor = {}); - InferenceEngine::Task make_next_stage_task(const Pipeline::iterator itStage, - const Pipeline::iterator itEndStage, - const InferenceEngine::ITaskExecutor::Ptr callbackExecutor); + ov::threading::Task make_next_stage_task(const Pipeline::iterator itStage, + const Pipeline::iterator itEndStage, + const std::shared_ptr callbackExecutor); template void infer_impl(const F& f) { @@ -264,10 +264,10 @@ class OPENVINO_RUNTIME_API IAsyncInferRequest : public IInferRequest { std::shared_ptr m_sync_request; - InferenceEngine::ITaskExecutor::Ptr m_request_executor; //!< Used to run inference CPU tasks. - InferenceEngine::ITaskExecutor::Ptr + std::shared_ptr m_request_executor; //!< Used to run inference CPU tasks. + std::shared_ptr m_callback_executor; //!< Used to run post inference callback in asynchronous pipline - InferenceEngine::ITaskExecutor::Ptr + std::shared_ptr m_sync_callback_executor; //!< Used to run post inference callback in synchronous pipline mutable std::mutex m_mutex; std::function m_callback; diff --git a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp index 44c701c1d42be8..c95feba6cc1042 100644 --- a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp +++ b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp @@ -17,8 +17,8 @@ #include "openvino/runtime/common.hpp" #include "openvino/runtime/isync_infer_request.hpp" #include "openvino/runtime/remote_context.hpp" -#include "threading/ie_cpu_streams_executor.hpp" -#include "threading/ie_itask_executor.hpp" +#include "openvino/runtime/threading/cpu_streams_executor.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" namespace InferenceEngine { class ICompiledModelWrapper; @@ -47,14 +47,13 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< * * @param callback_executor Callback executor (CPUStreamsExecutor by default) */ - ICompiledModel(const std::shared_ptr& model, - const std::shared_ptr& plugin, - const InferenceEngine::ITaskExecutor::Ptr& task_executor = - std::make_shared(InferenceEngine::IStreamsExecutor::Config{ - "Default"}), - const InferenceEngine::ITaskExecutor::Ptr& callback_executor = - std::make_shared(InferenceEngine::IStreamsExecutor::Config{ - "Callback"})); + ICompiledModel( + const std::shared_ptr& model, + const std::shared_ptr& plugin, + const std::shared_ptr& task_executor = + std::make_shared(ov::threading::IStreamsExecutor::Config{"Default"}), + const std::shared_ptr& callback_executor = + std::make_shared(ov::threading::IStreamsExecutor::Config{"Callback"})); /** * @brief Gets all outputs from compiled model @@ -119,8 +118,8 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< std::vector> m_inputs; std::vector> m_outputs; - InferenceEngine::ITaskExecutor::Ptr m_task_executor = nullptr; //!< Holds a task executor - InferenceEngine::ITaskExecutor::Ptr m_callback_executor = nullptr; //!< Holds a callback executor + std::shared_ptr m_task_executor = nullptr; //!< Holds a task executor + std::shared_ptr m_callback_executor = nullptr; //!< Holds a callback executor friend ov::CoreImpl; friend ov::IExecutableNetworkWrapper; @@ -146,7 +145,7 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< /** * @brief Default implementation of create async inter request method * - * @tparam AsyncInferRequestType Async infer request type. InferenceEngine::AsyncInferRequestThreadSafeDefault by + * @tparam AsyncInferRequestType Async infer request type. ov::IAsyncInferRequest by * default * * @return Asynchronous infer request @@ -163,8 +162,8 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< * @return OpenVINO Plugin interface */ const std::shared_ptr& get_plugin() const; - const InferenceEngine::ITaskExecutor::Ptr get_task_executor() const; - const InferenceEngine::ITaskExecutor::Ptr get_callback_executor() const; + const std::shared_ptr get_task_executor() const; + const std::shared_ptr get_callback_executor() const; }; } // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/iplugin.hpp b/src/inference/dev_api/openvino/runtime/iplugin.hpp index 47f576b46bfd52..e2a4b4110d3514 100644 --- a/src/inference/dev_api/openvino/runtime/iplugin.hpp +++ b/src/inference/dev_api/openvino/runtime/iplugin.hpp @@ -19,7 +19,7 @@ #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/icore.hpp" #include "openvino/runtime/remote_context.hpp" -#include "threading/ie_executor_manager.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" namespace InferenceEngine { @@ -188,7 +188,7 @@ class OPENVINO_RUNTIME_API IPlugin : public std::enable_shared_from_this& get_executor_manager() const; + const std::shared_ptr& get_executor_manager() const; ~IPlugin() = default; @@ -198,11 +198,11 @@ class OPENVINO_RUNTIME_API IPlugin : public std::enable_shared_from_this m_core; //!< A pointer to ICore interface - std::shared_ptr m_executor_manager; //!< A tasks execution manager - ov::Version m_version; //!< Member contains plugin version - bool m_is_new_api; //!< A flag which shows used API + std::string m_plugin_name; //!< A device name that plugins enables + std::weak_ptr m_core; //!< A pointer to ICore interface + std::shared_ptr m_executor_manager; //!< A tasks execution manager + ov::Version m_version; //!< Member contains plugin version + bool m_is_new_api; //!< A flag which shows used API }; } // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/system_conf.hpp b/src/inference/dev_api/openvino/runtime/system_conf.hpp new file mode 100644 index 00000000000000..216d059ed357d2 --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/system_conf.hpp @@ -0,0 +1,193 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief Abstraction over platform specific implementations + * @file openvino/runtime/system_conf.hpp + */ + +#pragma once + +#include + +#include "openvino/runtime/common.hpp" + +namespace ov { + +/** + * @brief Checks whether OpenMP environment variables are defined + * @ingroup ov_dev_api_system_conf + * + * @param[in] include_omp_num_threads Indicates if the omp number threads is included + * @return `True` if any OpenMP environment variable is defined, `false` otherwise + */ +OPENVINO_RUNTIME_API bool check_open_mp_env_vars(bool include_omp_num_threads = true); + +/** + * @brief Returns available CPU NUMA nodes (on Linux, and Windows [only with TBB], single node is assumed on all + * other OSes) + * @ingroup ov_dev_api_system_conf + * @return NUMA nodes + */ +OPENVINO_RUNTIME_API std::vector get_available_numa_nodes(); + +/** + * @brief Returns available CPU cores types (on Linux, and Windows) and ONLY with TBB, single core type is assumed + * otherwise + * @ingroup ov_dev_api_system_conf + * @return Vector of core types + */ +OPENVINO_RUNTIME_API std::vector get_available_cores_types(); + +/** + * @brief Returns number of CPU physical cores on Linux/Windows (which is considered to be more performance + * friendly for servers) (on other OSes it simply relies on the original parallel API of choice, which usually uses the + * logical cores). call function with 'false' to get #phys cores of all types call function with 'true' to get #phys + * 'Big' cores number of 'Little' = 'all' - 'Big' + * @ingroup ov_dev_api_system_conf + * @param[in] big_cores_only Additionally limits the number of reported cores to the 'Big' cores only. + * @return Number of physical CPU cores. + */ +OPENVINO_RUNTIME_API int get_number_of_cpu_cores(bool big_cores_only = false); + +/** + * @brief Returns number of CPU logical cores on Linux/Windows (on other OSes it simply relies on the original + * parallel API of choice, which uses the 'all' logical cores). call function with 'false' to get #logical cores of + * all types call function with 'true' to get #logical 'Big' cores number of 'Little' = 'all' - 'Big' + * @ingroup ov_dev_api_system_conf + * @param[in] big_cores_only Additionally limits the number of reported cores to the 'Big' cores only. + * @return Number of logical CPU cores. + */ +OPENVINO_RUNTIME_API int get_number_of_logical_cpu_cores(bool big_cores_only = false); + +/** + * @brief Checks whether CPU supports SSE 4.2 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is SSE 4.2 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_sse42(); + +/** + * @brief Checks whether CPU supports AVX capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx(); + +/** + * @brief Checks whether CPU supports AVX2 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX2 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx2(); + +/** + * @brief Checks whether CPU supports AVX 512 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX512F (foundation) instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512f(); + +/** + * @brief Checks whether CPU supports AVX 512 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX512F, AVX512BW, AVX512DQ instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core(); + +/** + * @brief Checks whether CPU supports AVX 512 VNNI capability + * @ingroup ov_dev_api_system_conf + * @return `True` is AVX512F, AVX512BW, AVX512DQ, AVX512_VNNI instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_vnni(); + +/** + * @brief Checks whether CPU supports BFloat16 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is tAVX512_BF16 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_bfloat16(); + +/** + * @brief Checks whether CPU supports AMX int8 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is tAMX_INT8 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_amx_int8(); + +/** + * @brief Checks whether CPU supports AMX bf16 capability + * @ingroup ov_dev_api_system_conf + * @return `True` is tAMX_BF16 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_amx_bf16(); + +/** + * @brief Checks whether CPU supports AMX capability + * @ingroup ov_dev_api_system_conf + * @return `True` is tAMX_INT8 or tAMX_BF16 instructions are available, `false` otherwise + */ +OPENVINO_RUNTIME_API bool with_cpu_x86_avx512_core_amx(); + +/** + * @enum ColumnOfProcessorTypeTable + * @brief This enum contains defination of each columns in processor type table which bases on cpu core types. Will + * extend to support other CPU core type like ARM. + * + * The following are two example of processor type table. + * 1. Processor table of two socket CPUs XEON server + * + * ALL_PROC | MAIN_CORE_PROC | EFFICIENT_CORE_PROC | HYPER_THREADING_PROC + * 96 48 0 48 // Total number of two sockets + * 48 24 0 24 // Number of socket one + * 48 24 0 24 // Number of socket two + * + * 2. Processor table of one socket CPU desktop + * + * ALL_PROC | MAIN_CORE_PROC | EFFICIENT_CORE_PROC | HYPER_THREADING_PROC + * 32 8 16 8 // Total number of one socket + */ +enum ColumnOfProcessorTypeTable { + ALL_PROC = 0, //!< All processors, regardless of backend cpu + MAIN_CORE_PROC = 1, //!< Processor based on physical core of Intel Performance-cores + EFFICIENT_CORE_PROC = 2, //!< Processor based on Intel Efficient-cores + HYPER_THREADING_PROC = 3, //!< Processor based on logical core of Intel Performance-cores + PROC_TYPE_TABLE_SIZE = 4 //!< Size of processor type table +}; + +/** + * @enum ColumnOfCPUMappingTable + * @brief This enum contains defination of each columns in CPU mapping table which use processor id as index. + * + * GROUP_ID is generated according to the following rules. + * 1. If one MAIN_CORE_PROC and one HYPER_THREADING_PROC are based on same Performance-cores, they are in one group. + * 2. If some EFFICIENT_CORE_PROC share one L2 cachle, they are in one group. + * 3. There are no duplicate group IDs in the system + * + * The following is the example of CPU mapping table. + * 1. Four processors of two Pcore + * 2. Four processors of four Ecores shared L2 cache + * + * PROCESSOR_ID | SOCKET_ID | CORE_ID | CORE_TYPE | GROUP_ID | Used + * 0 0 0 3 0 0 + * 1 0 0 1 0 0 + * 2 0 1 3 1 0 + * 3 0 1 1 1 0 + * 4 0 2 2 2 0 + * 5 0 3 2 2 0 + * 6 0 4 2 2 0 + * 7 0 5 2 2 0 + */ +enum ColumnOfCPUMappingTable { + CPU_MAP_PROCESSOR_ID = 0, //!< column for processor id of the processor + CPU_MAP_SOCKET_ID = 1, //!< column for socket id of the processor + CPU_MAP_CORE_ID = 2, //!< column for hardware core id of the processor + CPU_MAP_CORE_TYPE = 3, //!< column for CPU core type corresponding to the processor + CPU_MAP_GROUP_ID = 4, //!< column for group id to the processor. Processors in one group have dependency. + CPU_MAP_USED_FLAG = 5, //!< column for resource management of the processor + CPU_MAP_TABLE_SIZE = 6 //!< Size of CPU mapping table +}; + +} // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp new file mode 100644 index 00000000000000..0180b9a475df98 --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @file openvino/runtime/threading/cpu_streams_executor.hpp + * @brief A header file for OpenVINO CPU-Streams-based Executor implementation. + */ + +#pragma once + +#include +#include +#include + +#include "openvino/runtime/threading/istreams_executor.hpp" + +namespace ov { +namespace threading { + +/** + * @class CPUStreamsExecutor + * @ingroup ov_dev_api_threading + * @brief CPU Streams executor implementation. The executor splits the CPU into groups of threads, + * that can be pinned to cores or NUMA nodes. + * It uses custom threads to pull tasks from single queue. + */ +class OPENVINO_RUNTIME_API CPUStreamsExecutor : public IStreamsExecutor { +public: + /** + * @brief Constructor + * @param config Stream executor parameters + */ + explicit CPUStreamsExecutor(const Config& config); + + /** + * @brief A class destructor + */ + ~CPUStreamsExecutor() override; + + void run(Task task) override; + + void execute(Task task) override; + + int get_stream_id() override; + + int get_numa_node_id() override; + +private: + struct Impl; + std::unique_ptr _impl; +}; + +} // namespace threading +} // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/threading/executor_manager.hpp b/src/inference/dev_api/openvino/runtime/threading/executor_manager.hpp new file mode 100644 index 00000000000000..6e7735a6906056 --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/threading/executor_manager.hpp @@ -0,0 +1,80 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief OpenVINO Runtime Executor Manager + * @file openvino/runtime/threading/executor_manager.hpp + */ + +#pragma once + +#include "openvino/runtime/common.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" +#include "threading/ie_istreams_executor.hpp" + +namespace ov { + +namespace threading { + +/** + * @interface ExecutorManager + * @brief Interface for tasks execution manager. + * This is global point for getting task executor objects by string id. + * It's necessary in multiple asynchronous requests for having unique executors to avoid oversubscription. + * E.g. There 2 task executors for CPU device: one - in FPGA, another - in OneDNN. Parallel execution both of them leads + * to not optimal CPU usage. More efficient to run the corresponding tasks one by one via single executor. + * @ingroup ov_dev_api_threading + */ +class OPENVINO_RUNTIME_API ExecutorManager { +public: + /** + * @brief Returns executor by unique identificator + * @param id An unique identificator of device (Usually string representation of TargetDevice) + * @return A shared pointer to existing or newly ITaskExecutor + */ + virtual std::shared_ptr get_executor(const std::string& id) = 0; + + /** + * @brief Returns idle cpu streams executor + * + * @param config Streams executor config + * + * @return pointer to streams executor config + */ + virtual std::shared_ptr get_idle_cpu_streams_executor( + const ov::threading::IStreamsExecutor::Config& config) = 0; + + /** + * @brief Allows to configure executor manager + * + * @param properties map with configuration + */ + virtual void set_property(const ov::AnyMap& properties) = 0; + /** + * @brief Returns configuration + * + * @param name property name + * + * @return Property value + */ + virtual ov::Any get_property(const std::string& name) const = 0; + + /** + * @cond + */ + virtual size_t get_executors_number() const = 0; + + virtual size_t get_idle_cpu_streams_executors_number() const = 0; + + virtual void clear(const std::string& id = {}) = 0; + /** + * @endcond + */ + virtual ~ExecutorManager() = default; +}; + +OPENVINO_API std::shared_ptr executor_manager(); +} // namespace threading +} // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp new file mode 100644 index 00000000000000..aead0f07cc1418 --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp @@ -0,0 +1,168 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @file ie_istreams_executor.hpp + * @brief A header file for Inference Engine Streams-based Executor Interface + */ + +#pragma once + +#include +#include +#include + +#include "openvino/runtime/common.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" + +namespace ov { +namespace threading { + +/** + * @interface IStreamsExecutor + * @ingroup ov_dev_api_threading + * @brief Interface for Streams Task Executor. This executor groups worker threads into so-called `streams`. + * @par CPU + * The executor executes all parallel tasks using threads from one stream. + * With proper pinning settings it should reduce cache misses for memory bound workloads. + * @par NUMA + * On NUMA hosts GetNumaNodeId() method can be used to define the NUMA node of current stream + */ +class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor { +public: + /** + * @brief Defines inference thread binding type + */ + enum ThreadBindingType : std::uint8_t { + NONE, //!< Don't bind the inference threads + CORES, //!< Bind inference threads to the CPU cores (round-robin) + // the following modes are implemented only for the TBB code-path: + NUMA, //!< Bind to the NUMA nodes (default mode for the non-hybrid CPUs on the Win/MacOS, where the 'CORES' is + //!< not implemeneted) + HYBRID_AWARE //!< Let the runtime bind the inference threads depending on the cores type (default mode for the + //!< hybrid CPUs) + }; + + /** + * @brief Defines IStreamsExecutor configuration + */ + struct OPENVINO_RUNTIME_API Config { + /** + * @brief Sets configuration + * @param properties map of properties + */ + void set_property(const ov::AnyMap& properties); + + /** + * @brief Sets configuration + * @param key property name + * @param value property value + */ + void set_property(const std::string& key, const ov::Any& value); + + /** + * @brief Return configuration value + * @param key configuration key + * @return configuration value wrapped into ov::Any + */ + ov::Any get_property(const std::string& key) const; + + /** + * @brief Create appropriate multithreaded configuration + * filing unconfigured values from initial configuration using hardware properties + * @param initial Inital configuration + * @param fp_intesive additional hint for the the (Hybrid) core-types selection logic + * whether the executor should be configured for floating point intensive work (as opposite to int8 + * intensive) + * @return configured values + */ + static Config make_default_multi_threaded(const Config& initial, const bool fp_intesive = true); + static int get_default_num_streams( + const bool enable_hyper_thread = true); // no network specifics considered (only CPU's caps); + static int get_hybrid_num_streams(std::map& config, const int stream_mode); + static void update_hybrid_custom_threads(Config& config); + + std::string _name; //!< Used by `ITT` to name executor threads + int _streams = 1; //!< Number of streams. + int _threadsPerStream = 0; //!< Number of threads per stream that executes `ie_parallel` calls + ThreadBindingType _threadBindingType = ThreadBindingType::NONE; //!< Thread binding to hardware resource type. + //!< No binding by default + int _threadBindingStep = 1; //!< In case of @ref CORES binding offset type + //!< thread binded to cores with defined step + int _threadBindingOffset = 0; //!< In case of @ref CORES binding offset type thread binded to cores + //!< starting from offset + int _threads = 0; //!< Number of threads distributed between streams. + //!< Reserved. Should not be used. + int _big_core_streams = 0; //!< Number of streams in Performance-core(big core) + int _small_core_streams = 0; //!< Number of streams in Efficient-core(small core) + int _threads_per_stream_big = 0; //!< Threads per stream in big cores + int _threads_per_stream_small = 0; //!< Threads per stream in small cores + int _small_core_offset = 0; //!< Calculate small core start offset when binding cpu cores + bool _enable_hyper_thread = true; //!< enable hyper thread + enum StreamMode { DEFAULT, AGGRESSIVE, LESSAGGRESSIVE }; + enum PreferredCoreType { + ANY, + LITTLE, + BIG, + ROUND_ROBIN // used w/multiple streams to populate the Big cores first, then the Little, then wrap around + // (for large #streams) + } _threadPreferredCoreType = + PreferredCoreType::ANY; //!< In case of @ref HYBRID_AWARE hints the TBB to affinitize + + /** + * @brief A constructor with arguments + * + * @param[in] name The executor name + * @param[in] streams @copybrief Config::_streams + * @param[in] threadsPerStream @copybrief Config::_threadsPerStream + * @param[in] threadBindingType @copybrief Config::_threadBindingType + * @param[in] threadBindingStep @copybrief Config::_threadBindingStep + * @param[in] threadBindingOffset @copybrief Config::_threadBindingOffset + * @param[in] threads @copybrief Config::_threads + * @param[in] threadPreferBigCores @copybrief Config::_threadPreferBigCores + */ + Config(std::string name = "StreamsExecutor", + int streams = 1, + int threadsPerStream = 0, + ThreadBindingType threadBindingType = ThreadBindingType::NONE, + int threadBindingStep = 1, + int threadBindingOffset = 0, + int threads = 0, + PreferredCoreType threadPreferredCoreType = PreferredCoreType::ANY) + : _name{name}, + _streams{streams}, + _threadsPerStream{threadsPerStream}, + _threadBindingType{threadBindingType}, + _threadBindingStep{threadBindingStep}, + _threadBindingOffset{threadBindingOffset}, + _threads{threads}, + _threadPreferredCoreType(threadPreferredCoreType) {} + }; + + /** + * @brief A virtual destructor + */ + ~IStreamsExecutor() override; + + /** + * @brief Return the index of current stream + * @return An index of current stream. Or throw exceptions if called not from stream thread + */ + virtual int get_stream_id() = 0; + + /** + * @brief Return the id of current NUMA Node + * @return `ID` of current NUMA Node, or throws exceptions if called not from stream thread + */ + virtual int get_numa_node_id() = 0; + + /** + * @brief Execute the task in the current thread using streams executor configuration and constraints + * @param task A task to start + */ + virtual void execute(Task task) = 0; +}; + +} // namespace threading +} // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/threading/itask_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/itask_executor.hpp new file mode 100644 index 00000000000000..3cb42e3200bb0c --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/threading/itask_executor.hpp @@ -0,0 +1,76 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @file openvino/runtime/threading/task_executor.hpp + * @brief A header file for OpenVINO Task Executor Interface + */ + +#pragma once + +#include +#include +#include + +#include "openvino/runtime/common.hpp" + +namespace ov { +namespace threading { + +/** + * @brief OpenVINO Task Executor can use any copyable callable without parameters and output as a task. + * It would be wrapped into std::function object + * @ingroup ov_dev_api_threading + */ +using Task = std::function; + +/** +* @interface ITaskExecutor +* @ingroup ov_dev_api_threading +* @brief Interface for Task Executor. +* OpenVINO uses `ov::ITaskExecutor` interface to run all asynchronous internal tasks. +* Different implementations of task executors can be used for different purposes: +* - To improve cache locality of memory bound CPU tasks some executors can limit task's affinity and maximum +concurrency. +* - The executor with one worker thread can be used to serialize access to acceleration device. +* - Immediate task executor can be used to satisfy `ov::ITaskExecutor` interface restrictions but +run tasks in current thread. +* @note Implementation should guaranty thread safety of all methods +* @section Synchronization +* It is `ov::ITaskExecutor` user responsibility to wait for task execution completion. +* The `c++11` standard way to wait task completion is to use `std::packaged_task` or `std::promise` with +`std::future`. +* Here is an example of how to use `std::promise` to wait task completion and process task's exceptions: + * @snippet example_itask_executor.cpp itask_executor:define_pipeline + */ +class OPENVINO_RUNTIME_API ITaskExecutor { +public: + /** + * @brief Destroys the object. + */ + virtual ~ITaskExecutor() = default; + + /** + * @brief Execute ov::Task inside task executor context + * @param task A task to start + */ + virtual void run(Task task) = 0; + + /** + * @brief Execute all of the tasks and waits for its completion. + * Default run_and_wait() method implementation uses run() pure virtual method + * and higher level synchronization primitives from STL. + * The task is wrapped into std::packaged_task which returns std::future. + * std::packaged_task will call the task and signal to std::future that the task is finished + * or the exception is thrown from task + * Than std::future is used to wait for task execution completion and + * task exception extraction + * @note run_and_wait() does not copy or capture tasks! + * @param tasks A vector of tasks to execute + */ + virtual void run_and_wait(const std::vector& tasks); +}; + +} // namespace threading +} // namespace ov diff --git a/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp b/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp index f4b31d95fc8cb0..12c2232a572e5d 100644 --- a/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp +++ b/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp @@ -33,7 +33,7 @@ class INFERENCE_ENGINE_API_CLASS(CPUStreamsExecutor) : public IStreamsExecutor { * @brief Constructor * @param config Stream executor parameters */ - explicit CPUStreamsExecutor(const Config& config = {}); + explicit CPUStreamsExecutor(const InferenceEngine::IStreamsExecutor::Config& config = {}); /** * @brief A class destructor diff --git a/src/inference/dev_api/threading/ie_executor_manager.hpp b/src/inference/dev_api/threading/ie_executor_manager.hpp index 2504884d071d95..ef789c82c48234 100644 --- a/src/inference/dev_api/threading/ie_executor_manager.hpp +++ b/src/inference/dev_api/threading/ie_executor_manager.hpp @@ -18,8 +18,18 @@ #include "threading/ie_istreams_executor.hpp" #include "threading/ie_itask_executor.hpp" +namespace ov { +namespace threading { + +class ExecutorManager; + +} +} // namespace ov + namespace InferenceEngine { +class IPluginWrapper; + /** * @interface ExecutorManager * @brief Interface for tasks execution manager. @@ -76,8 +86,15 @@ class INFERENCE_ENGINE_API_CLASS(ExecutorManager) { */ virtual void setTbbFlag(bool flag) = 0; virtual bool getTbbFlag() = 0; + +private: + virtual std::shared_ptr get_ov_manager() const = 0; + friend class IPluginWrapper; }; INFERENCE_ENGINE_API_CPP(ExecutorManager::Ptr) executorManager(); +std::shared_ptr create_old_manager( + const std::shared_ptr& manager); + } // namespace InferenceEngine diff --git a/src/inference/dev_api/threading/ie_istreams_executor.hpp b/src/inference/dev_api/threading/ie_istreams_executor.hpp index efecaf606faa32..bb2bbeca0b70d2 100644 --- a/src/inference/dev_api/threading/ie_istreams_executor.hpp +++ b/src/inference/dev_api/threading/ie_istreams_executor.hpp @@ -14,6 +14,7 @@ #include #include "ie_parameter.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" #include "threading/ie_itask_executor.hpp" namespace InferenceEngine { @@ -28,30 +29,17 @@ namespace InferenceEngine { * @par NUMA * On NUMA hosts GetNumaNodeId() method can be used to define the NUMA node of current stream */ -class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor { +class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor, public ov::threading::IStreamsExecutor { public: /** * A shared pointer to IStreamsExecutor interface */ using Ptr = std::shared_ptr; - /** - * @brief Defines inference thread binding type - */ - enum ThreadBindingType : std::uint8_t { - NONE, //!< Don't bind the inference threads - CORES, //!< Bind inference threads to the CPU cores (round-robin) - // the following modes are implemented only for the TBB code-path: - NUMA, //!< Bind to the NUMA nodes (default mode for the non-hybrid CPUs on the Win/MacOS, where the 'CORES' is - //!< not implemeneted) - HYBRID_AWARE //!< Let the runtime bind the inference threads depending on the cores type (default mode for the - //!< hybrid CPUs) - }; - /** * @brief Defines IStreamsExecutor configuration */ - struct INFERENCE_ENGINE_API_CLASS(Config) { + struct INFERENCE_ENGINE_API_CLASS(Config) : public ov::threading::IStreamsExecutor::Config { /** * @brief Supported Configuration keys * @return vector of supported configuration keys @@ -87,33 +75,6 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor { static int GetHybridNumStreams(std::map& config, const int stream_mode); static void UpdateHybridCustomThreads(Config& config); - std::string _name; //!< Used by `ITT` to name executor threads - int _streams = 1; //!< Number of streams. - int _threadsPerStream = 0; //!< Number of threads per stream that executes `ie_parallel` calls - ThreadBindingType _threadBindingType = ThreadBindingType::NONE; //!< Thread binding to hardware resource type. - //!< No binding by default - int _threadBindingStep = 1; //!< In case of @ref CORES binding offset type - //!< thread binded to cores with defined step - int _threadBindingOffset = 0; //!< In case of @ref CORES binding offset type thread binded to cores - //!< starting from offset - int _threads = 0; //!< Number of threads distributed between streams. - //!< Reserved. Should not be used. - int _big_core_streams = 0; //!< Number of streams in Performance-core(big core) - int _small_core_streams = 0; //!< Number of streams in Efficient-core(small core) - int _threads_per_stream_big = 0; //!< Threads per stream in big cores - int _threads_per_stream_small = 0; //!< Threads per stream in small cores - int _small_core_offset = 0; //!< Calculate small core start offset when binding cpu cores - bool _enable_hyper_thread = true; //!< enable hyper thread - enum StreamMode { DEFAULT, AGGRESSIVE, LESSAGGRESSIVE }; - enum PreferredCoreType { - ANY, - LITTLE, - BIG, - ROUND_ROBIN // used w/multiple streams to populate the Big cores first, then the Little, then wrap around - // (for large #streams) - } _threadPreferredCoreType = - PreferredCoreType::ANY; //!< In case of @ref HYBRID_AWARE hints the TBB to affinitize - /** * @brief A constructor with arguments * @@ -134,14 +95,17 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor { int threadBindingOffset = 0, int threads = 0, PreferredCoreType threadPreferredCoreType = PreferredCoreType::ANY) - : _name{name}, - _streams{streams}, - _threadsPerStream{threadsPerStream}, - _threadBindingType{threadBindingType}, - _threadBindingStep{threadBindingStep}, - _threadBindingOffset{threadBindingOffset}, - _threads{threads}, - _threadPreferredCoreType(threadPreferredCoreType) {} + : ov::threading::IStreamsExecutor::Config(name, + streams, + threadsPerStream, + threadBindingType, + threadBindingStep, + threadBindingOffset, + threads, + threadPreferredCoreType) {} + + Config(const ov::threading::IStreamsExecutor::Config& config) + : ov::threading::IStreamsExecutor::Config(config) {} }; /** @@ -166,6 +130,18 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor { * @param task A task to start */ virtual void Execute(Task task) = 0; + + int get_stream_id() override { + return GetStreamId(); + } + + int get_numa_node_id() override { + return GetNumaNodeId(); + } + + void execute(Task task) override { + Execute(task); + } }; } // namespace InferenceEngine diff --git a/src/inference/dev_api/threading/ie_itask_executor.hpp b/src/inference/dev_api/threading/ie_itask_executor.hpp index 90557d08f9f92c..1fc2923fca92e7 100644 --- a/src/inference/dev_api/threading/ie_itask_executor.hpp +++ b/src/inference/dev_api/threading/ie_itask_executor.hpp @@ -14,6 +14,7 @@ #include #include "ie_api.h" +#include "openvino/runtime/threading/itask_executor.hpp" namespace InferenceEngine { @@ -22,7 +23,7 @@ namespace InferenceEngine { * It would be wrapped into std::function object * @ingroup ie_dev_api_threading */ -using Task = std::function; +using Task = ov::threading::Task; /** * @interface ITaskExecutor @@ -36,14 +37,13 @@ concurrency. * - Immediate task executor can be used to satisfy `InferenceEngine::ITaskExecutor` interface restrictions but run tasks in current thread. * @note Implementation should guaranty thread safety of all methods -* @section Synchronization * It is `InferenceEngine::ITaskExecutor` user responsibility to wait for task execution completion. * The `c++11` standard way to wait task completion is to use `std::packaged_task` or `std::promise` with `std::future`. * Here is an example of how to use `std::promise` to wait task completion and process task's exceptions: * @snippet example_itask_executor.cpp itask_executor:define_pipeline */ -class INFERENCE_ENGINE_API_CLASS(ITaskExecutor) { +class INFERENCE_ENGINE_API_CLASS(ITaskExecutor) : virtual public ov::threading::ITaskExecutor { public: /** * A shared pointer to ITaskExecutor interface @@ -55,12 +55,6 @@ class INFERENCE_ENGINE_API_CLASS(ITaskExecutor) { */ virtual ~ITaskExecutor() = default; - /** - * @brief Execute InferenceEngine::Task inside task executor context - * @param task A task to start - */ - virtual void run(Task task) = 0; - /** * @brief Execute all of the tasks and waits for its completion. * Default runAndWait() method implementation uses run() pure virtual method diff --git a/src/inference/include/openvino/runtime/core.hpp b/src/inference/include/openvino/runtime/core.hpp index a349378e8e32b1..55250ec3c5484f 100644 --- a/src/inference/include/openvino/runtime/core.hpp +++ b/src/inference/include/openvino/runtime/core.hpp @@ -733,6 +733,6 @@ class OPENVINO_RUNTIME_API Core { * You might want to use this function if you are developing a dynamically-loaded library which should clean up all * resources after itself when the library is unloaded. */ -void OPENVINO_RUNTIME_API shutdown(); +OPENVINO_RUNTIME_API void shutdown(); } // namespace ov diff --git a/src/inference/include/openvino/runtime/remote_tensor.hpp b/src/inference/include/openvino/runtime/remote_tensor.hpp index 321a2bcab51fcd..938398a07beecb 100644 --- a/src/inference/include/openvino/runtime/remote_tensor.hpp +++ b/src/inference/include/openvino/runtime/remote_tensor.hpp @@ -44,6 +44,8 @@ class OPENVINO_RUNTIME_API RemoteTensor : public Tensor { template T* data() = delete; + void copy_to(ov::Tensor& dst) const = delete; + /** * @brief Returns a map of device-specific parameters required for low-level * operations with underlying object. diff --git a/src/inference/src/dev/converter_utils.cpp b/src/inference/src/dev/converter_utils.cpp index 88bded8388127a..8e56463094a921 100644 --- a/src/inference/src/dev/converter_utils.cpp +++ b/src/inference/src/dev/converter_utils.cpp @@ -34,8 +34,10 @@ #include "openvino/runtime/profiling_info.hpp" #include "openvino/runtime/remote_context.hpp" #include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" #include "openvino/runtime/variable_state.hpp" #include "so_ptr.hpp" +#include "threading/ie_executor_manager.hpp" #include "transformations/utils/utils.hpp" namespace { @@ -221,7 +223,7 @@ class IInferencePluginWrapper : public InferenceEngine::IInferencePlugin { version.description = ver.description; SetVersion(version); _isNewAPI = plugin->is_new_api(); - _executorManager = plugin->get_executor_manager(); + _executorManager = InferenceEngine::create_old_manager(plugin->get_executor_manager()); } std::string GetName() const noexcept override { return m_plugin->get_device_name(); diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index f31b3df76ffda5..7c87a7c3d9e5aa 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -28,6 +28,7 @@ #include "openvino/pass/manager.hpp" #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/remote_context.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" #include "openvino/util/common_util.hpp" #include "openvino/util/shared_object.hpp" #include "preprocessing/preprocessing.hpp" @@ -57,7 +58,7 @@ void stripDeviceName(std::string& device, const std::string& substr) { ov::CoreImpl::CoreImpl(bool _newAPI) : m_new_api(_newAPI) { add_mutex(""); // Register global mutex - executorManagerPtr = InferenceEngine::executorManager(); + m_executor_manager = ov::threading::executor_manager(); for (const auto& it : ov::get_available_opsets()) { opsetNames.insert(it.first); } @@ -632,7 +633,7 @@ void ov::CoreImpl::set_property(const std::string& device_name, const AnyMap& pr ov::Any ov::CoreImpl::get_property_for_core(const std::string& name) const { if (name == ov::force_tbb_terminate.name()) { - const auto flag = InferenceEngine::executorManager()->getTbbFlag(); + const auto flag = ov::threading::executor_manager()->get_property(name).as(); return decltype(ov::force_tbb_terminate)::value_type(flag); } else if (name == ov::cache_dir.name()) { return ov::Any(coreConfig.get_cache_dir()); @@ -993,7 +994,7 @@ void ov::CoreImpl::CoreConfig::set_and_update(ov::AnyMap& config) { it = config.find(ov::force_tbb_terminate.name()); if (it != config.end()) { auto flag = it->second.as() == CONFIG_VALUE(YES) ? true : false; - InferenceEngine::executorManager()->setTbbFlag(flag); + ov::threading::executor_manager()->set_property({{it->first, flag}}); config.erase(it); } diff --git a/src/inference/src/dev/core_impl.hpp b/src/inference/src/dev/core_impl.hpp index 0d74145f2aea6e..7e223202f038fd 100644 --- a/src/inference/src/dev/core_impl.hpp +++ b/src/inference/src/dev/core_impl.hpp @@ -21,7 +21,7 @@ #include "openvino/core/version.hpp" #include "openvino/runtime/common.hpp" #include "openvino/runtime/icompiled_model.hpp" -#include "threading/ie_executor_manager.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" #ifdef OPENVINO_STATIC_LIBRARY # include "ie_plugins.hpp" @@ -162,7 +162,7 @@ class CoreImpl : public InferenceEngine::ICore, public std::enable_shared_from_t } }; - InferenceEngine::ExecutorManager::Ptr executorManagerPtr; + std::shared_ptr m_executor_manager; mutable std::unordered_set opsetNames; // TODO: make extensions to be optional with conditional compilation mutable std::vector extensions; diff --git a/src/inference/src/dev/iasync_infer_request.cpp b/src/inference/src/dev/iasync_infer_request.cpp index 385baba838c9ed..45633fa76166e9 100644 --- a/src/inference/src/dev/iasync_infer_request.cpp +++ b/src/inference/src/dev/iasync_infer_request.cpp @@ -14,13 +14,13 @@ namespace { -struct ImmediateStreamsExecutor : public InferenceEngine::ITaskExecutor { - explicit ImmediateStreamsExecutor(const InferenceEngine::IStreamsExecutor::Ptr& streamsExecutor) +struct ImmediateStreamsExecutor : public ov::threading::ITaskExecutor { + explicit ImmediateStreamsExecutor(const std::shared_ptr& streamsExecutor) : _streamsExecutor{streamsExecutor} {} void run(InferenceEngine::Task task) override { - _streamsExecutor->Execute(std::move(task)); + _streamsExecutor->execute(std::move(task)); } - InferenceEngine::IStreamsExecutor::Ptr _streamsExecutor; + std::shared_ptr _streamsExecutor; }; } // namespace @@ -30,8 +30,8 @@ ov::IAsyncInferRequest::~IAsyncInferRequest() { } ov::IAsyncInferRequest::IAsyncInferRequest(const std::shared_ptr& request, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor) + const std::shared_ptr& task_executor, + const std::shared_ptr& callback_executor) : m_sync_request(request), m_request_executor(task_executor), m_callback_executor(callback_executor) { @@ -117,7 +117,7 @@ void ov::IAsyncInferRequest::start_async_thread_unsafe() { void ov::IAsyncInferRequest::run_first_stage(const Pipeline::iterator itBeginStage, const Pipeline::iterator itEndStage, - const InferenceEngine::ITaskExecutor::Ptr callbackExecutor) { + const std::shared_ptr callbackExecutor) { auto& firstStageExecutor = std::get(*itBeginStage); OPENVINO_ASSERT(nullptr != firstStageExecutor); firstStageExecutor->run(make_next_stage_task(itBeginStage, itEndStage, std::move(callbackExecutor))); @@ -126,9 +126,9 @@ void ov::IAsyncInferRequest::run_first_stage(const Pipeline::iterator itBeginSta InferenceEngine::Task ov::IAsyncInferRequest::make_next_stage_task( const Pipeline::iterator itStage, const Pipeline::iterator itEndStage, - const InferenceEngine::ITaskExecutor::Ptr callbackExecutor) { + const std::shared_ptr callbackExecutor) { return std::bind( - [this, itStage, itEndStage](InferenceEngine::ITaskExecutor::Ptr& callbackExecutor) mutable { + [this, itStage, itEndStage](std::shared_ptr& callbackExecutor) mutable { std::exception_ptr currentException = nullptr; auto& thisStage = *itStage; auto itNextStage = itStage + 1; diff --git a/src/inference/src/dev/icompiled_model.cpp b/src/inference/src/dev/icompiled_model.cpp index c3e0796ab754bd..82b94d511d2a83 100644 --- a/src/inference/src/dev/icompiled_model.cpp +++ b/src/inference/src/dev/icompiled_model.cpp @@ -11,8 +11,8 @@ ov::ICompiledModel::ICompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor) + const std::shared_ptr& task_executor, + const std::shared_ptr& callback_executor) : m_plugin(plugin), m_task_executor(task_executor), m_callback_executor(callback_executor) { @@ -86,10 +86,10 @@ std::shared_ptr ov::ICompiledModel::create_infer_request const std::shared_ptr& ov::ICompiledModel::get_plugin() const { return m_plugin; } -const InferenceEngine::ITaskExecutor::Ptr ov::ICompiledModel::get_task_executor() const { +const std::shared_ptr ov::ICompiledModel::get_task_executor() const { return m_task_executor; } -const InferenceEngine::ITaskExecutor::Ptr ov::ICompiledModel::get_callback_executor() const { +const std::shared_ptr ov::ICompiledModel::get_callback_executor() const { return m_callback_executor; } diff --git a/src/inference/src/dev/icompiled_model_wrapper.cpp b/src/inference/src/dev/icompiled_model_wrapper.cpp index b0144b2a5fc416..189ab993217f9a 100644 --- a/src/inference/src/dev/icompiled_model_wrapper.cpp +++ b/src/inference/src/dev/icompiled_model_wrapper.cpp @@ -4,9 +4,8 @@ #include "icompiled_model_wrapper.hpp" -#include - #include "dev/converter_utils.hpp" +#include "ie_plugin_config.hpp" InferenceEngine::ICompiledModelWrapper::ICompiledModelWrapper( const std::shared_ptr& model) diff --git a/src/inference/src/dev/iplugin.cpp b/src/inference/src/dev/iplugin.cpp index 73476d21386942..5bed9efb18f92f 100644 --- a/src/inference/src/dev/iplugin.cpp +++ b/src/inference/src/dev/iplugin.cpp @@ -4,7 +4,7 @@ #include "openvino/runtime/iplugin.hpp" -ov::IPlugin::IPlugin() : m_executor_manager(InferenceEngine::executorManager()), m_is_new_api(true) {} +ov::IPlugin::IPlugin() : m_executor_manager(ov::threading::executor_manager()), m_is_new_api(true) {} void ov::IPlugin::set_version(const ov::Version& version) { m_version = version; @@ -42,7 +42,7 @@ bool ov::IPlugin::is_new_api() const { return m_is_new_api; } -const std::shared_ptr& ov::IPlugin::get_executor_manager() const { +const std::shared_ptr& ov::IPlugin::get_executor_manager() const { return m_executor_manager; } diff --git a/src/inference/src/dev/iplugin_wrapper.cpp b/src/inference/src/dev/iplugin_wrapper.cpp index 36207adf48539f..972d4d62bb4a46 100644 --- a/src/inference/src/dev/iplugin_wrapper.cpp +++ b/src/inference/src/dev/iplugin_wrapper.cpp @@ -9,6 +9,7 @@ #include "any_copy.hpp" #include "dev/converter_utils.hpp" #include "ie_icore.hpp" +#include "threading/ie_executor_manager.hpp" namespace InferenceEngine { @@ -20,7 +21,7 @@ IPluginWrapper::IPluginWrapper(const std::shared_ptrGetName(); m_is_new_api = m_old_plugin->IsNewAPI(); m_core = m_old_plugin->GetCore(); - m_executor_manager = m_old_plugin->executorManager(); + m_executor_manager = m_old_plugin->executorManager()->get_ov_manager(); } const std::shared_ptr& IPluginWrapper::update_exec_network( diff --git a/src/inference/src/dev/isync_infer_request.cpp b/src/inference/src/dev/isync_infer_request.cpp index c8aa79a84b971b..26ba98f1180c00 100644 --- a/src/inference/src/dev/isync_infer_request.cpp +++ b/src/inference/src/dev/isync_infer_request.cpp @@ -7,6 +7,7 @@ #include #include "cpp_interfaces/plugin_itt.hpp" +#include "ie_blob.h" #include "openvino/core/except.hpp" #include "openvino/core/layout.hpp" #include "openvino/core/parallel.hpp" diff --git a/src/inference/src/dev/threading/cpu_streams_executor.cpp b/src/inference/src/dev/threading/cpu_streams_executor.cpp new file mode 100644 index 00000000000000..ceb72eec87db6f --- /dev/null +++ b/src/inference/src/dev/threading/cpu_streams_executor.cpp @@ -0,0 +1,397 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/threading/cpu_streams_executor.hpp" + +#include +#include +#include +#include +#include +#include + +#include "openvino/itt.hpp" +#include "openvino/runtime/system_conf.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" +#include "threading/ie_parallel_custom_arena.hpp" +#include "threading/ie_thread_affinity.hpp" +#include "threading/ie_thread_local.hpp" + +namespace ov { +namespace threading { +struct CPUStreamsExecutor::Impl { + struct Stream { +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + struct Observer : public custom::task_scheduler_observer { + InferenceEngine::CpuSet _mask; + int _ncpus = 0; + int _threadBindingStep = 0; + int _offset = 0; + int _cpuIdxOffset = 0; + Observer(custom::task_arena& arena, + InferenceEngine::CpuSet mask, + int ncpus, + const int streamId, + const int threadsPerStream, + const int threadBindingStep, + const int threadBindingOffset, + const int cpuIdxOffset = 0) + : custom::task_scheduler_observer(arena), + _mask{std::move(mask)}, + _ncpus(ncpus), + _threadBindingStep(threadBindingStep), + _offset{streamId * threadsPerStream + threadBindingOffset}, + _cpuIdxOffset(cpuIdxOffset) {} + void on_scheduler_entry(bool) override { + InferenceEngine::PinThreadToVacantCore(_offset + tbb::this_task_arena::current_thread_index(), + _threadBindingStep, + _ncpus, + _mask, + _cpuIdxOffset); + } + void on_scheduler_exit(bool) override { + PinCurrentThreadByMask(_ncpus, _mask); + } + ~Observer() override = default; + }; +#endif + explicit Stream(Impl* impl) : _impl(impl) { + { + std::lock_guard lock{_impl->_streamIdMutex}; + if (_impl->_streamIdQueue.empty()) { + _streamId = _impl->_streamId++; + } else { + _streamId = _impl->_streamIdQueue.front(); + _impl->_streamIdQueue.pop(); + } + } + _numaNodeId = _impl->_config._streams + ? _impl->_usedNumaNodes.at((_streamId % _impl->_config._streams) / + ((_impl->_config._streams + _impl->_usedNumaNodes.size() - 1) / + _impl->_usedNumaNodes.size())) + : _impl->_usedNumaNodes.at(_streamId % _impl->_usedNumaNodes.size()); +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + const auto concurrency = (0 == _impl->_config._threadsPerStream) ? custom::task_arena::automatic + : _impl->_config._threadsPerStream; + if (ThreadBindingType::HYBRID_AWARE == _impl->_config._threadBindingType) { + if (Config::PreferredCoreType::ROUND_ROBIN != _impl->_config._threadPreferredCoreType) { + if (Config::PreferredCoreType::ANY == _impl->_config._threadPreferredCoreType) { + _taskArena.reset(new custom::task_arena{concurrency}); + } else { + const auto selected_core_type = + Config::PreferredCoreType::BIG == _impl->_config._threadPreferredCoreType + ? custom::info::core_types().back() // running on Big cores only + : custom::info::core_types().front(); // running on Little cores only + _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{} + .set_core_type(selected_core_type) + .set_max_concurrency(concurrency)}); + } + } else { + // assigning the stream to the core type in the round-robin fashion + // wrapping around total_streams (i.e. how many streams all different core types can handle + // together). Binding priority: Big core, Logical big core, Small core + const auto total_streams = _impl->total_streams_on_core_types.back().second; + const auto big_core_streams = _impl->total_streams_on_core_types.front().second; + const auto hybrid_core = _impl->total_streams_on_core_types.size() > 1; + const auto phy_core_streams = + _impl->_config._big_core_streams == 0 + ? 0 + : _impl->num_big_core_phys / _impl->_config._threads_per_stream_big; + const auto streamId_wrapped = _streamId % total_streams; + const auto& selected_core_type = + std::find_if( + _impl->total_streams_on_core_types.cbegin(), + _impl->total_streams_on_core_types.cend(), + [streamId_wrapped](const decltype(_impl->total_streams_on_core_types)::value_type& p) { + return p.second > streamId_wrapped; + }) + ->first; + const auto small_core = hybrid_core && selected_core_type == 0; + const auto logic_core = !small_core && streamId_wrapped >= phy_core_streams; + const auto small_core_skip = small_core && _impl->_config._threads_per_stream_small == 3 && + _impl->_config._small_core_streams > 1; + const auto max_concurrency = + small_core ? _impl->_config._threads_per_stream_small : _impl->_config._threads_per_stream_big; + // Special handling of _threads_per_stream_small == 3 + const auto small_core_id = small_core_skip ? 0 : streamId_wrapped - big_core_streams; + const auto stream_id = + hybrid_core + ? (small_core ? small_core_id + : (logic_core ? streamId_wrapped - phy_core_streams : streamId_wrapped)) + : streamId_wrapped; + const auto thread_binding_step = hybrid_core ? (small_core ? _impl->_config._threadBindingStep : 2) + : _impl->_config._threadBindingStep; + // Special handling of _threads_per_stream_small == 3, need to skip 4 (Four cores share one L2 cache + // on the small core), stream_id = 0, cpu_idx_offset cumulative plus 4 + const auto small_core_offset = + small_core_skip ? _impl->_config._small_core_offset + (streamId_wrapped - big_core_streams) * 4 + : _impl->_config._small_core_offset; + const auto cpu_idx_offset = + hybrid_core + // Prevent conflicts with system scheduling, so default cpu id on big core starts from 1 + ? (small_core ? small_core_offset : (logic_core ? 0 : 1)) + : 0; + + _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{} + .set_core_type(selected_core_type) + .set_max_concurrency(max_concurrency)}); + InferenceEngine::CpuSet processMask; + int ncpus = 0; + std::tie(processMask, ncpus) = InferenceEngine::GetProcessMask(); + if (nullptr != processMask) { + _observer.reset(new Observer{*_taskArena, + std::move(processMask), + ncpus, + stream_id, + max_concurrency, + thread_binding_step, + _impl->_config._threadBindingOffset, + cpu_idx_offset}); + _observer->observe(true); + } + } + } else if (ThreadBindingType::NUMA == _impl->_config._threadBindingType) { + _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{_numaNodeId, concurrency}}); + } else if ((0 != _impl->_config._threadsPerStream) || + (ThreadBindingType::CORES == _impl->_config._threadBindingType)) { + _taskArena.reset(new custom::task_arena{concurrency}); + if (ThreadBindingType::CORES == _impl->_config._threadBindingType) { + InferenceEngine::CpuSet processMask; + int ncpus = 0; + std::tie(processMask, ncpus) = InferenceEngine::GetProcessMask(); + if (nullptr != processMask) { + _observer.reset(new Observer{*_taskArena, + std::move(processMask), + ncpus, + _streamId, + _impl->_config._threadsPerStream, + _impl->_config._threadBindingStep, + _impl->_config._threadBindingOffset}); + _observer->observe(true); + } + } + } +#elif OV_THREAD == OV_THREAD_OMP + omp_set_num_threads(_impl->_config._threadsPerStream); + if (!checkOpenMpEnvVars(false) && (ThreadBindingType::NONE != _impl->_config._threadBindingType)) { + InferenceEngine::CpuSet processMask; + int ncpus = 0; + std::tie(processMask, ncpus) = InferenceEngine::GetProcessMask(); + if (nullptr != processMask) { + parallel_nt(_impl->_config._threadsPerStream, [&](int threadIndex, int threadsPerStream) { + int thrIdx = _streamId * _impl->_config._threadsPerStream + threadIndex + + _impl->_config._threadBindingOffset; + InferenceEngine::PinThreadToVacantCore(thrIdx, + _impl->_config._threadBindingStep, + ncpus, + processMask); + }); + } + } +#elif OV_THREAD == OV_THREAD_SEQ + if (ThreadBindingType::NUMA == _impl->_config._threadBindingType) { + InferenceEngine::PinCurrentThreadToSocket(_numaNodeId); + } else if (ThreadBindingType::CORES == _impl->_config._threadBindingType) { + InferenceEngine::CpuSet processMask; + int ncpus = 0; + std::tie(processMask, ncpus) = InferenceEngine::GetProcessMask(); + if (nullptr != processMask) { + InferenceEngine::PinThreadToVacantCore(_streamId + _impl->_config._threadBindingOffset, + _impl->_config._threadBindingStep, + ncpus, + processMask); + } + } +#endif + } + ~Stream() { + { + std::lock_guard lock{_impl->_streamIdMutex}; + _impl->_streamIdQueue.push(_streamId); + } +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + if (nullptr != _observer) { + _observer->observe(false); + } +#endif + } + + Impl* _impl = nullptr; + int _streamId = 0; + int _numaNodeId = 0; + bool _execute = false; + std::queue _taskQueue; +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + std::unique_ptr _taskArena; + std::unique_ptr _observer; +#endif + }; + + explicit Impl(const Config& config) + : _config{config}, + _streams([this] { + return std::make_shared(this); + }) { + _exectorMgr = executor_manager(); + auto numaNodes = get_available_numa_nodes(); + if (_config._streams != 0) { + std::copy_n(std::begin(numaNodes), + std::min(static_cast(_config._streams), numaNodes.size()), + std::back_inserter(_usedNumaNodes)); + } else { + _usedNumaNodes = numaNodes; + } +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + if (ThreadBindingType::HYBRID_AWARE == config._threadBindingType) { + const auto core_types = custom::info::core_types(); + const auto num_core_phys = get_number_of_cpu_cores(); + num_big_core_phys = get_number_of_cpu_cores(true); + const auto num_small_core_phys = num_core_phys - num_big_core_phys; + int sum = 0; + // reversed order, so BIG cores are first + for (auto iter = core_types.rbegin(); iter < core_types.rend(); iter++) { + const auto& type = *iter; + // calculating the #streams per core type + const int num_streams_for_core_type = + type == 0 ? std::max(1, + std::min(config._small_core_streams, + config._threads_per_stream_small == 0 + ? 0 + : num_small_core_phys / config._threads_per_stream_small)) + : std::max(1, + std::min(config._big_core_streams, + config._threads_per_stream_big == 0 + ? 0 + : num_big_core_phys / config._threads_per_stream_big * 2)); + sum += num_streams_for_core_type; + // prefix sum, so the core type for a given stream id will be deduced just as a upper_bound + // (notice that the map keeps the elements in the descending order, so the big cores are populated + // first) + total_streams_on_core_types.push_back({type, sum}); + } + } +#endif + for (auto streamId = 0; streamId < _config._streams; ++streamId) { + _threads.emplace_back([this, streamId] { + openvino::itt::threadName(_config._name + "_" + std::to_string(streamId)); + for (bool stopped = false; !stopped;) { + Task task; + { + std::unique_lock lock(_mutex); + _queueCondVar.wait(lock, [&] { + return !_taskQueue.empty() || (stopped = _isStopped); + }); + if (!_taskQueue.empty()) { + task = std::move(_taskQueue.front()); + _taskQueue.pop(); + } + } + if (task) { + Execute(task, *(_streams.local())); + } + } + }); + } + } + + void Enqueue(Task task) { + { + std::lock_guard lock(_mutex); + _taskQueue.emplace(std::move(task)); + } + _queueCondVar.notify_one(); + } + + void Execute(const Task& task, Stream& stream) { +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + auto& arena = stream._taskArena; + if (nullptr != arena) { + arena->execute(std::move(task)); + } else { + task(); + } +#else + task(); +#endif + } + + void Defer(Task task) { + auto& stream = *(_streams.local()); + stream._taskQueue.push(std::move(task)); + if (!stream._execute) { + stream._execute = true; + try { + while (!stream._taskQueue.empty()) { + Execute(stream._taskQueue.front(), stream); + stream._taskQueue.pop(); + } + } catch (...) { + } + stream._execute = false; + } + } + + Config _config; + std::mutex _streamIdMutex; + int _streamId = 0; + std::queue _streamIdQueue; + std::vector _threads; + std::mutex _mutex; + std::condition_variable _queueCondVar; + std::queue _taskQueue; + bool _isStopped = false; + std::vector _usedNumaNodes; + InferenceEngine::ThreadLocal> _streams; +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + // stream id mapping to the core type + // stored in the reversed order (so the big cores, with the highest core_type_id value, are populated first) + // every entry is the core type and #streams that this AND ALL EARLIER entries can handle (prefix sum) + // (so mapping is actually just an upper_bound: core type is deduced from the entry for which the id < #streams) + using StreamIdToCoreTypes = std::vector>; + StreamIdToCoreTypes total_streams_on_core_types; + int num_big_core_phys; +#endif + std::shared_ptr _exectorMgr; +}; + +int CPUStreamsExecutor::get_stream_id() { + auto stream = _impl->_streams.local(); + return stream->_streamId; +} + +int CPUStreamsExecutor::get_numa_node_id() { + auto stream = _impl->_streams.local(); + return stream->_numaNodeId; +} + +CPUStreamsExecutor::CPUStreamsExecutor(const ov::threading::IStreamsExecutor::Config& config) + : _impl{new Impl{config}} {} + +CPUStreamsExecutor::~CPUStreamsExecutor() { + { + std::lock_guard lock(_impl->_mutex); + _impl->_isStopped = true; + } + _impl->_queueCondVar.notify_all(); + for (auto& thread : _impl->_threads) { + if (thread.joinable()) { + thread.join(); + } + } +} + +void CPUStreamsExecutor::execute(Task task) { + _impl->Defer(std::move(task)); +} + +void CPUStreamsExecutor::run(Task task) { + if (0 == _impl->_config._streams) { + _impl->Defer(std::move(task)); + } else { + _impl->Enqueue(std::move(task)); + } +} + +} // namespace threading +} // namespace ov diff --git a/src/inference/src/dev/threading/executor_manager.cpp b/src/inference/src/dev/threading/executor_manager.cpp new file mode 100644 index 00000000000000..250217b9104267 --- /dev/null +++ b/src/inference/src/dev/threading/executor_manager.cpp @@ -0,0 +1,210 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/threading/executor_manager.hpp" + +#include "openvino/core/parallel.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/threading/cpu_streams_executor.hpp" +#include "threading/ie_cpu_streams_executor.hpp" +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +# if (TBB_INTERFACE_VERSION < 12000) +# include +# else +# include +# endif +#endif + +#include +#include +#include +#include + +namespace ov { +namespace threading { +namespace { +class ExecutorManagerImpl : public ExecutorManager { +public: + ~ExecutorManagerImpl(); + std::shared_ptr get_executor(const std::string& id) override; + std::shared_ptr get_idle_cpu_streams_executor( + const ov::threading::IStreamsExecutor::Config& config) override; + size_t get_executors_number() const override; + size_t get_idle_cpu_streams_executors_number() const override; + void clear(const std::string& id = {}) override; + void set_property(const ov::AnyMap& properties) override; + ov::Any get_property(const std::string& name) const override; + +private: + void reset_tbb(); + + std::unordered_map> executors; + std::vector>> + cpuStreamsExecutors; + mutable std::mutex streamExecutorMutex; + mutable std::mutex taskExecutorMutex; + bool tbbTerminateFlag = false; + mutable std::mutex global_mutex; + bool tbbThreadsCreated = false; +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO +# if (TBB_INTERFACE_VERSION < 12000) + std::shared_ptr tbbTaskScheduler = nullptr; +# else + std::shared_ptr tbbTaskScheduler = nullptr; +# endif +#endif +}; + +} // namespace + +ExecutorManagerImpl::~ExecutorManagerImpl() { + reset_tbb(); +} + +void ExecutorManagerImpl::set_property(const ov::AnyMap& properties) { + std::lock_guard guard(global_mutex); + for (const auto& it : properties) { + if (it.first == ov::force_tbb_terminate.name()) { + tbbTerminateFlag = it.second.as(); +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + if (tbbTerminateFlag) { + if (!tbbTaskScheduler) { +# if (TBB_INTERFACE_VERSION < 12000) + tbbTaskScheduler = std::make_shared(); +# elif (TBB_INTERFACE_VERSION < 12060) + tbbTaskScheduler = + std::make_shared(oneapi::tbb::task_scheduler_handle::get()); +# else + tbbTaskScheduler = std::make_shared(tbb::attach{}); +# endif + } + } else { + tbbTaskScheduler = nullptr; + } +#endif + } + } +} +ov::Any ExecutorManagerImpl::get_property(const std::string& name) const { + std::lock_guard guard(global_mutex); + if (name == ov::force_tbb_terminate.name()) { + return tbbTerminateFlag; + } + OPENVINO_UNREACHABLE("Property ", name, " is not supported."); +} + +void ExecutorManagerImpl::reset_tbb() { + std::lock_guard guard(global_mutex); + if (tbbTerminateFlag) { +#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + if (tbbTaskScheduler && tbbThreadsCreated) { +# if (TBB_INTERFACE_VERSION < 12000) + tbbTaskScheduler->terminate(); +# else + tbb::finalize(*tbbTaskScheduler, std::nothrow); +# endif + } + tbbThreadsCreated = false; + tbbTaskScheduler = nullptr; +#endif + tbbTerminateFlag = false; + } +} + +std::shared_ptr ExecutorManagerImpl::get_executor(const std::string& id) { + std::lock_guard guard(taskExecutorMutex); + auto foundEntry = executors.find(id); + if (foundEntry == executors.end()) { + auto newExec = std::make_shared(ov::threading::IStreamsExecutor::Config{id}); + tbbThreadsCreated = true; + executors[id] = newExec; + return newExec; + } + return foundEntry->second; +} + +std::shared_ptr ExecutorManagerImpl::get_idle_cpu_streams_executor( + const ov::threading::IStreamsExecutor::Config& config) { + std::lock_guard guard(streamExecutorMutex); + for (const auto& it : cpuStreamsExecutors) { + const auto& executor = it.second; + if (executor.use_count() != 1) + continue; + + const auto& executorConfig = it.first; + if (executorConfig._name == config._name && executorConfig._streams == config._streams && + executorConfig._threadsPerStream == config._threadsPerStream && + executorConfig._threadBindingType == config._threadBindingType && + executorConfig._threadBindingStep == config._threadBindingStep && + executorConfig._threadBindingOffset == config._threadBindingOffset) + if (executorConfig._threadBindingType != ov::threading::IStreamsExecutor::ThreadBindingType::HYBRID_AWARE || + executorConfig._threadPreferredCoreType == config._threadPreferredCoreType) + return executor; + } + auto newExec = std::make_shared(config); + tbbThreadsCreated = true; + cpuStreamsExecutors.emplace_back(std::make_pair(config, newExec)); + return newExec; +} + +size_t ExecutorManagerImpl::get_executors_number() const { + std::lock_guard guard(taskExecutorMutex); + return executors.size(); +} + +size_t ExecutorManagerImpl::get_idle_cpu_streams_executors_number() const { + std::lock_guard guard(streamExecutorMutex); + return cpuStreamsExecutors.size(); +} + +void ExecutorManagerImpl::clear(const std::string& id) { + std::lock_guard stream_guard(streamExecutorMutex); + std::lock_guard task_guard(taskExecutorMutex); + if (id.empty()) { + executors.clear(); + cpuStreamsExecutors.clear(); + } else { + executors.erase(id); + cpuStreamsExecutors.erase( + std::remove_if(cpuStreamsExecutors.begin(), + cpuStreamsExecutors.end(), + [&](const std::pair>& it) { + return it.first._name == id; + }), + cpuStreamsExecutors.end()); + } +} + +namespace { + +class ExecutorManagerHolder { + std::mutex _mutex; + std::weak_ptr _manager; + +public: + ExecutorManagerHolder(const ExecutorManagerHolder&) = delete; + ExecutorManagerHolder& operator=(const ExecutorManagerHolder&) = delete; + + ExecutorManagerHolder() = default; + + std::shared_ptr get() { + std::lock_guard lock(_mutex); + auto manager = _manager.lock(); + if (!manager) { + _manager = manager = std::make_shared(); + } + return manager; + } +}; + +} // namespace + +std::shared_ptr executor_manager() { + static ExecutorManagerHolder executorManagerHolder; + return executorManagerHolder.get(); +} + +} // namespace threading +} // namespace ov diff --git a/src/inference/src/dev/threading/istreams_executor.cpp b/src/inference/src/dev/threading/istreams_executor.cpp new file mode 100644 index 00000000000000..d96163a2739675 --- /dev/null +++ b/src/inference/src/dev/threading/istreams_executor.cpp @@ -0,0 +1,496 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/threading/istreams_executor.hpp" + +#include +#include +#include +#include + +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" +#include "ie_plugin_config.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/system_conf.hpp" +#include "openvino/util/log.hpp" +#include "threading/ie_parallel_custom_arena.hpp" + +namespace ov { +namespace threading { + +IStreamsExecutor::~IStreamsExecutor() {} + +void IStreamsExecutor::Config::set_property(const std::string& key, const ov::Any& value) { + set_property({{key, value}}); +} + +void IStreamsExecutor::Config::set_property(const ov::AnyMap& property) { + for (const auto& it : property) { + const auto& key = it.first; + const auto value = it.second; + if (key == CONFIG_KEY(CPU_BIND_THREAD)) { + if (value.as() == CONFIG_VALUE(YES) || value.as() == CONFIG_VALUE(NUMA)) { +#if (defined(__APPLE__) || defined(_WIN32)) + _threadBindingType = IStreamsExecutor::ThreadBindingType::NUMA; +#else + _threadBindingType = (value.as() == CONFIG_VALUE(YES)) + ? IStreamsExecutor::ThreadBindingType::CORES + : IStreamsExecutor::ThreadBindingType::NUMA; +#endif + } else if (value.as() == CONFIG_VALUE(HYBRID_AWARE)) { + _threadBindingType = IStreamsExecutor::ThreadBindingType::HYBRID_AWARE; + } else if (value.as() == CONFIG_VALUE(NO)) { + _threadBindingType = IStreamsExecutor::ThreadBindingType::NONE; + } else { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_BIND_THREAD) + << ". Expected only YES(binds to cores) / NO(no binding) / NUMA(binds to NUMA nodes) / " + "HYBRID_AWARE (let the runtime recognize and use the hybrid cores)"; + } + } else if (key == ov::affinity) { + ov::Affinity affinity; + std::stringstream{value.as()} >> affinity; + switch (affinity) { + case ov::Affinity::NONE: + _threadBindingType = ThreadBindingType::NONE; + break; + case ov::Affinity::CORE: { +#if (defined(__APPLE__) || defined(_WIN32)) + _threadBindingType = ThreadBindingType::NUMA; +#else + _threadBindingType = ThreadBindingType::CORES; +#endif + } break; + case ov::Affinity::NUMA: + _threadBindingType = ThreadBindingType::NUMA; + break; + case ov::Affinity::HYBRID_AWARE: + _threadBindingType = ThreadBindingType::HYBRID_AWARE; + break; + default: + OPENVINO_UNREACHABLE("Unsupported affinity type"); + } + } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { + if (value.as() == CONFIG_VALUE(CPU_THROUGHPUT_NUMA)) { + _streams = static_cast(get_available_numa_nodes().size()); + } else if (value.as() == CONFIG_VALUE(CPU_THROUGHPUT_AUTO)) { + // bare minimum of streams (that evenly divides available number of cores) + _streams = get_default_num_streams(); + } else { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THROUGHPUT_STREAMS) + << ". Expected only positive numbers (#streams) or " + << "PluginConfigParams::CPU_THROUGHPUT_NUMA/CPU_THROUGHPUT_AUTO"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THROUGHPUT_STREAMS) + << ". Expected only positive numbers (#streams)"; + } + _streams = val_i; + } + } else if (key == ov::num_streams) { + auto streams = value.as(); + if (streams == ov::streams::NUMA) { + _streams = static_cast(get_available_numa_nodes().size()); + } else if (streams == ov::streams::AUTO) { + // bare minimum of streams (that evenly divides available number of cores) + _streams = get_default_num_streams(); + } else if (streams.num >= 0) { + _streams = streams.num; + } else { + OPENVINO_UNREACHABLE("Wrong value for property key ", + ov::num_streams.name(), + ". Expected non negative numbers (#streams) or ", + "ov::streams::NUMA|ov::streams::AUTO, Got: ", + streams); + } + } else if (key == CONFIG_KEY(CPU_THREADS_NUM) || key == ov::inference_num_threads) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THREADS_NUM) + << ". Expected only positive numbers (#threads)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THREADS_NUM) + << ". Expected only positive numbers (#threads)"; + } + _threads = val_i; + } else if (key == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM) + << ". Expected only non negative numbers (#threads)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for property key " << CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM) + << ". Expected only non negative numbers (#threads)"; + } + _threadsPerStream = val_i; + } else if (key == CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS) + << ". Expected only non negative numbers (#streams)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS) + << ". Expected only non negative numbers (#streams)"; + } + _big_core_streams = val_i; + } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS) + << ". Expected only non negative numbers (#streams)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS) + << ". Expected only non negative numbers (#streams)"; + } + _small_core_streams = val_i; + } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG) + << ". Expected only non negative numbers (#threads)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG) + << ". Expected only non negative numbers (#threads)"; + } + _threads_per_stream_big = val_i; + } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL) + << ". Expected only non negative numbers (#threads)"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL) + << ". Expected only non negative numbers (#threads)"; + } + _threads_per_stream_small = val_i; + } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)) { + int val_i; + try { + val_i = value.as(); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET) + << ". Expected only non negative numbers"; + } + if (val_i < 0) { + IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET) + << ". Expected only non negative numbers"; + } + _small_core_offset = val_i; + } else if (key == CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD)) { + if (value.as() == CONFIG_VALUE(YES)) { + _enable_hyper_thread = true; + } else if (value.as() == CONFIG_VALUE(NO)) { + _enable_hyper_thread = false; + } else { + OPENVINO_UNREACHABLE("Unsupported enable hyper thread type"); + } + } else { + IE_THROW() << "Wrong value for property key " << key; + } + } +} + +ov::Any IStreamsExecutor::Config::get_property(const std::string& key) const { + if (key == ov::supported_properties) { + std::vector properties{ + CONFIG_KEY(CPU_THROUGHPUT_STREAMS), + CONFIG_KEY(CPU_BIND_THREAD), + CONFIG_KEY(CPU_THREADS_NUM), + CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM), + CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS), + CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS), + CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG), + CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL), + CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET), + CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD), + ov::num_streams.name(), + ov::inference_num_threads.name(), + ov::affinity.name(), + }; + return properties; + } else if (key == ov::affinity) { + switch (_threadBindingType) { + case IStreamsExecutor::ThreadBindingType::NONE: + return ov::Affinity::NONE; + case IStreamsExecutor::ThreadBindingType::CORES: + return ov::Affinity::CORE; + case IStreamsExecutor::ThreadBindingType::NUMA: + return ov::Affinity::NUMA; + case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE: + return ov::Affinity::HYBRID_AWARE; + } + } else if (key == CONFIG_KEY(CPU_BIND_THREAD)) { + switch (_threadBindingType) { + case IStreamsExecutor::ThreadBindingType::NONE: + return {CONFIG_VALUE(NO)}; + case IStreamsExecutor::ThreadBindingType::CORES: + return {CONFIG_VALUE(YES)}; + case IStreamsExecutor::ThreadBindingType::NUMA: + return {CONFIG_VALUE(NUMA)}; + case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE: + return {CONFIG_VALUE(HYBRID_AWARE)}; + } + } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { + return {std::to_string(_streams)}; + } else if (key == ov::num_streams) { + return decltype(ov::num_streams)::value_type{_streams}; + } else if (key == CONFIG_KEY(CPU_THREADS_NUM)) { + return {std::to_string(_threads)}; + } else if (key == ov::inference_num_threads) { + return decltype(ov::inference_num_threads)::value_type{_threads}; + } else if (key == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { + return {std::to_string(_threadsPerStream)}; + } else if (key == CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)) { + return {std::to_string(_big_core_streams)}; + } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)) { + return {std::to_string(_small_core_streams)}; + } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)) { + return {std::to_string(_threads_per_stream_big)}; + } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)) { + return {std::to_string(_threads_per_stream_small)}; + } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)) { + return {std::to_string(_small_core_offset)}; + } else if (key == CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD)) { + return {_enable_hyper_thread ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO)}; + } else { + OPENVINO_UNREACHABLE("Wrong value for property key ", key); + } + return {}; +} + +int IStreamsExecutor::Config::get_default_num_streams(const bool enable_hyper_thread) { + const int sockets = static_cast(get_available_numa_nodes().size()); + // bare minimum of streams (that evenly divides available number of core) + const int num_cores = sockets == 1 ? (enable_hyper_thread ? parallel_get_max_threads() : get_number_of_cpu_cores()) + : get_number_of_cpu_cores(); + if (0 == num_cores % 4) + return std::max(4, num_cores / 4); + else if (0 == num_cores % 5) + return std::max(5, num_cores / 5); + else if (0 == num_cores % 3) + return std::max(3, num_cores / 3); + else // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide + return 1; +} + +int IStreamsExecutor::Config::get_hybrid_num_streams(std::map& config, + const int stream_mode) { + const int num_cores = parallel_get_max_threads(); + const int num_cores_phy = get_number_of_cpu_cores(); + const int num_big_cores_phy = get_number_of_cpu_cores(true); + const int num_small_cores = num_cores_phy - num_big_cores_phy; + const int num_big_cores = num_cores > num_cores_phy ? num_big_cores_phy * 2 : num_big_cores_phy; + int big_core_streams = 0; + int small_core_streams = 0; + int threads_per_stream_big = 0; + int threads_per_stream_small = 0; + + if (stream_mode == DEFAULT) { + // bare minimum of streams (that evenly divides available number of core) + if (0 == num_big_cores_phy % 4) { + threads_per_stream_big = 4; + } else if (0 == num_big_cores_phy % 5) { + threads_per_stream_big = 5; + } else if (0 == num_big_cores_phy % 3) { + threads_per_stream_big = 3; + } else { // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide + threads_per_stream_big = num_big_cores_phy; + } + + big_core_streams = num_big_cores / threads_per_stream_big; + threads_per_stream_small = threads_per_stream_big; + if (num_small_cores == 0) { + threads_per_stream_small = 0; + } else if (num_small_cores < threads_per_stream_small) { + small_core_streams = 1; + threads_per_stream_small = num_small_cores; + threads_per_stream_big = threads_per_stream_small; + // Balance the computation of physical core and logical core, the number of threads on the physical core and + // logical core should be equal + big_core_streams = num_big_cores_phy / threads_per_stream_big * 2; + } else { + small_core_streams = num_small_cores / threads_per_stream_small; + } + } else if (stream_mode == AGGRESSIVE) { + big_core_streams = num_big_cores; + small_core_streams = num_small_cores; + threads_per_stream_big = num_big_cores / big_core_streams; + threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / small_core_streams; + } else if (stream_mode == LESSAGGRESSIVE) { + big_core_streams = num_big_cores / 2; + small_core_streams = num_small_cores / 2; + threads_per_stream_big = num_big_cores / big_core_streams; + threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / small_core_streams; + } else { + IE_THROW() << "Wrong stream mode to get num of streams: " << stream_mode; + } + config[CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)] = std::to_string(big_core_streams); + config[CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)] = std::to_string(small_core_streams); + config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)] = std::to_string(threads_per_stream_big); + config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)] = std::to_string(threads_per_stream_small); + // This is default setting for specific CPU which Pcore is in front and Ecore is in the back. + config[CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)] = std::to_string(num_small_cores == 0 ? 0 : num_big_cores); + return big_core_streams + small_core_streams; +} + +void IStreamsExecutor::Config::update_hybrid_custom_threads(Config& config) { + const auto num_cores = parallel_get_max_threads(); + const auto num_cores_phys = get_number_of_cpu_cores(); + const auto num_big_cores_phys = get_number_of_cpu_cores(true); + const auto num_big_cores = num_cores > num_cores_phys ? num_big_cores_phys * 2 : num_big_cores_phys; + const auto num_small_cores_phys = num_cores_phys - num_big_cores_phys; + const auto threads = config._threads ? config._threads : num_cores; + const auto streams = config._streams > 0 ? config._streams : 1; + + config._small_core_offset = num_big_cores; + int threads_per_stream = std::max(1, threads / streams); + + if ((num_big_cores_phys / threads_per_stream >= streams) && (1 < threads_per_stream)) { + config._big_core_streams = streams; + config._threads_per_stream_big = threads_per_stream; + config._small_core_streams = 0; + config._threads_per_stream_small = 0; + } else if ((num_small_cores_phys / threads_per_stream >= streams) && (num_big_cores_phys < threads_per_stream)) { + config._big_core_streams = 0; + config._threads_per_stream_big = 0; + config._small_core_streams = streams; + config._threads_per_stream_small = threads_per_stream; + } else { + const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream); + const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream); + + threads_per_stream = std::min(threads_per_stream_big, threads_per_stream_small); + while (threads_per_stream > 1) { + const int base_big_streams = num_big_cores_phys / threads_per_stream; + const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream : 0; + if (base_big_streams + base_small_streams >= streams) { + config._big_core_streams = base_big_streams; + config._small_core_streams = streams - base_big_streams; + break; + } else if (base_big_streams * 2 + base_small_streams >= streams) { + config._big_core_streams = streams - base_small_streams; + config._small_core_streams = base_small_streams; + break; + } else { + threads_per_stream = threads_per_stream > 1 ? threads_per_stream - 1 : 1; + } + } + + if (threads_per_stream == 1) { + const int stream_loops = streams / num_cores; + const int remain_streams = streams - stream_loops * num_cores; + if (num_big_cores_phys >= remain_streams) { + config._big_core_streams = remain_streams + num_big_cores * stream_loops; + config._small_core_streams = num_small_cores_phys * stream_loops; + } else if (num_big_cores_phys + num_small_cores_phys >= remain_streams) { + config._big_core_streams = num_big_cores_phys + num_big_cores * stream_loops; + config._small_core_streams = remain_streams - num_big_cores_phys + num_small_cores_phys * stream_loops; + } else { + config._big_core_streams = remain_streams - num_small_cores_phys + num_big_cores * stream_loops; + config._small_core_streams = num_small_cores_phys * (stream_loops + 1); + } + } + + config._threads_per_stream_big = threads_per_stream; + config._threads_per_stream_small = threads_per_stream; + } +} + +IStreamsExecutor::Config IStreamsExecutor::Config::make_default_multi_threaded(const IStreamsExecutor::Config& initial, + const bool fp_intesive) { + const auto envThreads = parallel_get_env_threads(); + const auto& numaNodes = get_available_numa_nodes(); + const int numaNodesNum = static_cast(numaNodes.size()); + auto streamExecutorConfig = initial; + const bool bLatencyCase = streamExecutorConfig._streams <= numaNodesNum; + + // by default, do not use the hyper-threading (to minimize threads synch overheads) + int num_cores_default = get_number_of_cpu_cores(); +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + // additional latency-case logic for hybrid processors: + if (ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) { + const auto core_types = custom::info::core_types(); + const auto num_little_cores = + custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.front())); + const auto num_big_cores_phys = get_number_of_cpu_cores(true); + const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores; + const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores; + // by default the latency case uses (faster) Big cores only, depending on the compute ratio + const bool bLatencyCaseBigOnly = + num_big_cores_phys > (num_little_cores / (fp_intesive ? fp32_threshold : int8_threshold)); + // selecting the preferred core type + streamExecutorConfig._threadPreferredCoreType = + bLatencyCase ? (bLatencyCaseBigOnly ? IStreamsExecutor::Config::PreferredCoreType::BIG + : IStreamsExecutor::Config::PreferredCoreType::ANY) + : IStreamsExecutor::Config::PreferredCoreType::ROUND_ROBIN; + // additionally selecting the #cores to use in the "Big-only" case + if (bLatencyCaseBigOnly) { + const int hyper_threading_threshold = + 2; // min #cores, for which the hyper-threading becomes useful for the latency case + const auto num_big_cores = + custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.back())); + num_cores_default = (num_big_cores_phys <= hyper_threading_threshold) ? num_big_cores : num_big_cores_phys; + } + // if nstreams or nthreads are set, need to calculate the Hybrid aware parameters here + if (!bLatencyCase && (streamExecutorConfig._big_core_streams == 0 || streamExecutorConfig._threads)) { + update_hybrid_custom_threads(streamExecutorConfig); + } + OPENVINO_DEBUG << "[ p_e_core_info ] streams (threads): " << streamExecutorConfig._streams << "(" + << streamExecutorConfig._threads_per_stream_big * streamExecutorConfig._big_core_streams + + streamExecutorConfig._threads_per_stream_small * streamExecutorConfig._small_core_streams + << ") -- PCore: " << streamExecutorConfig._big_core_streams << "(" + << streamExecutorConfig._threads_per_stream_big + << ") ECore: " << streamExecutorConfig._small_core_streams << "(" + << streamExecutorConfig._threads_per_stream_small << ")"; + } +#endif + const auto hwCores = + !bLatencyCase && numaNodesNum == 1 + // throughput case on a single-NUMA node machine uses all available cores + ? (streamExecutorConfig._enable_hyper_thread ? parallel_get_max_threads() : num_cores_default) + // in the rest of cases: + // multi-node machine + // or + // latency case, single-node yet hybrid case that uses + // all core types + // or + // big-cores only, but the #cores is "enough" (pls see the logic above) + // it is usually beneficial not to use the hyper-threading (which is default) + : num_cores_default; + const auto threads = + streamExecutorConfig._threads ? streamExecutorConfig._threads : (envThreads ? envThreads : hwCores); + streamExecutorConfig._threadsPerStream = + streamExecutorConfig._streams ? std::max(1, threads / streamExecutorConfig._streams) : threads; + streamExecutorConfig._threads = + (!bLatencyCase && ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) + ? streamExecutorConfig._big_core_streams * streamExecutorConfig._threads_per_stream_big + + streamExecutorConfig._small_core_streams * streamExecutorConfig._threads_per_stream_small + : streamExecutorConfig._threadsPerStream * streamExecutorConfig._streams; + return streamExecutorConfig; +} + +} // namespace threading +} // namespace ov diff --git a/src/inference/src/dev/threading/itask_executor.cpp b/src/inference/src/dev/threading/itask_executor.cpp new file mode 100644 index 00000000000000..7701df3d2b4113 --- /dev/null +++ b/src/inference/src/dev/threading/itask_executor.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/threading/itask_executor.hpp" + +#include +#include +#include +#include + +namespace ov { +namespace threading { + +void ITaskExecutor::run_and_wait(const std::vector& tasks) { + std::vector> packagedTasks; + std::vector> futures; + for (std::size_t i = 0; i < tasks.size(); ++i) { + packagedTasks.emplace_back([&tasks, i] { + tasks[i](); + }); + futures.emplace_back(packagedTasks.back().get_future()); + } + for (std::size_t i = 0; i < tasks.size(); ++i) { + run([&packagedTasks, i] { + packagedTasks[i](); + }); + } + // std::future::get will rethrow exception from task. + // We should wait all tasks before any exception is thrown. + // So wait() and get() for each future moved to separate loops + for (auto&& future : futures) { + future.wait(); + } + for (auto&& future : futures) { + future.get(); + } +} + +} // namespace threading +} // namespace ov diff --git a/src/inference/src/os/lin/lin_system_conf.cpp b/src/inference/src/os/lin/lin_system_conf.cpp index d822b631e9c92e..ec56b4897d5fcb 100644 --- a/src/inference/src/os/lin/lin_system_conf.cpp +++ b/src/inference/src/os/lin/lin_system_conf.cpp @@ -18,7 +18,7 @@ #include "streams_executor.hpp" #include "threading/ie_parallel_custom_arena.hpp" -namespace InferenceEngine { +namespace ov { struct CPU { int _processors = 0; @@ -243,13 +243,13 @@ void parse_processor_info_linux(const int _processors, }; #if !((IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)) -std::vector getAvailableNUMANodes() { +std::vector get_available_numa_nodes() { std::vector nodes((0 == cpu._sockets) ? 1 : cpu._sockets); std::iota(std::begin(nodes), std::end(nodes), 0); return nodes; } #endif -int getNumberOfCPUCores(bool bigCoresOnly) { +int get_number_of_cpu_cores(bool bigCoresOnly) { unsigned numberOfProcessors = cpu._processors; unsigned totalNumberOfCpuCores = cpu._cores; IE_ASSERT(totalNumberOfCpuCores != 0); @@ -280,4 +280,4 @@ int getNumberOfCPUCores(bool bigCoresOnly) { return phys_cores; } -} // namespace InferenceEngine +} // namespace ov diff --git a/src/inference/src/os/win/win_system_conf.cpp b/src/inference/src/os/win/win_system_conf.cpp index e89666edf7ac54..e4d7df0166730a 100644 --- a/src/inference/src/os/win/win_system_conf.cpp +++ b/src/inference/src/os/win/win_system_conf.cpp @@ -3,7 +3,7 @@ // #ifndef NOMINMAX -# define NOMINMAX +# define NOMINMAX #endif #include @@ -11,11 +11,11 @@ #include #include -#include "ie_system_conf.h" +#include "openvino/runtime/system_conf.hpp" #include "streams_executor.hpp" #include "threading/ie_parallel_custom_arena.hpp" -namespace InferenceEngine { +namespace ov { struct CPU { int _processors = 0; @@ -168,7 +168,7 @@ void parse_processor_info_win(const char* base_ptr, } } -int getNumberOfCPUCores(bool bigCoresOnly) { +int get_number_of_cpu_cores(bool bigCoresOnly) { const int fallback_val = parallel_get_max_threads(); DWORD sz = 0; // querying the size of the resulting structure, passing the nullptr for the buffer @@ -178,7 +178,8 @@ int getNumberOfCPUCores(bool bigCoresOnly) { std::unique_ptr ptr(new uint8_t[sz]); if (!GetLogicalProcessorInformationEx(RelationProcessorCore, - reinterpret_cast(ptr.get()), &sz)) + reinterpret_cast(ptr.get()), + &sz)) return fallback_val; int phys_cores = 0; @@ -188,20 +189,21 @@ int getNumberOfCPUCores(bool bigCoresOnly) { phys_cores++; } while (offset < sz); - #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) +#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) auto core_types = custom::info::core_types(); if (bigCoresOnly && core_types.size() > 1) /*Hybrid CPU*/ { - phys_cores = custom::info::default_concurrency(custom::task_arena::constraints{} - .set_core_type(core_types.back()) - .set_max_threads_per_core(1)); + phys_cores = custom::info::default_concurrency( + custom::task_arena::constraints{}.set_core_type(core_types.back()).set_max_threads_per_core(1)); } - #endif +#endif return phys_cores; } #if !(IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) // OMP/SEQ threading on the Windows doesn't support NUMA -std::vector getAvailableNUMANodes() { return {-1}; } +std::vector get_available_numa_nodes() { + return {-1}; +} #endif -} // namespace InferenceEngine +} // namespace ov diff --git a/src/inference/src/streams_executor.hpp b/src/inference/src/streams_executor.hpp index 769c4ec73cd034..4bea102dbceb63 100644 --- a/src/inference/src/streams_executor.hpp +++ b/src/inference/src/streams_executor.hpp @@ -11,7 +11,7 @@ #include #include -namespace InferenceEngine { +namespace ov { #ifdef __linux__ /** @@ -55,4 +55,4 @@ void parse_processor_info_win(const char* base_ptr, std::vector>& _cpu_mapping_table); #endif -} // namespace InferenceEngine \ No newline at end of file +} // namespace ov diff --git a/src/inference/src/ie_system_conf.cpp b/src/inference/src/system_conf.cpp similarity index 90% rename from src/inference/src/ie_system_conf.cpp rename to src/inference/src/system_conf.cpp index 761fdda4dd54e3..da212d4a62950c 100644 --- a/src/inference/src/ie_system_conf.cpp +++ b/src/inference/src/system_conf.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ie_system_conf.h" +#include "openvino/runtime/system_conf.hpp" #include #include @@ -15,7 +15,7 @@ #define XBYAK_UNDEF_JNL #include -namespace InferenceEngine { +namespace ov { #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) @@ -102,7 +102,7 @@ bool with_cpu_x86_avx512_core_amx() { #endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 -bool checkOpenMpEnvVars(bool includeOMPNumThreads) { +bool check_open_mp_env_vars(bool include_omp_num_threads) { for (auto&& var : {"GOMP_CPU_AFFINITY", "GOMP_DEBUG" "GOMP_RTEMS_THREAD_POOLS", @@ -134,7 +134,7 @@ bool checkOpenMpEnvVars(bool includeOMPNumThreads) { "PHI_KMP_PLACE_THREADS" "PHI_OMP_NUM_THREADS"}) { if (getenv(var)) { - if (0 != strcmp(var, "OMP_NUM_THREADS") || includeOMPNumThreads) + if (0 != strcmp(var, "OMP_NUM_THREADS") || include_omp_num_threads) return true; } } @@ -144,19 +144,19 @@ bool checkOpenMpEnvVars(bool includeOMPNumThreads) { #if defined(__APPLE__) || defined(__EMSCRIPTEN__) // for Linux and Windows the getNumberOfCPUCores (that accounts only for physical cores) implementation is OS-specific // (see cpp files in corresponding folders), for __APPLE__ it is default : -int getNumberOfCPUCores(bool) { +int get_number_of_cpu_cores(bool) { return parallel_get_max_threads(); } # if !((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO)) -std::vector getAvailableNUMANodes() { +std::vector get_available_numa_nodes() { return {-1}; } # endif -int getNumberOfLogicalCPUCores(bool) { +int get_number_of_logical_cpu_cores(bool) { return parallel_get_max_threads(); } #else -int getNumberOfLogicalCPUCores(bool bigCoresOnly) { +int get_number_of_logical_cpu_cores(bool bigCoresOnly) { int logical_cores = parallel_get_max_threads(); # if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) auto core_types = custom::info::core_types(); @@ -170,18 +170,18 @@ int getNumberOfLogicalCPUCores(bool bigCoresOnly) { #endif #if ((IE_THREAD == IE_THREAD_TBB) || (IE_THREAD == IE_THREAD_TBB_AUTO)) -std::vector getAvailableNUMANodes() { +std::vector get_available_numa_nodes() { return custom::info::numa_nodes(); } // this is impl only with the TBB -std::vector getAvailableCoresTypes() { +std::vector get_available_cores_types() { return custom::info::core_types(); } #else // as the core types support exists only with the TBB, the fallback is same for any other threading API -std::vector getAvailableCoresTypes() { +std::vector get_available_cores_types() { return {-1}; } #endif -} // namespace InferenceEngine +} // namespace ov diff --git a/src/inference/src/threading/ie_cpu_streams_executor.cpp b/src/inference/src/threading/ie_cpu_streams_executor.cpp index 2e786599a74bf2..37f690ec473c63 100644 --- a/src/inference/src/threading/ie_cpu_streams_executor.cpp +++ b/src/inference/src/threading/ie_cpu_streams_executor.cpp @@ -194,7 +194,7 @@ struct CPUStreamsExecutor::Impl { } #elif IE_THREAD == IE_THREAD_SEQ if (ThreadBindingType::NUMA == _impl->_config._threadBindingType) { - PinCurrentThreadToSocket(_numaNodeId); + InferenceEngine::PinCurrentThreadToSocket(_numaNodeId); } else if (ThreadBindingType::CORES == _impl->_config._threadBindingType) { CpuSet processMask; int ncpus = 0; @@ -368,7 +368,7 @@ int CPUStreamsExecutor::GetNumaNodeId() { return stream->_numaNodeId; } -CPUStreamsExecutor::CPUStreamsExecutor(const IStreamsExecutor::Config& config) : _impl{new Impl{config}} {} +CPUStreamsExecutor::CPUStreamsExecutor(const Config& config) : _impl{new Impl{config}} {} CPUStreamsExecutor::~CPUStreamsExecutor() { { diff --git a/src/inference/src/threading/ie_executor_manager.cpp b/src/inference/src/threading/ie_executor_manager.cpp index 6e52117976d88d..82a1e126ae5dae 100644 --- a/src/inference/src/threading/ie_executor_manager.cpp +++ b/src/inference/src/threading/ie_executor_manager.cpp @@ -5,7 +5,12 @@ #include "threading/ie_executor_manager.hpp" #include "ie_parallel.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" #include "threading/ie_cpu_streams_executor.hpp" +#include "threading/ie_itask_executor.hpp" #if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO # if (TBB_INTERFACE_VERSION < 12000) # include @@ -23,7 +28,7 @@ namespace InferenceEngine { namespace { class ExecutorManagerImpl : public ExecutorManager { public: - ~ExecutorManagerImpl(); + ExecutorManagerImpl(const std::shared_ptr& manager); ITaskExecutor::Ptr getExecutor(const std::string& id) override; IStreamsExecutor::Ptr getIdleCPUStreamsExecutor(const IStreamsExecutor::Config& config) override; size_t getExecutorsNumber() const override; @@ -33,134 +38,87 @@ class ExecutorManagerImpl : public ExecutorManager { bool getTbbFlag() override; private: - void resetTbb(); - std::unordered_map executors; - std::vector> cpuStreamsExecutors; - mutable std::mutex streamExecutorMutex; - mutable std::mutex taskExecutorMutex; - bool tbbTerminateFlag = false; - mutable std::mutex tbbMutex; - bool tbbThreadsCreated = false; -#if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO -# if (TBB_INTERFACE_VERSION < 12000) - std::shared_ptr tbbTaskScheduler = nullptr; -# else - std::shared_ptr tbbTaskScheduler = nullptr; -# endif -#endif + std::shared_ptr m_manager; + std::shared_ptr get_ov_manager() const override { + return m_manager; + } +}; + +class TaskExecutorWrapper : public ITaskExecutor { + std::shared_ptr m_executor; + +public: + TaskExecutorWrapper(const std::shared_ptr& executor) : m_executor(executor) {} + void run(Task task) override { + m_executor->run(task); + } + + void runAndWait(const std::vector& tasks) override { + m_executor->run_and_wait(tasks); + } +}; + +class StreamsExecutorWrapper : public IStreamsExecutor { + std::shared_ptr m_executor; + +public: + StreamsExecutorWrapper(const std::shared_ptr& executor) : m_executor(executor) {} + void run(Task task) override { + m_executor->run(task); + } + + void runAndWait(const std::vector& tasks) override { + m_executor->run_and_wait(tasks); + } + int GetStreamId() override { + return m_executor->get_stream_id(); + } + + int GetNumaNodeId() override { + return m_executor->get_numa_node_id(); + } + + void Execute(Task task) override { + m_executor->execute(task); + } }; } // namespace -ExecutorManagerImpl::~ExecutorManagerImpl() { - resetTbb(); -} +ExecutorManagerImpl::ExecutorManagerImpl(const std::shared_ptr& manager) + : m_manager(manager) {} void ExecutorManagerImpl::setTbbFlag(bool flag) { - std::lock_guard guard(tbbMutex); - tbbTerminateFlag = flag; -#if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO - if (tbbTerminateFlag) { - if (!tbbTaskScheduler) { -# if (TBB_INTERFACE_VERSION < 12000) - tbbTaskScheduler = std::make_shared(); -# elif (TBB_INTERFACE_VERSION < 12060) - tbbTaskScheduler = - std::make_shared(oneapi::tbb::task_scheduler_handle::get()); -# else - tbbTaskScheduler = std::make_shared(tbb::attach{}); -# endif - } - } else { - tbbTaskScheduler = nullptr; - } -#endif + m_manager->set_property({{ov::force_tbb_terminate.name(), flag}}); } bool ExecutorManagerImpl::getTbbFlag() { - std::lock_guard guard(tbbMutex); - return tbbTerminateFlag; -} - -void ExecutorManagerImpl::resetTbb() { - std::lock_guard guard(tbbMutex); - if (tbbTerminateFlag) { -#if IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO - if (tbbTaskScheduler && tbbThreadsCreated) { -# if (TBB_INTERFACE_VERSION < 12000) - tbbTaskScheduler->terminate(); -# else - tbb::finalize(*tbbTaskScheduler, std::nothrow); -# endif - } - tbbThreadsCreated = false; - tbbTaskScheduler = nullptr; -#endif - tbbTerminateFlag = false; - } + return m_manager->get_property(ov::force_tbb_terminate.name()).as(); } ITaskExecutor::Ptr ExecutorManagerImpl::getExecutor(const std::string& id) { - std::lock_guard guard(taskExecutorMutex); - auto foundEntry = executors.find(id); - if (foundEntry == executors.end()) { - auto newExec = std::make_shared(IStreamsExecutor::Config{id}); - tbbThreadsCreated = true; - executors[id] = newExec; - return newExec; - } - return foundEntry->second; + return std::make_shared(m_manager->get_executor(id)); } IStreamsExecutor::Ptr ExecutorManagerImpl::getIdleCPUStreamsExecutor(const IStreamsExecutor::Config& config) { - std::lock_guard guard(streamExecutorMutex); - for (const auto& it : cpuStreamsExecutors) { - const auto& executor = it.second; - if (executor.use_count() != 1) - continue; - - const auto& executorConfig = it.first; - if (executorConfig._name == config._name && executorConfig._streams == config._streams && - executorConfig._threadsPerStream == config._threadsPerStream && - executorConfig._threadBindingType == config._threadBindingType && - executorConfig._threadBindingStep == config._threadBindingStep && - executorConfig._threadBindingOffset == config._threadBindingOffset) - if (executorConfig._threadBindingType != IStreamsExecutor::ThreadBindingType::HYBRID_AWARE || - executorConfig._threadPreferredCoreType == config._threadPreferredCoreType) - return executor; - } - auto newExec = std::make_shared(config); - tbbThreadsCreated = true; - cpuStreamsExecutors.emplace_back(std::make_pair(config, newExec)); - return newExec; + return std::make_shared(m_manager->get_idle_cpu_streams_executor(config)); } size_t ExecutorManagerImpl::getExecutorsNumber() const { - std::lock_guard guard(taskExecutorMutex); - return executors.size(); + return m_manager->get_executors_number(); } size_t ExecutorManagerImpl::getIdleCPUStreamsExecutorsNumber() const { - std::lock_guard guard(streamExecutorMutex); - return cpuStreamsExecutors.size(); + return m_manager->get_idle_cpu_streams_executors_number(); } void ExecutorManagerImpl::clear(const std::string& id) { - std::lock_guard stream_guard(streamExecutorMutex); - std::lock_guard task_guard(taskExecutorMutex); - if (id.empty()) { - executors.clear(); - cpuStreamsExecutors.clear(); - } else { - executors.erase(id); - cpuStreamsExecutors.erase( - std::remove_if(cpuStreamsExecutors.begin(), - cpuStreamsExecutors.end(), - [&](const std::pair& it) { - return it.first._name == id; - }), - cpuStreamsExecutors.end()); - } + return m_manager->clear(id); +} + +std::shared_ptr create_old_manager( + const std::shared_ptr& manager) { + return std::make_shared(manager); } namespace { @@ -179,7 +137,7 @@ class ExecutorManagerHolder { std::lock_guard lock(_mutex); auto manager = _manager.lock(); if (!manager) { - _manager = manager = std::make_shared(); + _manager = manager = create_old_manager(ov::threading::executor_manager()); } return manager; } diff --git a/src/inference/src/threading/ie_istreams_executor.cpp b/src/inference/src/threading/ie_istreams_executor.cpp index 87529594c45ad6..e78cc8cb0fae4e 100644 --- a/src/inference/src/threading/ie_istreams_executor.cpp +++ b/src/inference/src/threading/ie_istreams_executor.cpp @@ -23,463 +23,31 @@ namespace InferenceEngine { IStreamsExecutor::~IStreamsExecutor() {} std::vector IStreamsExecutor::Config::SupportedKeys() const { - return { - CONFIG_KEY(CPU_THROUGHPUT_STREAMS), - CONFIG_KEY(CPU_BIND_THREAD), - CONFIG_KEY(CPU_THREADS_NUM), - CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM), - CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS), - CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS), - CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG), - CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL), - CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET), - CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD), - ov::num_streams.name(), - ov::inference_num_threads.name(), - ov::affinity.name(), - }; + return get_property(ov::supported_properties.name()).as>(); } int IStreamsExecutor::Config::GetDefaultNumStreams(const bool enable_hyper_thread) { - const int sockets = static_cast(getAvailableNUMANodes().size()); - // bare minimum of streams (that evenly divides available number of core) - const int num_cores = sockets == 1 ? (enable_hyper_thread ? parallel_get_max_threads() : getNumberOfCPUCores()) - : getNumberOfCPUCores(); - if (0 == num_cores % 4) - return std::max(4, num_cores / 4); - else if (0 == num_cores % 5) - return std::max(5, num_cores / 5); - else if (0 == num_cores % 3) - return std::max(3, num_cores / 3); - else // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide - return 1; + return get_default_num_streams(enable_hyper_thread); } int IStreamsExecutor::Config::GetHybridNumStreams(std::map& config, const int stream_mode) { - const int num_cores = parallel_get_max_threads(); - const int num_cores_phy = getNumberOfCPUCores(); - const int num_big_cores_phy = getNumberOfCPUCores(true); - const int num_small_cores = num_cores_phy - num_big_cores_phy; - const int num_big_cores = num_cores > num_cores_phy ? num_big_cores_phy * 2 : num_big_cores_phy; - int big_core_streams = 0; - int small_core_streams = 0; - int threads_per_stream_big = 0; - int threads_per_stream_small = 0; - - if (stream_mode == DEFAULT) { - // bare minimum of streams (that evenly divides available number of core) - if (0 == num_big_cores_phy % 4) { - threads_per_stream_big = 4; - } else if (0 == num_big_cores_phy % 5) { - threads_per_stream_big = 5; - } else if (0 == num_big_cores_phy % 3) { - threads_per_stream_big = 3; - } else { // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide - threads_per_stream_big = num_big_cores_phy; - } - - big_core_streams = num_big_cores / threads_per_stream_big; - threads_per_stream_small = threads_per_stream_big; - if (num_small_cores == 0) { - threads_per_stream_small = 0; - } else if (num_small_cores < threads_per_stream_small) { - small_core_streams = 1; - threads_per_stream_small = num_small_cores; - threads_per_stream_big = threads_per_stream_small; - // Balance the computation of physical core and logical core, the number of threads on the physical core and - // logical core should be equal - big_core_streams = num_big_cores_phy / threads_per_stream_big * 2; - } else { - small_core_streams = num_small_cores / threads_per_stream_small; - } - } else if (stream_mode == AGGRESSIVE) { - big_core_streams = num_big_cores; - small_core_streams = num_small_cores; - threads_per_stream_big = num_big_cores / big_core_streams; - threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / small_core_streams; - } else if (stream_mode == LESSAGGRESSIVE) { - big_core_streams = num_big_cores / 2; - small_core_streams = num_small_cores / 2; - threads_per_stream_big = num_big_cores / big_core_streams; - threads_per_stream_small = num_small_cores == 0 ? 0 : num_small_cores / small_core_streams; - } else { - IE_THROW() << "Wrong stream mode to get num of streams: " << stream_mode; - } - config[CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)] = std::to_string(big_core_streams); - config[CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)] = std::to_string(small_core_streams); - config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)] = std::to_string(threads_per_stream_big); - config[CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)] = std::to_string(threads_per_stream_small); - // This is default setting for specific CPU which Pcore is in front and Ecore is in the back. - config[CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)] = std::to_string(num_small_cores == 0 ? 0 : num_big_cores); - return big_core_streams + small_core_streams; + return get_hybrid_num_streams(config, stream_mode); } void IStreamsExecutor::Config::SetConfig(const std::string& key, const std::string& value) { - if (key == CONFIG_KEY(CPU_BIND_THREAD)) { - if (value == CONFIG_VALUE(YES) || value == CONFIG_VALUE(NUMA)) { -#if (defined(__APPLE__) || defined(_WIN32)) - _threadBindingType = IStreamsExecutor::ThreadBindingType::NUMA; -#else - _threadBindingType = (value == CONFIG_VALUE(YES)) ? IStreamsExecutor::ThreadBindingType::CORES - : IStreamsExecutor::ThreadBindingType::NUMA; -#endif - } else if (value == CONFIG_VALUE(HYBRID_AWARE)) { - _threadBindingType = IStreamsExecutor::ThreadBindingType::HYBRID_AWARE; - } else if (value == CONFIG_VALUE(NO)) { - _threadBindingType = IStreamsExecutor::ThreadBindingType::NONE; - } else { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_BIND_THREAD) - << ". Expected only YES(binds to cores) / NO(no binding) / NUMA(binds to NUMA nodes) / " - "HYBRID_AWARE (let the runtime recognize and use the hybrid cores)"; - } - } else if (key == ov::affinity) { - ov::Affinity affinity; - std::stringstream{value} >> affinity; - switch (affinity) { - case ov::Affinity::NONE: - _threadBindingType = ThreadBindingType::NONE; - break; - case ov::Affinity::CORE: { -#if (defined(__APPLE__) || defined(_WIN32)) - _threadBindingType = ThreadBindingType::NUMA; -#else - _threadBindingType = ThreadBindingType::CORES; -#endif - } break; - case ov::Affinity::NUMA: - _threadBindingType = ThreadBindingType::NUMA; - break; - case ov::Affinity::HYBRID_AWARE: - _threadBindingType = ThreadBindingType::HYBRID_AWARE; - break; - default: - OPENVINO_UNREACHABLE("Unsupported affinity type"); - } - } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { - if (value == CONFIG_VALUE(CPU_THROUGHPUT_NUMA)) { - _streams = static_cast(getAvailableNUMANodes().size()); - } else if (value == CONFIG_VALUE(CPU_THROUGHPUT_AUTO)) { - // bare minimum of streams (that evenly divides available number of cores) - _streams = GetDefaultNumStreams(); - } else { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THROUGHPUT_STREAMS) - << ". Expected only positive numbers (#streams) or " - << "PluginConfigParams::CPU_THROUGHPUT_NUMA/CPU_THROUGHPUT_AUTO"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THROUGHPUT_STREAMS) - << ". Expected only positive numbers (#streams)"; - } - _streams = val_i; - } - } else if (key == ov::num_streams) { - auto streams = ov::util::from_string(value, ov::streams::num); - if (streams == ov::streams::NUMA) { - _streams = static_cast(getAvailableNUMANodes().size()); - } else if (streams == ov::streams::AUTO) { - // bare minimum of streams (that evenly divides available number of cores) - _streams = GetDefaultNumStreams(); - } else if (streams.num >= 0) { - _streams = streams.num; - } else { - OPENVINO_UNREACHABLE("Wrong value for property key ", - ov::num_streams.name(), - ". Expected non negative numbers (#streams) or ", - "ov::streams::NUMA|ov::streams::AUTO, Got: ", - streams); - } - } else if (key == CONFIG_KEY(CPU_THREADS_NUM) || key == ov::inference_num_threads) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THREADS_NUM) - << ". Expected only positive numbers (#threads)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY(CPU_THREADS_NUM) - << ". Expected only positive numbers (#threads)"; - } - _threads = val_i; - } else if (key == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM) - << ". Expected only non negative numbers (#threads)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for property key " << CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM) - << ". Expected only non negative numbers (#threads)"; - } - _threadsPerStream = val_i; - } else if (key == CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS) - << ". Expected only non negative numbers (#streams)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS) - << ". Expected only non negative numbers (#streams)"; - } - _big_core_streams = val_i; - } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS) - << ". Expected only non negative numbers (#streams)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS) - << ". Expected only non negative numbers (#streams)"; - } - _small_core_streams = val_i; - } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG) - << ". Expected only non negative numbers (#threads)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG) - << ". Expected only non negative numbers (#threads)"; - } - _threads_per_stream_big = val_i; - } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL) - << ". Expected only non negative numbers (#threads)"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL) - << ". Expected only non negative numbers (#threads)"; - } - _threads_per_stream_small = val_i; - } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)) { - int val_i; - try { - val_i = std::stoi(value); - } catch (const std::exception&) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET) - << ". Expected only non negative numbers"; - } - if (val_i < 0) { - IE_THROW() << "Wrong value for HYBRID_AWARE key " << CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET) - << ". Expected only non negative numbers"; - } - _small_core_offset = val_i; - } else if (key == CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD)) { - if (value == CONFIG_VALUE(YES)) { - _enable_hyper_thread = true; - } else if (value == CONFIG_VALUE(NO)) { - _enable_hyper_thread = false; - } else { - OPENVINO_UNREACHABLE("Unsupported enable hyper thread type"); - } - } else { - IE_THROW() << "Wrong value for property key " << key; - } + set_property(key, value); } Parameter IStreamsExecutor::Config::GetConfig(const std::string& key) const { - if (key == ov::affinity) { - switch (_threadBindingType) { - case IStreamsExecutor::ThreadBindingType::NONE: - return ov::Affinity::NONE; - case IStreamsExecutor::ThreadBindingType::CORES: - return ov::Affinity::CORE; - case IStreamsExecutor::ThreadBindingType::NUMA: - return ov::Affinity::NUMA; - case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE: - return ov::Affinity::HYBRID_AWARE; - } - } else if (key == CONFIG_KEY(CPU_BIND_THREAD)) { - switch (_threadBindingType) { - case IStreamsExecutor::ThreadBindingType::NONE: - return {CONFIG_VALUE(NO)}; - case IStreamsExecutor::ThreadBindingType::CORES: - return {CONFIG_VALUE(YES)}; - case IStreamsExecutor::ThreadBindingType::NUMA: - return {CONFIG_VALUE(NUMA)}; - case IStreamsExecutor::ThreadBindingType::HYBRID_AWARE: - return {CONFIG_VALUE(HYBRID_AWARE)}; - } - } else if (key == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { - return {std::to_string(_streams)}; - } else if (key == ov::num_streams) { - return decltype(ov::num_streams)::value_type{_streams}; - } else if (key == CONFIG_KEY(CPU_THREADS_NUM)) { - return {std::to_string(_threads)}; - } else if (key == ov::inference_num_threads) { - return decltype(ov::inference_num_threads)::value_type{_threads}; - } else if (key == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { - return {std::to_string(_threadsPerStream)}; - } else if (key == CONFIG_KEY_INTERNAL(BIG_CORE_STREAMS)) { - return {std::to_string(_big_core_streams)}; - } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_STREAMS)) { - return {std::to_string(_small_core_streams)}; - } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_BIG)) { - return {std::to_string(_threads_per_stream_big)}; - } else if (key == CONFIG_KEY_INTERNAL(THREADS_PER_STREAM_SMALL)) { - return {std::to_string(_threads_per_stream_small)}; - } else if (key == CONFIG_KEY_INTERNAL(SMALL_CORE_OFFSET)) { - return {std::to_string(_small_core_offset)}; - } else if (key == CONFIG_KEY_INTERNAL(ENABLE_HYPER_THREAD)) { - return {_enable_hyper_thread ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO)}; - } else { - IE_THROW() << "Wrong value for property key " << key; - } - return {}; + return get_property(key); } void IStreamsExecutor::Config::UpdateHybridCustomThreads(Config& config) { - const auto num_cores = parallel_get_max_threads(); - const auto num_cores_phys = getNumberOfCPUCores(); - const auto num_big_cores_phys = getNumberOfCPUCores(true); - const auto num_big_cores = num_cores > num_cores_phys ? num_big_cores_phys * 2 : num_big_cores_phys; - const auto num_small_cores_phys = num_cores_phys - num_big_cores_phys; - const auto threads = config._threads ? config._threads : num_cores; - const auto streams = config._streams > 0 ? config._streams : 1; - - config._small_core_offset = num_big_cores; - int threads_per_stream = std::max(1, threads / streams); - - if ((num_big_cores_phys / threads_per_stream >= streams) && (1 < threads_per_stream)) { - config._big_core_streams = streams; - config._threads_per_stream_big = threads_per_stream; - config._small_core_streams = 0; - config._threads_per_stream_small = 0; - } else if ((num_small_cores_phys / threads_per_stream >= streams) && (num_big_cores_phys < threads_per_stream)) { - config._big_core_streams = 0; - config._threads_per_stream_big = 0; - config._small_core_streams = streams; - config._threads_per_stream_small = threads_per_stream; - } else { - const int threads_per_stream_big = std::min(num_big_cores_phys, threads_per_stream); - const int threads_per_stream_small = std::min(num_small_cores_phys, threads_per_stream); - - threads_per_stream = std::min(threads_per_stream_big, threads_per_stream_small); - while (threads_per_stream > 1) { - const int base_big_streams = num_big_cores_phys / threads_per_stream; - const int base_small_streams = num_small_cores_phys > 0 ? num_small_cores_phys / threads_per_stream : 0; - if (base_big_streams + base_small_streams >= streams) { - config._big_core_streams = base_big_streams; - config._small_core_streams = streams - base_big_streams; - break; - } else if (base_big_streams * 2 + base_small_streams >= streams) { - config._big_core_streams = streams - base_small_streams; - config._small_core_streams = base_small_streams; - break; - } else { - threads_per_stream = threads_per_stream > 1 ? threads_per_stream - 1 : 1; - } - } - - if (threads_per_stream == 1) { - const int stream_loops = streams / num_cores; - const int remain_streams = streams - stream_loops * num_cores; - if (num_big_cores_phys >= remain_streams) { - config._big_core_streams = remain_streams + num_big_cores * stream_loops; - config._small_core_streams = num_small_cores_phys * stream_loops; - } else if (num_big_cores_phys + num_small_cores_phys >= remain_streams) { - config._big_core_streams = num_big_cores_phys + num_big_cores * stream_loops; - config._small_core_streams = remain_streams - num_big_cores_phys + num_small_cores_phys * stream_loops; - } else { - config._big_core_streams = remain_streams - num_small_cores_phys + num_big_cores * stream_loops; - config._small_core_streams = num_small_cores_phys * (stream_loops + 1); - } - } - - config._threads_per_stream_big = threads_per_stream; - config._threads_per_stream_small = threads_per_stream; - } + return update_hybrid_custom_threads(config); } IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(const IStreamsExecutor::Config& initial, const bool fp_intesive) { - const auto envThreads = parallel_get_env_threads(); - const auto& numaNodes = getAvailableNUMANodes(); - const int numaNodesNum = static_cast(numaNodes.size()); - auto streamExecutorConfig = initial; - const bool bLatencyCase = streamExecutorConfig._streams <= numaNodesNum; - - // by default, do not use the hyper-threading (to minimize threads synch overheads) - int num_cores_default = getNumberOfCPUCores(); -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - // additional latency-case logic for hybrid processors: - if (ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) { - const auto core_types = custom::info::core_types(); - const auto num_little_cores = - custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.front())); - const auto num_big_cores_phys = getNumberOfCPUCores(true); - const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores; - const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores; - // by default the latency case uses (faster) Big cores only, depending on the compute ratio - const bool bLatencyCaseBigOnly = - num_big_cores_phys > (num_little_cores / (fp_intesive ? fp32_threshold : int8_threshold)); - // selecting the preferred core type - streamExecutorConfig._threadPreferredCoreType = - bLatencyCase ? (bLatencyCaseBigOnly ? IStreamsExecutor::Config::PreferredCoreType::BIG - : IStreamsExecutor::Config::PreferredCoreType::ANY) - : IStreamsExecutor::Config::PreferredCoreType::ROUND_ROBIN; - // additionally selecting the #cores to use in the "Big-only" case - if (bLatencyCaseBigOnly) { - const int hyper_threading_threshold = - 2; // min #cores, for which the hyper-threading becomes useful for the latency case - const auto num_big_cores = - custom::info::default_concurrency(custom::task_arena::constraints{}.set_core_type(core_types.back())); - num_cores_default = (num_big_cores_phys <= hyper_threading_threshold) ? num_big_cores : num_big_cores_phys; - } - // if nstreams or nthreads are set, need to calculate the Hybrid aware parameters here - if (!bLatencyCase && (streamExecutorConfig._big_core_streams == 0 || streamExecutorConfig._threads)) { - UpdateHybridCustomThreads(streamExecutorConfig); - } - OPENVINO_DEBUG << "[ p_e_core_info ] streams (threads): " << streamExecutorConfig._streams << "(" - << streamExecutorConfig._threads_per_stream_big * streamExecutorConfig._big_core_streams + - streamExecutorConfig._threads_per_stream_small * streamExecutorConfig._small_core_streams - << ") -- PCore: " << streamExecutorConfig._big_core_streams << "(" - << streamExecutorConfig._threads_per_stream_big - << ") ECore: " << streamExecutorConfig._small_core_streams << "(" - << streamExecutorConfig._threads_per_stream_small << ")"; - } -#endif - const auto hwCores = - !bLatencyCase && numaNodesNum == 1 - // throughput case on a single-NUMA node machine uses all available cores - ? (streamExecutorConfig._enable_hyper_thread ? parallel_get_max_threads() : num_cores_default) - // in the rest of cases: - // multi-node machine - // or - // latency case, single-node yet hybrid case that uses - // all core types - // or - // big-cores only, but the #cores is "enough" (pls see the logic above) - // it is usually beneficial not to use the hyper-threading (which is default) - : num_cores_default; - const auto threads = - streamExecutorConfig._threads ? streamExecutorConfig._threads : (envThreads ? envThreads : hwCores); - streamExecutorConfig._threadsPerStream = - streamExecutorConfig._streams ? std::max(1, threads / streamExecutorConfig._streams) : threads; - streamExecutorConfig._threads = - (!bLatencyCase && ThreadBindingType::HYBRID_AWARE == streamExecutorConfig._threadBindingType) - ? streamExecutorConfig._big_core_streams * streamExecutorConfig._threads_per_stream_big + - streamExecutorConfig._small_core_streams * streamExecutorConfig._threads_per_stream_small - : streamExecutorConfig._threadsPerStream * streamExecutorConfig._streams; - return streamExecutorConfig; + return make_default_multi_threaded(initial); } } // namespace InferenceEngine diff --git a/src/inference/src/threading/ie_itask_executor.cpp b/src/inference/src/threading/ie_itask_executor.cpp index f75279dfa449ab..8e6bf89f389981 100644 --- a/src/inference/src/threading/ie_itask_executor.cpp +++ b/src/inference/src/threading/ie_itask_executor.cpp @@ -12,27 +12,7 @@ namespace InferenceEngine { void ITaskExecutor::runAndWait(const std::vector& tasks) { - std::vector> packagedTasks; - std::vector> futures; - for (std::size_t i = 0; i < tasks.size(); ++i) { - packagedTasks.emplace_back([&tasks, i] { - tasks[i](); - }); - futures.emplace_back(packagedTasks.back().get_future()); - } - for (std::size_t i = 0; i < tasks.size(); ++i) { - run([&packagedTasks, i] { - packagedTasks[i](); - }); - } - // std::future::get will rethrow exception from task. - // We should wait all tasks before any exception is thrown. - // So wait() and get() for each future moved to separate loops - for (auto&& future : futures) { - future.wait(); - } - for (auto&& future : futures) { - future.get(); - } + run_and_wait(tasks); } + } // namespace InferenceEngine diff --git a/src/inference/tests/unit/cpu_map_parser.cpp b/src/inference/tests/unit/cpu_map_parser.cpp index d2693c87ff9983..20f8ace1862eb7 100644 --- a/src/inference/tests/unit/cpu_map_parser.cpp +++ b/src/inference/tests/unit/cpu_map_parser.cpp @@ -10,7 +10,7 @@ #include "streams_executor.hpp" using namespace testing; -using namespace InferenceEngine; +using namespace ov; namespace { @@ -36,12 +36,12 @@ class LinuxCpuMapParserTests : public CommonTestUtils::TestsCommon, std::vector> test_proc_type_table; std::vector> test_cpu_mapping_table; - InferenceEngine::parse_processor_info_linux(test_data._processors, - test_data.system_info_table, - test_sockets, - test_cores, - test_proc_type_table, - test_cpu_mapping_table); + ov::parse_processor_info_linux(test_data._processors, + test_data.system_info_table, + test_sockets, + test_cores, + test_proc_type_table, + test_cpu_mapping_table); ASSERT_EQ(test_data._sockets, test_sockets); ASSERT_EQ(test_data._cores, test_cores); @@ -629,13 +629,13 @@ class WinCpuMapParserTests : public CommonTestUtils::TestsCommon, std::vector> test_proc_type_table; std::vector> test_cpu_mapping_table; - parse_processor_info_win(test_info_ptr, - len, - test_data._processors, - test_sockets, - test_cores, - test_proc_type_table, - test_cpu_mapping_table); + ov::parse_processor_info_win(test_info_ptr, + len, + test_data._processors, + test_sockets, + test_cores, + test_proc_type_table, + test_cpu_mapping_table); ASSERT_EQ(test_data._sockets, test_sockets); ASSERT_EQ(test_data._cores, test_cores); diff --git a/src/inference/tests/unit/ie_executor_manager_tests.cpp b/src/inference/tests/unit/ie_executor_manager_tests.cpp index 42035ac2a5f389..a419777c4c1d60 100644 --- a/src/inference/tests/unit/ie_executor_manager_tests.cpp +++ b/src/inference/tests/unit/ie_executor_manager_tests.cpp @@ -4,36 +4,34 @@ #include -#include +#include "openvino/runtime/threading/executor_manager.hpp" using namespace ::testing; -using namespace std; -using namespace InferenceEngine; TEST(ExecutorManagerTests, canCreateSingleExecutorManager) { - auto executorManager1 = executorManager(); + auto executorManager1 = ov::threading::executor_manager(); - auto executorManager2 = executorManager(); + auto executorManager2 = ov::threading::executor_manager(); ASSERT_EQ(executorManager1, executorManager2); } TEST(ExecutorManagerTests, createDifferentExecutorsForDifferentDevices) { - auto executorMgr = executorManager(); - auto executor1 = executorMgr->getExecutor("CPU"); - auto executor2 = executorMgr->getExecutor("GPU"); + auto executorMgr = ov::threading::executor_manager(); + auto executor1 = executorMgr->get_executor("CPU"); + auto executor2 = executorMgr->get_executor("GPU"); ASSERT_NE(executor1, executor2); - ASSERT_EQ(2, executorMgr->getExecutorsNumber()); + ASSERT_EQ(2, executorMgr->get_executors_number()); } TEST(ExecutorManagerTests, returnTheSameExecutorForTheSameDevice) { - auto executorMgr = executorManager(); - auto executor1 = executorMgr->getExecutor("CPU"); - auto executor2 = executorMgr->getExecutor("GPU"); + auto executorMgr = ov::threading::executor_manager(); + auto executor1 = executorMgr->get_executor("CPU"); + auto executor2 = executorMgr->get_executor("GPU"); - auto executor = executorMgr->getExecutor("GPU"); + auto executor = executorMgr->get_executor("GPU"); ASSERT_EQ(executor, executor2); - ASSERT_EQ(2, executorMgr->getExecutorsNumber()); + ASSERT_EQ(2, executorMgr->get_executors_number()); } diff --git a/src/plugins/auto/CMakeLists.txt b/src/plugins/auto/CMakeLists.txt index fbca1d5c43f19b..ed24e998a5f421 100644 --- a/src/plugins/auto/CMakeLists.txt +++ b/src/plugins/auto/CMakeLists.txt @@ -38,6 +38,7 @@ endif() set_ie_threading_interface_for(${TARGET_NAME}) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/auto_batch/CMakeLists.txt b/src/plugins/auto_batch/CMakeLists.txt index 9b34bdcc2a405e..edd4e619b59e0b 100644 --- a/src/plugins/auto_batch/CMakeLists.txt +++ b/src/plugins/auto_batch/CMakeLists.txt @@ -20,6 +20,7 @@ ie_add_plugin(NAME ${TARGET_NAME} target_link_libraries(${TARGET_NAME} PRIVATE Threads::Threads) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/hetero/CMakeLists.txt b/src/plugins/hetero/CMakeLists.txt index 17035a9a3e4052..da48fcc6e88bdd 100644 --- a/src/plugins/hetero/CMakeLists.txt +++ b/src/plugins/hetero/CMakeLists.txt @@ -24,6 +24,7 @@ ie_faster_build(${TARGET_NAME} target_link_libraries(${TARGET_NAME} PRIVATE openvino::pugixml) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index a54bd48b6332ca..65d8601d6e8258 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -78,6 +78,7 @@ cross_compiled_file(${TARGET_NAME} NAMESPACE InferenceEngine::Extensions::Cpu::XARCH ) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) # add test object library diff --git a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp index 607a37478a4d5b..1eb82f84becef8 100644 --- a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp @@ -78,14 +78,6 @@ namespace ov { namespace intel_cpu { -void shape_inference(ov::Node* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map& constant_data) { - auto shapeInfer = make_shape_inference(op->shared_from_this()); - output_shapes = shapeInfer->infer(input_shapes, constant_data); -} - class entryBase : public IShapeInferCommon { public: using iface_type = IShapeInferCommon; diff --git a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.hpp b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.hpp index 56f00c6460b256..9e307e6fc871ff 100644 --- a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.hpp +++ b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.hpp @@ -13,11 +13,6 @@ namespace ov { namespace intel_cpu { -void shape_inference(ov::Node* op, - const std::vector& input_shapes, - std::vector& output_shapes, - const std::map& constant_data = {}); - class IShapeInferCommon { public: virtual std::vector infer(const std::vector& input_shapes, diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/assign_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/assign_shape_inference.cpp index 441a029d21a0fd..9500ca8138f5cd 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/assign_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/assign_shape_inference.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_cell_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_cell_test.cpp index 1dfffe43b38484..311e43dc634bbf 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_cell_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_cell_test.cpp @@ -2,14 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ov_ops/augru_cell.hpp" - #include -#include -#include -#include -#include +#include "ov_ops/augru_cell.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_sequence_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_sequence_test.cpp index 962bd6402c20fd..55cb4958110d27 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_sequence_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/augru_sequence_test.cpp @@ -2,14 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ov_ops/augru_sequence.hpp" - #include -#include -#include -#include -#include +#include "ov_ops/augru_sequence.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference.cpp deleted file mode 100644 index bb168118f82861..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include -#include -#include - -using namespace ov; -using namespace ov::intel_cpu; - -static std::shared_ptr make_batch_to_space( - PartialShape data_shape = PartialShape::dynamic(ov::Rank(2)), - PartialShape block_shape = PartialShape::dynamic(), - PartialShape crops_begin_shape = PartialShape::dynamic(), - PartialShape crops_end_shape = PartialShape::dynamic()) { - auto data = std::make_shared(element::f32, data_shape); - auto block = std::make_shared(element::i32, block_shape); - auto crops_begin = std::make_shared(element::i32, crops_begin_shape); - auto crops_end = std::make_shared(element::i32, crops_end_shape); - - const auto batch_to_space = std::make_shared(data, block, crops_begin, crops_end); - return batch_to_space; -} - -TEST(StaticShapeInferenceTest, BatchToSpaceWithHostTensorData) { - auto space_to_batch = make_batch_to_space(); - int32_t block_val[] = {1, 6, 5, 1, 16}; - int32_t pads_begin_val[] = {0, 2, 0, 0, 0}; - int32_t pads_end_val[] = {0, 2, 1, 0, 0}; - auto block = std::make_shared(ngraph::element::Type_t::i32, ov::Shape{5}, block_val); - auto crops_begin = std::make_shared(element::i32, ov::Shape{5}, pads_begin_val); - auto crops_end = std::make_shared(element::i32, ov::Shape{5}, pads_end_val); - - const std::vector input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - std::map> constant_data; - constant_data[1] = block; - constant_data[2] = crops_begin; - constant_data[3] = crops_end; - - shape_inference(space_to_batch.get(), input_shapes, output_shapes, constant_data); - ASSERT_EQ(output_shapes[0], (StaticShape{960 / (6 * 5 * 16), 6 * 6 - 2 - 2, 13 * 5 - 1, 128, 16 * 16})); -} - -TEST(StaticShapeInferenceTest, BatchToSpaceWithMissingTensorData) { - auto batch_to_space = make_batch_to_space(); - int32_t block_val[] = {1, 6, 5, 1, 16}; - int32_t pads_end_val[] = {0, 2, 1, 0, 0}; - auto block = std::make_shared(ngraph::element::Type_t::i32, ov::Shape{5}, block_val); - auto crops_end = std::make_shared(element::i32, ov::Shape{5}, pads_end_val); - - const std::vector input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - std::map> constant_data; - constant_data[1] = block; - constant_data[3] = crops_end; - - EXPECT_THROW(shape_inference(batch_to_space.get(), input_shapes, output_shapes, constant_data), NodeValidationFailure); -} - -TEST(StaticShapeInferenceTest, batch_to_space_output_with_const_inputs) { - auto data = std::make_shared(element::f32, ov::PartialShape{-1, -1, -1, -1}); - auto block_shape = std::make_shared(element::i64, ov::Shape{4}, std::vector{1, 10, 5, 1}); - auto crops_begin = std::make_shared(element::i64, ov::Shape{4}, std::vector{0, 3, 1, 0}); - auto crops_end = std::make_shared(element::i64, ov::Shape{4}, std::vector{0, 3, 0, 0}); - const auto batch_to_space = std::make_shared(data, block_shape, crops_begin, crops_end); - std::vector input_shapes = {{100, 7, 13, 3}, {4}, {4}, {4}}; - std::vector output_shapes = {{}}; - shape_inference(batch_to_space.get(), input_shapes, output_shapes); - - ASSERT_EQ(output_shapes[0], (StaticShape{100 / (10 * 5), 7 * 10 - 3 - 3, 13 * 5 - 1, 3})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference_test.cpp new file mode 100644 index 00000000000000..a79f3fd98a41d6 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/batch_to_space_shape_inference_test.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class BatchToSpaceV1StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } + + std::shared_ptr make_batch_to_space_dynamic() { + const auto data = std::make_shared(element::f32, PartialShape::dynamic()); + const auto block = std::make_shared(element::i32, PartialShape::dynamic()); + const auto crops_begin = std::make_shared(element::i32, PartialShape::dynamic()); + const auto crops_end = std::make_shared(element::i32, PartialShape::dynamic()); + + return make_op(data, block, crops_begin, crops_end); + } +}; + +TEST_F(BatchToSpaceV1StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t crops_begin_val[] = {0, 2, 0, 0, 0}; + int32_t crops_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {2, std::make_shared(element::i32, Shape{5}, crops_begin_val)}, + {3, std::make_shared(element::i32, Shape{5}, crops_end_val)}}; + + input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; + shape_inference(op.get(), input_shapes, output_shapes, constant_data); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{960 / (6 * 5 * 16), 6 * 6 - 2 - 2, 13 * 5 - 1, 128, 16 * 16})); +} + +TEST_F(BatchToSpaceV1StaticShapeInferenceTest, blocks_crops_in_constant_map) { + op = make_batch_to_space_dynamic(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t crops_begin_val[] = {0, 2, 0, 0, 0}; + int32_t crops_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {2, std::make_shared(element::i32, Shape{5}, crops_begin_val)}, + {3, std::make_shared(element::i32, Shape{5}, crops_end_val)}}; + + input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; + + shape_inference(op.get(), input_shapes, output_shapes, constant_data); + EXPECT_EQ(output_shapes[0], (StaticShape{960 / (6 * 5 * 16), 6 * 6 - 2 - 2, 13 * 5 - 1, 128, 16 * 16})); +} + +TEST_F(BatchToSpaceV1StaticShapeInferenceTest, blocs_crops_as_constants) { + auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1, -1}); + auto block_shape = std::make_shared(element::i64, Shape{4}, std::vector{1, 10, 5, 1}); + auto crops_begin = std::make_shared(element::i64, Shape{4}, std::vector{0, 3, 1, 0}); + auto crops_end = std::make_shared(element::i64, Shape{4}, std::vector{0, 3, 0, 0}); + + op = make_op(data, block_shape, crops_begin, crops_end); + input_shapes = {{100, 7, 13, 3}, {4}, {4}, {4}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes[0], (StaticShape{100 / (10 * 5), 7 * 10 - 3 - 3, 13 * 5 - 1, 3})); +} + +TEST_F(BatchToSpaceV1StaticShapeInferenceTest, missing_tensor_data) { + auto op = make_batch_to_space_dynamic(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t crops_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {3, std::make_shared(element::i32, Shape{5}, crops_end_val)}}; + + input_shapes = {{960, 6, 13, 128, 16}, {5}, {5}, {5}}; + + EXPECT_THROW(shape_inference(op.get(), input_shapes, output_shapes, constant_data), NodeValidationFailure); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/broadcast_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/broadcast_shape_inference.cpp index 02091859c317b2..263062e4eced41 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/broadcast_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/broadcast_shape_inference.cpp @@ -4,12 +4,7 @@ #include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/convolution_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/convolution_shape_inference.cpp index 516bc25e1575c6..e1800a1999aa6c 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/convolution_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/convolution_shape_inference.cpp @@ -4,14 +4,7 @@ #include -#include -#include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference.cpp deleted file mode 100644 index 04b17fbb0bfd66..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include - -using namespace ov::intel_cpu; - -TEST(StaticShapeInferenceTest, DepthToSpaceTest) { - auto A = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(ov::Rank(4))); - auto depth_to_space = - std::make_shared(A, ov::op::v0::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST, 2); - const std::vector input_shapes = {StaticShape{1, 16, 3, 1080, 1616}}; - std::vector output_shapes = {StaticShape{}}; - shape_inference(depth_to_space.get(), input_shapes, output_shapes); - ASSERT_EQ(output_shapes[0], (StaticShape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference_test.cpp new file mode 100644 index 00000000000000..d7fb9d9f4e676a --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/depth_to_space_shape_inference_test.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class DepthToSpaceV0StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + input_shapes = {StaticShape{1, 16, 3, 1080, 1616}}; + output_shapes.resize(1); + } +}; + +TEST_F(DepthToSpaceV0StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + op->set_block_size(2); + + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); +} + +TEST_F(DepthToSpaceV0StaticShapeInferenceTest, block_first) { + const auto data = std::make_shared(element::f32, PartialShape::dynamic(4)); + const auto op = make_op(data, op_type::DepthToSpaceMode::BLOCKS_FIRST, 2); + + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{1, 2, 2 * 3, 2 * 1080, 2 * 1616})); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/detection_output_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/detection_output_shape_inference_test.cpp index 8e1ef2a216eb76..c91c8879d83472 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/detection_output_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/detection_output_shape_inference_test.cpp @@ -4,13 +4,8 @@ #include -#include -#include -#include - -#include "utils/shape_inference/static_shape.hpp" #include "detection_output_shape_inference.hpp" - +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/elementwises.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/elementwises.cpp index a7eb81a5db9cb4..b77b1330ea4d31 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/elementwises.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/elementwises.cpp @@ -4,13 +4,7 @@ #include -#include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_generate_proposal.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_generate_proposal.cpp index a8782e7fd5da47..8d4d069ec6f306 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_generate_proposal.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_generate_proposal.cpp @@ -5,9 +5,8 @@ #include #include -#include -#include -#include + +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_prior_grid_generator_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_prior_grid_generator_shape_inference.cpp index 49b22ba7ad3000..447dd142e2df9e 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_prior_grid_generator_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_prior_grid_generator_shape_inference.cpp @@ -4,11 +4,7 @@ #include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_roi_feature_extractor.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_roi_feature_extractor.cpp index 7d46c113ca1139..3d82a65dd453de 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_roi_feature_extractor.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/experimental_detectron_roi_feature_extractor.cpp @@ -4,13 +4,7 @@ #include -#include -#include -#include -#include - -#include "utils/shape_inference/shape_inference.hpp" -#include "utils/shape_inference/static_shape.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/extract_image_patches_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/extract_image_patches_shape_inference.cpp index 7bf7862559b46d..7ab1b7ad681034 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/extract_image_patches_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/extract_image_patches_shape_inference.cpp @@ -4,11 +4,7 @@ #include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/fft_base_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/fft_base_shape_inference.cpp index d1c252b6b7f59f..e0fea3663e23dd 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/fft_base_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/fft_base_shape_inference.cpp @@ -4,13 +4,7 @@ #include -#include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_cell_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_cell_test.cpp index 32c2f02c60a49e..5fdaf6680ec600 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_cell_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_cell_test.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_sequence_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_sequence_test.cpp index 6ec856afec6b87..e55da4d19e937b 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_sequence_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/gru_sequence_test.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/interpolate_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/interpolate_shape_inference.cpp index c8d95fb9537aca..e5016f585118a8 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/interpolate_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/interpolate_shape_inference.cpp @@ -4,12 +4,7 @@ #include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/lstm_cell_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/lstm_cell_shape_inference.cpp index 91273fc85f2577..330ed81d67ebe4 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/lstm_cell_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/lstm_cell_shape_inference.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/matmul_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/matmul_shape_inference.cpp index 75b6c8a10cb1ca..de44f9eb384fd0 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/matmul_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/matmul_shape_inference.cpp @@ -3,13 +3,7 @@ // #include -#include -#include -#include -#include -#include -#include - +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; using namespace testing; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/one_hot_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/one_hot_shape_inference_test.cpp index 7fbf5d273a9d26..7e7ecaf10f54b5 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/one_hot_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/one_hot_shape_inference_test.cpp @@ -6,11 +6,7 @@ #include "common_test_utils/test_assertions.hpp" #include "one_hot_shape_inference.hpp" - -#include "openvino/op/ops.hpp" -#include "openvino/op/parameter.hpp" -#include "utils/shape_inference/shape_inference.hpp" -#include "utils/shape_inference/static_shape.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/proposal.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/proposal.cpp index 4a21796c0b53f4..6eee193c8fafeb 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/proposal.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/proposal.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/read_value_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/read_value_shape_inference.cpp index bcd0a9b3c59cc7..43426ca1f2b6fd 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/read_value_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/read_value_shape_inference.cpp @@ -3,10 +3,7 @@ // #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/reduce_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/reduce_test.cpp index 11eaf813e2966f..9dd98765257977 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/reduce_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/reduce_test.cpp @@ -4,12 +4,7 @@ #include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/roi_align_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/roi_align_shape_inference.cpp index 16a7aba6d3c148..f9e4475a374913 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/roi_align_shape_inference.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/roi_align_shape_inference.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference.cpp deleted file mode 100644 index 33a2f0c38a9170..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include -#include - -using namespace ov; -using namespace ov::intel_cpu; - -TEST(StaticShapeInferenceTest, ScatterElementsUpdateTest) { - auto data_shape = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); - auto indices_shape = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); - auto updates_shape = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); - auto axis_shape = std::make_shared(element::i32, PartialShape::dynamic()); - - auto scatter_elements = - std::make_shared(data_shape, indices_shape, updates_shape, axis_shape); - - int32_t axis_shape_val[] = {2}; - std::map> constant_data; - constant_data[3] = - std::make_shared(ngraph::element::Type_t::i32, Shape{1}, axis_shape_val); - std::vector input_shapes = {StaticShape{1000, 256, 7, 7}, - StaticShape{125, 20, 7, 6}, - StaticShape{125, 20, 7, 6}, - StaticShape{1}}, - output_shapes = {StaticShape{}}; - shape_inference(scatter_elements.get(), input_shapes, output_shapes, constant_data); - - ASSERT_EQ(output_shapes[0], StaticShape({1000, 256, 7, 7})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference_test.cpp new file mode 100644 index 00000000000000..4ea2cf3fef8eb8 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_elements_update_shape_inference_test.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "common_test_utils/test_assertions.hpp" +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::opset10; +using namespace ov::intel_cpu; +using namespace testing; + +class ScatterElementsUpdateV3StaticShapeInferenceTest + : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } +}; + +TEST_F(ScatterElementsUpdateV3StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + + int32_t axis = 1; + const auto const_data = + std::map{{3, std::make_shared(element::i32, Shape{1}, &axis)}}; + + input_shapes = ShapeVector{{1000, 256, 10, 13}, {25, 125, 3, 1}, {25, 125, 3, 1}, {1}}; + shape_inference(op.get(), input_shapes, output_shapes, const_data); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], StaticShape({1000, 256, 10, 13})); +} + +TEST_F(ScatterElementsUpdateV3StaticShapeInferenceTest, correct_inputs_axis_as_constant) { + const auto d = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); + const auto i = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); + const auto u = std::make_shared(element::i32, PartialShape{-1, -1, -1, -1}); + const auto a = std::make_shared(element::i64, Shape{}, -2); + + const auto op = make_op(d, i, u, a); + + input_shapes = ShapeVector{{2, 5, 10, 15}, {2, 1, 10, 15}, {2, 1, 10, 15}, {}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], StaticShape({2, 5, 10, 15})); +} + +TEST_F(ScatterElementsUpdateV3StaticShapeInferenceTest, params_are_dynamic_rank_axis_in_const_map) { + const auto d = std::make_shared(element::i32, PartialShape::dynamic()); + const auto i = std::make_shared(element::i32, PartialShape::dynamic()); + const auto u = std::make_shared(element::i32, PartialShape::dynamic()); + const auto a = std::make_shared(element::u32, PartialShape::dynamic()); + + const auto op = make_op(d, i, u, a); + + uint32_t axis = 2; + const auto const_data = + std::map{{3, std::make_shared(element::u32, Shape{}, &axis)}}; + + input_shapes = ShapeVector{{5000, 256, 10, 15}, {30, 25, 3, 3}, {30, 25, 3, 3}, {}}; + shape_inference(op.get(), input_shapes, output_shapes, const_data); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], StaticShape({5000, 256, 10, 15})); +} + +TEST_F(ScatterElementsUpdateV3StaticShapeInferenceTest, incorrect_axis_value) { + const auto d = std::make_shared(element::i32, PartialShape::dynamic()); + const auto i = std::make_shared(element::i32, PartialShape::dynamic()); + const auto u = std::make_shared(element::i32, PartialShape::dynamic()); + const auto a = std::make_shared(element::u32, PartialShape::dynamic()); + + const auto op = make_op(d, i, u, a); + + uint32_t axis = 4; + const auto const_data = + std::map{{3, std::make_shared(element::u32, Shape{}, &axis)}}; + + input_shapes = ShapeVector{{5000, 256, 10, 15}, {30, 25, 3, 3}, {30, 25, 3, 3}, {}}; + OV_EXPECT_THROW(shape_inference(op.get(), input_shapes, output_shapes, const_data), + AssertFailure, + HasSubstr("Parameter axis 4 out of the tensor rank range [-4, 3]")); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_update_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_update_shape_inference_test.cpp index f37a64de77381e..d88c8a20f46bb9 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_update_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/scatter_update_shape_inference_test.cpp @@ -4,10 +4,7 @@ #include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/select_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/select_shape_inference_test.cpp index e6f41d87ecd937..8242f81777d091 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/select_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/select_shape_inference_test.cpp @@ -4,11 +4,7 @@ #include -#include -#include -#include - -#include "utils/shape_inference/static_shape.hpp" +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shape_node_tests.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shape_node_tests.cpp index 3bd3887fb99385..2662faae88f490 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shape_node_tests.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shape_node_tests.cpp @@ -4,15 +4,7 @@ #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "utils.hpp" using namespace ov; using namespace ov::intel_cpu; diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference.cpp deleted file mode 100644 index c47c72db8ba6d5..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include - -#include "utils/shape_inference/static_shape.hpp" - -using namespace ov; -using namespace ov::intel_cpu; - -TEST(StaticShapeInferenceTest, ShuffleChannelsTest) { - const auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1}); - const auto axis = -1; - const auto group = 3; - const auto shuffle_channels = std::make_shared(data, axis, group); - - std::vector static_input_shapes = {StaticShape{5, 4, 9}}; - std::vector static_output_shapes = {StaticShape{}}; - shape_inference(shuffle_channels.get(), static_input_shapes, static_output_shapes); - - ASSERT_EQ(static_output_shapes[0], static_input_shapes[0]); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference_test.cpp new file mode 100644 index 00000000000000..f9b9b2fdd151bd --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/shuffle_channels_shape_inference_test.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class ShuffleChannelsV0StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } +}; + +TEST_F(ShuffleChannelsV0StaticShapeInferenceTest, default_ctor) { + op = make_op(); + op->set_axis(-2); + op->set_group(2); + + input_shapes = {StaticShape{5, 4, 9}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], input_shapes[0]); +} + +TEST_F(ShuffleChannelsV0StaticShapeInferenceTest, correct_shape_infer) { + const auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1}); + op = make_op(data, -1, 3); + + input_shapes = {StaticShape{5, 4, 9}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes[0], input_shapes[0]); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference.cpp deleted file mode 100644 index 36d0017af5cc18..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference.cpp +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include -#include -#include - -using namespace ov; -using namespace ov::intel_cpu; - -static std::shared_ptr build_space_to_batch( - PartialShape data_shape = PartialShape::dynamic(ov::Rank(2)), - PartialShape block_shape = PartialShape::dynamic(), - PartialShape pads_begin_shape = PartialShape::dynamic(), - PartialShape pad_end_shape = PartialShape::dynamic()) { - auto data = std::make_shared(element::f32, data_shape); - auto block = std::make_shared(element::i32, block_shape); - auto pads_begin = std::make_shared(element::i32, pads_begin_shape); - auto pads_end = std::make_shared(element::i32, pad_end_shape); - - auto space_to_batch = std::make_shared(data, block, pads_begin, pads_end); - return space_to_batch; -} - -TEST(StaticShapeInferenceTest, SpaceToBatchTest) { - auto space_to_batch = build_space_to_batch(); - int32_t block_val[] = {1, 6, 5, 1, 16}; - int32_t pads_begin_val[] = {0, 2, 0, 0, 0}; - int32_t pads_end_val[] = {0, 2, 1, 0, 0}; - auto block = std::make_shared(ngraph::element::Type_t::i32, ov::Shape{5}, block_val); - auto pads_begin = std::make_shared(element::i32, ov::Shape{5}, pads_begin_val); - auto pads_end = std::make_shared(element::i32, ov::Shape{5}, pads_end_val); - - const std::vector input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - std::map> constant_data; - constant_data[1] = block; - constant_data[2] = pads_begin; - constant_data[3] = pads_end; - - shape_inference(space_to_batch.get(), input_shapes, output_shapes, constant_data); - ASSERT_EQ(output_shapes[0], (StaticShape{2 * 6 * 5 * 16, (32 + 2 + 2) / 6, (64 + 1) / 5, 128, 256 / 16})); -} - -TEST(StaticShapeInferenceTest, SpaceToBatchThrowExceptionWithoutHostTensorData) { - auto space_to_batch = build_space_to_batch(); - - std::map> constant_data; - const std::vector input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - EXPECT_THROW(shape_inference(space_to_batch.get(), input_shapes, output_shapes), NodeValidationFailure); -} - -TEST(StaticShapeInferenceTest, SpaceToBatchThrowExceptionWithMissingPadsHostTensorData) { - auto space_to_batch = build_space_to_batch(); - - int32_t block_val[] = {1, 6, 5, 1, 16}; - auto block = std::make_shared(ngraph::element::Type_t::i32, ov::Shape{5}, block_val); - - std::map> constant_data; - constant_data[1] = block; - - const std::vector input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; - std::vector output_shapes = {{}}; - - EXPECT_THROW(shape_inference(space_to_batch.get(), input_shapes, output_shapes), NodeValidationFailure); -} - -TEST(StaticShapeInferenceTest, space_to_batch_output_with_const_inputs) { - auto data = std::make_shared(element::f32, ov::PartialShape{-1, -1, -1, -1}); - auto block_shape = std::make_shared(element::i64, ov::Shape{4}, std::vector{1, 12, 100, 2}); - auto pads_begin = std::make_shared(element::i64, ov::Shape{4}, std::vector{0, 3, 38, 1}); - auto pads_end = std::make_shared(element::i64, ov::Shape{4}, std::vector{0, 5, 38, 0}); - const auto space_to_batch = std::make_shared(data, block_shape, pads_begin, pads_end); - std::vector input_shapes = {{2, 100, 1024, 3}, {4}, {4}, {4}}; - std::vector output_shapes = {{}}; - shape_inference(space_to_batch.get(), input_shapes, output_shapes); - - ASSERT_EQ(output_shapes[0], (StaticShape{2 * 12 * 100 * 2, (100 + 3 + 5) / 12, (1024 + 38 + 38) / 100, (3 + 1) / 2})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference_test.cpp new file mode 100644 index 00000000000000..fd6969e0622983 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_batch_shape_inference_test.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class SpaceToBatchV1StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } + + std::shared_ptr make_space_to_batch_dynamic() { + const auto data = std::make_shared(element::f32, PartialShape::dynamic()); + const auto block = std::make_shared(element::i32, PartialShape::dynamic()); + const auto pads_begin = std::make_shared(element::i32, PartialShape::dynamic()); + const auto pads_end = std::make_shared(element::i32, PartialShape::dynamic()); + + return make_op(data, block, pads_begin, pads_end); + } +}; + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t pads_begin_val[] = {0, 2, 0, 0, 0}; + int32_t pads_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {2, std::make_shared(element::i32, Shape{5}, pads_begin_val)}, + {3, std::make_shared(element::i32, Shape{5}, pads_end_val)}}; + + input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; + shape_inference(op.get(), input_shapes, output_shapes, constant_data); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{2 * 6 * 5 * 16, (32 + 2 + 2) / 6, (64 + 1) / 5, 128, 256 / 16})); +} + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, blocks_pads_as_constants) { + const auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1, -1}); + const auto block_shape = std::make_shared(element::i64, Shape{4}, std::vector{1, 12, 100, 2}); + const auto pads_begin = std::make_shared(element::i64, Shape{4}, std::vector{0, 3, 38, 1}); + const auto pads_end = std::make_shared(element::i64, Shape{4}, std::vector{0, 5, 38, 0}); + + const auto op = make_op(data, block_shape, pads_begin, pads_end); + + input_shapes = {{2, 100, 1024, 3}, {4}, {4}, {4}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes[0], + (StaticShape{2 * 12 * 100 * 2, (100 + 3 + 5) / 12, (1024 + 38 + 38) / 100, (3 + 1) / 2})); +} + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, blocks_pads_in_constant_map) { + const auto op = make_space_to_batch_dynamic(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + int32_t pads_begin_val[] = {0, 2, 0, 0, 0}; + int32_t pads_end_val[] = {0, 2, 1, 0, 0}; + + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}, + {2, std::make_shared(element::i32, Shape{5}, pads_begin_val)}, + {3, std::make_shared(element::i32, Shape{5}, pads_end_val)}}; + + input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; + shape_inference(op.get(), input_shapes, output_shapes, constant_data); + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{2 * 6 * 5 * 16, (32 + 2 + 2) / 6, (64 + 1) / 5, 128, 256 / 16})); +} + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, throw_no_data_const_map) { + const auto op = make_space_to_batch_dynamic(); + + input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; + EXPECT_THROW(shape_inference(op.get(), input_shapes, output_shapes), NodeValidationFailure); +} + +TEST_F(SpaceToBatchV1StaticShapeInferenceTest, exception_missing_pads_data_in_const_map) { + const auto op = make_space_to_batch_dynamic(); + + int32_t block_val[] = {1, 6, 5, 1, 16}; + const auto constant_data = + std::map{{1, std::make_shared(element::i32, Shape{5}, block_val)}}; + + input_shapes = {{2, 32, 64, 128, 256}, {5}, {5}, {5}}; + + EXPECT_THROW(shape_inference(op.get(), input_shapes, output_shapes), NodeValidationFailure); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference.cpp deleted file mode 100644 index 1466e73b34f9cc..00000000000000 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include - -#include -#include -#include -#include - -using namespace ov::intel_cpu; - -TEST(StaticShapeInferenceTest, SpaceToDepthTest) { - auto A = std::make_shared(ov::element::f32, ov::PartialShape::dynamic(ov::Rank(4))); - auto space_to_depth = - std::make_shared(A, ov::op::v0::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST, 2); - const std::vector input_shapes = {StaticShape{1, 12, 4, 1080, 1616}}; - std::vector output_shapes = {StaticShape{}}; - shape_inference(space_to_depth.get(), input_shapes, output_shapes); - ASSERT_EQ(output_shapes[0], (StaticShape{1, 12 * 8, 4 / 2, 1080 / 2, 1616 / 2})); -} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference_test.cpp new file mode 100644 index 00000000000000..da8851751ee92c --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/space_to_depth_shape_inference_test.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "openvino/opsets/opset10.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using namespace ov::opset10; +using namespace testing; + +class SpaceToDepthV0StaticShapeInferenceTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes.resize(1); + } +}; + +TEST_F(SpaceToDepthV0StaticShapeInferenceTest, default_ctor) { + const auto op = make_op(); + op->set_block_size(2); + + input_shapes = {StaticShape{1, 12, 4, 1080, 1616}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{1, 12 * 8, 4 / 2, 1080 / 2, 1616 / 2})); +} + +TEST_F(SpaceToDepthV0StaticShapeInferenceTest, depth_first_block_2) { + const auto data = std::make_shared(element::f32, PartialShape::dynamic(4)); + const auto op = make_op(data, op_type::SpaceToDepthMode::DEPTH_FIRST, 2); + + input_shapes = {StaticShape{1, 12, 4, 1080, 1616}}; + shape_inference(op.get(), input_shapes, output_shapes); + + EXPECT_EQ(output_shapes.size(), 1); + EXPECT_EQ(output_shapes[0], (StaticShape{1, 12 * 8, 4 / 2, 1080 / 2, 1616 / 2})); +} diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/utils.hpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/utils.hpp index 85ead85909447d..546ffd7a9c1302 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/utils.hpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/utils.hpp @@ -13,6 +13,19 @@ #pragma once +namespace ov { +namespace intel_cpu { +template +void shape_inference(ov::Node* op, + const std::vector& input_shapes, + std::vector& output_shapes, + const std::map& constant_data = {}) { + const auto shape_infer = make_shape_inference(op->shared_from_this()); + output_shapes = shape_infer->infer(input_shapes, constant_data); +} +} // namespace intel_cpu +} // namespace ov + struct TestTensor { std::shared_ptr tensor; ov::intel_cpu::StaticShape static_shape; @@ -90,6 +103,8 @@ using ShapeVector = std::vector; template class OpStaticShapeInferenceTest : public testing::Test { protected: + using op_type = TOp; + ShapeVector input_shapes, output_shapes; ov::intel_cpu::StaticShape exp_shape; std::shared_ptr op; diff --git a/src/plugins/intel_gna/CMakeLists.txt b/src/plugins/intel_gna/CMakeLists.txt index f6b358a2f13574..08d32a4771cca5 100644 --- a/src/plugins/intel_gna/CMakeLists.txt +++ b/src/plugins/intel_gna/CMakeLists.txt @@ -71,6 +71,7 @@ target_compile_definitions(${TARGET_NAME} _NO_MKL_ ) +# must be called after all target_link_libraries ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) # @@ -139,5 +140,3 @@ if(NOT BUILD_SHARED_LIBS) endif() add_subdirectory(tests) - - diff --git a/src/plugins/intel_gna/src/transformations/utils/gather_sinking_utils.cpp b/src/plugins/intel_gna/src/transformations/utils/gather_sinking_utils.cpp index a8a1fbad30bfbb..4a9495b7b274a2 100644 --- a/src/plugins/intel_gna/src/transformations/utils/gather_sinking_utils.cpp +++ b/src/plugins/intel_gna/src/transformations/utils/gather_sinking_utils.cpp @@ -57,7 +57,7 @@ bool IfNodeHasGatherInputs(const Output& output) { namespace { bool HasDynamicRankInput(NodePtr node) { - for (auto& input_node : node->input_values()) { + for (const auto& input_node : node->input_values()) { const Rank output_rank = input_node.get_partial_shape().rank(); if (output_rank.is_dynamic()) return true; @@ -148,7 +148,7 @@ bool CanPropagateGatherForwardThrough(Node* node) { #undef CHECK_GATHER_SINKING_SUPPORTED bool CanGatherPropagateForward(NodePtr node) { - for (auto output : node->outputs()) { + for (const auto& output : node->outputs()) { for (auto& consumer_input : output.get_target_inputs()) { if (!CanPropagateGatherForwardThrough(consumer_input.get_node())) return false; @@ -209,7 +209,7 @@ GatherInfo GetGatherInfo(Node* node) { } Node* FindFirstConsumer(NodePtr node) { - for (auto output : node->outputs()) { + for (const auto& output : node->outputs()) { auto inputs = output.get_target_inputs(); if (inputs.empty()) continue; diff --git a/src/plugins/intel_gna/tests/deprecated/readers/ir_reader_v7/CMakeLists.txt b/src/plugins/intel_gna/tests/deprecated/readers/ir_reader_v7/CMakeLists.txt index 2f13e3123c0cb1..303537eb48e8e7 100644 --- a/src/plugins/intel_gna/tests/deprecated/readers/ir_reader_v7/CMakeLists.txt +++ b/src/plugins/intel_gna/tests/deprecated/readers/ir_reader_v7/CMakeLists.txt @@ -37,8 +37,6 @@ target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/" target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime::dev inference_engine_legacy openvino::pugixml openvino::itt) -ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) - if(WIN32) set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) endif() @@ -47,6 +45,9 @@ if(BUILD_SHARED_LIBS) target_link_libraries(${TARGET_NAME} PRIVATE inference_engine) endif() +# must be called after all target_link_libraries +ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) + # code style add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME}) diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt index ab4d7618c30a25..306b56987d70c8 100644 --- a/src/plugins/intel_gpu/CMakeLists.txt +++ b/src/plugins/intel_gpu/CMakeLists.txt @@ -70,4 +70,5 @@ if(ENABLE_TESTS) endif() # Failed because of OpenCL +# must be called after all target_link_libraries # ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME}) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index 0fef9af07ec39b..0828653681048c 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -231,16 +231,8 @@ struct network { /// Returns memory state @p variable_id of stateful network VariableState& get_variable_memory(const std::string &variable_id); - /// Return kernels_cache - kernels_cache& get_kernels_cache() const { return *_kernels_cache; } - - /// Return implentations_cache - ImplementationsCache& get_implementations_cache() const { return *_impls_cache; } - /// Return in_mem_kernels_cache KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; } - - ICompilationContext& get_compilation_context() const { return *_compilation_context; } std::mutex& get_impl_cache_mutex() const { return _in_mem_cache_mutex; } const ExecutionConfig& get_config() const { return _config; } @@ -272,7 +264,6 @@ struct network { output_chains_map _output_chains; mutable std::mutex _in_mem_cache_mutex; - std::unique_ptr _compilation_context; void build_exec_order(); void allocate_primitive_instance(program_node const& node); @@ -284,11 +275,8 @@ struct network { void add_default_output_chains(); output_chains_map::iterator add_output_chain(std::shared_ptr& p_inst); - std::unique_ptr _kernels_cache; // Move from cldnn::program to cldnn::network for multi-threads issue. - std::unique_ptr _impls_cache; std::unique_ptr _in_mem_kernels_cache; - const size_t _impls_cache_capacity = 10000; const size_t _in_mem_kernels_cache_capacity = 10000; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index d764f1de5e7adf..2c21bc1694daa4 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -26,6 +26,7 @@ class pass_manager; class base_pass; class program_wrapper; class kernels_cache; +class ICompilationContext; struct program { @@ -252,6 +253,10 @@ struct program { void query_local_block_io_supported(); void calc_nodes_hash(); + ImplementationsCache& get_implementations_cache() const { return *_impls_cache; } + ICompilationContext& get_compilation_context() const { return *_compilation_context; } + void cancel_compilation_context(); + private: uint32_t prog_id = 0; engine& _engine; @@ -266,6 +271,9 @@ struct program { std::unique_ptr pm; bool is_body_program; int8_t is_subgroup_local_block_io_supported; + std::unique_ptr _impls_cache; + const size_t _impls_cache_capacity = 10000; + std::unique_ptr _compilation_context; std::map> nodes_map; std::list optimized_out; @@ -305,7 +313,9 @@ struct program { void cleanup(); void transfer_memory_to_device(); + InferenceEngine::CPUStreamsExecutor::Config make_task_executor_config(const ExecutionConfig& config, std::string tags = "") const; std::shared_ptr make_task_executor(const ExecutionConfig& config) const; + /* ** Analysis functions */ @@ -343,6 +353,8 @@ struct program { // old_node - node which will be replaced // new_node - node which will replace the old one void replace(program_node& old_node, program_node& new_node); + + void init_program(); }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp index 7880b79a85eb4d..182865306e4611 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/serialization/binary_buffer.hpp @@ -14,7 +14,8 @@ namespace cldnn { class BinaryOutputBuffer : public OutputBuffer { public: - BinaryOutputBuffer(std::ostream& stream) : OutputBuffer(this), stream(stream) {} + BinaryOutputBuffer(std::ostream& stream) + : OutputBuffer(this), stream(stream), _impl_params(nullptr) {} void write(void const * data, std::streamsize size) { auto const written_size = stream.rdbuf()->sputn(reinterpret_cast(data), size); @@ -32,7 +33,8 @@ class BinaryOutputBuffer : public OutputBuffer { class BinaryInputBuffer : public InputBuffer { public: - BinaryInputBuffer(std::istream& stream, engine& engine) : InputBuffer(this, engine), stream(stream) {} + BinaryInputBuffer(std::istream& stream, engine& engine) + : InputBuffer(this, engine), stream(stream), _impl_params(nullptr), _network(nullptr) {} void read(void* const data, std::streamsize size) { auto const read_size = stream.rdbuf()->sgetn(reinterpret_cast(data), size); @@ -42,6 +44,8 @@ class BinaryInputBuffer : public InputBuffer { void setKernlImplParams(void* impl_params) { _impl_params = impl_params; } void* getKernlImplParams() const { return _impl_params; } + void setNetwork(void* network) { _network = network; } + void* getNetwork() const { return _network; } std::streampos tellg() { return stream.tellg(); } void seekg(std::streampos pos) { stream.seekg(pos); } @@ -49,6 +53,7 @@ class BinaryInputBuffer : public InputBuffer { private: std::istream& stream; void* _impl_params; + void* _network; }; template diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp index 979e203cab51dd..20fb79db8664ae 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/compiled_model.hpp @@ -46,9 +46,6 @@ class CompiledModel : public InferenceEngine::ExecutableNetworkThreadSafeDefault ExecutionConfig m_config; InferenceEngine::ITaskExecutor::Ptr m_taskExecutor; InferenceEngine::ITaskExecutor::Ptr m_waitExecutor; - -private: - bool is_serializable(); }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp index 007e55e7fb3f6c..9e0f8941527139 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp @@ -18,7 +18,7 @@ namespace intel_gpu { class Plugin : public InferenceEngine::IInferencePlugin { struct impl; std::shared_ptr _impl; - bool isModelCachingEnabled = false; + bool isModelCachingEnabled = true; std::string default_device_id = "0"; // key: device_id, value: cldnn device diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp index 6d354cd8d8f8f6..81e0dbcf774ee7 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/lru_cache.hpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include "kernel.hpp" @@ -30,15 +32,15 @@ class LruCache { } /** - * @brief Get the least recently used element object in the cache + * @brief Get the least recently used element with key and value pair in the cache * * @return Value */ - Value get_lru_element() const { + std::pair get_lru_element() const { if (_lru_data_list.size()) { - return _lru_data_list.back().second; + return _lru_data_list.back(); } else { - return Value(); + return std::make_pair(Key(), Value()); } } @@ -164,6 +166,46 @@ class LruCache { } }; -using ImplementationsCache = cldnn::LruCache>; using KernelsCache = cldnn::LruCache; + +template +class LruCacheThreadSafe : LruCache { +public: + using parent = LruCache; + using FuncRemoveItem = std::function&)>; + + explicit LruCacheThreadSafe(size_t caps) : parent(caps) { } + + bool add(const Key& key, const Value& value) { + std::lock_guard lock(_mutex); + auto popped_item = parent::get_lru_element(); + auto ret = parent::add(key, value); + if (ret && _remove_popped_item) { + _remove_popped_item(popped_item); + } + return ret; + } + + bool has(const Key& key) const { + std::lock_guard lock(_mutex); + return parent::has(key); + } + + Value get(const Key& key) { + std::lock_guard lock(_mutex); + return parent::get(key); + } + + void set_remove_item_callback(FuncRemoveItem callback) { + _remove_popped_item = callback; + } + +private: + FuncRemoveItem _remove_popped_item; + mutable std::mutex _mutex; +}; + + +using ImplementationsCache = cldnn::LruCacheThreadSafe>; + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/compilation_context.cpp b/src/plugins/intel_gpu/src/graph/compilation_context.cpp index 2aa02dbda0ebc9..75c6b3a65b8b12 100644 --- a/src/plugins/intel_gpu/src/graph/compilation_context.cpp +++ b/src/plugins/intel_gpu/src/graph/compilation_context.cpp @@ -1,91 +1,76 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2022-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // + #include "compilation_context.hpp" -#include "threading/ie_thread_safe_containers.hpp" -#include "kernel_selector/kernel_base.h" +#include +#include +#include +#include "intel_gpu/runtime/utils.hpp" namespace cldnn { -class CompilationTaskQueue { - using CompilationTaskData = std::pair; - +class CompilationContext : public ICompilationContext { public: - void push_task(size_t task_key, ICompilationContext::Task&& task) { - std::lock_guard lock(_mutex); - if (_queue_keymap.find(task_key) == _queue_keymap.end()) { - auto insert_it = _queue.insert(_queue.end(), {task_key, task}); - _queue_keymap.insert({task_key, insert_it}); - } + CompilationContext(InferenceEngine::CPUStreamsExecutor::Config task_executor_config) : _task_executor_config(task_executor_config) { + _task_executor_config._streams = 4; + _task_executor = std::make_shared(_task_executor_config); } - bool pop_front_task(size_t& task_key, ICompilationContext::Task& task) { + void push_task(size_t key, Task&& task) override { + if (_stop_compilation) + return; + std::lock_guard lock(_mutex); - if (!_queue.empty()) { - auto front = _queue.front(); - task = front.second; - task_key = front.first; - _queue.pop_front(); - return true; + if (_task_keys.find(key) == _task_keys.end()) { + _task_keys.insert(key); + if (_task_executor != nullptr) + _task_executor->run(task); } - return false; } - void erase_task_key(size_t removed_key) { + void remove_keys(std::vector&& keys) override { std::lock_guard lock(_mutex); - if (_queue_keymap.find(removed_key) != _queue_keymap.end()) { - _queue_keymap.erase(removed_key); + if (!_task_keys.empty()) { + for (auto key : keys) { + if (_task_keys.find(key) != _task_keys.end()) { + _task_keys.erase(key); + } + } } } -private: - std::deque _queue; - std::unordered_map::iterator> _queue_keymap; - std::mutex _mutex; -}; - -class CompilationContext : public ICompilationContext { -public: - CompilationContext(cldnn::engine& engine, const ExecutionConfig& config, size_t program_id) { - _kernels_cache = cldnn::make_unique(engine, config, program_id, nullptr, kernel_selector::KernelBase::get_db().get_batch_header_str()); - _worker = std::thread([this](){ - while (!_stop_compilation) { - CompilationContext::Task task; - size_t task_key; - bool success = _queue.pop_front_task(task_key, task); - if (success) { - task(*_kernels_cache); - _queue.erase_task_key(task_key); - } else { - std::chrono::milliseconds ms{1}; - std::this_thread::sleep_for(ms); - } - } - }); + ~CompilationContext() noexcept { + cancel(); } - void push_task(size_t key, ICompilationContext::Task&& task) override { - _queue.push_task(key, std::move(task)); + bool is_stopped() override { + return _stop_compilation; } void cancel() noexcept override { + if (_stop_compilation) + return; + _stop_compilation = true; - if (_worker.joinable()) - _worker.join(); + { + std::lock_guard lock(_mutex); + if (_task_executor != nullptr) + _task_executor.reset(); + _task_keys.clear(); + } } - ~CompilationContext() noexcept { cancel(); } - private: - std::unique_ptr _kernels_cache; - std::thread _worker; + InferenceEngine::CPUStreamsExecutor::Config _task_executor_config; + InferenceEngine::CPUStreamsExecutor::Ptr _task_executor; + std::mutex _mutex; + std::unordered_set _task_keys; std::atomic_bool _stop_compilation{false}; - - CompilationTaskQueue _queue; }; -std::unique_ptr ICompilationContext::create(cldnn::engine& engine, const ExecutionConfig& config, size_t program_id) { - return cldnn::make_unique(engine, config, program_id); +std::unique_ptr ICompilationContext::create(InferenceEngine::CPUStreamsExecutor::Config task_executor_config) { + return cldnn::make_unique(task_executor_config); } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/data.cpp b/src/plugins/intel_gpu/src/graph/data.cpp index 10be2a3504e81a..16e0edb6d2d033 100644 --- a/src/plugins/intel_gpu/src/graph/data.cpp +++ b/src/plugins/intel_gpu/src/graph/data.cpp @@ -85,15 +85,24 @@ void data_inst::load(BinaryInputBuffer& ib) { size_t data_size; ib >> make_data(&data_size, sizeof(size_t)); - _outputs[0] = get_network().get_memory_pool().get_memory(output_layout, _allocation_type, false); - if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { - ib >> make_data(_outputs[0]->buffer_ptr(), data_size); + if (ib.getNetwork()) { + const network* primary_network = reinterpret_cast(ib.getNetwork()); + _outputs[0] = primary_network->get_primitive(id())->output_memory_ptr(); + auto pos = ib.tellg(); + pos += data_size; + ib.seekg(pos); } else { - std::vector _buf; - _buf.resize(data_size); - ib >> make_data(_buf.data(), data_size); - _outputs[0]->copy_from(get_network().get_stream(), _buf.data()); + _outputs[0] = get_network().get_memory_pool().get_memory(output_layout, _allocation_type, false); + + if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { + ib >> make_data(_outputs[0]->buffer_ptr(), data_size); + } else { + std::vector _buf; + _buf.resize(data_size); + ib >> make_data(_buf.data(), data_size); + _outputs[0]->copy_from(get_network().get_stream(), _buf.data()); + } } } diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index fc7bf008f372b1..d00a36c676b3b9 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -39,8 +39,9 @@ bool is_batch_after_spatial(const std::string order) { } format::type get_preferred_format(fully_connected_node const& node, const kernel_impl_params& impl_param) { - if (node.get_preferred_impl_type() == impl_types::onednn) - return format::bfyx; + if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) { + return node.get_preferred_output_fmt(); + } auto input_layout = impl_param.get_input_layout(); diff --git a/src/plugins/intel_gpu/src/graph/gemm.cpp b/src/plugins/intel_gpu/src/graph/gemm.cpp index 6d2cd3d76f6c83..b15be72fc4d3d4 100644 --- a/src/plugins/intel_gpu/src/graph/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/gemm.cpp @@ -78,6 +78,10 @@ layout gemm_inst::calc_output_layout(gemm_node const& node, kernel_impl_params c auto output_format = input0_layout.format; + if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) { + output_format = node.get_preferred_output_fmt(); + } + return layout(output_shape, output_type, output_format, prim->output_paddings[0]); } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 06d2b2852d666c..2bfb2e9bcb2f7d 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -169,6 +169,18 @@ void remove_redundant_reorders::run(program& p) { !r_node.get_primitive()->has_surface_input(); if (remove_dep) { + // for chains like + // b_fs_yx_fsv16 -> reorder(ofmt:bfyx) -> bfyx -> reorder(ofmt:any) -> bfyx + // if output_format of current node is format::any, input format of the dependency node is propagated as it is + // b_fs_yx_fsv16 -> reorder(ofmt:any) -> b_fs_yx_fsv16 + // so output format of dependency node must be stored in output_format of current node + // b_fs_yx_fsv16 -> reorder(ofmt:bfyx) -> bfyx + auto output_layout = r_dep_node.get_output_layout(); + auto prim = std::const_pointer_cast(r_node.get_primitive()); + if (prim->output_format == format::any) + prim->output_format = output_layout.format; + + LOG_NODE_REMOVAL(r_dep_node.id()); r_dep_node.can_be_optimized(true); p.add_optimized_primitive_info(r_dep_node.id()); p.extract_and_remove(r_dep_node); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp index 5f3f741d7cb115..c0d5a734e2ffa6 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp @@ -5,6 +5,7 @@ #include "pass_manager.h" #include "data_inst.h" #include "mutable_data_inst.h" +#include "gemm_inst.h" #include "program_node.h" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/itt.hpp" @@ -44,6 +45,8 @@ void select_preferred_formats::run(program& p) { dnnl::primitive_attr(), dnnl::memory::format_tag::any); _lo.select_preferred_formats_for_onednn(*n, *prim_desc); + } else if (n->is_type() || n->is_type()) { + _lo.select_preferred_formats_for_onednn(*n); } } catch(std::exception &exception) { GPU_DEBUG_INFO << "WARNING(select_preferred_formats): " << exception.what() << std::endl; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 90010f902e694e..bbe4368485d07b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -276,6 +276,19 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { (std::accumulate(gws.begin(), gws.end(), 1, std::multiplies()) == 0); } } + + void set_kernels(std::map& kernels) { + if (is_cpu()) + return; + + _kernel_ids.clear(); + _kernels.clear(); + _kernels.reserve(kernels.size()); + for (auto& k : kernels) { + _kernel_ids.push_back(k.first); + _kernels.emplace_back(std::move(k.second)); + } + } }; } // namespace ocl diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index 4e404518d6659d..5e8c03dd0c67fd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -324,10 +324,8 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl { void build_primitive(const ExecutionConfig& config) { auto cache_outpath = get_cache_directory(config); - if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) { - if (env_p[0] == '1') { - cache_outpath = ""; - } + if (!config.get_property(ov::intel_gpu::allow_new_shape_infer)) { + cache_outpath = ""; } if (cache_outpath.empty()) { diff --git a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp b/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp index abb686ac2d9874..f26aa904004630 100644 --- a/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp +++ b/src/plugins/intel_gpu/src/graph/include/compilation_context.hpp @@ -1,10 +1,10 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2022-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include "kernels_cache.hpp" +#include #include #include @@ -12,12 +12,14 @@ namespace cldnn { class ICompilationContext { public: - using Task = std::function; + using Task = std::function; virtual void push_task(size_t key, Task&& task) = 0; - virtual void cancel() noexcept = 0; + virtual void remove_keys(std::vector&& keys) = 0; virtual ~ICompilationContext() = default; + virtual bool is_stopped() = 0; + virtual void cancel() = 0; - static std::unique_ptr create(cldnn::engine& engine, const ExecutionConfig& config, size_t program_id); + static std::unique_ptr create(InferenceEngine::CPUStreamsExecutor::Config task_executor_config); }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h index edf78887cc33f3..21fd41a59c8863 100644 --- a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h +++ b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h @@ -210,7 +210,7 @@ class layout_optimizer { bool should_select_b_fs_yx_fsv16_layout(convolution_node const& node, layout const& output_or_weights_layout); #ifdef ENABLE_ONEDNN_FOR_GPU - void select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc); + void select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc = dnnl::primitive_desc()); #endif }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index f30c1db46dd601..c21ad7d3906b45 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -63,7 +63,6 @@ struct primitive_impl { } virtual std::vector> get_kernels_source() { return {}; } virtual void reset_kernels_source() {} - virtual void set_kernels(std::vector) {} virtual std::vector get_kernels() const { return {}; } virtual void set_kernel_ids(std::vector kernel_ids) {} virtual void save(cldnn::BinaryOutputBuffer& ob) const {} @@ -80,6 +79,8 @@ struct primitive_impl { OPENVINO_ASSERT(false, "[GPU] update_dispatch_data is not implemented for dynamic implemenation ", _kernel_name); } + virtual void set_kernels(std::map& kernels) {} + protected: std::string _kernel_name; bool _is_dynamic = false; diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index e9537f6869e707..cf5011a275c72d 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1757,6 +1757,10 @@ format layout_optimizer::get_preferred_format(program_node& node) { // Set default format for issue 92967/98750 // TODO: will remove when arg_max_min_ref supports blocked format expected = format::get_default_format(node.get_input_layouts()[0].get_rank(), false, false); + } else if (node.is_type() || node.is_type()) { + if (use_onednn_impls) { + expected = node.get_preferred_output_fmt(); + } } if (allow_new_shape_infer && node.get_preferred_input_fmt() != format::any) { @@ -1862,6 +1866,19 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d GPU_DEBUG_LOG << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt) << " For index : " << idx << std::endl; } + } else if (node.is_type() || node.is_type()) { + for (size_t idx = 0 ; idx < node.get_dependencies().size() ; idx++) { + if (node.get_dependency(idx).is_constant()) + continue; + node.set_preferred_input_fmt(idx, cldnn::format::bfyx); + + if (node.get_preferred_output_fmt() == format::any) { + for (size_t usr = 0; usr < std::max(1, node.get_users().size()); usr++) + node.set_preferred_output_fmt(usr, cldnn::format::bfyx); + } + GPU_DEBUG_LOG << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(cldnn::format::bfyx) << " --> " << fmt_to_str(cldnn::format::bfyx) + << " For index : " << idx << std::endl; + } } return; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index eff3f4d80b1ed4..08910a78fc6797 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -331,14 +331,7 @@ network::network(program::ptr program, const ExecutionConfig& config, stream::pt if (is_dynamic()) { GPU_DEBUG_DEFINE_MEM_LOGGER("dynamic_network_initialization"); - _kernels_cache = std::unique_ptr(new kernels_cache(program->get_engine(), - program->get_config(), - program->get_id(), - program->get_task_executor(), - kernel_selector::KernelBase::get_db().get_batch_header_str())); - _impls_cache = std::unique_ptr(new ImplementationsCache(_impls_cache_capacity)); _in_mem_kernels_cache = std::unique_ptr(new KernelsCache(_in_mem_kernels_cache_capacity)); - _compilation_context = ICompilationContext::create(program->get_engine(), program->get_config(), program->get_id()); } } @@ -471,8 +464,8 @@ network::network(cldnn::BinaryInputBuffer& ib, const ExecutionConfig& config, st } network::~network() { - if (_compilation_context) - _compilation_context->cancel(); + if (_program != nullptr) + _program->cancel_compilation_context(); _memory_pool->clear_pool_for_network(net_id); GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) { diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 9f7f4d5c42b445..2eb6b3ec55a776 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -321,10 +321,9 @@ bool primitive_inst::update_impl() { // Update param if fake_alignment is available auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params); auto impl_key = get_impl_key(updated_params); - auto& cache = get_network().get_implementations_cache(); + auto& cache = get_network().get_program()->get_implementations_cache(); bool has_cached_impl = false; { - std::lock_guard lock(get_network().get_impl_cache_mutex()); has_cached_impl = cache.has(impl_key); if (has_cached_impl) { _impl = cache.get(impl_key)->clone(); @@ -337,11 +336,13 @@ bool primitive_inst::update_impl() { } if (!has_cached_impl) { if (_dynamic_impl) { - auto& compilation_context = get_network().get_compilation_context(); - compilation_context.push_task(impl_key, [this, updated_params, impl_key](kernels_cache& kc) { - auto& cache = get_network().get_implementations_cache(); + auto& compilation_context = get_network().get_program()->get_compilation_context(); + compilation_context.push_task(impl_key, [this, &compilation_context, updated_params, impl_key]() { + if (compilation_context.is_stopped()) + return; + auto _program = get_network().get_program(); + auto& cache = _program->get_implementations_cache(); { - std::lock_guard lock(get_network().get_impl_cache_mutex()); // Check existense in the cache one more time as several iterations of model execution could happens and multiple compilation // tasks created for same shapes if (cache.has(impl_key)) @@ -349,13 +350,8 @@ bool primitive_inst::update_impl() { } auto impl = _node->type()->choose_impl(*_node, updated_params); - auto kernel_ids = kc.add_kernels_source(impl->get_kernels_source()); - impl->set_kernel_ids(kernel_ids); - kc.compile(); - impl->init_kernels(kc); - kc.reset(); - - std::lock_guard lock(get_network().get_impl_cache_mutex()); + auto kernels = _program->get_kernels_cache().compile(impl->get_kernels_source()); + impl->set_kernels(kernels); cache.add(impl_key, impl->clone()); }); _impl = _dynamic_impl->clone(); @@ -364,13 +360,9 @@ bool primitive_inst::update_impl() { update_shape_info(*_impl_params); } else { _impl = _node->type()->choose_impl(*_node, updated_params); - auto& kernels_cache = get_network().get_kernels_cache(); - auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source()); - _impl->set_kernel_ids(kernel_ids); - kernels_cache.compile(); - _impl->init_kernels(kernels_cache); - kernels_cache.reset(); - std::lock_guard lock(get_network().get_impl_cache_mutex()); + auto& kernels_cache = get_network().get_program()->get_kernels_cache(); + auto kernels = kernels_cache.compile(_impl->get_kernels_source()); + _impl->set_kernels(kernels); cache.add(impl_key, _impl->clone()); auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr"; @@ -526,7 +518,7 @@ void primitive_inst::rebuild_exec_deps( primitive_inst::primitive_inst(network& network) : _network(network) , _node(nullptr) - , _impl_params(nullptr) + , _impl_params(make_unique()) , _impl(nullptr) , _dynamic_impl(nullptr) , _outputs({memory::ptr()}) @@ -707,12 +699,11 @@ event::ptr primitive_inst::update_weights() { } else { GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string() << " to " << expected_layout.to_short_string() << std::endl; - auto& kernels_cache = get_network().get_kernels_cache(); - auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false); - kernels_cache.compile(); - kernel = kernels_cache.get_kernel(kernel_id); + auto& kernels_cache = get_network().get_program()->get_kernels_cache(); + auto kernels = kernels_cache.compile({weights_params.clKernel->code.kernelString}); + OPENVINO_ASSERT(kernels.size() == 1, "The output of kernel compile has issue"); + kernel = kernels.begin()->second; cache.add(kernel_key, kernel); - kernels_cache.reset(); } auto& stream = get_network().get_stream(); @@ -1158,8 +1149,6 @@ int32_t primitive_inst::get_index_in_deps(memory::cptr arg) const { } void primitive_inst::load(cldnn::BinaryInputBuffer& ib) { - _impl_params.release(); - _impl_params = make_unique(); _impl_params->load(ib); ib.setKernlImplParams(_impl_params.get()); diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 1462f80eb18a26..db9c4436d0f27c 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -18,6 +18,7 @@ #include "program_dump_graph.h" #include "sliding_window_utils.hpp" #include "program_helpers.h" +#include "compilation_context.hpp" #include "matrix_nms_inst.h" #include "roi_pooling_inst.h" @@ -109,17 +110,11 @@ program::program(engine& engine_ref, processing_order(), is_body_program(is_body_program), is_subgroup_local_block_io_supported(-1) { + _config.apply_user_properties(_engine.get_device_info()); init_primitives(); - set_options(); - query_local_block_io_supported(); - _task_executor = make_task_executor(_config); - GPU_DEBUG_INFO << "Program config\n" << config.to_string(); - - pm = std::unique_ptr(new pass_manager(*this)); + init_program(); prepare_nodes(topology); - _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, - kernel_selector::KernelBase::get_db().get_batch_header_str())); program_node::reset_unique_id(); if (no_optimizations) { @@ -141,15 +136,9 @@ program::program(engine& engine_ref, _task_executor(task_executor), processing_order(), is_subgroup_local_block_io_supported(-1) { + _config.apply_user_properties(_engine.get_device_info()); init_primitives(); - set_options(); - query_local_block_io_supported(); - - _task_executor = make_task_executor(_config); - - _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, - kernel_selector::KernelBase::get_db().get_batch_header_str())); - pm = std::unique_ptr(new pass_manager(*this)); + init_program(); prepare_nodes(nodes); build_program(is_internal); calc_nodes_hash(); @@ -160,11 +149,35 @@ program::program(engine& engine) _stream(_engine.create_stream({})), _config(), processing_order(), - is_subgroup_local_block_io_supported(-1) { } + is_subgroup_local_block_io_supported(-1) { + _config.apply_user_properties(_engine.get_device_info()); + } + program::~program() { query_local_block_io_supported(); } +void program::init_program() { + set_options(); + query_local_block_io_supported(); + + pm = std::unique_ptr(new pass_manager(*this)); + + _task_executor = make_task_executor(_config); + _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, + kernel_selector::KernelBase::get_db().get_batch_header_str())); + + _compilation_context = ICompilationContext::create(make_task_executor_config(_config, + "Task executor config for CompilationContext in GPU plugin")); + + _impls_cache = cldnn::make_unique(_impls_cache_capacity); + // Remove items of compilation context's internal queue when some impl is popped in kernels_cache + // compilation context's queue check duplication of inserted task + _impls_cache->set_remove_item_callback([this](std::pair>& item) { + get_compilation_context().remove_keys({item.first}); + }); +} + void program::init_primitives() { static bool is_initialized = false; if (!is_initialized) { @@ -198,8 +211,8 @@ static void adjust_num_cores(InferenceEngine::CPUStreamsExecutor::Config& config config._streams = std::min(config._streams, num_cores); } -std::shared_ptr program::make_task_executor(const ExecutionConfig& config) const { - InferenceEngine::CPUStreamsExecutor::Config task_executor_config("CPU Tasks executor for GPU plugin", 1); +InferenceEngine::CPUStreamsExecutor::Config program::make_task_executor_config(const ExecutionConfig& config, std::string tags) const { + InferenceEngine::CPUStreamsExecutor::Config task_executor_config(tags, 1); task_executor_config._streams = config.get_property(ov::compilation_num_threads); auto priority = config.get_property(ov::intel_gpu::hint::host_task_priority); switch (priority) { @@ -211,6 +224,11 @@ std::shared_ptr program::make_task_executor adjust_num_cores(task_executor_config); + return task_executor_config; +} + +std::shared_ptr program::make_task_executor(const ExecutionConfig& config) const { + InferenceEngine::CPUStreamsExecutor::Config task_executor_config = make_task_executor_config(config, "CPU Tasks executor for GPU plugin"); return std::make_shared(task_executor_config); } @@ -1713,3 +1731,8 @@ std::pair program::get_estimated_device_mem_usage() { void program::remove_kernel(kernel_id id) { _kernels_cache->remove_kernel(id); } + +void program::cancel_compilation_context() { + if (_compilation_context != nullptr) + _compilation_context->cancel(); +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl index 3dc30c7a88ffa9..d7c86c5bed361c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/fetch_weights.cl @@ -4,27 +4,28 @@ #include "common.cl" -#define GET_FILTER_OS_IS_YX_ISV16_OSV16_INDEX(prefix, o, i, y, x, sub_group_size) \ - CAT(prefix, _OFFSET) + \ - ((o) % (sub_group_size)) + \ - (sub_group_size)*( \ - (x)*(sub_group_size)*CAT(prefix, _X_PITCH) + \ - (y)*(sub_group_size)*CAT(prefix, _Y_PITCH) + \ - ((i) % (sub_group_size)) + \ - ((i) / (sub_group_size))*(sub_group_size)*CAT(prefix, _IFM_PITCH) + \ - ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \ +#define GET_FILTER_OS_IS_YX_ISV_OSV_INDEX(prefix, o, i, y, x, osv, isv) \ + get_os_is_zyx_isv_osv_index( \ + o, i, 0, y, x, \ + CAT(prefix, _SIZE_X), \ + CAT(prefix, _SIZE_Y), \ + 1, \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + osv, \ + isv \ ) -#define GET_FILTER_OS_IS_ZYX_ISV16_OSV16_INDEX(prefix, o, i, z, y, x, sub_group_size) \ - CAT(prefix, _OFFSET) + \ - ((o) % (sub_group_size)) + \ - (sub_group_size)*( \ - (x)*(sub_group_size)*CAT(prefix, _X_PITCH) + \ - (y)*(sub_group_size)*CAT(prefix, _Y_PITCH) + \ - (z)*(sub_group_size)*CAT(prefix, _Z_PITCH) + \ - ((i) % (sub_group_size)) + \ - ((i) / (sub_group_size))*(sub_group_size)*CAT(prefix, _IFM_PITCH) + \ - ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \ +#define GET_FILTER_OS_IS_ZYX_ISV_OSV_INDEX(prefix, o, i, z, y, x, osv, isv) \ + get_os_is_zyx_isv_osv_index( \ + o, i, z, y, x, \ + CAT(prefix, _SIZE_X), \ + CAT(prefix, _SIZE_Y), \ + CAT(prefix, _SIZE_Z), \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + osv, \ + isv \ ) #define GET_FILTER_IS_OS_ZYX_ISV16_OSV16_INDEX(prefix, o, i, z, y, x, sub_group_size) \ @@ -85,6 +86,32 @@ CAT(prefix, _OFFSET) \ ) +inline uint get_os_is_zyx_isv_osv_index(uint o, uint i, uint z, uint y, uint x, + uint x_size, uint y_size, uint z_size, uint i_size, uint o_size, uint osv_size, uint isv_size) +{ + const uint isv = i % isv_size; + const uint osv = o % osv_size; + const uint is = i / isv_size; + const uint os = o / osv_size; + + const uint x_pitch = osv_size * isv_size; + const uint y_pitch = x_pitch * x_size; + const uint z_pitch = y_pitch * y_size; + const uint is_pitch = z_pitch * z_size; + const uint os_pitch = is_pitch * ((i_size + isv_size - 1) / isv_size); + + const uint output_offset = + osv + + isv * osv_size + + x * x_pitch + + y * y_pitch + + z * z_pitch + + is * is_pitch + + os * os_pitch; + + return output_offset; +} + inline uint get_os_is_zyx_osv_isv_index(uint o, uint i, uint z, uint y, uint x, uint x_size, uint y_size, uint z_size, uint i_size, uint o_size, uint osv_size, uint isv_size) { @@ -329,7 +356,7 @@ inline uint get_os_zyxi_osv16_index(uint o, uint i, uint z, uint y, uint x, uint #define GET_FILTER_INDEX_5D_SAFE(prefix, g, o, i, z, y, x) GET_FILTER_GOIZYX_SAFE(prefix, g, o, i, z, y, x) -#define GET_FILTER_OS_IYX_OSV8_INDEX(prefix, o, i, y, x, sub_group_size) \ +#define GET_FILTER_OS_IYX_OSV_INDEX(prefix, o, i, y, x, sub_group_size) \ CAT(prefix, _OFFSET) + \ ((o) % (sub_group_size)) + \ (sub_group_size)*( \ @@ -339,7 +366,7 @@ inline uint get_os_zyxi_osv16_index(uint o, uint i, uint z, uint y, uint x, uint ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \ ) -#define GET_FILTER_OS_IYX_OSV8_ROTATE_180_INDEX(prefix, o, i, y, x, sub_group_size) \ +#define GET_FILTER_OS_IYX_OSV_ROTATE_180_INDEX(prefix, o, i, y, x, sub_group_size) \ CAT(prefix, _OFFSET) + \ ((o) % (sub_group_size)) + \ (sub_group_size)*( \ @@ -1495,16 +1522,6 @@ inline uint get_os_i_yxs_osv_yxsv4_index(uint o, uint i, uint y, uint x, uint i_ CAT(prefix, _SIZE_Y), \ 4) -#define GET_FILTER_OS_IYX_OSV32__AI32_INDEX(prefix, o, i, y, x, sub_group_size) \ - CAT(prefix, _OFFSET) + \ - ((o) % (sub_group_size)) + \ - (sub_group_size)*( \ - (x)*CAT(prefix, _X_PITCH) + \ - (y)*CAT(prefix, _Y_PITCH) + \ - (i)*CAT(prefix, _IFM_PITCH) + \ - ((o) / (sub_group_size))*CAT(prefix, _OFM_PITCH) \ - ) - #define GET_FILTER_G_OS_IYX_OSV16(prefix, g, o, i, y, x, sub_group_size) \ CAT(prefix, _OFFSET) + \ (g * CAT(prefix, _GROUPS_PITCH)) + \ diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl index 582c2f6c6c74df..147ab43e837ee0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl @@ -25,19 +25,20 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x return GET_FILTER_INDEX_5D(INPUT0, 0, o, i, z, y, x); #elif defined INPUT0_LAYOUT_OS_IYX_OSV16 || \ defined INPUT0_LAYOUT_OS_I_OSV16 || \ - defined INPUT0_LAYOUT_OS_I_OSV8__AI8 || \ defined INPUT0_LAYOUT_OS_I_OSV16__AI8 - return GET_FILTER_OS_IYX_OSV8_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 16); +#elif defined INPUT0_LAYOUT_OS_I_OSV8__AI8 + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 8); #elif defined INPUT0_LAYOUT_IYX_OSV32 - return GET_FILTER_OS_IYX_OSV8_INDEX(INPUT0, o, i, y, x, 32); + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 32); #elif defined INPUT0_LAYOUT_OS_IYX_OSV32__AI32 - return GET_FILTER_OS_IYX_OSV32__AI32_INDEX(OUTPUT, o, i, y, x, 32); + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 32); #elif defined INPUT0_LAYOUT_O_IS_YX_ISV16 return GET_FILTER_O_IS_YX_ISV16_INDEX(INPUT0, o, i, y, x, 16); #elif defined INPUT0_LAYOUT_IYX_OSV64 - return GET_FILTER_OS_IYX_OSV8_INDEX(INPUT0, o, i, y, x, 64); + return GET_FILTER_OS_IYX_OSV_INDEX(INPUT0, o, i, y, x, 64); #elif defined INPUT0_LAYOUT_OS_IYX_OSV16_ROTATE_180 - return GET_FILTER_OS_IYX_OSV8_ROTATE_180_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IYX_OSV_ROTATE_180_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); #elif defined INPUT0_LAYOUT_I_YXS_OS_YXSV2_OSV16 return GET_FILTER_I_YXS_OS_YXSV2_OSV_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); #elif defined INPUT0_LAYOUT_IY_XS_OS_XSV2_OSV16__AO32 || defined OUTPUT_LAYOUT_IY_XS_OS_XSV2_OSV8__AO32 @@ -61,11 +62,11 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x #elif defined INPUT0_LAYOUT_OS_IS_Y_X8_OSV8_ISV4_SWIZZLED_BY_4 return GET_FILTER_OS_IS_Y_X8_OSV8_ISV4_SWIZZLED_BY_4(INPUT0, o, i, y, x); #elif defined INPUT0_LAYOUT_OS_IS_YX_ISV16_OSV16 - return GET_FILTER_OS_IS_YX_ISV16_OSV16_INDEX(INPUT0, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IS_YX_ISV_OSV_INDEX(INPUT0, o, i, y, x, 16, 16); #elif defined INPUT0_LAYOUT_OIYX_O16 return GET_FILTER_OIYX_O16(INPUT0, o, i, y, x); #elif defined INPUT0_LAYOUT_OS_IS_ZYX_ISV16_OSV16 - return GET_FILTER_OS_IS_ZYX_ISV16_OSV16_INDEX(INPUT0, o, i, z, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IS_ZYX_ISV_OSV_INDEX(INPUT0, o, i, z, y, x, 16, 16); #elif defined INPUT0_LAYOUT_IS_OS_ZYX_ISV16_OSV16 return GET_FILTER_IS_OS_ZYX_ISV16_OSV16_INDEX(INPUT0, o, i, z, y, x, SUB_GROUP_SIZE); #elif defined INPUT0_LAYOUT_IS_OS_YX_ISV16_OSV16 @@ -219,19 +220,20 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint return GET_FILTER_INDEX_5D(OUTPUT, 0, o, i, z, y, x); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV16 || \ defined OUTPUT_LAYOUT_OS_I_OSV16 || \ - defined OUTPUT_LAYOUT_OS_I_OSV8__AI8 || \ defined OUTPUT_LAYOUT_OS_I_OSV16__AI8 - return GET_FILTER_OS_IYX_OSV8_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 16); +#elif defined OUTPUT_LAYOUT_OS_I_OSV8__AI8 + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 8); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV32 - return GET_FILTER_OS_IYX_OSV8_INDEX(OUTPUT, o, i, y, x, 32); + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 32); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV32__AI32 - return GET_FILTER_OS_IYX_OSV32__AI32_INDEX(OUTPUT, o, i, y, x, 32); + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 32); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV64 - return GET_FILTER_OS_IYX_OSV8_INDEX(OUTPUT, o, i, y, x, 64); + return GET_FILTER_OS_IYX_OSV_INDEX(OUTPUT, o, i, y, x, 64); #elif defined OUTPUT_LAYOUT_O_IS_YX_ISV16 return GET_FILTER_O_IS_YX_ISV16_INDEX(OUTPUT, o, i, y, x, 16); #elif defined OUTPUT_LAYOUT_OS_IYX_OSV16_ROTATE_180 - return GET_FILTER_OS_IYX_OSV8_ROTATE_180_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IYX_OSV_ROTATE_180_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); #elif defined OUTPUT_LAYOUT_I_YXS_OS_YXSV2_OSV16 return GET_FILTER_I_YXS_OS_YXSV2_OSV_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); #elif defined OUTPUT_LAYOUT_IY_XS_OS_XSV2_OSV16__AO32 || defined OUTPUT_LAYOUT_IY_XS_OS_XSV2_OSV8__AO32 @@ -313,11 +315,11 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4 return GET_FILTER_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, o, i, z, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_YX_ISV16_OSV16 - return GET_FILTER_OS_IS_YX_ISV16_OSV16_INDEX(OUTPUT, o, i, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IS_YX_ISV_OSV_INDEX(OUTPUT, o, i, y, x, 16, 16); #elif defined OUTPUT_LAYOUT_OS_YXI_OSV16 return GET_FILTER_OS_YXI_OSV16(OUTPUT, o, i, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISV16_OSV16 - return GET_FILTER_OS_IS_ZYX_ISV16_OSV16_INDEX(OUTPUT, o, i, z, y, x, SUB_GROUP_SIZE); + return GET_FILTER_OS_IS_ZYX_ISV_OSV_INDEX(OUTPUT, o, i, z, y, x, 16, 16); #elif defined OUTPUT_LAYOUT_IS_OS_ZYX_ISV16_OSV16 return GET_FILTER_IS_OS_ZYX_ISV16_OSV16_INDEX(OUTPUT, o, i, z, y, x, SUB_GROUP_SIZE); #elif defined OUTPUT_LAYOUT_IS_OS_YX_ISV16_OSV16 diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index b2370fa7951287..c699b379984c94 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -6,6 +6,7 @@ #include "intel_gpu/graph/serialization/binary_buffer.hpp" #include "intel_gpu/graph/serialization/string_serializer.hpp" #include "intel_gpu/graph/serialization/utils.hpp" +#include "intel_gpu/graph/serialization/vector_serializer.hpp" #include "intel_gpu/plugin/graph.hpp" #include "intel_gpu/runtime/itt.hpp" #include "intel_gpu/plugin/infer_request.hpp" @@ -96,11 +97,14 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote std::string name; std::string precision; std::string layout; + InferenceEngine::SizeVector dims; ib >> name; ib >> precision; ib >> layout; + ib >> dims; DataPtr input = std::make_shared(name, Precision::FromStr(precision), cldnn::serial_util::layout_from_string(layout)); + input->setDims(dims); InputInfo::Ptr infoNew = std::make_shared(); infoNew->setInputData(input); inputs.emplace(std::make_pair(name, infoNew)); @@ -115,11 +119,14 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote std::string name; std::string precision; std::string layout; + InferenceEngine::SizeVector dims; ib >> name; ib >> precision; ib >> layout; + ib >> dims; DataPtr output = std::make_shared(name, Precision::FromStr(precision), cldnn::serial_util::layout_from_string(layout)); + output->setDims(dims); outputs.emplace(std::make_pair(name, output)); } @@ -234,6 +241,9 @@ CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::Remote ib.seekg(pos); auto graph = std::make_shared(ib, context_impl, m_config, n); m_graphs.push_back(graph); + if (n == 0) { + ib.setNetwork(graph->GetNetwork().get()); + } } } @@ -317,14 +327,6 @@ IInferRequestInternal::Ptr CompiledModel::CreateInferRequest() { _callbackExecutor); } -bool CompiledModel::is_serializable() { - // Dynamic model serialization is not yet supported. - if (m_graphs[0]->GetNetwork()->is_dynamic()) - return false; - - return true; -} - // Cache blob format: // [ ConstInputsDataMap / ConstOutputsDataMap ] // [ ov::Node::Input/ ov::Node::Output ] @@ -334,9 +336,6 @@ void CompiledModel::Export(std::ostream& networkModel) { if (m_graphs.empty()) IE_THROW(NetworkNotLoaded); - if (!is_serializable()) - return; - cldnn::BinaryOutputBuffer ob(networkModel); // InputsInfo and OutputsInfo for CNNNetwork @@ -350,6 +349,7 @@ void CompiledModel::Export(std::ostream& networkModel) { std::stringstream ss; ss << in.second->getInputData()->getLayout(); ob << ss.str(); + ob << in.second->getTensorDesc().getDims(); } ob << GetOutputsInfo().size(); @@ -361,6 +361,7 @@ void CompiledModel::Export(std::ostream& networkModel) { std::stringstream ss; ss << out.second->getLayout(); ob << ss.str(); + ob << out.second->getTensorDesc().getDims(); } } diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 54630c5384aa40..aaced7fdc61dc6 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -142,12 +142,6 @@ Plugin::Plugin() : m_default_contexts({}) { m_default_contexts.insert({device.first, ctx}); } } - - if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) { - if (env_p[0] == '1') { - isModelCachingEnabled = true; - } - } } auto check_inputs = [](InferenceEngine::InputsDataMap _networkInputs) { @@ -204,6 +198,9 @@ IExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(const InferenceEngine { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::LoadExeNetworkImpl::CreateExeNetwork"); CompiledModel::Ptr exeNetwork = std::make_shared(transformedNetwork, context, config); + if (exeNetwork->m_graphs[0]->GetNetwork()->is_dynamic()) { + isModelCachingEnabled = false; + } update_memory_statistics(context->get_impl()); return exeNetwork; } diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp index 7ca5f1acb3c0cd..6aaa2f7385df4c 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp @@ -54,7 +54,7 @@ std::string reorder_options(const std::string& org_options) { } // namespace namespace cldnn { - +std::atomic kernels_cache::_kernel_idx{0}; std::mutex kernels_cache::_mutex; std::string kernels_cache::get_cache_path() const { @@ -70,10 +70,8 @@ std::string kernels_cache::get_cache_path() const { } bool kernels_cache::is_cache_enabled() const { - if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) { - if (env_p[0] == '1') { - return false; - } + if (!_config.get_property(ov::intel_gpu::allow_new_shape_infer)) { + return false; } return !_config.get_property(ov::cache_dir).empty(); @@ -191,7 +189,7 @@ static std::vector getProgramBinaries(cl::Program program) { } // TODO: This build_batch method should be backend specific -void kernels_cache::build_batch(const engine& build_engine, const batch_program& batch) { +void kernels_cache::build_batch(const engine& build_engine, const batch_program& batch, std::map& compiled_kernels) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::build_batch"); auto& cl_build_engine = dynamic_cast(build_engine); @@ -288,7 +286,7 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program& cl_context context = cl_build_engine.get_cl_context().get(); kernel::ptr kernel = kernels_factory::create(_engine, context, kern, entry_point); const auto& kmap = std::make_pair(k_id->second, kernel); - _kernels.insert(kmap); + compiled_kernels.insert(kmap); } else { throw std::runtime_error("Could not find entry point"); } @@ -393,7 +391,7 @@ void kernels_cache::build_all() { auto& batch = batches[idx]; tasks.push_back([this, &_build_engine, &batch, &exception] { try { - build_batch(_build_engine, batch); + build_batch(_build_engine, batch, _kernels); } catch(...) { exception = std::current_exception(); } @@ -407,7 +405,7 @@ void kernels_cache::build_all() { } } else { for (size_t idx = 0; idx < batches.size(); idx++) { - build_batch(_build_engine, batches[idx]); + build_batch(_build_engine, batches[idx], _kernels); } } @@ -438,10 +436,7 @@ std::vector kernels_cache::add_kernels_source(std::vector lock(_mutex); auto kernel_string = kernel_sources[i]; - // we need unique id in order to avoid conflict across topologies. - const auto kernel_num = _kernels.size() + (_kernel_idx++); - kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num); - + kernel_id id = gen_kernel_id(kernel_string->entry_point); auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program); assert(_kernels.find(id) == _kernels.end()); @@ -459,37 +454,10 @@ void kernels_cache::add_kernels(const std::vector& kernel_ids, cons for (size_t i = 0; i < kernel_ids.size(); i++) { const auto& kmap = std::make_pair(kernel_ids[i], kernels[i]); _kernels.insert(kmap); + _kernel_idx++; } } -void kernels_cache::compile() { - OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildAll"); - - std::unique_ptr _build_engine = nullptr; - if (_engine.type() == engine_types::ocl) { - _build_engine = std::unique_ptr(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl)); - } - - // create batches - std::vector batches; - get_program_source(_kernels_code, &batches); - - // build batches - for (size_t idx = 0; idx < batches.size(); idx++) { - build_batch(*_build_engine, batches[idx]); - } - - _kernels_code.clear(); - _pending_compilation = false; -#if defined(__unix__) && !defined(__ANDROID__) - // NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed. - // (It is at least 500 MB when we perform parallel compilation) - // It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory. - // Also, this is not happening in Windows. - // So, added malloc_trim for linux build until we figure out a better solution. - malloc_trim(0); -#endif -} void kernels_cache::save(BinaryOutputBuffer& ob) const { OPENVINO_ASSERT(_engine.type() == engine_types::ocl, "[GPU] Not supported engine type"); @@ -572,6 +540,7 @@ void kernels_cache::load(BinaryInputBuffer& ib) { cl_context cl_context = build_engine->get_cl_context().get(); kernel::ptr kernel = kernels_factory::create(_engine, cl_context, cl_kernel, entry_point); _kernels.insert({k_id->second, kernel}); + _kernel_idx++; } } } @@ -584,4 +553,41 @@ void kernels_cache::load(BinaryInputBuffer& ib) { } } +std::map kernels_cache::compile(std::vector> kernel_sources, + bool dump_custom_program) { + OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::Compile_ThreadSafe"); + kernels_code t_kernels_code; + + // Get kernels code from kernel sources + for (size_t idx = 0; idx < kernel_sources.size(); ++idx) { + auto kernel_string = kernel_sources[idx]; + kernel_id id = gen_kernel_id(kernel_string->entry_point); + t_kernels_code.emplace(kernel_string, id, dump_custom_program); + } + + ocl::ocl_engine& _build_engine = downcast(_engine); + + // Create batches + std::vector batches; + get_program_source(t_kernels_code, &batches); + + std::map output_kernels; + // Build batches + for (size_t idx = 0; idx < batches.size(); ++idx) { + build_batch(_build_engine, batches[idx], output_kernels); + } + + t_kernels_code.clear(); +#if defined(__unix__) && !defined(__ANDROID__) + // NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed. + // (It is at least 500 MB when we perform parallel compilation) + // It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory. + // Also, this is not happening in Windows. + // So, added malloc_trim for linux build until we figure out a better solution. + malloc_trim(0); +#endif + + return output_kernels; +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp index aa2e1187a3b5f5..79f9ad625d3197 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp @@ -22,6 +22,7 @@ #include "ocl/ocl_engine.hpp" namespace cldnn { + class kernels_cache { public: using source_code = std::vector; @@ -81,18 +82,23 @@ class kernels_cache { ExecutionConfig _config; uint32_t _prog_id = 0; kernels_code _kernels_code; - size_t _kernel_idx = 0; + static std::atomic _kernel_idx; std::atomic _pending_compilation{false}; std::map _kernels; std::vector batch_header_str; void get_program_source(const kernels_code& kernels_source_code, std::vector*) const; - void build_batch(const engine& build_engine, const batch_program& batch); + void build_batch(const engine& build_engine, const batch_program& batch, std::map& compiled_kernels); std::string get_cache_path() const; bool is_cache_enabled() const; size_t get_max_kernels_per_batch() const; + inline std::string gen_kernel_id(std::string entry_point) { + // we need unique id in order to avoid conflict across topologies. + return entry_point + "_" + std::to_string((_kernel_idx++)); + } + public: explicit kernels_cache(engine& engine, const ExecutionConfig& config, @@ -116,9 +122,9 @@ class kernels_cache { } std::vector add_kernels_source(std::vector> kernel_sources, bool dump_custom_program = false); void add_kernels(const std::vector& kernel_ids, const std::vector& kernels); - void compile(); void save(BinaryOutputBuffer& ob) const; void load(BinaryInputBuffer& ib); + std::map compile(std::vector> kernel_sources, bool dump_custom_program = false); }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp index 50b928dc25ef98..f4c9e20844abe7 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp @@ -211,6 +211,10 @@ clEnqueueMemFillINTEL_fn)( #define CL_DEVICE_UUID_KHR 0x106A +#endif // cl_khr_device_uuid + +#ifndef OV_GPU_USE_OPENCL_HPP + // for C++ wrappers using uuid_array = std::array; @@ -220,7 +224,7 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array) } // namespace detail } // namespace cl -#endif // cl_khr_device_uuid +#endif // OV_GPU_USE_OPENCL_HPP /*************************************************************** * cl_intel_device_attribute_query diff --git a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp index 59950f81cc9296..d88a740f441c80 100644 --- a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp @@ -1717,14 +1717,14 @@ TEST_P(conv_swap_xy_with_eltwise_diff_sizes, basic) { // in_shape; out_shape; eltw_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; #define CASE_CONV_ELTW_FP16_SWAP_XY_1 { 1, 16, 1, 5 }, { 1, 32, 1, 7 }, { 1, 32, 1, 1 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx #define CASE_CONV_ELTW_FP16_SWAP_XY_2 { 1, 16, 1, 5 }, { 1, 32, 1, 7 }, { 1, 32, 1, 7 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx -#define CASE_CONV_ELTW_FP32_SWAP_XY_1 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 1, 32, 1, 1 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx -#define CASE_CONV_ELTW_FP32_SWAP_XY_2 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 3, 32, 1, 7 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_FP16_SWAP_XY_3 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 1, 32, 1, 1 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx +#define CASE_CONV_ELTW_FP16_SWAP_XY_4 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 3, 32, 1, 7 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_swap_xy_with_eltwise_diff_sizes, ::testing::ValuesIn(std::vector{ - conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_1, 3, 3, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_2, 3, 3, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_SWAP_XY_1, 3, 3, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_SWAP_XY_2, 3, 3, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_1, 3, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_2, 3, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_3, 3, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_4, 3, 2, 4 }, })); class conv_scale_activation_eltwise_fp32_quantize_i8 : public ConvEltwTest {}; diff --git a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp index e6becbacb9e6c6..0b3ebc78d1ada9 100644 --- a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp @@ -151,6 +151,9 @@ class FullyConnectedFusingTestOneDNN : public BaseFusingTest{ + fully_connected_test_params{ CASE_FC_U8S8_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 3 }, +})); + class fc_int8_quantize_u8 : public FullyConnectedFusingTest {}; TEST_P(fc_int8_quantize_u8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input", get_input_layout(p)), @@ -272,7 +307,7 @@ TEST_P(fc_int8_quantize_u8, basic) { execute(p); } -INSTANTIATE_TEST_SUITE_P(fusings_gpu_fc, fc_int8_quantize_u8, ::testing::ValuesIn(std::vector{ +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_quantize_u8, ::testing::ValuesIn(std::vector{ fully_connected_test_params{ CASE_FC_U8S8_1, 2, 3 }, fully_connected_test_params{ CASE_FC_U8S8_2, 2, 3 }, fully_connected_test_params{ CASE_FC_U8S8_3, 2, 3 }, @@ -283,6 +318,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu_fc, fc_int8_quantize_u8, ::testing::ValuesI class fc_int8_eltwise_quantize_i8 : public FullyConnectedFusingTest {}; TEST_P(fc_int8_eltwise_quantize_i8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input", get_input_layout(p)), @@ -315,6 +353,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_eltwise_quantize_i8, ::testing::Va class fc_int8_eltwise_activation_quantize_i8 : public FullyConnectedFusingTest {}; TEST_P(fc_int8_eltwise_activation_quantize_i8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input", get_input_layout(p)), diff --git a/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp b/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp index 96f37db8fba43d..78a5781e93bf85 100644 --- a/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp +++ b/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp @@ -34,6 +34,10 @@ class BaseFusingTest : public ::testing::TestWithParam { cfg_fused.set_property(ov::intel_gpu::optimize_data(true)); cfg_not_fused.set_property(ov::intel_gpu::optimize_data(false)); cfg_not_fused.set_property(ov::intel_gpu::allow_static_input_reorder(true)); + if (engine.get_device_info().supports_immad) { + cfg_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order)); + cfg_not_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order)); + } } void compare(network& not_fused, network& fused, T& p, bool count_reorder = false) { @@ -74,20 +78,15 @@ class BaseFusingTest : public ::testing::TestWithParam { ASSERT_EQ(outputs_ref.size(), outputs_fused.size()); ASSERT_EQ(outputs_ref.size(), size_t(1)); - auto output_not_fused_prim = outputs_ref.begin()->second.get_memory(); - auto output_fused_prim = outputs_fused.begin()->second.get_memory(); - if (output_not_fused_prim->get_layout().data_type == data_types::f32) { - cldnn::mem_lock ref(output_not_fused_prim, get_test_stream()); - cldnn::mem_lock output_ptr(output_fused_prim, get_test_stream()); - for (size_t i = 0; i < output_fused_prim->get_layout().count(); i++) { - ASSERT_NEAR(ref[i], output_ptr[i], tolerance) << "i = " << i; - } - } else { - cldnn::mem_lock ref(output_not_fused_prim, get_test_stream()); - cldnn::mem_lock output_ptr(output_fused_prim, get_test_stream()); - for (size_t i = 0; i < output_fused_prim->get_layout().count(); i++) { - ASSERT_NEAR(half_to_float(ref[i]), half_to_float(output_ptr[i]), tolerance) << "i = " << i; - } + auto val_ref=get_output_values_to_float(not_fused, outputs_ref.begin()->first); + auto val_opt=get_output_values_to_float(fused, outputs_fused.begin()->first); + ASSERT_EQ(val_ref.size(), val_opt.size()); + for (size_t i = 0; i < val_ref.size(); i++) { + ASSERT_NEAR(val_ref[i], val_opt[i], tolerance) + << "tolerance = " << tolerance + << "\ni = " << i + << "\nref[i] = " << val_ref[i] + << "\nopt[i] = " << val_opt[i]; } } diff --git a/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp index 33dd5009be0367..40bb22589ccb84 100644 --- a/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp @@ -74,11 +74,6 @@ class GemmFusingTest : public ::BaseFusingTest { } layout get_per_channel_layout(gemm_test_params& p) { - // WA: per channel binary post-operation is not supported for onednn gemm. Use single value for such case. - if (engine.get_device_info().supports_immad){ - std::cout << "per_channel layout for onednn gemm not supported." << std::endl; - return layout{p.default_type, p.default_format, tensor{1, 1, 1, 1}}; - } return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shapes.at(0).feature[0], 1, 1 } }; } @@ -129,6 +124,9 @@ class GemmFusingTest : public ::BaseFusingTest { class gemm_3in_quantize_i8 : public GemmFusingTest {}; TEST_P(gemm_3in_quantize_i8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input0", get_input_layout(p, 0)), @@ -279,6 +277,9 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_scale, ::testing::ValuesIn(std::v class gemm_2in_act_scale_quantize_i8 : public GemmFusingTest {}; TEST_P(gemm_2in_act_scale_quantize_i8, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); create_topologies( input_layout("input0", get_input_layout(p, 0)), diff --git a/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp index 2c9d7d87750bba..736392016e2730 100644 --- a/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp @@ -120,6 +120,9 @@ class ReduceFusingTest : public ::BaseFusingTest { class reduce_eltwise_activation_quantize : public ReduceFusingTest {}; TEST_P(reduce_eltwise_activation_quantize, basic) { + // TODO: Fix me, refer PR(#15873) + if (engine.get_device_info().supports_immad) + return; auto p = GetParam(); update_out_shape(p); create_topologies( diff --git a/src/plugins/intel_gpu/tests/fusions/strided_slice_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/strided_slice_fusion_test.cpp index cad2927bbbfabf..3aa5d30d3c780e 100644 --- a/src/plugins/intel_gpu/tests/fusions/strided_slice_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/strided_slice_fusion_test.cpp @@ -21,6 +21,7 @@ struct strided_slice_test_params { data_types input_type; format input_format; size_t expected_fused_primitives; + size_t expected_fused_primitives_onednn; size_t expected_not_fused_primitives; std::vector> activation_func_list; }; @@ -64,6 +65,8 @@ TEST_P(strided_slice_activation, basic) { std::vector strides_data = { 1, 1, 1, 1 }; auto p = GetParam(); + if (engine.get_device_info().supports_immad) + p.expected_fused_primitives = p.expected_fused_primitives_onednn; create_topologies( input_layout("input", get_input_layout(p)), strided_slice("strided_slice", input_info("input"), begin_data, end_data, strides_data, {}, {}, {}, {}, {}, { 1, 8, 1, 1 }) @@ -84,7 +87,7 @@ TEST_P(strided_slice_activation, basic) { } INSTANTIATE_TEST_SUITE_P(fusings_gpu, strided_slice_activation, ::testing::ValuesIn(std::vector{ - strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 4, {{ activation_func::clamp, { } }, { activation_func::exp, { } }} }, - strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 3, {{ activation_func::logistic, { } } } }, - strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 3, {{ activation_func::hyperbolic_tan, { } } } }, + strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 2, 4, {{ activation_func::clamp, { } }, { activation_func::exp, { } }} }, + strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 2, 3, {{ activation_func::logistic, { } } } }, + strided_slice_test_params{ CASE_STRIDED_SLICE_F16_1, 2, 3, 3, {{ activation_func::hyperbolic_tan, { } } } }, })); diff --git a/src/plugins/intel_gpu/tests/passes/remove_redundant_reorders_tests.cpp b/src/plugins/intel_gpu/tests/passes/remove_redundant_reorders_tests.cpp new file mode 100644 index 00000000000000..b2c62a5506b302 --- /dev/null +++ b/src/plugins/intel_gpu/tests/passes/remove_redundant_reorders_tests.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include "intel_gpu/runtime/engine.hpp" + +#include "intel_gpu/graph/network.hpp" +#include "intel_gpu/graph/program.hpp" +#include "data_inst.h" +#include "convolution_inst.h" +#include "reorder_inst.h" +#include "softmax_inst.h" + +#include "pass_manager.h" +#include "to_string_utils.h" + +#include "program_wrapper.h" + +#include + +using namespace cldnn; +using namespace ::tests; + +TEST(remove_redundant_reorders, remove_dep_dynamic) { + // Topology: + // convolution -> reorder -> softmax + // + // Expectation: + // The preferred format of convolution should be selected as b_fs_yx_fsv16 (reorder_inputs) + // A new reorder that converts to bfyx should be inserted after convolution (reorder_inputs) + // In reorders, output format of dependency reorder should be saved as output_format of orginial reorder (remove_redundant_reorders) + + auto& engine = get_test_engine(); + auto input_layout_dynamic = layout{ov::PartialShape{1, 3, ov::Dimension::dynamic(), ov::Dimension::dynamic()}, + data_types::f16, format::bfyx}; + auto input = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 3, 224, 224 } }); + auto weights = engine.allocate_memory({ data_types::f16, format::bfyx, { 64, 3, 7, 7 } }); + + topology topology; + topology.add(data("weights", weights)); + topology.add(input_layout("input", input_layout_dynamic)); + topology.add(convolution("conv", input_info("input"), { "weights" })); + topology.add(reorder("reorder", input_info("conv"), format::any, data_types::f32)); + topology.add(softmax("softmax", input_info("reorder"), 1)); + + ExecutionConfig config; + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + network.set_input_data("input", input); + + network.execute(); + + auto prog = network.get_program(); + ASSERT_NE(prog, nullptr); + auto& softmax_node = prog->get_node("softmax"); + auto softmax_layout = softmax_node.get_output_layout(); + + ASSERT_EQ(softmax_layout.format.value, format::bfyx); +} diff --git a/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp index 052f7b206b9527..22ff33c3fd5c0c 100644 --- a/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/lru_caches_gpu_test.cpp @@ -23,7 +23,7 @@ TEST(lru_cache, basic_data_type) input_values.push_back(std::make_pair(i, i + 10)); } - ASSERT_EQ(ca.get_lru_element(), int()); + ASSERT_EQ(ca.get_lru_element().second, int()); std::vector expected_hitted = {false, false, false, false, true, true, false}; for (size_t i = 0; i < input_values.size(); i++) { @@ -95,7 +95,7 @@ TEST(lru_cache, custom_data_type) { std::vector expected_hitted = {false, false, false, false, true, true, true, false}; - ASSERT_EQ(ca.get_lru_element(), std::shared_ptr()); + ASSERT_EQ(ca.get_lru_element().second, std::shared_ptr()); for (size_t i = 0; i < inputs.size(); i++) { auto& in = inputs[i]; std::shared_ptr p_data; diff --git a/src/plugins/intel_gpu/tests/test_utils/test_utils.cpp b/src/plugins/intel_gpu/tests/test_utils/test_utils.cpp index b8f9396d137931..e71a2b32bc53e3 100644 --- a/src/plugins/intel_gpu/tests/test_utils/test_utils.cpp +++ b/src/plugins/intel_gpu/tests/test_utils/test_utils.cpp @@ -287,7 +287,12 @@ std::vector> generic_test::generate_generic_test_pa } std::shared_ptr create_test_engine() { - return cldnn::engine::create(engine_types::ocl, runtime_types::ocl); + auto ret = cldnn::engine::create(engine_types::ocl, runtime_types::ocl); +#ifdef ENABLE_ONEDNN_FOR_GPU + if(ret->get_device_info().supports_immad) + ret->create_onednn_engine({}); +#endif + return ret; } cldnn::engine& get_test_engine() { diff --git a/src/plugins/intel_gpu/tests/test_utils/test_utils.h b/src/plugins/intel_gpu/tests/test_utils/test_utils.h index 6993939d3e8c1d..d46935aee5384d 100644 --- a/src/plugins/intel_gpu/tests/test_utils/test_utils.h +++ b/src/plugins/intel_gpu/tests/test_utils/test_utils.h @@ -589,6 +589,26 @@ std::vector get_output_values_to_float(network& net, const primitive_id& ret.push_back(mem[i]); return ret; } + +inline std::vector get_output_values_to_float(network& net, const primitive_id& output_id, size_t max_cnt = std::numeric_limits::max()) { + switch(net.get_output_layout(output_id).data_type){ + case data_types::f16: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::f32: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::i8: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::u8: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::i32: + return get_output_values_to_float(net, output_id, max_cnt); + case data_types::i64: + return get_output_values_to_float(net, output_id, max_cnt); + default: + IE_THROW() << "Unknown output data_type"; + } +} + double default_tolerance(data_types dt); // inline void print_bin_blob(cldnn::memory& mem, std::string name) // { diff --git a/src/plugins/template/backend/CMakeLists.txt b/src/plugins/template/backend/CMakeLists.txt index 320fec922ecbae..04b16c14885a93 100644 --- a/src/plugins/template/backend/CMakeLists.txt +++ b/src/plugins/template/backend/CMakeLists.txt @@ -28,7 +28,6 @@ add_library(openvino::interpreter_backend ALIAS interpreter_backend) if(CMAKE_COMPILER_IS_GNUCXX) ie_add_compiler_flags(-Wno-missing-declarations) - ie_add_compiler_flags(-Wno-sign-compare) endif() ie_faster_build(interpreter_backend UNITY) diff --git a/src/plugins/template/src/async_infer_request.cpp b/src/plugins/template/src/async_infer_request.cpp index 74d3cfae77a10c..f92f259ad7bd2f 100644 --- a/src/plugins/template/src/async_infer_request.cpp +++ b/src/plugins/template/src/async_infer_request.cpp @@ -9,10 +9,11 @@ #include "template_itt.hpp" // ! [async_infer_request:ctor] -TemplatePlugin::AsyncInferRequest::AsyncInferRequest(const std::shared_ptr& request, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& wait_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor) +TemplatePlugin::AsyncInferRequest::AsyncInferRequest( + const std::shared_ptr& request, + const std::shared_ptr& task_executor, + const std::shared_ptr& wait_executor, + const std::shared_ptr& callback_executor) : ov::IAsyncInferRequest(request, task_executor, callback_executor), m_wait_executor(wait_executor) { // In current implementation we have CPU only tasks and no needs in 2 executors diff --git a/src/plugins/template/src/async_infer_request.hpp b/src/plugins/template/src/async_infer_request.hpp index 6a049a373b4981..a15d7ee388a5a5 100644 --- a/src/plugins/template/src/async_infer_request.hpp +++ b/src/plugins/template/src/async_infer_request.hpp @@ -16,14 +16,14 @@ namespace TemplatePlugin { class AsyncInferRequest : public ov::IAsyncInferRequest { public: AsyncInferRequest(const std::shared_ptr& request, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, - const InferenceEngine::ITaskExecutor::Ptr& wait_executor, - const InferenceEngine::ITaskExecutor::Ptr& callback_executor); + const std::shared_ptr& task_executor, + const std::shared_ptr& wait_executor, + const std::shared_ptr& callback_executor); ~AsyncInferRequest(); private: - InferenceEngine::ITaskExecutor::Ptr m_wait_executor; + std::shared_ptr m_wait_executor; }; // ! [async_infer_request:header] diff --git a/src/plugins/template/src/compiled_model.cpp b/src/plugins/template/src/compiled_model.cpp index 6ea87593020f2f..d08a8352af47d4 100644 --- a/src/plugins/template/src/compiled_model.cpp +++ b/src/plugins/template/src/compiled_model.cpp @@ -79,7 +79,7 @@ void fill_output_info(const ov::Output& output, InferenceEngine::DataP // ! [executable_network:ctor_cnnnetwork] TemplatePlugin::CompiledModel::CompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, + const std::shared_ptr& task_executor, const Configuration& cfg) : ov::ICompiledModel(model, plugin, task_executor), // Disable default threads creation _cfg(cfg), diff --git a/src/plugins/template/src/compiled_model.hpp b/src/plugins/template/src/compiled_model.hpp index 82e4455cf18582..a138f3f6b68008 100644 --- a/src/plugins/template/src/compiled_model.hpp +++ b/src/plugins/template/src/compiled_model.hpp @@ -24,7 +24,7 @@ class CompiledModel : public ov::ICompiledModel { public: CompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, - const InferenceEngine::ITaskExecutor::Ptr& task_executor, + const std::shared_ptr& task_executor, const Configuration& cfg); // Methods from a base class ov::ICompiledModel diff --git a/src/plugins/template/src/plugin.cpp b/src/plugins/template/src/plugin.cpp index 41cc097551e79b..5c3aa091d4ae5b 100644 --- a/src/plugins/template/src/plugin.cpp +++ b/src/plugins/template/src/plugin.cpp @@ -35,7 +35,7 @@ Plugin::Plugin() { _backend = ngraph::runtime::Backend::create(); // create default stream executor with a given name - _waitExecutor = get_executor_manager()->getIdleCPUStreamsExecutor({wait_executor_name}); + _waitExecutor = get_executor_manager()->get_idle_cpu_streams_executor({wait_executor_name}); } // ! [plugin:ctor] @@ -91,12 +91,12 @@ std::shared_ptr TemplatePlugin::Plugin::compile_model(const auto fullConfig = Configuration{properties, _cfg}; auto streamsExecutorConfig = - InferenceEngine::IStreamsExecutor::Config::MakeDefaultMultiThreaded(fullConfig._streamsExecutorConfig); + ov::threading::IStreamsExecutor::Config::make_default_multi_threaded(fullConfig._streamsExecutorConfig); streamsExecutorConfig._name = stream_executor_name; auto compiled_model = std::make_shared(model->clone(), shared_from_this(), - get_executor_manager()->getIdleCPUStreamsExecutor(streamsExecutorConfig), + get_executor_manager()->get_idle_cpu_streams_executor(streamsExecutorConfig), fullConfig); return compiled_model; } @@ -136,7 +136,7 @@ std::shared_ptr TemplatePlugin::Plugin::import_model(std::is auto compiled_model = std::make_shared(ov_model, shared_from_this(), - get_executor_manager()->getIdleCPUStreamsExecutor(streamsExecutorConfig), + get_executor_manager()->get_idle_cpu_streams_executor(streamsExecutorConfig), fullConfig); return compiled_model; } diff --git a/src/plugins/template/src/plugin.hpp b/src/plugins/template/src/plugin.hpp index 1c45317522bc6b..aa5b9077312244 100644 --- a/src/plugins/template/src/plugin.hpp +++ b/src/plugins/template/src/plugin.hpp @@ -8,6 +8,7 @@ #include "compiled_model.hpp" #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/iplugin.hpp" +#include "openvino/runtime/threading/itask_executor.hpp" #include "template_config.hpp" //! [plugin:header] @@ -50,7 +51,7 @@ class Plugin : public ov::IPlugin { std::shared_ptr _backend; Configuration _cfg; - InferenceEngine::ITaskExecutor::Ptr _waitExecutor; + std::shared_ptr _waitExecutor; }; } // namespace TemplatePlugin diff --git a/src/plugins/template/src/template_config.cpp b/src/plugins/template/src/template_config.cpp index 582b1c6d0589bb..16ae7092842798 100644 --- a/src/plugins/template/src/template_config.cpp +++ b/src/plugins/template/src/template_config.cpp @@ -16,16 +16,17 @@ Configuration::Configuration() {} Configuration::Configuration(const ConfigMap& config, const Configuration& defaultCfg, bool throwOnUnsupported) { *this = defaultCfg; // If plugin needs to use InferenceEngine::StreamsExecutor it should be able to process its configuration - auto streamExecutorConfigKeys = _streamsExecutorConfig.SupportedKeys(); + auto streamExecutorConfigKeys = + _streamsExecutorConfig.get_property(ov::supported_properties.name()).as>(); for (auto&& c : config) { const auto& key = c.first; const auto& value = c.second; if (ov::template_plugin::throughput_streams == key) { - _streamsExecutorConfig.SetConfig(CONFIG_KEY(CPU_THROUGHPUT_STREAMS), value.as()); + _streamsExecutorConfig.set_property(CONFIG_KEY(CPU_THROUGHPUT_STREAMS), value); } else if (streamExecutorConfigKeys.end() != std::find(std::begin(streamExecutorConfigKeys), std::end(streamExecutorConfigKeys), key)) { - _streamsExecutorConfig.SetConfig(key, value.as()); + _streamsExecutorConfig.set_property(key, value); } else if (CONFIG_KEY(DEVICE_ID) == key) { deviceId = std::stoi(value.as()); if (deviceId > 0) { @@ -42,11 +43,12 @@ Configuration::Configuration(const ConfigMap& config, const Configuration& defau } } -InferenceEngine::Parameter Configuration::Get(const std::string& name) const { - auto streamExecutorConfigKeys = _streamsExecutorConfig.SupportedKeys(); +ov::Any Configuration::Get(const std::string& name) const { + auto streamExecutorConfigKeys = + _streamsExecutorConfig.get_property(ov::supported_properties.name()).as>(); if ((streamExecutorConfigKeys.end() != std::find(std::begin(streamExecutorConfigKeys), std::end(streamExecutorConfigKeys), name))) { - return _streamsExecutorConfig.GetConfig(name); + return _streamsExecutorConfig.get_property(name); } else if (name == CONFIG_KEY(DEVICE_ID)) { return {std::to_string(deviceId)}; } else if (name == CONFIG_KEY(PERF_COUNT)) { @@ -54,7 +56,7 @@ InferenceEngine::Parameter Configuration::Get(const std::string& name) const { } else if (name == ov::template_plugin::throughput_streams || name == CONFIG_KEY(CPU_THROUGHPUT_STREAMS)) { return {std::to_string(_streamsExecutorConfig._streams)}; } else if (name == CONFIG_KEY(CPU_BIND_THREAD)) { - return const_cast(_streamsExecutorConfig).GetConfig(name); + return _streamsExecutorConfig.get_property(name); } else if (name == CONFIG_KEY(CPU_THREADS_NUM)) { return {std::to_string(_streamsExecutorConfig._threads)}; } else if (name == CONFIG_KEY_INTERNAL(CPU_THREADS_PER_STREAM)) { diff --git a/src/plugins/template/src/template_config.hpp b/src/plugins/template/src/template_config.hpp index 74b578546e7241..203633123977f1 100644 --- a/src/plugins/template/src/template_config.hpp +++ b/src/plugins/template/src/template_config.hpp @@ -4,11 +4,11 @@ #pragma once -#include #include -#include #include -#include + +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" namespace TemplatePlugin { @@ -26,13 +26,13 @@ struct Configuration { const Configuration& defaultCfg = {}, const bool throwOnUnsupported = true); - InferenceEngine::Parameter Get(const std::string& name) const; + ov::Any Get(const std::string& name) const; // Plugin configuration parameters int deviceId = 0; bool perfCount = true; - InferenceEngine::IStreamsExecutor::Config _streamsExecutorConfig; + ov::threading::IStreamsExecutor::Config _streamsExecutorConfig; ov::hint::PerformanceMode performance_mode = ov::hint::PerformanceMode::UNDEFINED; }; // ! [configuration:header] diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp index f178723e025e86..64468aa0cece75 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/caching_tests.cpp @@ -48,15 +48,15 @@ namespace { INSTANTIATE_TEST_SUITE_P(smoke_KernelCachingSupportCase_GPU, CompiledKernelsCacheTest, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_GPU), - ::testing::Values(std::make_pair(ov::AnyMap{}, "cl_cache"))), + ::testing::Values(std::make_pair(ov::AnyMap{}, "blob"))), CompiledKernelsCacheTest::getTestCaseName); auto autoConfigs = []() { return std::vector>{ - std::make_pair(ov::AnyMap{{ov::device::priorities(CommonTestUtils::DEVICE_GPU)}}, "cl_cache"), + std::make_pair(ov::AnyMap{{ov::device::priorities(CommonTestUtils::DEVICE_GPU)}}, "blob"), std::make_pair( ov::AnyMap{{ov::device::priorities(CommonTestUtils::DEVICE_GPU, CommonTestUtils::DEVICE_CPU)}}, - "blob,cl_cache"), + "blob"), std::make_pair( ov::AnyMap{{ov::device::priorities(CommonTestUtils::DEVICE_CPU, CommonTestUtils::DEVICE_GPU)}}, "blob")}; diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/caching_tests.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/caching_tests.cpp index 1c10cea5ffe8ca..92dc383ad49c82 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/caching_tests.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/plugin/caching_tests.cpp @@ -46,7 +46,7 @@ namespace { INSTANTIATE_TEST_SUITE_P(smoke_KernelCachingSupportCase_GPU, LoadNetworkCompiledKernelsCacheTest, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_GPU), - ::testing::Values(std::make_pair(std::map(), "cl_cache"))), + ::testing::Values(std::make_pair(std::map(), "blob"))), LoadNetworkCompiledKernelsCacheTest::getTestCaseName); typedef std::map conftype; @@ -54,10 +54,10 @@ namespace { return std::vector>{ std::make_pair(conftype{{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, CommonTestUtils::DEVICE_GPU}}, - "cl_cache"), + "blob"), std::make_pair(conftype{{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, (std::string(CommonTestUtils::DEVICE_GPU) + "," + CommonTestUtils::DEVICE_CPU)}}, - "blob,cl_cache"), + "blob"), std::make_pair(conftype{{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, (std::string(CommonTestUtils::DEVICE_CPU) + "," + CommonTestUtils::DEVICE_GPU)}}, "blob")}; diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp index 176e1e7dd35cc5..cd11d6a444501e 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp @@ -118,7 +118,7 @@ std::vector disabledTestPatterns() { R"(.*smoke_select_CompareWithRefsNumpy_dynamic_range.*)", R"(.*CachingSupportCase.*LoadNetworkCacheTestBase.*CompareWithRefImpl.*)", #if defined(_WIN32) || defined(_WIN64) - R"(.*Auto_KernelCachingSupportCase.*CanCreateCacheDirAndDumpBinariesUnicodePath.*)", + R"(.*KernelCachingSupportCase.*CanCreateCacheDirAndDumpBinariesUnicodePath.*)", #endif R"(.*CachingSupportCase.*GPU.*CompileModelCacheTestBase.*CompareWithRefImpl.*)", // Currently 1D convolution has an issue diff --git a/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/split.cpp b/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/split.cpp index f27d9691ba9e50..7e38844f08b9ca 100644 --- a/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/split.cpp +++ b/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/split.cpp @@ -16,7 +16,7 @@ namespace GPULayerTestsDefinitions { typedef std::tuple< size_t, // Num splits - size_t, // Axis + int64_t, // Axis ElementType, // Net precision InputShape, // Input shapes std::vector // Used outputs indices @@ -52,7 +52,8 @@ class SplitLayerGPUDynamicTest : public testing::WithParamInterface outIndices; ElementType netPrecision; @@ -127,7 +128,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_SplitsCheck6D, SplitLayerGPUDynamicTest, SplitLayerGPUDynamicTest::getTestCaseName); typedef std::tuple< - size_t, // Axis + int64_t, // Axis std::vector, // SplitLength ElementType, // Net precision InputShape // Input shapes @@ -138,7 +139,7 @@ class VariadicSplitLayerGPUDynamicTest : public testing::WithParamInterface obj) { std::ostringstream result; - size_t axis; + int64_t axis; std::vector splitLength; ElementType netPrecision; InputShape inputShape; @@ -159,7 +160,7 @@ class VariadicSplitLayerGPUDynamicTest : public testing::WithParamInterface splitLength; ElementType netPrecision; diff --git a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/io_tensor.cpp b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/io_tensor.cpp index 80cde0537bc852..ac38d51cd04875 100644 --- a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/io_tensor.cpp +++ b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/io_tensor.cpp @@ -20,7 +20,12 @@ void OVInferRequestIOTensorTest::SetUp() { // Skip test according to plugin specific disabledTestPatterns() (if any) SKIP_IF_CURRENT_TEST_IS_DISABLED() OVInferRequestTests::SetUp(); - req = execNet.create_infer_request(); + try { + req = execNet.create_infer_request(); + } catch (const std::exception& ex) { + FAIL() << "Can't Create Infer Requiest in SetUp \nException [" << ex.what() << "]" + << std::endl; + } input = execNet.input(); output = execNet.output(); } @@ -32,11 +37,6 @@ void OVInferRequestIOTensorTest::TearDown() { OVInferRequestTests::TearDown(); } -TEST_P(OVInferRequestIOTensorTest, Cancreate_infer_request) { - ov::InferRequest req; - OV_ASSERT_NO_THROW(req = execNet.create_infer_request()); -} - TEST_P(OVInferRequestIOTensorTest, failToSetNullptrForInput) { ASSERT_THROW(req.set_tensor(input, {}), ov::Exception); } @@ -46,7 +46,7 @@ TEST_P(OVInferRequestIOTensorTest, failToSetNullptrForOutput) { ASSERT_THROW(req.set_tensor(output, {}), ov::Exception); } -TEST_P(OVInferRequestIOTensorTest, getAfterSetInputDoNotChangeInput) { +TEST_P(OVInferRequestIOTensorTest, canSetAndGetInput) { auto tensor = utils::create_and_fill_tensor(input.get_element_type(), input.get_shape()); OV_ASSERT_NO_THROW(req.set_tensor(input, tensor)); ov::Tensor actual_tensor; @@ -59,7 +59,7 @@ TEST_P(OVInferRequestIOTensorTest, getAfterSetInputDoNotChangeInput) { ASSERT_EQ(input.get_shape(), actual_tensor.get_shape()); } -TEST_P(OVInferRequestIOTensorTest, getAfterSetInputDoNotChangeOutput) { +TEST_P(OVInferRequestIOTensorTest, canSetAndGetOutput) { auto tensor = utils::create_and_fill_tensor(output.get_element_type(), output.get_shape()); req.set_tensor(output, tensor); auto actual_tensor = req.get_tensor(output); @@ -133,22 +133,6 @@ TEST_P(OVInferRequestIOTensorTest, secondCallGetOutputAfterInferSync) { ASSERT_EQ(tensor1.data(), tensor2.data()); } -TEST_P(OVInferRequestIOTensorTest, canSetInputTensorForInferRequest) { - auto input_tensor = utils::create_and_fill_tensor(input.get_element_type(), input.get_shape()); - OV_ASSERT_NO_THROW(req.set_tensor(input, input_tensor)); - ov::Tensor actual_tensor; - OV_ASSERT_NO_THROW(actual_tensor = req.get_tensor(input)); - ASSERT_EQ(input_tensor.data(), actual_tensor.data()); -} - -TEST_P(OVInferRequestIOTensorTest, canSetOutputBlobForInferRequest) { - auto output_tensor = utils::create_and_fill_tensor(output.get_element_type(), output.get_shape()); - OV_ASSERT_NO_THROW(req.set_tensor(output, output_tensor)); - ov::Tensor actual_tensor; - OV_ASSERT_NO_THROW(actual_tensor = req.get_tensor(output)); - ASSERT_EQ(output_tensor.data(), actual_tensor.data()); -} - TEST_P(OVInferRequestIOTensorTest, canInferWithSetInOutBlobs) { auto input_tensor = utils::create_and_fill_tensor(input.get_element_type(), input.get_shape()); OV_ASSERT_NO_THROW(req.set_tensor(input, input_tensor)); diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/variadic_split.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/variadic_split.hpp index 80e5910d336be6..5b19b6453425ba 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/variadic_split.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/variadic_split.hpp @@ -16,7 +16,7 @@ namespace LayerTestsDefinitions { typedef std::tuple< std::vector, // Num splits - size_t, // Axis + int64_t, // Axis InferenceEngine::Precision, // Net precision InferenceEngine::Precision, // Input precision InferenceEngine::Precision, // Output precision diff --git a/src/tests/functional/shared_test_classes/src/single_layer/variadic_split.cpp b/src/tests/functional/shared_test_classes/src/single_layer/variadic_split.cpp index b04a11e72467b7..b812895cc46acd 100644 --- a/src/tests/functional/shared_test_classes/src/single_layer/variadic_split.cpp +++ b/src/tests/functional/shared_test_classes/src/single_layer/variadic_split.cpp @@ -7,7 +7,7 @@ namespace LayerTestsDefinitions { std::string VariadicSplitLayerTest::getTestCaseName(const testing::TestParamInfo& obj) { - size_t axis; + int64_t axis; std::vector numSplits; InferenceEngine::Precision netPrecision; InferenceEngine::Precision inPrc, outPrc; @@ -30,7 +30,7 @@ namespace LayerTestsDefinitions { } void VariadicSplitLayerTest::SetUp() { - size_t axis; + int64_t axis; std::vector inputShape, numSplits; InferenceEngine::Precision netPrecision; std::tie(numSplits, axis, netPrecision, inPrc, outPrc, inLayout, outLayout, inputShape, targetDevice) = this->GetParam(); diff --git a/src/tests/functional/shared_test_classes/src/subgraph/variadic_split_pad.cpp b/src/tests/functional/shared_test_classes/src/subgraph/variadic_split_pad.cpp index b59b29970ebb76..7d05cd29501295 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/variadic_split_pad.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/variadic_split_pad.cpp @@ -8,7 +8,7 @@ namespace SubgraphTestsDefinitions { std::string VariadicSplitPad::getTestCaseName(const testing::TestParamInfo &obj) { InferenceEngine::SizeVector inputShape; - size_t axis; + int64_t axis; std::vector numSplits, connectIndexes; std::vector padsBegin, padsEnd; ngraph::helpers::PadMode padMode; @@ -31,7 +31,7 @@ std::string VariadicSplitPad::getTestCaseName(const testing::TestParamInfo numSplits, connectIndexes; std::vector padBegin, padEnd; ngraph::helpers::PadMode padMode; diff --git a/src/tests/ie_test_utils/common_test_utils/test_assertions.hpp b/src/tests/ie_test_utils/common_test_utils/test_assertions.hpp index 714b7d5d767a50..8dfe4b1038c794 100644 --- a/src/tests/ie_test_utils/common_test_utils/test_assertions.hpp +++ b/src/tests/ie_test_utils/common_test_utils/test_assertions.hpp @@ -73,14 +73,16 @@ inline bool strDoesnotContain(const std::string & str, const std::string & subst } \ } -#define OV_EXPECT_THROW(statement, exception, exception_what_matcher) \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - FAIL() << "Expected exception " << OV_PP_TOSTRING(exception); \ - } catch (const exception& ex) { \ - EXPECT_THAT(ex.what(), exception_what_matcher); \ - } catch (...) { \ - FAIL() << "Unknown exception"; \ +#define OV_EXPECT_THROW(statement, exp_exception, exception_what_matcher) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + FAIL() << "Expected exception " << OV_PP_TOSTRING(exp_exception); \ + } catch (const exp_exception& ex) { \ + EXPECT_THAT(ex.what(), exception_what_matcher); \ + } catch (const std::exception& e) { \ + FAIL() << "Unexpected exception " << e.what(); \ + } catch (...) { \ + FAIL() << "Unknown exception"; \ } inline void compare_blob(InferenceEngine::Blob::Ptr lhs, InferenceEngine::Blob::Ptr rhs) { diff --git a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/run_parallel.py b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/run_parallel.py index cface67efbb0cd..6319621cacb7a8 100644 --- a/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/run_parallel.py +++ b/src/tests/ie_test_utils/functional_test_utils/layer_tests_summary/run_parallel.py @@ -400,7 +400,12 @@ def __save_log(logs_dir, dir, test_name): test_log = list() dir = None test_cnt_expected = test_cnt_real_saved_now = test_cnt_real_saved_before = 0 - for line in log_file.readlines(): + try: + lines = log_file.readlines() + except: + lines = log.read_text(encoding='ascii', errors='ignore').split('\n') + + for line in lines: if constants.GTEST_FILTER in line: line = line[line.find(constants.GTEST_FILTER):] test_cnt_expected = line.count(':') @@ -493,4 +498,4 @@ def __save_log(logs_dir, dir, test_name): logger.error("Run is not successful") sys.exit(-1) else: - logger.info("Run is successful") \ No newline at end of file + logger.info("Run is successful") diff --git a/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp b/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp index 4cc6d4a31a3ce1..9a72ae31b321dc 100644 --- a/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp +++ b/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp @@ -250,7 +250,7 @@ std::shared_ptr makeSplit(const ngraph::Output &in, std::shared_ptr makeVariadicSplit(const ngraph::Output &in, const std::vector numSplits, - size_t axis); + int64_t axis); std::shared_ptr makeActivation(const ngraph::Output &in, const element::Type &type, diff --git a/src/tests/ngraph_helpers/ngraph_functions/src/variadic_split.cpp b/src/tests/ngraph_helpers/ngraph_functions/src/variadic_split.cpp index 49348b914d42aa..c6e7b99644e82a 100644 --- a/src/tests/ngraph_helpers/ngraph_functions/src/variadic_split.cpp +++ b/src/tests/ngraph_helpers/ngraph_functions/src/variadic_split.cpp @@ -11,9 +11,9 @@ namespace ngraph { namespace builder { std::shared_ptr makeVariadicSplit(const ngraph::Output &in, const std::vector numSplits, - size_t axis) { - auto splitAxisOp = std::make_shared(element::u64, ngraph::Shape{}, - std::vector{axis}); + int64_t axis) { + auto splitAxisOp = std::make_shared(element::i64, ngraph::Shape{}, + std::vector{axis}); auto numSplit = std::make_shared(element::u64, ngraph::Shape{numSplits.size()}, numSplits); auto VariadicSplitNode = std::make_shared(in, splitAxisOp, numSplit); diff --git a/tests/layer_tests/common/tflite_layer_test_class.py b/tests/layer_tests/common/tflite_layer_test_class.py index fc459b5028fa7a..d1d89263b3a559 100644 --- a/tests/layer_tests/common/tflite_layer_test_class.py +++ b/tests/layer_tests/common/tflite_layer_test_class.py @@ -7,6 +7,7 @@ from common.layer_test_class import CommonLayerTest from common.utils.tflite_utils import get_tflite_results, get_tensors_from_graph + class TFLiteLayerTest(CommonLayerTest): model_path = None inputs = None diff --git a/tests/layer_tests/common/utils/tflite_utils.py b/tests/layer_tests/common/utils/tflite_utils.py index 3c700c54becee8..7edd77d667cc2f 100644 --- a/tests/layer_tests/common/utils/tflite_utils.py +++ b/tests/layer_tests/common/utils/tflite_utils.py @@ -1,9 +1,60 @@ import os import tensorflow as tf +import numpy as np from common.utils.tf_utils import summarize_graph, transpose_nhwc_to_nchw +def make_positive_array(inputs_dict): + for input in inputs_dict.keys(): + inputs_dict[input] = np.random.randint(1, 10, inputs_dict[input]).astype(np.float32) + return inputs_dict + + +def short_range(inputs_dict): + for input in inputs_dict.keys(): + inputs_dict[input] = np.random.randint(-1, 1, inputs_dict[input]).astype(np.float32) + return inputs_dict + + +def make_boolean_array(inputs_dict): + for input in inputs_dict.keys(): + inputs_dict[input] = np.random.randint(0, 1, inputs_dict[input]) > 1 + return inputs_dict + + +data_generators = { + 'positive': make_positive_array, + 'short_range': short_range, + 'boolean': make_boolean_array, +} + + +def activation_helper(input_node, activation_name, name): + if activation_name is None: + return input_node + else: + return activation_name(input_node, name=name) + + +additional_test_params = [ + [ + {'axis': None}, + {'axis': -1} + ], + [ + {'activation': None}, + {'activation': tf.nn.relu}, + {'activation': tf.nn.relu6}, + # skip tanh and signbit since tflite doesn't fuse such activations + # https://github.com/tensorflow/tensorflow/blob/77d8c333405a080c57850c45531dbbf077b2bd0e/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td#L86:L89 + # {'activation': tf.math.tanh}, + # {'activation': lambda x, name: tf.identity(tf.experimental.numpy.signbit(x), name=name)}, + {'activation': lambda x, name: tf.math.minimum(tf.math.maximum(-1., x), 1., name=name)} + ] +] + + def save_pb_to_tflite(pb_model): graph_summary = summarize_graph(pb_model) inputs = [k for k in graph_summary['inputs'].keys()] @@ -67,3 +118,4 @@ def get_tensors_from_graph(graph, ops: list): tensors.append(op_out_tensor) return tensors + diff --git a/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py b/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py index aef0fae1086cc0..55825f35dbe75c 100644 --- a/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py +++ b/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py @@ -8,11 +8,23 @@ import openvino.runtime as ov import pytest import torch +import unittest from openvino.runtime import PartialShape, Dimension, Model, Type from common.mo_convert_test_class import CommonMOConvertTest +class MyTorchOp(torch.autograd.Function): + @staticmethod + def symbolic(g, in_positions): + return g.op("MyTorchOp", in_positions) + + @staticmethod + def forward(self, in_positions): + out_pos = in_positions.reshape(-1) + return out_pos + 0.5 + + def make_pt_model_one_input(): from torch import nn class NeuralNetwork(nn.Module): @@ -735,3 +747,30 @@ def test_mo_import_from_memory(self, create_model, ie_device, precision, ir_vers if mo_params is not None: test_params.update(mo_params) self._test_by_ref_graph(temp_dir, test_params, graph_ref, compare_tensor_names=False) + + +def create_pt_model_with_custom_op(): + # + # Create PyTorch model with custom operation + # + import torch.nn as nn + + class MyModel(nn.Module): + def __init__(self): + super(MyModel, self).__init__() + self.my_op = MyTorchOp() + + def forward(self, x): + return self.my_op.apply(x) + + return MyModel() + + +class ConvertONNXFallthroughTest(unittest.TestCase): + def test_onnx_fallthrough(self): + from openvino.tools.mo import convert_model + pytorch_model = create_pt_model_with_custom_op() + + # Check that ONNX conversion passed, so ONNX frontend raises error message of unsupported op. + with self.assertRaisesRegex(RuntimeError, ".*OpenVINO does not support the following ONNX operations: MyTorchOp.*"): + convert_model(pytorch_model, input_shape=[1, 2, 3], use_legacy_frontend=True) diff --git a/tests/layer_tests/pytorch_tests/test_einsum.py b/tests/layer_tests/pytorch_tests/test_einsum.py new file mode 100644 index 00000000000000..37a52540d6852e --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_einsum.py @@ -0,0 +1,103 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from pytorch_layer_test_class import PytorchLayerTest + + +class TestEinsumBatchMatMul(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + + return (np.random.randn(5, 2, 3).astype(np.float32), np.random.randn(5, 3, 4).astype(np.float32),) + + def create_model(self): + import torch + + class EinsumModelBatchMatmul(torch.nn.Module): + def forward(self, x, y): + eqn = "bij, bjk -> bik" + return torch.einsum(eqn, x, y) + + ref_net = None + + return EinsumModelBatchMatmul(), ref_net, "aten::einsum" + + @pytest.mark.nightly + @pytest.mark.precommit + def test_einsum_batch_matmul(self, ie_device, precision, ir_version): + self._test(*self.create_model(), ie_device, precision, ir_version) + + +class TestEinsumBatchDiagonal(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + + return (np.random.randn(3, 5, 5).astype(np.float32),) + + def create_model(self): + import torch + + class EinsumModelBatchDiagonal(torch.nn.Module): + def forward(self, x): + eqn = "kii -> ki" + return torch.einsum(eqn, x) + + ref_net = None + + return EinsumModelBatchDiagonal(), ref_net, "aten::einsum" + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.xfail(reason='OpenVINO CPU plugin does not support einsum diagonal') + def test_einsum_batch_diagonal(self, ie_device, precision, ir_version): + self._test(*self.create_model(), ie_device, precision, ir_version, dynamic_shapes=False) + + +class TestEinsumInnerProd(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + + return (np.random.randn(5).astype(np.float32), np.random.randn(5).astype(np.float32)) + + def create_model(self): + import torch + + class EinsumModelInnerProd(torch.nn.Module): + def forward(self, x, y): + eqn = "i,i" + return torch.einsum(eqn, x, y) + + ref_net = None + + return EinsumModelInnerProd(), ref_net, "aten::einsum" + + @pytest.mark.nightly + @pytest.mark.precommit + def test_einsum_inner_prod(self, ie_device, precision, ir_version): + self._test(*self.create_model(), ie_device, precision, ir_version) + + +class TestEinsumTranspose(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + + return (np.random.randn(3, 5).astype(np.float32),) + + def create_model(self): + import torch + + class EinsumModelTranspose(torch.nn.Module): + def forward(self, x): + eqn = "ij->ji" + return torch.einsum(eqn, x) + + ref_net = None + + return EinsumModelTranspose(), ref_net, "aten::einsum" + + @pytest.mark.nightly + @pytest.mark.precommit + def test_einsum_transpose(self, ie_device, precision, ir_version): + self._test(*self.create_model(), ie_device, precision, ir_version) \ No newline at end of file diff --git a/tests/layer_tests/pytorch_tests/test_index.py b/tests/layer_tests/pytorch_tests/test_index.py new file mode 100644 index 00000000000000..967ef4c98afb6e --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_index.py @@ -0,0 +1,73 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import numpy as np + +from pytorch_layer_test_class import PytorchLayerTest + + +class TestIndex(PytorchLayerTest): + def _prepare_input(self, input_shape, idx): + import numpy as np + return (np.random.randn(*input_shape).astype(np.float32), idx) + + def create_model(self, model="list"): + import torch + + class aten_index_list(torch.nn.Module): + + def forward(self, x, idx): + return x[idx] + + class aten_index_getitem(torch.nn.Module): + + def forward(self, x, idx): + return x.__getitem__(idx) + + + class aten_index_list_bool(torch.nn.Module): + + def forward(self, x, idx): + return x[idx.to(torch.bool)] + + class aten_index_getitem_bool(torch.nn.Module): + + def forward(self, x, idx): + return x.__getitem__(idx.to(torch.bool)) + cases = { + "list": aten_index_list, + "getitem": aten_index_getitem, + "list_with_bool": aten_index_list_bool, + "getitem_with_bool": aten_index_getitem_bool + } + + aten_index = cases[model] + + ref_net = None + + return aten_index(), ref_net, "aten::index" + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize("case", ["list", "getitem"]) + @pytest.mark.parametrize(("input_shape", "idx"), [ + ((1,), np.array(0).astype(int)), + ([2, 3], np.array(-1).astype(int)), + ([4, 5, 6], np.array((1, 2)).astype(int)), + ([7, 8, 9], np.array((-1, 2, -3)).astype(int)), + ([2, 2, 3, 4], np.array((1,)).astype(int))]) + def test_index(self, input_shape, idx, case, ie_device, precision, ir_version): + self._test(*self.create_model(case), ie_device, precision, ir_version, kwargs_to_prepare_input={"input_shape": input_shape, "idx": idx}) + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize("case", ["getitem_with_bool", "list_with_bool"]) + @pytest.mark.parametrize(("input_shape", "idx"), [ + ((1, 2), np.array([[1, 0]]).astype(bool)), + ((2, 2, 5), np.zeros([2, 2, 5]).astype(bool)), + ((2, 2, 5), np.ones([2, 2, 5]).astype(bool)), + ((2, 2, 5), np.random.rand(2, 2, 5) > 0) + ]) + def test_index_bool(self, input_shape, idx, case, ie_device, precision, ir_version): + self._test(*self.create_model(case), ie_device, precision, ir_version, kwargs_to_prepare_input={"input_shape": input_shape, "idx": idx}) \ No newline at end of file diff --git a/tests/layer_tests/pytorch_tests/test_roi_align.py b/tests/layer_tests/pytorch_tests/test_roi_align.py new file mode 100644 index 00000000000000..fb03c51b0914e0 --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_roi_align.py @@ -0,0 +1,58 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +import numpy as np +import pytest +import torch + +from pytorch_layer_test_class import PytorchLayerTest +from torchvision.ops import roi_align + + +class TestROIAlign(PytorchLayerTest): + def _prepare_input(self): + return (self.input_tensor, self.boxes) + + def create_model(self, output_size, spatial_scale, sampling_ratio, aligned): + + class torchvision_roi_align(torch.nn.Module): + def __init__(self, output_size, spatial_scale, sampling_ratio, aligned): + super().__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + self.aligned = aligned + + def forward(self, input_tensor, rois): + return roi_align( + input_tensor, + rois.to(dtype=input_tensor.dtype), + self.output_size, + self.spatial_scale, + self.sampling_ratio, + self.aligned, + ) + + ref_net = None + + return (torchvision_roi_align(output_size, spatial_scale, sampling_ratio, aligned), + ref_net, "torchvision::roi_align") + + @pytest.mark.parametrize('input_tensor', (np.random.randn(4, 5, 6, 7).astype(np.float32),)) + @pytest.mark.parametrize('boxes', (np.array([[1, 2, 2, 3, 3]]).astype(np.float32), + np.array([[0, 1, 2, 5, 4], + [2, 1, 2, 5, 4], + [3, 1, 2, 5, 4]]).astype(np.float32))) + @pytest.mark.parametrize('output_size', ((4, 5), (3, 2), 3)) + @pytest.mark.parametrize('spatial_scale', (0.5, 1.0)) + @pytest.mark.parametrize('sampling_ratio', (0, 1)) + @pytest.mark.parametrize('aligned', (True, False)) + @pytest.mark.nightly + @pytest.mark.precommit + def test_roi_align(self, ie_device, precision, ir_version, input_tensor, boxes, output_size, + spatial_scale, sampling_ratio, aligned): + self.input_tensor = input_tensor + self.boxes = boxes + self._test(*self.create_model(output_size, spatial_scale, sampling_ratio, aligned), + ie_device, precision, ir_version, trace_model=True) diff --git a/tests/layer_tests/pytorch_tests/test_upsample.py b/tests/layer_tests/pytorch_tests/test_upsample.py index a5ea7df4157cd1..d1874f6c0d07b5 100644 --- a/tests/layer_tests/pytorch_tests/test_upsample.py +++ b/tests/layer_tests/pytorch_tests/test_upsample.py @@ -6,10 +6,50 @@ from pytorch_layer_test_class import PytorchLayerTest +class TestUpsample1D(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + return (np.random.randn(1, 3, 224).astype(np.float32),) + + def create_model(self, size, scale, mode): + import torch + import torch.nn.functional as F + + class aten_upsample(torch.nn.Module): + def __init__(self, size, scale, mode): + super().__init__() + self.size = size + self.scale = scale + self.mode = mode + + def forward(self, x): + return F.interpolate(x, self.size, scale_factor=self.scale, mode=self.mode) + + ref_net = None + + return aten_upsample(size, scale, mode), ref_net, F"aten::upsample_{mode}1d" + + @pytest.mark.parametrize("mode,size,scale", [ + ('nearest', 300, None), + ('nearest', 200, None), + ('nearest', None, 2.5), + ('nearest', None, 0.75), + ('linear', 300, None), + ('linear', 200, None), + ('linear', None, 2.5,), + ('linear', None, 0.75), + ]) + @pytest.mark.nightly + @pytest.mark.precommit + def test_upsample1d(self, mode, size, scale, ie_device, precision, ir_version): + self._test(*self.create_model(size, scale, mode), ie_device, + precision, ir_version, trace_model=True) + + class TestUpsample2D(PytorchLayerTest): def _prepare_input(self): import numpy as np - return (np.zeros((1, 3, 224, 224)).astype(np.float32),) + return (np.random.randn(1, 3, 200, 200).astype(np.float32),) def create_model(self, size, scale, mode): import torch @@ -31,25 +71,70 @@ def forward(self, x): @pytest.mark.parametrize("mode,size,scale", [ ('nearest', 300, None), - ('nearest', 200, None), - ('nearest', (128, 480), None), - ('nearest', None, 2.5,), + ('nearest', 150, None), + ('nearest', (300, 400), None), + ('nearest', None, 2.5), ('nearest', None, 0.75), - ('nearest', None, (1.2, 0.8)), + ('nearest', None, (1.5, 2)), ('bilinear', 300, None), - ('bilinear', 200, None), - ('bilinear', (128, 480), None), + ('bilinear', 150, None), + ('bilinear', (400, 480), None), ('bilinear', None, 2.5,), ('bilinear', None, 0.75), - ('bilinear', None, (1.2, 0.8)), + ('bilinear', None, (1.2, 1.3)), ('bicubic', 300, None), - ('bicubic', 200, None), - ('bicubic', (128, 480), None), + ('bicubic', 150, None), + ('bicubic', (400, 480), None), ('bicubic', None, 2.5,), ('bicubic', None, 0.75), - ('bicubic', None, (1.2, 0.8))] - ) + ('bicubic', None, (1.2, 1.3)) + ]) + @pytest.mark.nightly + @pytest.mark.precommit + def test_upsample2d(self, mode, size, scale, ie_device, precision, ir_version): + self._test(*self.create_model(size, scale, mode), ie_device, + precision, ir_version, trace_model=True, **{"custom_eps": 1e-3}) + + +class TestUpsample3D(PytorchLayerTest): + def _prepare_input(self): + import numpy as np + return (np.random.randn(1, 3, 100, 100, 100).astype(np.float32),) + + def create_model(self, size, scale, mode): + import torch + import torch.nn.functional as F + + class aten_upsample(torch.nn.Module): + def __init__(self, size, scale, mode): + super().__init__() + self.size = size + self.scale = scale + self.mode = mode + + def forward(self, x): + return F.interpolate(x, self.size, scale_factor=self.scale, mode=self.mode) + + ref_net = None + + return aten_upsample(size, scale, mode), ref_net, F"aten::upsample_{mode}3d" + + @pytest.mark.parametrize("mode,size,scale", [ + ('nearest', 200, None), + ('nearest', 150, None), + ('nearest', (150, 200, 250), None), + ('nearest', None, 2.5), + ('nearest', None, 0.75), + ('nearest', None, (1.5, 2, 2.5)), + ('trilinear', 200, None), + ('trilinear', 150, None), + ('trilinear', (200, 240, 210), None), + ('trilinear', None, 2.5,), + ('trilinear', None, 0.75), + ('trilinear', None, (1.2, 1.1, 1.5)), + ]) @pytest.mark.nightly @pytest.mark.precommit - def test_upsample(self, mode, size, scale, ie_device, precision, ir_version): - self._test(*self.create_model(size, scale, mode), ie_device, precision, ir_version, trace_model=True) + def test_upsample3d(self, mode, size, scale, ie_device, precision, ir_version): + self._test(*self.create_model(size, scale, mode), ie_device, + precision, ir_version, trace_model=True, **{"custom_eps": 1e-3}) diff --git a/tests/layer_tests/tensorflow_lite_tests/test_tfl_Binary.py b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Binary.py new file mode 100644 index 00000000000000..22b9be44017af0 --- /dev/null +++ b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Binary.py @@ -0,0 +1,62 @@ +import itertools + +import pytest +import tensorflow as tf + +from common.tflite_layer_test_class import TFLiteLayerTest +from tensorflow_lite_tests.test_tfl_Unary import data_generators + +test_ops = [ + {'op_name': 'EQUAL', 'op_func': tf.math.equal}, + {'op_name': 'FLOOR_MOD', 'op_func': tf.math.floormod}, + {'op_name': 'GREATER', 'op_func': tf.math.greater}, + {'op_name': 'GREATER_EQUAL', 'op_func': tf.math.greater_equal}, + {'op_name': 'LESS', 'op_func': tf.math.less}, + {'op_name': 'LESS_EQUAL', 'op_func': tf.math.less_equal}, + {'op_name': 'LOGICAL_AND', 'op_func': tf.math.logical_and, 'kwargs_to_prepare_input': 'boolean', 'dtype': tf.bool}, + {'op_name': 'LOGICAL_OR', 'op_func': tf.math.logical_or, 'kwargs_to_prepare_input': 'boolean', 'dtype': tf.bool}, + {'op_name': 'MAXIMUM', 'op_func': tf.math.maximum}, + {'op_name': 'MINIMUM', 'op_func': tf.math.minimum}, + {'op_name': 'NOT_EQUAL', 'op_func': tf.math.not_equal}, + {'op_name': 'POW', 'op_func': tf.math.pow, 'kwargs_to_prepare_input': 'positive'}, + {'op_name': 'SQUARED_DIFFERENCE', 'op_func': tf.math.squared_difference}, +] + +test_params = [ + {'shape': [2, 10, 10, 3]}, + {'shape': [2, 10]} +] + +test_data = list(itertools.product(test_ops, test_params)) +for i, (parameters, shapes) in enumerate(test_data): + parameters.update(shapes) + test_data[i] = parameters.copy() + + +class TestTFLiteBinaryLayerTest(TFLiteLayerTest): + inputs = ["Input_0", "Input_1"] + outputs = ["BinaryOperation"] + + def _prepare_input(self, inputs_dict, generator=None): + if generator is None: + return super()._prepare_input(inputs_dict) + return data_generators[generator](inputs_dict) + + def make_model(self, params): + assert len(set(params.keys()).intersection({'op_name', 'op_func', 'shape'})) == 3, \ + 'Unexpected parameters for test: ' + ','.join(params.keys()) + self.allowed_ops = [params['op_name']] + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + place_holder0 = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteBinaryLayerTest.inputs[0]) + place_holder1 = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteBinaryLayerTest.inputs[1]) + params['op_func'](place_holder0, place_holder1, name=TestTFLiteBinaryLayerTest.outputs[0]) + net = sess.graph_def + return net + + @pytest.mark.parametrize("params", test_data) + @pytest.mark.nightly + def test_binary(self, params, ie_device, precision, temp_dir): + self._test(ie_device, precision, temp_dir, params) diff --git a/tests/layer_tests/tensorflow_lite_tests/test_tfl_BinaryWithActivation.py b/tests/layer_tests/tensorflow_lite_tests/test_tfl_BinaryWithActivation.py new file mode 100644 index 00000000000000..e48fd1501910db --- /dev/null +++ b/tests/layer_tests/tensorflow_lite_tests/test_tfl_BinaryWithActivation.py @@ -0,0 +1,63 @@ +import itertools + +import tensorflow as tf +import pytest + +from common.tflite_layer_test_class import TFLiteLayerTest +from common.utils.tflite_utils import data_generators, additional_test_params, activation_helper + +test_ops = [ + {'op_name': 'ADD', 'op_func': tf.math.add}, + {'op_name': 'DIV', 'op_func': tf.math.divide, 'kwargs_to_prepare_input': 'positive'}, + {'op_name': 'MUL', 'op_func': tf.math.multiply}, + {'op_name': 'SUB', 'op_func': tf.math.subtract}, +] + +test_params = [ + {'shape': [2, 10, 10, 3]}, + {'shape': [2, 10]} +] + + +test_data = list(itertools.product(test_ops, test_params)) +for i, (parameters, shapes) in enumerate(test_data): + parameters.update(shapes) + test_data[i] = parameters.copy() + +test_data = list(itertools.product(test_data, additional_test_params[1])) +for i, (parameters, additional_test_params[1]) in enumerate(test_data): + parameters.update(additional_test_params[1]) + test_data[i] = parameters.copy() + + +class TestTFLiteBinaryWithActivationLayerTest(TFLiteLayerTest): + inputs = ["Input_0", "Input_1"] + outputs = ["BinaryOperation"] + + def _prepare_input(self, inputs_dict, generator=None): + if generator is None: + return super()._prepare_input(inputs_dict) + return data_generators[generator](inputs_dict) + + def make_model(self, params): + assert len(set(params.keys()).intersection({'op_name', 'op_func', 'shape', 'activation'})) == 4, \ + 'Unexpected parameters for test: ' + ','.join(params.keys()) + self.allowed_ops = [params['op_name']] + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + in0 = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteBinaryWithActivationLayerTest.inputs[0]) + in1 = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteBinaryWithActivationLayerTest.inputs[1]) + bin_op_name = TestTFLiteBinaryWithActivationLayerTest.outputs[0] if not params['activation'] else \ + TestTFLiteBinaryWithActivationLayerTest.outputs[0] + "/op" + op = params['op_func'](in0, in1, name=bin_op_name) + op = activation_helper(op, params['activation'], TestTFLiteBinaryWithActivationLayerTest.outputs[0]) + + net = sess.graph_def + return net + + @pytest.mark.parametrize("params", test_data) + @pytest.mark.nightly + def test_binary(self, params, ie_device, precision, temp_dir): + self._test(ie_device, precision, temp_dir, params) diff --git a/tests/layer_tests/tensorflow_lite_tests/test_tfl_Reduce.py b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Reduce.py new file mode 100644 index 00000000000000..71488370596907 --- /dev/null +++ b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Reduce.py @@ -0,0 +1,61 @@ +import itertools + +import pytest +import tensorflow as tf + +from common.tflite_layer_test_class import TFLiteLayerTest +from common.utils.tflite_utils import data_generators, additional_test_params + +test_ops = [ + {'op_name': 'MEAN', 'op_func': tf.math.reduce_mean}, + {'op_name': 'REDUCE_ALL', 'op_func': tf.math.reduce_all, 'kwargs_to_prepare_input': 'boolean', 'dtype': tf.bool}, + {'op_name': 'REDUCE_ANY', 'op_func': tf.math.reduce_any, 'kwargs_to_prepare_input': 'boolean', 'dtype': tf.bool}, + {'op_name': 'REDUCE_MAX', 'op_func': tf.math.reduce_max}, + {'op_name': 'REDUCE_MIN', 'op_func': tf.math.reduce_min}, + {'op_name': 'REDUCE_PROD', 'op_func': tf.math.reduce_prod, 'kwargs_to_prepare_input': 'short_range'}, + {'op_name': 'SUM', 'op_func': tf.math.reduce_sum}, +] + +test_params = [ + {'shape': [2, 10, 10, 3]}, + {'shape': [2, 10]} +] + + +test_data = list(itertools.product(test_ops, test_params)) +for i, (parameters, shapes) in enumerate(test_data): + parameters.update(shapes) + test_data[i] = parameters.copy() + + +test_data = list(itertools.product(test_data, additional_test_params[0])) +for i, (parameters, additional_test_params[0]) in enumerate(test_data): + parameters.update(additional_test_params[0]) + test_data[i] = parameters.copy() + + +class TestTFLiteReduceLayerTest(TFLiteLayerTest): + inputs = ["Input"] + outputs = ["ReduceOperation"] + + def _prepare_input(self, inputs_dict, generator=None): + if generator is None: + return super()._prepare_input(inputs_dict) + return data_generators[generator](inputs_dict) + + def make_model(self, params): + assert len(set(params.keys()).intersection({'op_name', 'op_func', 'shape', 'axis'})) == 4, \ + 'Unexpected parameters for test: ' + ','.join(params.keys()) + self.allowed_ops = [params['op_name']] + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + place_holder = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteReduceLayerTest.inputs[0]) + params['op_func'](place_holder, axis=params['axis'], name=TestTFLiteReduceLayerTest.outputs[0]) + net = sess.graph_def + return net + + @pytest.mark.parametrize("params", test_data) + @pytest.mark.nightly + def test_reduce(self, params, ie_device, precision, temp_dir): + self._test(ie_device, precision, temp_dir, params) diff --git a/tests/layer_tests/tensorflow_lite_tests/test_tfl_Unary.py b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Unary.py index 0c7e9f849e198b..b782202d112b79 100644 --- a/tests/layer_tests/tensorflow_lite_tests/test_tfl_Unary.py +++ b/tests/layer_tests/tensorflow_lite_tests/test_tfl_Unary.py @@ -7,27 +7,11 @@ import pytest import tensorflow as tf from common.tflite_layer_test_class import TFLiteLayerTest +from common.utils.tflite_utils import data_generators np.random.seed(42) -def make_positive_array(inputs_dict): - for input in inputs_dict.keys(): - inputs_dict[input] = np.random.randint(1, 10, inputs_dict[input]).astype(np.float32) - return inputs_dict - - -def make_boolean_array(inputs_dict): - for input in inputs_dict.keys(): - inputs_dict[input] = np.random.randint(0, 1, inputs_dict[input]) > 1 - return inputs_dict - - -data_generators = { - 'positive': make_positive_array, - 'boolean': make_boolean_array, -} - test_ops = [ {'op_name': 'ABS', 'op_func': tf.math.abs}, {'op_name': 'CAST', 'op_func': partial(tf.cast, dtype=tf.int32)}, @@ -82,14 +66,14 @@ def _prepare_input(self, inputs_dict, generator=None): return super()._prepare_input(inputs_dict) return data_generators[generator](inputs_dict) - def make_model(self, params): assert len(set(params.keys()).intersection({'op_name', 'op_func', 'shape'})) == 3, \ 'Unexpected parameters for test: ' + ','.join(params.keys()) self.allowed_ops = [params['op_name']] tf.compat.v1.reset_default_graph() with tf.compat.v1.Session() as sess: - place_holder = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], name=TestTFLiteUnaryLayerTest.inputs[0]) + place_holder = tf.compat.v1.placeholder(params.get('dtype', tf.float32), params['shape'], + name=TestTFLiteUnaryLayerTest.inputs[0]) params['op_func'](place_holder, name=TestTFLiteUnaryLayerTest.outputs[0]) net = sess.graph_def return net diff --git a/tests/layer_tests/tensorflow_tests/test_tf_FakeQuantWithMinMaxVars.py b/tests/layer_tests/tensorflow_tests/test_tf_FakeQuantWithMinMaxVars.py new file mode 100644 index 00000000000000..73a5dcb832ae1d --- /dev/null +++ b/tests/layer_tests/tensorflow_tests/test_tf_FakeQuantWithMinMaxVars.py @@ -0,0 +1,60 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest +import tensorflow as tf +from common.tf_layer_test_class import CommonTFLayerTest + + +class TestFakeQuantWithMinMaxVars(CommonTFLayerTest): + def _prepare_input(self, inputs_info): + # generate elements so that the input tensor may contain repeating elements + assert 'inputs' in inputs_info, "Test error: inputs_info must contain `input`" + inputs_shape = inputs_info['inputs'] + inputs_data = {} + inputs_data['inputs'] = np.random.randint(-10, 10, inputs_shape).astype(np.float32) + return inputs_data + + def create_fake_quant_with_min_max_vars_net(self, inputs_shape, min_value, max_value, num_bits, narrow_range, + fake_quant_op): + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + inputs = tf.compat.v1.placeholder(tf.float32, inputs_shape, 'inputs') + min = tf.constant(min_value, dtype=tf.float32) + max = tf.constant(max_value, dtype=tf.float32) + fake_quant_op(inputs=inputs, min=min, max=max, num_bits=num_bits, + narrow_range=narrow_range) + tf.compat.v1.global_variables_initializer() + tf_net = sess.graph_def + + return tf_net, None + + test_basic = [ + # test FakeQuantWithMinMaxVars + dict(inputs_shape=[2, 6, 4], min_value=-3, max_value=4, num_bits=None, narrow_range=None, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVars), + dict(inputs_shape=[3, 2, 1, 5], min_value=-4, max_value=5, num_bits=14, narrow_range=True, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVars), + dict(inputs_shape=[3, 2, 4], min_value=2, max_value=4, num_bits=10, narrow_range=False, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVars), + dict(inputs_shape=[1, 2, 3], min_value=-6, max_value=-3, num_bits=8, narrow_range=True, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVars), + + # test FakeQuantWithMinMaxVarsPerChannel + pytest.param(dict(inputs_shape=[2, 6, 4], min_value=[-4, -3, -5, -8], max_value=[4, 7, 9, 5], num_bits=None, + narrow_range=None, + fake_quant_op=tf.raw_ops.FakeQuantWithMinMaxVarsPerChannel), + marks=pytest.mark.xfail(reason="104822")) + + ] + + @pytest.mark.parametrize("params", test_basic) + @pytest.mark.precommit_tf_fe + @pytest.mark.nightly + def test_fake_quant_with_min_max_vars_basic(self, params, ie_device, precision, ir_version, temp_dir, + use_new_frontend, + use_old_api): + self._test(*self.create_fake_quant_with_min_max_vars_net(**params), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_new_frontend=use_new_frontend, use_old_api=use_old_api) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_Identity.py b/tests/layer_tests/tensorflow_tests/test_tf_Identity.py index 24fd85c30dd6a1..2e18d134d22fb4 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_Identity.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_Identity.py @@ -2,86 +2,37 @@ # SPDX-License-Identifier: Apache-2.0 import pytest -from common.layer_test_class import check_ir_version +import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest -from common.utils.tf_utils import permute_nchw_to_nhwc - -from unit_tests.utils.graph import build_graph class TestIdentity(CommonTFLayerTest): - def create_identity_net(self, shape, ir_version, use_new_frontend): - """ - Tensorflow net IR net - - Input->Identity->ReLU => Input->ReLU - - """ - - import tensorflow as tf - + def create_identity_net(self, input_shape, identity_op): tf.compat.v1.reset_default_graph() # Create the graph and model with tf.compat.v1.Session() as sess: - tf_x_shape = shape.copy() - - tf_x_shape = permute_nchw_to_nhwc(tf_x_shape, use_new_frontend) - - x = tf.compat.v1.placeholder(tf.float32, tf_x_shape, 'Input') - id = tf.identity(x, name="Operation") - tf.nn.relu(id, name='Operation') + input = tf.compat.v1.placeholder(tf.float32, input_shape, 'input') + relu = tf.raw_ops.Relu(features=input) + identity_op(input=relu, name="identity") tf.compat.v1.global_variables_initializer() tf_net = sess.graph_def - # - # Create reference IR net - # Please, specify 'type': 'Input' for input node - # Moreover, do not forget to validate ALL layer attributes!!! - # - - ref_net = None - - if check_ir_version(10, None, ir_version) and not use_new_frontend: - nodes_attributes = { - 'inputX': {'kind': 'op', 'type': 'Parameter'}, - 'inputX_data': {'shape': shape, 'kind': 'data'}, - 'ReLU': {'kind': 'op', 'type': 'ReLU'}, - 'ReLU_data': {'shape': shape, 'kind': 'data'}, - 'result': {'kind': 'op', 'type': 'Result'} - } - ref_net = build_graph(nodes_attributes, - [('inputX', 'inputX_data'), - ('inputX_data', 'ReLU'), - ('ReLU', 'ReLU_data'), - ('ReLU_data', 'result') - ]) - - return tf_net, ref_net - - test_data_precommit = [dict(shape=[1, 3, 50, 100, 224])] - - @pytest.mark.parametrize("params", test_data_precommit) - @pytest.mark.precommit - def test_identity_precommit(self, params, ie_device, precision, ir_version, temp_dir, - use_new_frontend, use_old_api): - self._test(*self.create_identity_net(**params, ir_version=ir_version, - use_new_frontend=use_new_frontend), - ie_device, precision, ir_version, temp_dir=temp_dir, - use_new_frontend=use_new_frontend, use_old_api=use_old_api) + return tf_net, None - test_data = [dict(shape=[1]), - pytest.param(dict(shape=[1, 224]), marks=pytest.mark.precommit_tf_fe), - dict(shape=[1, 3, 224]), - dict(shape=[1, 3, 100, 224]), - dict(shape=[1, 3, 50, 100, 224])] + test_data_basic = [ + dict(input_shape=[2], identity_op=tf.raw_ops.Identity), + dict(input_shape=[2, 3], identity_op=tf.raw_ops.PreventGradient), + dict(input_shape=[], identity_op=tf.raw_ops.Snapshot), + dict(input_shape=[1, 2, 3], identity_op=tf.raw_ops.StopGradient) + ] - @pytest.mark.parametrize("params", test_data) + @pytest.mark.parametrize("params", test_data_basic) + @pytest.mark.precommit_tf_fe @pytest.mark.nightly - def test_identity(self, params, ie_device, precision, ir_version, temp_dir, use_new_frontend, - use_old_api): - self._test(*self.create_identity_net(**params, ir_version=ir_version, - use_new_frontend=use_new_frontend), + def test_identity_basic(self, params, ie_device, precision, ir_version, temp_dir, + use_new_frontend, use_old_api): + self._test(*self.create_identity_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_new_frontend=use_new_frontend, use_old_api=use_old_api) diff --git a/tools/mo/openvino/tools/mo/moc_frontend/pytorch_frontend_utils.py b/tools/mo/openvino/tools/mo/moc_frontend/pytorch_frontend_utils.py index a86409b9a48204..29939d7d988bed 100644 --- a/tools/mo/openvino/tools/mo/moc_frontend/pytorch_frontend_utils.py +++ b/tools/mo/openvino/tools/mo/moc_frontend/pytorch_frontend_utils.py @@ -131,6 +131,7 @@ def convert_pytorch_to_onnx(model, input_shape, opset_version, example_inputs, o torch.onnx.export(model, inputs, model_onnx, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH, **additional_params) return model_onnx diff --git a/tools/mo/unit_tests/moc_tf_fe/conversion_basic_models.py b/tools/mo/unit_tests/moc_tf_fe/conversion_basic_models.py index 08effa43a4c5c6..d1c0115bf0f711 100644 --- a/tools/mo/unit_tests/moc_tf_fe/conversion_basic_models.py +++ b/tools/mo/unit_tests/moc_tf_fe/conversion_basic_models.py @@ -309,3 +309,41 @@ def test_conversion_model_oneshot_iterator_use_legacy_frontend(self): def test_conversion_model_oneshot_iterator_default(self): self.basic("model_oneshot_iterator.pbtxt", None, None, None, None, None, None, True, True, False, False) + + @generate( + *[ + ( + "in2{f32}->[0.0 0.0 0.0 0.0]", + {"in1": np.array([[1.0, 2.0], [3.0, 4.0]])}, + np.array([[1.0, 2.0], [3.0, 4.0]]), + np.float32, + ), + ( + "in2->[1.0 15.0 15.5 1.0]", + {"in1": np.array([[2.0, 4.0], [12.0, 8.0]])}, + np.array([[3.0, 19.0], [27.5, 9.0]]), + np.float32, + ), + ], + ) + def test_conversion_model_with_non_standard_extension(self, input_freezing_value, inputs, expected, + dtype): + self.basic("model_fp32.frozen", input_freezing_value, inputs, dtype, expected, only_conversion=False, + input_model_is_text=False, use_new_frontend=True, + use_legacy_frontend=False) + + def test_conversion_fake_model(self): + with self.assertRaisesRegex(Exception, + "Internal error or inconsistent input model: the frontend supports " + "only frozen binary protobuf format."): + self.basic("fake.pb", None, None, None, None, + only_conversion=True, input_model_is_text=False, use_new_frontend=True, + use_legacy_frontend=False) + + def test_conversion_dir_model(self): + with self.assertRaisesRegex(Exception, + "Internal error or inconsistent input model: the frontend supports " + "only frozen binary protobuf format."): + self.basic(".", None, None, None, None, + only_conversion=True, input_model_is_text=False, use_new_frontend=True, + use_legacy_frontend=False) diff --git a/tools/mo/unit_tests/moc_tf_fe/test_models/fake.pb b/tools/mo/unit_tests/moc_tf_fe/test_models/fake.pb new file mode 100644 index 00000000000000..ae05864994afaf --- /dev/null +++ b/tools/mo/unit_tests/moc_tf_fe/test_models/fake.pb @@ -0,0 +1,2 @@ +dcfsdcdsdcs +cscscsc \ No newline at end of file diff --git a/tools/mo/unit_tests/moc_tf_fe/test_models/model_fp32.frozen b/tools/mo/unit_tests/moc_tf_fe/test_models/model_fp32.frozen new file mode 100644 index 00000000000000..3343e4106f837c --- /dev/null +++ b/tools/mo/unit_tests/moc_tf_fe/test_models/model_fp32.frozen @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a33c91148b5e72ca03608c7d2ee18229ee4b610344dadd6896efeb6ac7b93e0 +size 141