diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 98a53622a0b6da..6aed320376c21e 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -1,14 +1,14 @@ name: Windows (VS 2022, Python 3.11) on: workflow_dispatch: - pull_request: - paths-ignore: - - '**/docs/**' - - 'docs/**' - - '**/**.md' - - '**.md' - - '**/layer_tests_summary/**' - - '**/conformance/**' +# pull_request: +# paths-ignore: +# - '**/docs/**' +# - 'docs/**' +# - '**/**.md' +# - '**.md' +# - '**/layer_tests_summary/**' +# - '**/conformance/**' push: paths-ignore: - '**/docs/**' diff --git a/docs/benchmarks/performance_benchmarks.md b/docs/articles_en/about_openvino/performance_benchmarks.md similarity index 100% rename from docs/benchmarks/performance_benchmarks.md rename to docs/articles_en/about_openvino/performance_benchmarks.md diff --git a/docs/MO_DG/prepare_model/Getting_performance_numbers.md b/docs/articles_en/about_openvino/performance_benchmarks/Getting_performance_numbers.md similarity index 100% rename from docs/MO_DG/prepare_model/Getting_performance_numbers.md rename to docs/articles_en/about_openvino/performance_benchmarks/Getting_performance_numbers.md diff --git a/docs/benchmarks/performance_benchmarks_faq.md b/docs/articles_en/about_openvino/performance_benchmarks/performance_benchmarks_faq.md similarity index 100% rename from docs/benchmarks/performance_benchmarks_faq.md rename to docs/articles_en/about_openvino/performance_benchmarks/performance_benchmarks_faq.md diff --git a/docs/benchmarks/performance_int8_vs_fp32.md b/docs/articles_en/about_openvino/performance_benchmarks/performance_int8_vs_fp32.md similarity index 100% rename from docs/benchmarks/performance_int8_vs_fp32.md rename to docs/articles_en/about_openvino/performance_benchmarks/performance_int8_vs_fp32.md diff --git a/docs/articles_en/documentation/openvino_legacy_features/mo_ovc_transition/legacy_conversion_api.md b/docs/articles_en/documentation/openvino_legacy_features/mo_ovc_transition/legacy_conversion_api.md index dce063b625fd24..86ef2bdda17d22 100644 --- a/docs/articles_en/documentation/openvino_legacy_features/mo_ovc_transition/legacy_conversion_api.md +++ b/docs/articles_en/documentation/openvino_legacy_features/mo_ovc_transition/legacy_conversion_api.md @@ -2,8 +2,6 @@ @sphinxdirective -.. _deep learning model optimizer: - .. toctree:: :maxdepth: 1 :hidden: diff --git a/docs/articles_en/documentation/openvino_legacy_features/mo_ovc_transition/legacy_conversion_api/setting_input_shapes.md b/docs/articles_en/documentation/openvino_legacy_features/mo_ovc_transition/legacy_conversion_api/setting_input_shapes.md index e962dc8babb472..3de90266af87db 100644 --- a/docs/articles_en/documentation/openvino_legacy_features/mo_ovc_transition/legacy_conversion_api/setting_input_shapes.md +++ b/docs/articles_en/documentation/openvino_legacy_features/mo_ovc_transition/legacy_conversion_api/setting_input_shapes.md @@ -8,9 +8,6 @@ With model conversion API you can increase your model's efficiency by providing :description: Learn how to increase the efficiency of a model with MO by providing an additional shape definition with the input_shape and static_shape parameters. -.. _when_to_specify_input_shapes: - - Specifying input_shape parameter ################################ diff --git a/docs/articles_en/openvino_workflow.md b/docs/articles_en/openvino_workflow.md index 3d617a35155a66..5453c03426cd3f 100644 --- a/docs/articles_en/openvino_workflow.md +++ b/docs/articles_en/openvino_workflow.md @@ -20,29 +20,39 @@ pytorch_2_0_torch_compile +.. image:: ./_static/images/model_conversion_diagram.svg + :alt: model conversion diagram + +OpenVINO offers multiple workflows, depending on the use case and personal or project preferences. +The diagram above is only a rough representation of the available options, but this section will +give you a detailed view of how you can go from preparing your model, through optimizing it, +to executing inference, and deploying your solution. + + | :doc:`Model Preparation ` -| With model conversion API guide, you will learn to convert pre-trained models for use with OpenVINO™. You can use your own models or choose some from a broad selection in online databases, such as `TensorFlow Hub `__, `Hugging Face `__, `Torchvision models `__.. +| Learn how to convert pre-trained models to OpenVINO IR, using different approaches for more convenience or higher performance. + | :doc:`Model Optimization and Compression ` -| In this section you will find out how to optimize a model to achieve better inference performance. It describes multiple optimization methods for both the training and post-training stages. +| Find out how to optimize a model to achieve better inference performance, utilizing multiple optimization methods for both in-training compression and post-training quantization. + | :doc:`Running Inference ` -| This section explains describes how to run inference which is the most basic form of deployment and the quickest way of launching inference. +| See how to run inference with OpenVINO, which is the most basic form of deployment, and the quickest way of running a deep learning model. +| :doc:`Deployment Option 1. Using OpenVINO Runtime ` +| Deploy a model locally, reading the file directly from your application and utilizing resources available to the system. +| Deployment on a local system uses the steps described in the section on running inference. -Once you have a model that meets both OpenVINO™ and your requirements, you can choose how to deploy it with your application. +| :doc:`Deployment Option 2. Using Model Server ` +| Deploy a model remotely, connecting your application to an inference server and utilizing external resources, with no impact on the app's performance. +| Deployment on OpenVINO Model Server is quick and does not require any additional steps described in the section on running inference. -| :doc:`Option 1. Deployment via OpenVINO Runtime ` -| Local deployment uses OpenVINO Runtime that is called from, and linked to, the application directly. -| It utilizes resources available to the system and provides the quickest way of launching inference. -| Deployment on a local system requires performing the steps from the running inference section. +| :doc:`Deployment Option 3. Using torch.compile for PyTorch 2.0 ` +| Deploy a PyTorch model using OpenVINO in a PyTorch-native application. -| :doc:`Option 2. Deployment via Model Server ` -| Deployment via OpenVINO Model Server allows the application to connect to the inference server set up remotely. -| This way inference can use external resources instead of those available to the application itself. -| Deployment on a model server can be done quickly and without performing any additional steps described in the running inference section. @endsphinxdirective \ No newline at end of file diff --git a/docs/articles_en/openvino_workflow/model_introduction.md b/docs/articles_en/openvino_workflow/model_introduction.md deleted file mode 100644 index c11beb17e67764..00000000000000 --- a/docs/articles_en/openvino_workflow/model_introduction.md +++ /dev/null @@ -1,258 +0,0 @@ -# Model Preparation {#openvino_docs_model_processing_introduction} - -@sphinxdirective - -.. meta:: - :description: Preparing models for OpenVINO Runtime. Learn about the methods - used to read, convert and compile models from different frameworks. - -.. toctree:: - :maxdepth: 1 - :hidden: - - Supported_Model_Formats - openvino_docs_OV_Converter_UG_Conversion_Options - openvino_docs_OV_Converter_UG_prepare_model_convert_model_Converting_Model - - -Every deep learning workflow begins with obtaining a model. You can choose to prepare -a custom one, use a ready-made solution and adjust it to your needs, or even download -and run a pre-trained network from an online database, such as -`TensorFlow Hub `__, `Hugging Face `__, -or `Torchvision models `__. - -If your selected model is in one of the :doc:`OpenVINO™ supported model formats `, -you can use it directly, without the need to save as OpenVINO IR -(`openvino.Model `__ - -`ov.Model `__). -For this purpose, you can use ``openvino.Core.read_model`` and ``openvino.Core.compile_model`` -methods, so that conversion is performed automatically before inference, for -maximum convenience. Note that for PyTorch models, Python API -is the only conversion option. TensorFlow may present additional considerations -:doc:`TensorFlow Frontend Capabilities and Limitations `. - - -For better performance and more optimization options, OpenVINO also offers a conversion -API with two possible approaches: the Python API functions (``openvino.convert_model`` -and ``openvino.save_model``) and the ``ovc`` command line tool, which are described in detail in this article. - -.. note:: - - Model conversion API prior to OpenVINO 2023.1 is considered deprecated. - Both existing and new projects are recommended to transition to the new - solutions, keeping in mind that they are not fully backwards compatible - with ``openvino.tools.mo.convert_model`` or the ``mo`` CLI tool. - For more details, see the :doc:`Model Conversion API Transition Guide `. - - - - -Convert a Model in Python: ``convert_model`` -############################################## - -You can use the Model conversion API in Python with the ``openvino.convert_model`` function. This function converts a model from its original framework representation, for example PyTorch or TensorFlow, to the object of type ``openvino.Model``. The resulting ``openvino.Model`` can be compiled with ``openvino.compile_model`` and inferred in the same application (Python script or Jupyter Notebook) or saved into a file using``openvino.save_model`` for future use. Below, there are examples of how to use the ``openvino.convert_model`` with models from popular public repositories: - - -.. tab-set:: - - .. tab-item:: Torchvision - - .. code-block:: py - :force: - - import openvino as ov - import torch - from torchvision.models import resnet50 - - model = resnet50(weights='DEFAULT') - - # prepare input_data - input_data = torch.rand(1, 3, 224, 224) - - ov_model = ov.convert_model(model, example_input=input_data) - - ###### Option 1: Save to OpenVINO IR: - - # save model to OpenVINO IR for later use - ov.save_model(ov_model, 'model.xml') - - ###### Option 2: Compile and infer with OpenVINO: - - # compile model - compiled_model = ov.compile_model(ov_model) - - # run inference - result = compiled_model(input_data) - - .. tab-item:: Hugging Face Transformers - - .. code-block:: py - - from transformers import BertTokenizer, BertModel - - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertModel.from_pretrained("bert-base-uncased") - text = "Replace me by any text you'd like." - encoded_input = tokenizer(text, return_tensors='pt') - - import openvino as ov - ov_model = ov.convert_model(model, example_input={**encoded_input}) - - ###### Option 1: Save to OpenVINO IR: - - # save model to OpenVINO IR for later use - ov.save_model(ov_model, 'model.xml') - - ###### Option 2: Compile and infer with OpenVINO: - - # compile model - compiled_model = ov.compile_model(ov_model) - - # prepare input_data using HF tokenizer or your own tokenizer - # encoded_input is reused here for simplicity - - # run inference - result = compiled_model({**encoded_input}) - - .. tab-item:: Keras Applications - - .. code-block:: py - - import tensorflow as tf - import openvino as ov - - tf_model = tf.keras.applications.ResNet50(weights="imagenet") - ov_model = ov.convert_model(tf_model) - - ###### Option 1: Save to OpenVINO IR: - - # save model to OpenVINO IR for later use - ov.save_model(ov_model, 'model.xml') - - ###### Option 2: Compile and infer with OpenVINO: - - # compile model - compiled_model = ov.compile_model(ov_model) - - # prepare input_data - import numpy as np - input_data = np.random.rand(1, 224, 224, 3) - - # run inference - result = compiled_model(input_data) - - .. tab-item:: TensorFlow Hub - - .. code-block:: py - - import tensorflow as tf - import tensorflow_hub as hub - import openvino as ov - - model = tf.keras.Sequential([ - hub.KerasLayer("https://tfhub.dev/google/imagenet/mobilenet_v1_100_224/classification/5") - ]) - - # Check model page for information about input shape: https://tfhub.dev/google/imagenet/mobilenet_v1_100_224/classification/5 - model.build([None, 224, 224, 3]) - - model.save('mobilenet_v1_100_224') # use a temporary directory - ov_model = ov.convert_model('mobilenet_v1_100_224') - - ###### Option 1: Save to OpenVINO IR: - - ov.save_model(ov_model, 'model.xml') - - ###### Option 2: Compile and infer with OpenVINO: - - compiled_model = ov.compile_model(ov_model) - - # prepare input_data - import numpy as np - input_data = np.random.rand(1, 224, 224, 3) - - # run inference - result = compiled_model(input_data) - - .. tab-item:: ONNX Model Hub - - .. code-block:: py - - import onnx - - model = onnx.hub.load("resnet50") - onnx.save(model, 'resnet50.onnx') # use a temporary file for model - - import openvino as ov - ov_model = ov.convert_model('resnet50.onnx') - - ###### Option 1: Save to OpenVINO IR: - - # save model to OpenVINO IR for later use - ov.save_model(ov_model, 'model.xml') - - ###### Option 2: Compile and infer with OpenVINO: - - # compile model - compiled_model = ov.compile_model(ov_model) - - # prepare input_data - import numpy as np - input_data = np.random.rand(1, 3, 224, 224) - - # run inference - result = compiled_model(input_data) - -In Option 1, where the ``openvino.save_model`` function is used, an OpenVINO model is serialized in the file system as two files with ``.xml`` and ``.bin`` extensions. This pair of files is called OpenVINO Intermediate Representation format (OpenVINO IR, or just IR) and useful for efficient model deployment. OpenVINO IR can be loaded into another application for inference using the ``openvino.Core.read_model`` function. For more details, refer to the :doc:`OpenVINO™ Runtime documentation `. - -Option 2, where ``openvino.compile_model`` is used, provides a convenient way to quickly switch from framework-based code to OpenVINO-based code in your existing Python inference application. In this case, the converted model is not saved to IR. Instead, the model is compiled and used for inference within the same application. - -Option 1 separates model conversion and model inference into two different applications. This approach is useful for deployment scenarios requiring fewer extra dependencies and faster model loading in the end inference application. - -For example, converting a PyTorch model to OpenVINO usually demands the ``torch`` Python module and Python. This process can take extra time and memory. But, after the converted model is saved as OpenVINO IR with ``openvino.save_model``, it can be loaded in a separate application without requiring the ``torch`` dependency and the time-consuming conversion. The inference application can be written in other languages supported by OpenVINO, for example, in C++, and Python installation is not necessary for it to run. - -Before saving the model to OpenVINO IR, consider applying :doc:`Post-training Optimization ` to enable more efficient inference and smaller model size. - -The figure below illustrates the typical workflow for deploying a trained deep-learning model. - -.. image:: ./_static/images/model_conversion_diagram.svg - :alt: model conversion diagram - -Convert a Model in CLI: ``ovc`` -############################### - -Another option for model conversion is to use ``ovc`` command-line tool, which stands for OpenVINO Model Converter. The tool combines both ``openvino.convert_model`` and ``openvino.save_model`` functionalities. It is convenient to use when the original model is ready for inference and is in one of the supported file formats: ONNX, TensorFlow, TensorFlow Lite, or PaddlePaddle. As a result, ``ovc`` produces an OpenVINO IR, consisting of ``.xml`` and ``.bin`` files, which needs to be read with the ``openvino.Core.read_model`` method. You can compile and infer the ``ov.Model`` later with :doc:`OpenVINO™ Runtime ` - -.. note:: - PyTorch models cannot be converted with ``ovc``, use ``openvino.convert_model`` instead. - -The results of both ``ovc`` and ``openvino.convert_model``/``openvino.save_model`` conversion methods are the same. You can choose either of them based on your convenience. Note that there should not be any differences in the results of model conversion if the same set of parameters is used and the model is saved into OpenVINO IR. - - - - -Additional Resources -#################### - -The following articles describe in details how to obtain and prepare your model depending on the source model type: - -* :doc:`Convert different model formats to the ov.Model format `. -* :doc:`Review all available conversion parameters `. - -To achieve the best model inference performance and more compact OpenVINO IR representation follow: - -* :doc:`Post-training optimization ` -* :doc:`Model inference in OpenVINO Runtime ` - -If you are using legacy conversion API (``mo`` or ``openvino.tools.mo.convert_model``), please refer to the following materials: - -* :doc:`Transition from legacy mo and ov.tools.mo.convert_model ` -* :doc:`Legacy Model Conversion API ` - - - - -.. api/ie_python_api/_autosummary/openvino.Model.html is a broken link for some reason - need to investigate python api article generation - - -@endsphinxdirective diff --git a/docs/articles_en/openvino_workflow/model_introduction/supported_model_formats.md b/docs/articles_en/openvino_workflow/model_introduction/supported_model_formats.md deleted file mode 100644 index 6ff5e620f10d78..00000000000000 --- a/docs/articles_en/openvino_workflow/model_introduction/supported_model_formats.md +++ /dev/null @@ -1,33 +0,0 @@ -# Supported Model Formats {#Supported_Model_Formats} - -@sphinxdirective - -.. toctree:: - :maxdepth: 1 - :hidden: - - openvino_docs_OV_Converter_UG_prepare_model_convert_model_Convert_Model_From_PyTorch - openvino_docs_OV_Converter_UG_prepare_model_convert_model_Convert_Model_From_TensorFlow - openvino_docs_OV_Converter_UG_prepare_model_convert_model_Convert_Model_From_ONNX - openvino_docs_OV_Converter_UG_prepare_model_convert_model_Convert_Model_From_TensorFlow_Lite - openvino_docs_OV_Converter_UG_prepare_model_convert_model_Convert_Model_From_Paddle - - -**OpenVINO IR (Intermediate Representation)** - the proprietary format of OpenVINO™, benefiting from the full extent of its features. The result of running ``ovc`` CLI tool or ``openvino.save_model`` is OpenVINO IR. All other supported formats can be converted to the IR, refer to the following articles for details on conversion: - -* :doc:`How to convert PyTorch ` -* :doc:`How to convert ONNX ` -* :doc:`How to convert TensorFlow ` -* :doc:`How to convert TensorFlow Lite ` -* :doc:`How to convert PaddlePaddle ` - -To choose the best workflow for your application, read the :doc:`Model Preparation section ` - -Refer to the list of all supported conversion options in :doc:`Conversion Parameters ` - -Additional Resources -#################### - -* :doc:`Transition guide from the legacy to new conversion API ` - -@endsphinxdirective diff --git a/docs/articles_en/openvino_workflow/model_preparation.md b/docs/articles_en/openvino_workflow/model_preparation.md new file mode 100644 index 00000000000000..dbc34640a992cc --- /dev/null +++ b/docs/articles_en/openvino_workflow/model_preparation.md @@ -0,0 +1,263 @@ +# Model Preparation {#openvino_docs_model_processing_introduction} + +@sphinxdirective + +.. meta:: + :description: Preparing models for OpenVINO Runtime. Learn about the methods + used to read, convert and compile models from different frameworks. + +.. toctree:: + :maxdepth: 1 + :hidden: + + + Conversion Parameters + Setting Input Shapes + Convert from PyTorch + Convert from TensorFlow + Convert from ONNX + Convert from TensorFlow_Lite + Convert from PaddlePaddle + Supported_Model_Formats + + +To start working with OpenVINO, you need to obtain a model in one of the +:doc:`supported model formats `. The easiest way +to do so is to download a pre-trained network from an online database, such as +`TensorFlow Hub `__, +`Hugging Face `__, or +`Torchvision models `__. + +The OpenVINO workflow starts with converting the selected model to its +proprietary format, :doc:`OpenVINO IR ` +(`openvino.Model `__ - +`ov.Model `__). +Although in most cases it can be done automatically, under the hood, explicit +conversion may enable more optimization options and better performance. +It is done in one of two ways: + + * the Python API functions (``openvino.convert_model`` and ``openvino.save_model``) + * the ``ovc`` command line tool. + + .. note:: + + Model conversion API prior to OpenVINO 2023.1 is considered deprecated. + Existing and new projects are recommended to transition to the new + solutions, keeping in mind that they are not fully backwards compatible + with ``openvino.tools.mo.convert_model`` or the ``mo`` CLI tool. + For more details, see the :doc:`Model Conversion API Transition Guide `. + + +Convert a Model in Python: ``convert_model`` +############################################## + +The Model conversion API in Python uses the ``openvino.convert_model`` function, +turning a given model to the ``openvino.Model`` object. The object may be used +further, compiled and inferred, or saved to a drive as :doc:`OpenVINO IR ` +(``openvino.save_model`` produces a set of ``.xml`` and ``.bin`` files). + +See how to use ``openvino.convert_model`` with models from some of the most popular +public repositories: + +.. tab-set:: + + .. tab-item:: Torchvision + + .. code-block:: py + :force: + + import openvino as ov + import torch + from torchvision.models import resnet50 + + model = resnet50(weights='DEFAULT') + + # prepare input_data + input_data = torch.rand(1, 3, 224, 224) + + ov_model = ov.convert_model(model, example_input=input_data) + + ###### Option 1: Save to OpenVINO IR: + + # save model to OpenVINO IR for later use + ov.save_model(ov_model, 'model.xml') + + ###### Option 2: Compile and infer with OpenVINO: + + # compile model + compiled_model = ov.compile_model(ov_model) + + # run inference + result = compiled_model(input_data) + + .. tab-item:: Hugging Face Transformers + + .. code-block:: py + + from transformers import BertTokenizer, BertModel + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained("bert-base-uncased") + text = "Replace me by any text you'd like." + encoded_input = tokenizer(text, return_tensors='pt') + + import openvino as ov + ov_model = ov.convert_model(model, example_input={**encoded_input}) + + ###### Option 1: Save to OpenVINO IR: + + # save model to OpenVINO IR for later use + ov.save_model(ov_model, 'model.xml') + + ###### Option 2: Compile and infer with OpenVINO: + + # compile model + compiled_model = ov.compile_model(ov_model) + + # prepare input_data using HF tokenizer or your own tokenizer + # encoded_input is reused here for simplicity + + # run inference + result = compiled_model({**encoded_input}) + + .. tab-item:: Keras Applications + + .. code-block:: py + + import tensorflow as tf + import openvino as ov + + tf_model = tf.keras.applications.ResNet50(weights="imagenet") + ov_model = ov.convert_model(tf_model) + + ###### Option 1: Save to OpenVINO IR: + + # save model to OpenVINO IR for later use + ov.save_model(ov_model, 'model.xml') + + ###### Option 2: Compile and infer with OpenVINO: + + # compile model + compiled_model = ov.compile_model(ov_model) + + # prepare input_data + import numpy as np + input_data = np.random.rand(1, 224, 224, 3) + + # run inference + result = compiled_model(input_data) + + .. tab-item:: TensorFlow Hub + + .. code-block:: py + + import tensorflow as tf + import tensorflow_hub as hub + import openvino as ov + + model = tf.keras.Sequential([ + hub.KerasLayer("https://tfhub.dev/google/imagenet/mobilenet_v1_100_224/classification/5") + ]) + + # Check model page for information about input shape: https://tfhub.dev/google/imagenet/mobilenet_v1_100_224/classification/5 + model.build([None, 224, 224, 3]) + + model.save('mobilenet_v1_100_224') # use a temporary directory + ov_model = ov.convert_model('mobilenet_v1_100_224') + + ###### Option 1: Save to OpenVINO IR: + + ov.save_model(ov_model, 'model.xml') + + ###### Option 2: Compile and infer with OpenVINO: + + compiled_model = ov.compile_model(ov_model) + + # prepare input_data + import numpy as np + input_data = np.random.rand(1, 224, 224, 3) + + # run inference + result = compiled_model(input_data) + + .. tab-item:: ONNX Model Hub + + .. code-block:: py + + import onnx + + model = onnx.hub.load("resnet50") + onnx.save(model, 'resnet50.onnx') # use a temporary file for model + + import openvino as ov + ov_model = ov.convert_model('resnet50.onnx') + + ###### Option 1: Save to OpenVINO IR: + + # save model to OpenVINO IR for later use + ov.save_model(ov_model, 'model.xml') + + ###### Option 2: Compile and infer with OpenVINO: + + # compile model + compiled_model = ov.compile_model(ov_model) + + # prepare input_data + import numpy as np + input_data = np.random.rand(1, 3, 224, 224) + + # run inference + result = compiled_model(input_data) + + +* Saving the model, **Option 1**, is used as a separate step, outside of deployment. + The file it provides is then used in the final software solution, resulting in + maximum performance due to fewer dependencies and faster model loading. + +* Compiling the model, **Option 2**, provides a convenient way to quickly switch from + framework-based code to OpenVINO-based code in your existing Python inference application. + The converted model is not saved to IR but compiled and used for inference within the same application. + +Before saving the model to OpenVINO IR, consider :doc:`Post-training Optimization ` +to achieve more efficient inference and smaller model size. + + +Convert a Model in CLI: ``ovc`` +############################### + +``ovc`` is a command-line model converter, combining the ``openvino.convert_model`` +and ``openvino.save_model`` functionalities, providing the exact same results, if the same set of +parameters is used for saving into OpenVINO IR. It converts files from one of the +:doc:`supported model formats ` to :doc:`OpenVINO IR `, which can then be read, compiled, +and run by the final inference application. + +.. note:: + PyTorch models cannot be converted with ``ovc``, use ``openvino.convert_model`` instead. + + + +Additional Resources +#################### + +The following articles describe in detail how to obtain and prepare your model depending on the source model type: + +* :doc:`Convert different model formats to the ov.Model format `. +* :doc:`Review all available conversion parameters `. + +To achieve the best model inference performance and more compact OpenVINO IR representation follow: + +* :doc:`Post-training optimization ` +* :doc:`Model inference in OpenVINO Runtime ` + +If you are still using the legacy conversion API (``mo`` or ``openvino.tools.mo.convert_model``), please refer to the following materials: + +* :doc:`Transition from legacy mo and ov.tools.mo.convert_model ` +* :doc:`Legacy Model Conversion API ` + + + + +.. need to investigate python api article generation - api/ie_python_api/_autosummary/openvino.Model.html does not exist, api/ie_python_api/_autosummary/openvino.runtime.Model.html does. + + +@endsphinxdirective diff --git a/docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_ONNX.md b/docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_ONNX.md similarity index 100% rename from docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_ONNX.md rename to docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_ONNX.md diff --git a/docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_Paddle.md b/docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_Paddle.md similarity index 100% rename from docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_Paddle.md rename to docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_Paddle.md diff --git a/docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_PyTorch.md b/docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_PyTorch.md similarity index 100% rename from docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_PyTorch.md rename to docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_PyTorch.md diff --git a/docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_TensorFlow.md b/docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_TensorFlow.md similarity index 99% rename from docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_TensorFlow.md rename to docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_TensorFlow.md index bec51f537cd541..e74b45cbc82a91 100644 --- a/docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_TensorFlow.md +++ b/docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_TensorFlow.md @@ -3,8 +3,8 @@ @sphinxdirective .. meta:: - :description: Learn how to convert a model from a - TensorFlow format to the OpenVINO Model. + :description: Learn how to convert a model from a TensorFlow format to the OpenVINO Model. + This page provides general instructions on how to run model conversion from a TensorFlow format to the OpenVINO IR format. The instructions are different depending on whether your model was created with TensorFlow v1.X or TensorFlow v2.X. diff --git a/docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_TensorFlow_Lite.md b/docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_TensorFlow_Lite.md similarity index 100% rename from docs/articles_en/openvino_workflow/model_introduction/supported_model_formats/Convert_Model_From_TensorFlow_Lite.md rename to docs/articles_en/openvino_workflow/model_preparation/Convert_Model_From_TensorFlow_Lite.md diff --git a/docs/articles_en/openvino_workflow/model_introduction/Deep_Learning_Model_Optimizer_DevGuide.md b/docs/articles_en/openvino_workflow/model_preparation/conversion_parameters.md similarity index 99% rename from docs/articles_en/openvino_workflow/model_introduction/Deep_Learning_Model_Optimizer_DevGuide.md rename to docs/articles_en/openvino_workflow/model_preparation/conversion_parameters.md index 94b01c4299179d..ca562ab46ac948 100644 --- a/docs/articles_en/openvino_workflow/model_introduction/Deep_Learning_Model_Optimizer_DevGuide.md +++ b/docs/articles_en/openvino_workflow/model_preparation/conversion_parameters.md @@ -2,8 +2,6 @@ @sphinxdirective -.. _deep learning model optimizer: - .. meta:: :description: Model Conversion API provides several parameters to adjust model conversion. diff --git a/docs/articles_en/openvino_workflow/model_introduction/Converting_Model.md b/docs/articles_en/openvino_workflow/model_preparation/setting_input_shapes.md similarity index 82% rename from docs/articles_en/openvino_workflow/model_introduction/Converting_Model.md rename to docs/articles_en/openvino_workflow/model_preparation/setting_input_shapes.md index 24fa33c17f4a94..0a4b25000904d3 100644 --- a/docs/articles_en/openvino_workflow/model_introduction/Converting_Model.md +++ b/docs/articles_en/openvino_workflow/model_preparation/setting_input_shapes.md @@ -1,26 +1,25 @@ # Setting Input Shapes {#openvino_docs_OV_Converter_UG_prepare_model_convert_model_Converting_Model} -With model conversion API you can increase your model's efficiency by providing an additional shape definition using the ``input`` parameter. - @sphinxdirective .. meta:: :description: Learn how to increase the efficiency of a model by providing an additional shape definition with the ``input`` parameter of ``openvino.convert_model`` and ``ovc``. -.. _when_to_specify_input_shapes: -Specifying Shapes in the ``input`` Parameter -##################################################### +``openvino.convert_model`` supports conversion of models with dynamic input shapes that +contain undefined dimensions. However, if the shape of data is not going to change from +one inference request to another, it is recommended to **set up static shapes** +(all dimensions are fully defined) for the inputs, using the the ``input`` parameter. +Doing so at the model preparation stage, not at runtime, can be beneficial in terms of +performance and memory consumption. -``openvino.convert_model`` supports conversion of models with dynamic input shapes that contain undefined dimensions. -However, if the shape of data is not going to change from one inference request to another, -it is recommended to set up static shapes (when all dimensions are fully defined) for the inputs. -Doing it at this stage, instead of during inference in runtime, can be beneficial in terms of performance and memory consumption. -To set up static shapes, model conversion API provides the ``input`` parameter. -For more information on changing input shapes in runtime, refer to the :doc:`Changing input shapes ` guide. -To learn more about dynamic shapes in runtime, refer to the :doc:`Dynamic Shapes ` guide. +For more information on changing input shapes in runtime, refer to the +:doc:`Changing input shapes ` guide. +To learn more about dynamic shapes in runtime, refer to the +:doc:`Dynamic Shapes ` guide. -The OpenVINO Runtime API may present certain limitations in inferring models with undefined dimensions on some hardware. See the :doc:`Features support matrix ` for reference. +The OpenVINO Runtime API may present certain limitations in inferring models with undefined dimensions on some hardware. +See the :doc:`Feature support matrix ` for reference. In this case, the ``input`` parameter and the :doc:`reshape method ` can help to resolve undefined dimensions. For example, run model conversion for the TensorFlow MobileNet model with the single input @@ -138,4 +137,4 @@ In practice, not every model is designed in a way that allows change of input sh For more information about shape follow the :doc:`inference troubleshooting ` and :ref:`ways to relax shape inference flow ` guides. -@endsphinxdirective +@endsphinxdirective \ No newline at end of file diff --git a/docs/articles_en/openvino_workflow/model_preparation/supported_model_formats.md b/docs/articles_en/openvino_workflow/model_preparation/supported_model_formats.md new file mode 100644 index 00000000000000..7688568e218f04 --- /dev/null +++ b/docs/articles_en/openvino_workflow/model_preparation/supported_model_formats.md @@ -0,0 +1,607 @@ +# Supported Model Formats {#Supported_Model_Formats} + +@sphinxdirective + +.. meta:: + :description: Learn about supported model formats and the methods used to convert, read, and compile them in OpenVINO™. + + +| **OpenVINO IR (Intermediate Representation)** +| The proprietary format of OpenVINO™, benefiting from the full extent of its features. + It is obtained by :doc:`converting a model ` + from one of the remaining supported formats using the Python model conversion API or the + OpenVINO Converter. +| Consider storing your model in this format to minimize first-inference latency, + perform model optimizations, and save space on your drive, in some cases. + + +| **PyTorch, TensorFlow, TensorFlow Lite, ONNX, and PaddlePaddle** +| These supported model formats can be read, compiled, and converted to OpenVINO IR, + either automatically or explicitly. + + +In the Python API, these options are provided as three separate methods: +``read_model()``, ``compile_model()``, and ``convert_model()``. + +The ``convert_model()`` method enables you to perform additional adjustments +to the model, such as setting shapes, changing model input types or layouts, +cutting parts of the model, freezing inputs, etc. For a detailed description +of the conversion process, see the +:doc:`model conversion guide `. + + + + + +Note that for PyTorch models, Python API +is the only conversion option. + +TensorFlow may present additional considerations +:doc:`TensorFlow Frontend Capabilities and Limitations `. + + + + + + + + + + +Here are code examples of how to use these methods with different model formats: + +.. tab-set:: + + .. tab-item:: PyTorch + :sync: torch + + .. tab-set:: + + .. tab-item:: Python + :sync: py + + * The ``convert_model()`` method: + + This is the only method applicable to PyTorch models. + + .. dropdown:: List of supported formats: + + * **Python objects**: + + * ``torch.nn.Module`` + * ``torch.jit.ScriptModule`` + * ``torch.jit.ScriptFunction`` + + .. code-block:: py + :force: + + model = torchvision.models.resnet50(weights='DEFAULT') + ov_model = convert_model(model) + compiled_model = core.compile_model(ov_model, "AUTO") + + For more details on conversion, refer to the + :doc:`guide ` + and an example `tutorial `__ + on this topic. + + .. tab-item:: TensorFlow + :sync: tf + + .. tab-set:: + + .. tab-item:: Python + :sync: py + + * The ``convert_model()`` method: + + When you use the ``convert_model()`` method, you have more control and you can specify additional adjustments for ``ov.Model``. The ``read_model()`` and ``compile_model()`` methods are easier to use, however, they do not have such capabilities. With ``ov.Model`` you can choose to optimize, compile and run inference on it or serialize it into a file for subsequent use. + + .. dropdown:: List of supported formats: + + * **Files**: + + * SavedModel - ```` or ``.pb`` + * Checkpoint - ``.pb`` or ``.pbtxt`` + * MetaGraph - ``.meta`` + + * **Python objects**: + + * ``tf.keras.Model`` + * ``tf.keras.layers.Layer`` + * ``tf.Module`` + * ``tf.compat.v1.Graph`` + * ``tf.compat.v1.GraphDef`` + * ``tf.function`` + * ``tf.compat.v1.session`` + * ``tf.train.checkpoint`` + + .. code-block:: py + :force: + + ov_model = convert_model("saved_model.pb") + compiled_model = core.compile_model(ov_model, "AUTO") + + For more details on conversion, refer to the + :doc:`guide ` + and an example `tutorial `__ + on this topic. + + * The ``read_model()`` and ``compile_model()`` methods: + + .. dropdown:: List of supported formats: + + * **Files**: + + * SavedModel - ```` or ``.pb`` + * Checkpoint - ``.pb`` or ``.pbtxt`` + * MetaGraph - ``.meta`` + + .. code-block:: py + :force: + + ov_model = read_model("saved_model.pb") + compiled_model = core.compile_model(ov_model, "AUTO") + + For a guide on how to run inference, see how to + :doc:`Integrate OpenVINO™ with Your Application `. + For TensorFlow format, see :doc:`TensorFlow Frontend Capabilities and Limitations `. + + .. tab-item:: C++ + :sync: cpp + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * SavedModel - ```` or ``.pb`` + * Checkpoint - ``.pb`` or ``.pbtxt`` + * MetaGraph - ``.meta`` + + .. code-block:: cpp + + ov::CompiledModel compiled_model = core.compile_model("saved_model.pb", "AUTO"); + + For a guide on how to run inference, see how to + :doc:`Integrate OpenVINO™ with Your Application `. + + .. tab-item:: C + :sync: c + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * SavedModel - ```` or ``.pb`` + * Checkpoint - ``.pb`` or ``.pbtxt`` + * MetaGraph - ``.meta`` + + .. code-block:: c + + ov_compiled_model_t* compiled_model = NULL; + ov_core_compile_model_from_file(core, "saved_model.pb", "AUTO", 0, &compiled_model); + + For a guide on how to run inference, see how to + :doc:`Integrate OpenVINO™ with Your Application `. + + .. tab-item:: CLI + :sync: cli + + You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. + + .. code-block:: sh + + mo --input_model .pb + + For details on the conversion, refer to the + :doc:`article `. + + .. tab-item:: TensorFlow Lite + :sync: tflite + + .. tab-set:: + + .. tab-item:: Python + :sync: py + + * The ``convert_model()`` method: + + When you use the ``convert_model()`` method, you have more control and you can specify additional adjustments for ``ov.Model``. The ``read_model()`` and ``compile_model()`` methods are easier to use, however, they do not have such capabilities. With ``ov.Model`` you can choose to optimize, compile and run inference on it or serialize it into a file for subsequent use. + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.tflite`` + + .. code-block:: py + :force: + + ov_model = convert_model(".tflite") + compiled_model = core.compile_model(ov_model, "AUTO") + + For more details on conversion, refer to the + :doc:`guide ` + and an example `tutorial `__ + on this topic. + + + * The ``read_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.tflite`` + + .. code-block:: py + :force: + + ov_model = read_model(".tflite") + compiled_model = core.compile_model(ov_model, "AUTO") + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.tflite`` + + .. code-block:: py + :force: + + compiled_model = core.compile_model(".tflite", "AUTO") + + For a guide on how to run inference, see how to + :doc:`Integrate OpenVINO™ with Your Application `. + + + .. tab-item:: C++ + :sync: cpp + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.tflite`` + + .. code-block:: cpp + + ov::CompiledModel compiled_model = core.compile_model(".tflite", "AUTO"); + + For a guide on how to run inference, see how to + :doc:`Integrate OpenVINO™ with Your Application `. + + .. tab-item:: C + :sync: c + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.tflite`` + + .. code-block:: c + + ov_compiled_model_t* compiled_model = NULL; + ov_core_compile_model_from_file(core, ".tflite", "AUTO", 0, &compiled_model); + + For a guide on how to run inference, see how to + :doc:`Integrate OpenVINO™ with Your Application `. + + .. tab-item:: CLI + :sync: cli + + * The ``convert_model()`` method: + + You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.tflite`` + + .. code-block:: sh + + mo --input_model .tflite + + For details on the conversion, refer to the + :doc:`article `. + + .. tab-item:: ONNX + :sync: onnx + + .. tab-set:: + + .. tab-item:: Python + :sync: py + + * The ``convert_model()`` method: + + When you use the ``convert_model()`` method, you have more control and you can specify additional adjustments for ``ov.Model``. The ``read_model()`` and ``compile_model()`` methods are easier to use, however, they do not have such capabilities. With ``ov.Model`` you can choose to optimize, compile and run inference on it or serialize it into a file for subsequent use. + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.onnx`` + + .. code-block:: py + :force: + + ov_model = convert_model(".onnx") + compiled_model = core.compile_model(ov_model, "AUTO") + + For more details on conversion, refer to the + :doc:`guide ` + and an example `tutorial `__ + on this topic. + + + * The ``read_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.onnx`` + + .. code-block:: py + :force: + + ov_model = read_model(".onnx") + compiled_model = core.compile_model(ov_model, "AUTO") + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.onnx`` + + .. code-block:: py + :force: + + compiled_model = core.compile_model(".onnx", "AUTO") + + For a guide on how to run inference, see how to :doc:`Integrate OpenVINO™ with Your Application `. + + + .. tab-item:: C++ + :sync: cpp + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.onnx`` + + .. code-block:: cpp + + ov::CompiledModel compiled_model = core.compile_model(".onnx", "AUTO"); + + For a guide on how to run inference, see how to :doc:`Integrate OpenVINO™ with Your Application `. + + .. tab-item:: C + :sync: c + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.onnx`` + + .. code-block:: c + + ov_compiled_model_t* compiled_model = NULL; + ov_core_compile_model_from_file(core, ".onnx", "AUTO", 0, &compiled_model); + + For details on the conversion, refer to the :doc:`article ` + + .. tab-item:: CLI + :sync: cli + + * The ``convert_model()`` method: + + You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.onnx`` + + .. code-block:: sh + + mo --input_model .onnx + + For details on the conversion, refer to the + :doc:`article ` + + .. tab-item:: PaddlePaddle + :sync: pdpd + + .. tab-set:: + + .. tab-item:: Python + :sync: py + + * The ``convert_model()`` method: + + When you use the ``convert_model()`` method, you have more control and you can specify additional adjustments for ``ov.Model``. The ``read_model()`` and ``compile_model()`` methods are easier to use, however, they do not have such capabilities. With ``ov.Model`` you can choose to optimize, compile and run inference on it or serialize it into a file for subsequent use. + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.pdmodel`` + + * **Python objects**: + + * ``paddle.hapi.model.Model`` + * ``paddle.fluid.dygraph.layers.Layer`` + * ``paddle.fluid.executor.Executor`` + + .. code-block:: py + :force: + + ov_model = convert_model(".pdmodel") + compiled_model = core.compile_model(ov_model, "AUTO") + + For more details on conversion, refer to the + :doc:`guide ` + and an example `tutorial `__ + on this topic. + + * The ``read_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.pdmodel`` + + .. code-block:: py + :force: + + ov_model = read_model(".pdmodel") + compiled_model = core.compile_model(ov_model, "AUTO") + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.pdmodel`` + + .. code-block:: py + :force: + + compiled_model = core.compile_model(".pdmodel", "AUTO") + + For a guide on how to run inference, see how to + :doc:`Integrate OpenVINO™ with Your Application `. + + .. tab-item:: C++ + :sync: cpp + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.pdmodel`` + + .. code-block:: cpp + + ov::CompiledModel compiled_model = core.compile_model(".pdmodel", "AUTO"); + + For a guide on how to run inference, see how to + :doc:`Integrate OpenVINO™ with Your Application `. + + .. tab-item:: C + :sync: c + + * The ``compile_model()`` method: + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.pdmodel`` + + .. code-block:: c + + ov_compiled_model_t* compiled_model = NULL; + ov_core_compile_model_from_file(core, ".pdmodel", "AUTO", 0, &compiled_model); + + For a guide on how to run inference, see how to + :doc:`Integrate OpenVINO™ with Your Application `. + + .. tab-item:: CLI + :sync: cli + + * The ``convert_model()`` method: + + You can use ``mo`` command-line tool to convert a model to IR. The obtained IR can then be read by ``read_model()`` and inferred. + + .. dropdown:: List of supported formats: + + * **Files**: + + * ``.pdmodel`` + + .. code-block:: sh + + mo --input_model .pdmodel + + For details on the conversion, refer to the + :doc:`article `. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +* :doc:`How to convert PyTorch ` +* :doc:`How to convert ONNX ` +* :doc:`How to convert TensorFlow ` +* :doc:`How to convert TensorFlow Lite ` +* :doc:`How to convert PaddlePaddle ` + +To choose the best workflow for your application, read the :doc:`Model Preparation section ` + +Refer to the list of all supported conversion options in :doc:`Conversion Parameters ` + +Additional Resources +#################### + +* :doc:`Transition guide from the legacy to new conversion API ` + +@endsphinxdirective diff --git a/docs/articles_en/openvino_workflow/openvino_intro/ShapeInference.md b/docs/articles_en/openvino_workflow/openvino_intro/ShapeInference.md index b6e95b74ca8421..c6e4fd8b6882e7 100644 --- a/docs/articles_en/openvino_workflow/openvino_intro/ShapeInference.md +++ b/docs/articles_en/openvino_workflow/openvino_intro/ShapeInference.md @@ -9,7 +9,8 @@ troubleshooting_reshape_errors .. meta:: - :description: OpenVINO™ allows changing model input shape during the runtime when the provided input has a different size than the model's input shape. + :description: OpenVINO™ allows changing model input shape during the runtime when the provided + input has a different size than the model's input shape. OpenVINO™ enables you to change model input shape during the application runtime. @@ -24,7 +25,7 @@ The following instructions are for cases where you need to change the model inpu The reshape method -++++++++++++++++++++ +######################## The reshape method is used as ``ov::Model::reshape`` in C++ and `Model.reshape `__ @@ -166,7 +167,7 @@ You can find the usage scenarios of the ``reshape`` method in nor the ``reshape`` method. The set_batch method -++++++++++++++++++++ +######################## The meaning of the model batch may vary depending on the model design. To change the batch dimension of the model, :ref:`set the layout ` and call the ``set_batch`` method. diff --git a/docs/articles_en/openvino_workflow/openvino_intro.md b/docs/articles_en/openvino_workflow/running_inference_with_openvino.md similarity index 58% rename from docs/articles_en/openvino_workflow/openvino_intro.md rename to docs/articles_en/openvino_workflow/running_inference_with_openvino.md index 2937189c136a01..20dfce6162042d 100644 --- a/docs/articles_en/openvino_workflow/openvino_intro.md +++ b/docs/articles_en/openvino_workflow/running_inference_with_openvino.md @@ -1,9 +1,7 @@ -# Inference with OpenVINO Runtime {#openvino_docs_OV_UG_OV_Runtime_User_Guide} +# Running Inference with OpenVINO™ {#openvino_docs_OV_UG_OV_Runtime_User_Guide} @sphinxdirective -.. _deep learning openvino runtime: - .. toctree:: :maxdepth: 1 :hidden: @@ -17,12 +15,21 @@ Optimize Inference .. meta:: - :description: OpenVINO Runtime is an API comprised of a set of C++ libraries - with C and Python bindings and it delivers inference solutions - on different platforms. + :description: Learn how to run inference using OpenVINO. + +OpenVINO Runtime is a set of C++ libraries with C and Python bindings providing a common API +to deploy inference on the platform of your choice. You can run any of the +:doc:`supported model formats ` directly or convert the model +and save it to the :doc:`OpenVINO IR ` format, for maximum performance. -OpenVINO Runtime is a set of C++ libraries with C and Python bindings providing a common API to deliver inference solutions on the platform of your choice. Use the OpenVINO Runtime API to read PyTorch, TensorFlow, TensorFlow Lite, ONNX, and PaddlePaddle models and execute them on preferred devices. OpenVINO gives you the option to use these models directly or convert them to the OpenVINO IR (Intermediate Representation) format explicitly, for maximum performance. +Why is OpenVINO IR inference faster? Even if you run a supported model directly, it is +converted before inference. It may happen automatically, under the hood, for maximum convenience, +but it is not suited for the most performance-oriented use cases. For example, converting PyTorch +usually requires Python and the ``torch`` module, which take extra time and memory, on top the +conversion process itself. If OpenVINO IR is used instead, it does not require any conversion, +nor the additional dependencies, as the inference application can be written in C or C++. +OpenVINO IR provides by far the best first-inference latency scores. .. note:: @@ -33,7 +40,7 @@ OpenVINO Runtime is a set of C++ libraries with C and Python bindings providing Note that TensorFlow models can be run using the :doc:`torch.compile feature `, as well as the standard ways of :doc:`converting TensorFlow ` - or reading them directly. + or running its inference. OpenVINO Runtime uses a plugin architecture. Its plugins are software components that contain complete implementation for inference on a particular Intel® hardware device: CPU, GPU, GNA, etc. Each plugin implements the unified API and provides additional hardware-specific APIs for configuring devices or API interoperability between OpenVINO Runtime and underlying plugin backend. diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 10b82b2b75ffcb..8276d5a3188970 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -27,7 +27,7 @@ elseif(AARCH64) if(APPLE) # Apple M1 / M2 is assumed set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8.2-a) - add_definitions(-DOV_CPU_ARM_ENABLE_FP16) + add_compile_definitions(OV_CPU_ARM_ENABLE_FP16) else() set(OV_CPU_ARM_TARGET_ARCH_DEFAULT arm64-v8a) endif() diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/pooling.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/pooling.cpp index c57694a072f9e7..6d4b6a72e5819a 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/pooling.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/single_layer_tests/pooling.cpp @@ -4,209 +4,184 @@ #include -#include "single_layer_tests/pooling.hpp" +#include "single_op_tests/pooling.hpp" #include "common_test_utils/test_constants.hpp" -using namespace LayerTestsDefinitions; - namespace { -// Common params -const std::vector inputPrecisions = { - InferenceEngine::Precision::FP32, -// InferenceEngine::Precision::FP16, // "[NOT_IMPLEMENTED] Input image format FP16 is not supported yet... - InferenceEngine::Precision::U8, -// InferenceEngine::Precision::I8 // Too much cases -}; +using ov::test::PoolingLayerTest; +using ov::test::MaxPoolingV8LayerTest; +using ov::test::utils::PoolingTypes; +using ov::test::poolSpecificParams; -const std::vector netPrecisions = { - InferenceEngine::Precision::FP16 +const std::vector model_types = { + ov::element::f16 }; const std::vector> kernels = {{3, 3}, {3, 5}}; -const std::vector> kernel3D = {{2, 2, 2}}; +const std::vector> kernel_3d = {{2, 2, 2}}; const std::vector> strides = {{1, 1}, {1, 2}, {2, 1}, {2, 2}}; -const std::vector> strides3D = {{1, 1, 1}, - {2, 2, 2}}; -const std::vector> stridess3D = {{2, 2, 2}}; -const std::vector> padBegins = {{0, 0}, - {0, 2}}; -const std::vector> padBegins3D = {{0, 0, 0}}; -const std::vector> padEnds = {{0, 0}, - {0, 2}}; -const std::vector> padEnds3D = {{0, 0, 0}}; -const std::vector roundingTypes = {ngraph::op::RoundingType::CEIL, - ngraph::op::RoundingType::FLOOR}; + +const std::vector> strides_3d = {{1, 1, 1}, + {2, 2, 2}}; + +const std::vector> pad_begins = {{0, 0}, + {0, 2}}; + +const std::vector> pad_begins_3d = {{0, 0, 0}}; + +const std::vector> pad_ends = {{0, 0}, + {0, 2}}; + +const std::vector> pad_ends_3d = {{0, 0, 0}}; + ////* ========== Max Polling ========== */ /* +========== Explicit Pad Floor Rounding ========== */ +std::vector input_shapes_static = {{1, 3, 30, 30}}; + const auto maxPool_ExplicitPad_FloorRounding_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::MAX), + ::testing::Values(PoolingTypes::MAX), ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), - ::testing::ValuesIn(padBegins), - ::testing::ValuesIn(padEnds), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::EXPLICIT), + ::testing::ValuesIn(pad_begins), + ::testing::ValuesIn(pad_ends), + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::EXPLICIT), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_FloorRounding, PoolingLayerTest, ::testing::Combine( maxPool_ExplicitPad_FloorRounding_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* +========== Same Upper Pad Floor Rounding ========== */ const auto maxPool_SameUpperPad_FloorRounding_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::MAX), + ::testing::Values(PoolingTypes::MAX), ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), - ::testing::ValuesIn(padBegins), - ::testing::ValuesIn(padEnds), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::SAME_UPPER), + ::testing::ValuesIn(pad_begins), + ::testing::ValuesIn(pad_ends), + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::SAME_UPPER), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_SameUpperPad_FloorRounding, PoolingLayerTest, ::testing::Combine( maxPool_SameUpperPad_FloorRounding_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* +========== Same Lower Pad Floor Rounding ========== */ const auto maxPool_SameLowerPad_FloorRounding_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::MAX), + ::testing::Values(PoolingTypes::MAX), ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), - ::testing::ValuesIn(padBegins), - ::testing::ValuesIn(padEnds), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::SAME_LOWER), + ::testing::ValuesIn(pad_begins), + ::testing::ValuesIn(pad_ends), + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::SAME_LOWER), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_SameLowerPad_FloorRounding, PoolingLayerTest, ::testing::Combine( maxPool_SameUpperPad_FloorRounding_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* ========== Explicit Pad Floor Rounding 5D input========== */ + +std::vector input_shapes_5d_static = {{32, 32, 2, 2, 2}}; + const auto maxPool_ExplicitPad_FloorRounding_5Dinput_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::MAX), - ::testing::ValuesIn(kernel3D), - ::testing::ValuesIn(strides3D), - ::testing::ValuesIn(padBegins3D), - ::testing::ValuesIn(padEnds3D), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::EXPLICIT), + ::testing::Values(PoolingTypes::MAX), + ::testing::ValuesIn(kernel_3d), + ::testing::ValuesIn(strides_3d), + ::testing::ValuesIn(pad_begins_3d), + ::testing::ValuesIn(pad_ends_3d), + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::EXPLICIT), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_FloorRounding_5Dinput, PoolingLayerTest, ::testing::Combine( maxPool_ExplicitPad_FloorRounding_5Dinput_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({32, 32, 2, 2, 2})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_5d_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* ========== Same Upper Pad Floor Rounding 5D input========== */ const auto maxPool_SameUpperPad_FloorRounding_5Dinput_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::MAX), - ::testing::ValuesIn(kernel3D), - ::testing::ValuesIn(strides3D), - ::testing::ValuesIn(padBegins3D), - ::testing::ValuesIn(padEnds3D), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::SAME_UPPER), + ::testing::Values(PoolingTypes::MAX), + ::testing::ValuesIn(kernel_3d), + ::testing::ValuesIn(strides_3d), + ::testing::ValuesIn(pad_begins_3d), + ::testing::ValuesIn(pad_ends_3d), + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::SAME_UPPER), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_SameUpperPad_FloorRounding_5Dinput, PoolingLayerTest, ::testing::Combine( maxPool_SameUpperPad_FloorRounding_5Dinput_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({32, 32, 2, 2, 2})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_5d_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* ========== Same Lower Pad Ceil Rounding 5D input========== */ const auto maxPool_SameLowerPad_CeilRounding_5Dinput_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::MAX), - ::testing::ValuesIn(kernel3D), - ::testing::ValuesIn(strides3D), - ::testing::ValuesIn(padBegins3D), - ::testing::ValuesIn(padEnds3D), - ::testing::Values(ngraph::op::RoundingType::CEIL), - ::testing::Values(ngraph::op::PadType::SAME_LOWER), + ::testing::Values(PoolingTypes::MAX), + ::testing::ValuesIn(kernel_3d), + ::testing::ValuesIn(strides_3d), + ::testing::ValuesIn(pad_begins_3d), + ::testing::ValuesIn(pad_ends_3d), + ::testing::Values(ov::op::RoundingType::CEIL), + ::testing::Values(ov::op::PadType::SAME_LOWER), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_SameLowerPad_CeilRounding_5Dinput, PoolingLayerTest, ::testing::Combine( maxPool_SameUpperPad_FloorRounding_5Dinput_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({32, 32, 2, 2, 2})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_5d_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* ========== Explicit Pad Ceil Rounding ========== */ const auto maxPool_ExplicitPad_CeilRounding_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::MAX), + ::testing::Values(PoolingTypes::MAX), ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), - ::testing::ValuesIn(padBegins), - ::testing::ValuesIn(padEnds), - ::testing::Values(ngraph::op::RoundingType::CEIL), - ::testing::Values(ngraph::op::PadType::EXPLICIT), + ::testing::ValuesIn(pad_begins), + ::testing::ValuesIn(pad_ends), + ::testing::Values(ov::op::RoundingType::CEIL), + ::testing::Values(ov::op::PadType::EXPLICIT), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_CeilRounding, PoolingLayerTest, ::testing::Combine( maxPool_ExplicitPad_CeilRounding_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); @@ -214,55 +189,49 @@ INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_CeilRounding, PoolingLayerTes ////* ========== Avg Pooling ========== */ /* +========== Explicit Pad Ceil Rounding ========== */ const auto avgPoolExplicitPadCeilRoundingParams = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::AVG), + ::testing::Values(PoolingTypes::AVG), ::testing::ValuesIn(kernels), - // TODO: Non 1 strides fails in ngraph reference implementation with error "The end corner is out of bounds at axis 3" thrown in the test body. + // TODO: Non 1 strides fails in reference implementation with error "The end corner is out of bounds at axis 3" thrown in the test body. ::testing::ValuesIn(strides), ::testing::ValuesIn(std::vector>({{0, 0}, {1, 1}, {0, 1}})), ::testing::ValuesIn(std::vector>({{0, 0}, {1, 1}, {0, 1}})), - ::testing::Values(ngraph::op::RoundingType::CEIL), - ::testing::Values(ngraph::op::PadType::EXPLICIT), + ::testing::Values(ov::op::RoundingType::CEIL), + ::testing::Values(ov::op::PadType::EXPLICIT), ::testing::Values(true, false) ); INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_ExplicitPad_CeilRounding, PoolingLayerTest, ::testing::Combine( avgPoolExplicitPadCeilRoundingParams, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); -std::vector psParams({poolSpecificParams(ngraph::helpers::PoolingTypes::AVG, {2, 2}, {2, 2}, {0, 0}, {0, 0}, - ngraph::op::RoundingType::CEIL, ngraph::op::PadType::EXPLICIT, false), - poolSpecificParams(ngraph::helpers::PoolingTypes::AVG, {7, 7}, {1, 1}, {0, 0}, {1, 1}, - ngraph::op::RoundingType::CEIL, ngraph::op::PadType::EXPLICIT, false)}); +std::vector psParams({poolSpecificParams(PoolingTypes::AVG, {2, 2}, {2, 2}, {0, 0}, {0, 0}, + ov::op::RoundingType::CEIL, ov::op::PadType::EXPLICIT, false), + poolSpecificParams(PoolingTypes::AVG, {7, 7}, {1, 1}, {0, 0}, {1, 1}, + ov::op::RoundingType::CEIL, ov::op::PadType::EXPLICIT, false)}); + +std::vector input_shapes_explicit_pad_ceil_rounding_corner_static = {{1, 3, 30, 30}}; INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_ExplicitPad_CeilRounding_corner, PoolingLayerTest, ::testing::Combine( ::testing::ValuesIn(psParams), - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 1024, 6, 6})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_explicit_pad_ceil_rounding_corner_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* +========== Explicit Pad Floor Rounding ========== */ const auto avgPoolExplicitPadFloorRoundingParams = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::AVG), + ::testing::Values(PoolingTypes::AVG), ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), ::testing::ValuesIn(std::vector>({{0, 0}, {1, 1}})), ::testing::ValuesIn(std::vector>({{0, 0}, {1, 1}})), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::EXPLICIT), + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::EXPLICIT), ::testing::Values(true, false) ); @@ -270,114 +239,96 @@ const auto avgPoolExplicitPadFloorRoundingParams = ::testing::Combine( INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_ExplicitPad_FloorRounding, PoolingLayerTest, ::testing::Combine( avgPoolExplicitPadFloorRoundingParams, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* ========== Explicit Pad Floor Rounding 5D input========== */ const auto avgPool_ExplicitPad_FloorRounding_5Dinput_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::AVG), - ::testing::ValuesIn(kernel3D), - ::testing::ValuesIn(strides3D), - ::testing::ValuesIn(padBegins3D), - ::testing::ValuesIn(padEnds3D), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::EXPLICIT), + ::testing::Values(PoolingTypes::AVG), + ::testing::ValuesIn(kernel_3d), + ::testing::ValuesIn(strides_3d), + ::testing::ValuesIn(pad_begins_3d), + ::testing::ValuesIn(pad_ends_3d), + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::EXPLICIT), ::testing::Values(true, false) ); +std::vector input_shapes_5d_2_static = {{32, 32, 2, 2, 4}}; + INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_ExplicitPad_FloorRounding_5Dinput, PoolingLayerTest, ::testing::Combine( avgPool_ExplicitPad_FloorRounding_5Dinput_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({32, 32, 2, 2, 4})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_5d_2_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* ========== Same Upper Pad Floor Rounding 5D input========== */ const auto avgPool_SameUpperPad_FloorRounding_5Dinput_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::AVG), - ::testing::ValuesIn(kernel3D), - ::testing::ValuesIn(strides3D), - ::testing::ValuesIn(padBegins3D), - ::testing::ValuesIn(padEnds3D), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::SAME_UPPER), + ::testing::Values(PoolingTypes::AVG), + ::testing::ValuesIn(kernel_3d), + ::testing::ValuesIn(strides_3d), + ::testing::ValuesIn(pad_begins_3d), + ::testing::ValuesIn(pad_ends_3d), + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::SAME_UPPER), ::testing::Values(true) ); INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_SameUpperPad_FloorRounding_5Dinput, PoolingLayerTest, ::testing::Combine( avgPool_SameUpperPad_FloorRounding_5Dinput_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({32, 32, 2, 2, 4})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_5d_2_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); /* ========== Same Lower Pad Ceil Rounding 5D input========== */ const auto avgPool_SameLowerPad_CeilRounding_5Dinput_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::AVG), - ::testing::ValuesIn(kernel3D), - ::testing::ValuesIn(strides3D), - ::testing::ValuesIn(padBegins3D), - ::testing::ValuesIn(padEnds3D), - ::testing::Values(ngraph::op::RoundingType::CEIL), - ::testing::Values(ngraph::op::PadType::SAME_LOWER), + ::testing::Values(PoolingTypes::AVG), + ::testing::ValuesIn(kernel_3d), + ::testing::ValuesIn(strides_3d), + ::testing::ValuesIn(pad_begins_3d), + ::testing::ValuesIn(pad_ends_3d), + ::testing::Values(ov::op::RoundingType::CEIL), + ::testing::Values(ov::op::PadType::SAME_LOWER), ::testing::Values(true) ); INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_SameLowerPad_CeilRounding_5Dinput, PoolingLayerTest, ::testing::Combine( avgPool_SameLowerPad_CeilRounding_5Dinput_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({32, 32, 2, 2, 2})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_5d_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); ////* ========== Max Pooling V8 ========== */ const std::vector> dilation = {{1, 1}, {2, 2}}; -const std::vector> dilation3D = {{1, 1, 1}, {2, 2, 2}}; +const std::vector> dilation_3d = {{1, 1, 1}, {2, 2, 2}}; /* ========== Explicit Pad Floor Rounding ========== */ const auto maxPoolv8_ExplicitPad_FloorRounding_Params = ::testing::Combine( ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), ::testing::ValuesIn(dilation), - ::testing::ValuesIn(padBegins), - ::testing::ValuesIn(padEnds), - ::testing::Values(ngraph::element::Type_t::i32), + ::testing::ValuesIn(pad_begins), + ::testing::ValuesIn(pad_ends), + ::testing::Values(ov::element::i32), ::testing::Values(0), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::EXPLICIT) + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::EXPLICIT) ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV8_ExplicitPad_FloorRounding, MaxPoolingV8LayerTest, ::testing::Combine( maxPoolv8_ExplicitPad_FloorRounding_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), MaxPoolingV8LayerTest::getTestCaseName); @@ -386,23 +337,19 @@ const auto maxPoolv8_SameUpperPad_FloorRounding_Params = ::testing::Combine( ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), ::testing::ValuesIn(dilation), - ::testing::ValuesIn(padBegins), - ::testing::ValuesIn(padEnds), - ::testing::Values(ngraph::element::Type_t::i32), + ::testing::ValuesIn(pad_begins), + ::testing::ValuesIn(pad_ends), + ::testing::Values(ov::element::i32), ::testing::Values(0), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::SAME_UPPER) + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::SAME_UPPER) ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolv8_SameUpperPad_FloorRounding, MaxPoolingV8LayerTest, ::testing::Combine( maxPoolv8_SameUpperPad_FloorRounding_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), MaxPoolingV8LayerTest::getTestCaseName); @@ -411,98 +358,82 @@ const auto maxPoolv8_SameLowerPad_FloorRounding_Params = ::testing::Combine( ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), ::testing::ValuesIn(dilation), - ::testing::ValuesIn(padBegins), - ::testing::ValuesIn(padEnds), - ::testing::Values(ngraph::element::Type_t::i32), + ::testing::ValuesIn(pad_begins), + ::testing::ValuesIn(pad_ends), + ::testing::Values(ov::element::i32), ::testing::Values(0), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::SAME_LOWER) + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::SAME_LOWER) ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolv8_SameLowerPad_FloorRounding, MaxPoolingV8LayerTest, ::testing::Combine( maxPoolv8_SameLowerPad_FloorRounding_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), MaxPoolingV8LayerTest::getTestCaseName); /* ========= Explicit Pad Floor Rounding 5D input========== */ const auto maxPoolv8_ExplicitPad_FloorRounding_5Dinput_Params = ::testing::Combine( - ::testing::ValuesIn(kernel3D), - ::testing::ValuesIn(strides3D), - ::testing::Values(dilation3D[0]), - ::testing::ValuesIn(padBegins3D), - ::testing::ValuesIn(padEnds3D), - ::testing::Values(ngraph::element::Type_t::i32), + ::testing::ValuesIn(kernel_3d), + ::testing::ValuesIn(strides_3d), + ::testing::Values(dilation_3d[0]), + ::testing::ValuesIn(pad_begins_3d), + ::testing::ValuesIn(pad_ends_3d), + ::testing::Values(ov::element::i32), ::testing::Values(0), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::EXPLICIT) + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::EXPLICIT) ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolv8_ExplicitPad_FloorRounding_5Dinput, MaxPoolingV8LayerTest, ::testing::Combine( maxPoolv8_ExplicitPad_FloorRounding_5Dinput_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({32, 32, 2, 2, 2})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_5d_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), MaxPoolingV8LayerTest::getTestCaseName); /* ========= Same Upper Pad Floor Rounding 5D input========== */ const auto maxPoolv8_SameUpperPad_FloorRounding_5Dinput_Params = ::testing::Combine( - ::testing::ValuesIn(kernel3D), - ::testing::ValuesIn(strides3D), - ::testing::ValuesIn(dilation3D), - ::testing::ValuesIn(padBegins3D), - ::testing::ValuesIn(padEnds3D), - ::testing::Values(ngraph::element::Type_t::i32), + ::testing::ValuesIn(kernel_3d), + ::testing::ValuesIn(strides_3d), + ::testing::ValuesIn(dilation_3d), + ::testing::ValuesIn(pad_begins_3d), + ::testing::ValuesIn(pad_ends_3d), + ::testing::Values(ov::element::i32), ::testing::Values(0), - ::testing::Values(ngraph::op::RoundingType::FLOOR), - ::testing::Values(ngraph::op::PadType::SAME_UPPER) + ::testing::Values(ov::op::RoundingType::FLOOR), + ::testing::Values(ov::op::PadType::SAME_UPPER) ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolv8_SameUpperPad_FloorRounding_5Dinput, MaxPoolingV8LayerTest, ::testing::Combine( maxPoolv8_SameUpperPad_FloorRounding_5Dinput_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({32, 32, 2, 2, 2})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_5d_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), MaxPoolingV8LayerTest::getTestCaseName); /* ========= Same Lower Pad Ceil Rounding 5D input========== */ const auto maxPoolv8_SameLowerPad_CeilRounding_5Dinput_Params = ::testing::Combine( - ::testing::ValuesIn(kernel3D), - ::testing::ValuesIn(strides3D), - ::testing::ValuesIn(dilation3D), - ::testing::ValuesIn(padBegins3D), - ::testing::ValuesIn(padEnds3D), - ::testing::Values(ngraph::element::Type_t::i32), + ::testing::ValuesIn(kernel_3d), + ::testing::ValuesIn(strides_3d), + ::testing::ValuesIn(dilation_3d), + ::testing::ValuesIn(pad_begins_3d), + ::testing::ValuesIn(pad_ends_3d), + ::testing::Values(ov::element::i32), ::testing::Values(0), - ::testing::Values(ngraph::op::RoundingType::CEIL), - ::testing::Values(ngraph::op::PadType::SAME_LOWER) + ::testing::Values(ov::op::RoundingType::CEIL), + ::testing::Values(ov::op::PadType::SAME_LOWER) ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolv8_SameLowerPad_CeilRounding_5Dinput, MaxPoolingV8LayerTest, ::testing::Combine( maxPoolv8_SameLowerPad_CeilRounding_5Dinput_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({32, 32, 2, 2, 2})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_5d_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), MaxPoolingV8LayerTest::getTestCaseName); @@ -511,49 +442,41 @@ const auto maxPoolv8_ExplicitPad_CeilRounding_Params = ::testing::Combine( ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), ::testing::ValuesIn(dilation), - ::testing::ValuesIn(padBegins), - ::testing::ValuesIn(padEnds), - ::testing::Values(ngraph::element::Type_t::i32), + ::testing::ValuesIn(pad_begins), + ::testing::ValuesIn(pad_ends), + ::testing::Values(ov::element::i32), ::testing::Values(0), - ::testing::Values(ngraph::op::RoundingType::CEIL), - ::testing::Values(ngraph::op::PadType::EXPLICIT) + ::testing::Values(ov::op::RoundingType::CEIL), + ::testing::Values(ov::op::PadType::EXPLICIT) ); INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolv8_ExplicitPad_CeilRounding, MaxPoolingV8LayerTest, ::testing::Combine( maxPoolv8_ExplicitPad_CeilRounding_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), MaxPoolingV8LayerTest::getTestCaseName); ////* ========== Avg and Max Polling Cases ========== */ /* ========== Valid Pad Rounding Not Applicable ========== */ const auto allPools_ValidPad_Params = ::testing::Combine( - ::testing::Values(ngraph::helpers::PoolingTypes::MAX, ngraph::helpers::PoolingTypes::AVG), + ::testing::Values(PoolingTypes::MAX, PoolingTypes::AVG), ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), ::testing::Values(std::vector({0, 0})), ::testing::Values(std::vector({0, 0})), ::testing::Values( - ngraph::op::RoundingType::FLOOR), // placeholder value - Rounding Type not applicable for Valid pad type - ::testing::Values(ngraph::op::PadType::VALID), + ov::op::RoundingType::FLOOR), // placeholder value - Rounding Type not applicable for Valid pad type + ::testing::Values(ov::op::PadType::VALID), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling ); INSTANTIATE_TEST_SUITE_P(smoke_MAX_and_AVGPool_ValidPad, PoolingLayerTest, ::testing::Combine( allPools_ValidPad_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), PoolingLayerTest::getTestCaseName); @@ -563,21 +486,17 @@ const auto maxPoolv8_ValidPad_Params = ::testing::Combine( ::testing::ValuesIn(dilation), ::testing::Values(std::vector({0, 0})), ::testing::Values(std::vector({0, 0})), - ::testing::Values(ngraph::element::Type_t::i32), + ::testing::Values(ov::element::i32), ::testing::Values(0), - ::testing::Values(ngraph::op::RoundingType::FLOOR), // placeholder value - Rounding Type not applicable for Valid pad type - ::testing::Values(ngraph::op::PadType::VALID) + ::testing::Values(ov::op::RoundingType::FLOOR), // placeholder value - Rounding Type not applicable for Valid pad type + ::testing::Values(ov::op::PadType::VALID) ); INSTANTIATE_TEST_SUITE_P(smoke_MAXPoolv8_ValidPad, MaxPoolingV8LayerTest, ::testing::Combine( maxPoolv8_ValidPad_Params, - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({1, 3, 30, 30})), + ::testing::ValuesIn(model_types), + ::testing::Values(ov::test::static_shapes_to_test_representation(input_shapes_static)), ::testing::Values(ov::test::utils::DEVICE_CPU)), MaxPoolingV8LayerTest::getTestCaseName); diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 1ae7ef4f76618e..17e62ca926397b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -257,6 +257,7 @@ REGISTER_FACTORY(v11, Interpolate); REGISTER_FACTORY(v11, TopK); // ------------------------------ Supported v12 ops ----------------------------- // +REGISTER_FACTORY(v12, GroupNormalization); REGISTER_FACTORY(v12, Pad); REGISTER_FACTORY(v12, ScatterElementsUpdate); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/condition.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/condition.hpp index 5ad37c7dfa55bf..4301f7cc10d31a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/condition.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/condition.hpp @@ -54,8 +54,8 @@ struct condition : public primitive_base { const std::vector& inputs, const branch& branch_true, const branch& branch_false, - const padding& output_padding = padding()) - : primitive_base(id, inputs, {output_padding}), + const size_t num_outputs = 1) + : primitive_base(id, inputs, {padding()}, {optional_data_type()}, num_outputs), branch_true(branch_true), branch_false(branch_false) {} diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/group_normalization.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/group_normalization.hpp new file mode 100644 index 00000000000000..d8c7e385f4c9a7 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/group_normalization.hpp @@ -0,0 +1,73 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once +#include "primitive.hpp" + +namespace cldnn { + +/// @brief Performs the following transformation of the input tensor: +/// y = scale * (x - mean) / sqrt(variance + epsilon) + bias +/// The operation is applied per batch, per group of channels. +struct group_normalization : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(group_normalization) + + group_normalization() : primitive_base("", {}) {} + + /// @brief Constructs group_normalization primitive. + /// @param id This primitive id. + /// @param data The input tensor to be normalized. + /// @param scale Scale values tensor. + /// @param bias Bias values. + /// @param num_groups Number of groups the channel dimension will be divided into. + /// @param epsilon A value added to the variance which ensures that division by zero. + /// does not occur for any normalized element. + group_normalization(const primitive_id& id, + const input_info& data, + const input_info& scale, + const input_info& bias, + std::int64_t num_groups, + double epsilon, + const padding& output_padding = padding()) + : primitive_base(id, {data, scale, bias}, {output_padding}), num_groups{num_groups}, epsilon{epsilon} {} + + /// @brief Number of groups the channel dimension will be divided into + /// @details + /// Specifies the number of groups G that the channel dimension will be divided into. + std::int64_t num_groups{}; + + /// @brief A value added to the variance which ensures that division by zero. + /// @details + /// A very small value added to the variance for numerical stability. + /// Ensures that division by zero does not occur for any normalized element. + double epsilon{}; + + std::size_t hash() const override { + size_t seed = primitive::hash(); + seed = hash_combine(seed, num_groups); + return hash_combine(seed, epsilon); + } + + bool operator==(const primitive& rhs) const override { + if (!compare_common_params(rhs)) + return false; + + const auto& rhs_casted = downcast(rhs); + + return num_groups == rhs_casted.num_groups && epsilon == rhs_casted.epsilon; + } + + void save(BinaryOutputBuffer& ob) const override { + primitive_base::save(ob); + ob << num_groups; + ob << epsilon; + } + + void load(BinaryInputBuffer& ib) override { + primitive_base::load(ib); + ib >> num_groups; + ib >> epsilon; + } +}; + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/loop.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/loop.hpp index c27a88e1975f1c..282147cc9e9d3d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/loop.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/loop.hpp @@ -7,8 +7,8 @@ #include #include "primitive.hpp" #include "intel_gpu/graph/topology.hpp" +#include "intel_gpu/graph/program.hpp" -#define DEFAULT_MAX_NUM_ITERATION 256 namespace cldnn { /// @@ -53,18 +53,35 @@ struct loop : public primitive_base { CLDNN_DECLARE_PRIMITIVE(loop) loop() : primitive_base("", {}), - max_iteration(0) {} + max_num_iterations(0) {} struct io_primitive_map { /// @brief Constructs a mapping from external input/output primitive to input/output primitive in body topology - /// + /// or a mapping from output of body topology to input of body topology for the next iteration. + /// @param external_id Primitive id of input of loop or output of body network. + /// @param internal_id Primitive id of input of body network. + /// @param axis Axis to iterate through. Negative value means the axis will not iterate through and start, end, stride arguments will be ignored. + /// @param start Index where the iteration starts from. Applies only when axis >=0. + /// @param end Index where iteration ends. Negative value means counting indexes from the end. Applies only when axis >=0. + /// @param stride Step of iteration. Negative value means backward iteration. Applies only when axis >=0. + io_primitive_map(primitive_id external_id, primitive_id internal_id, + int64_t axis = -1, int64_t start = 0, int64_t end = -1, int64_t stride = 1) : + external_id(external_id, 0), + internal_id(internal_id, 0), + axis(axis), + start(start), + end(end), + stride(stride) {} + + /// @brief Constructs a mapping from external input/output primitive to input/output primitive in body topology + /// or a mapping from output of body topology to input of body topology for the next iteration. /// @param external_id Primitive id of input of loop or output of body network. /// @param internal_id Primitive id of input of body network. /// @param axis Axis to iterate through. Negative value means the axis will not iterate through and start, end, stride arguments will be ignored. /// @param start Index where the iteration starts from. Applies only when axis >=0. /// @param end Index where iteration ends. Negative value means counting indexes from the end. Applies only when axis >=0. /// @param stride Step of iteration. Negative value means backward iteration. Applies only when axis >=0. - io_primitive_map(primitive_id external_id = "", primitive_id internal_id = "", + io_primitive_map(input_info external_id = input_info(), input_info internal_id = input_info(), int64_t axis = -1, int64_t start = 0, int64_t end = -1, int64_t stride = 1) : external_id(std::move(external_id)), internal_id(std::move(internal_id)), @@ -73,8 +90,8 @@ struct loop : public primitive_base { end(end), stride(stride) {} - primitive_id external_id; - primitive_id internal_id; + input_info external_id; + input_info internal_id; int64_t axis; int64_t start; int64_t end; @@ -125,68 +142,69 @@ struct loop : public primitive_base { /// /// @param id This primitive id. /// @param inputs Input data primitive ids. - /// @param body Topology to be recurrently executed. + /// @param body_program body program to be recurrently executed. /// @param trip_count_id Data primitive id in external topology specifying maximum number of iterations. /// Its data primitive should have 1 integer element. Negative value means infinite /// number of iteration. - /// @param initial_condition_id Data primitive id in external topology specifying initial execution + /// @param first_execution_condition_id Data primitive id in external topology specifying initial execution /// condition. Its data primitive should have 1 integer element. Zero means /// loop will not be executed, otherwise loop will be executed. /// @param num_iteration_id mutable_data primitive id to get the actual number of loop iterations. - /// @param current_iteration_id Optional data primitive id in the body network to specify current iteration. - /// If current_iteration_id is specified but body does not have data whose primitive - /// id is same as current_iteration_id, data primitive will be added in the body network. - /// @param condition_id Optional data primitive id in the body network to specify execution condition + /// @param body_current_iteration_id Optional data primitive id in the body network to specify current iteration. + /// If body_current_iteration_id is specified but body does not have data whose primitive + /// id is same as body_current_iteration_id, data primitive will be added in the body network. + /// @param body_execution_condition_id Optional data primitive id in the body network to specify execution condition /// for the next iteration. Its data primitive should have 1 integer element. Zero means - /// loop will not be executed, otherwise loop will be executed. If condition_id - /// is specified but body does not have data whose primitive id is same as condition_id, + /// loop will not be executed, otherwise loop will be executed. If body_execution_condition_id + /// is specified but body does not have data whose primitive id is same as body_execution_condition_id, /// data primitive will be added in the body network. /// @param primitive_map Rules to map input of loop or output of body topology to input of the body topology /// @param back_edges Output data primitive id. /// @param output_padding Optional padding for output from primitive. loop(const primitive_id& id, const std::vector& inputs, - const topology& body, + const program::ptr body_program, const primitive_id& trip_count_id, - const primitive_id& initial_condition_id, + const primitive_id& first_execution_condition_id, const primitive_id& num_iteration_id, const std::vector& input_primitive_maps, const std::vector& output_primitive_maps, const std::vector& back_edges, - int64_t max_iteration = -1, - const primitive_id& current_iteration_id = primitive_id(), - const primitive_id& condition_id = primitive_id(), - const padding& output_padding = padding()) - : primitive_base(id, inputs, {output_padding}), - body(body), + int64_t max_num_iterations = -1, + const primitive_id& body_current_iteration_id = primitive_id(), + const primitive_id& body_execution_condition_id = primitive_id(), + const size_t num_outputs = 1) + : primitive_base(id, inputs, {padding()}, {optional_data_type()}, num_outputs), + body_program(std::move(body_program)), trip_count_id(trip_count_id), - initial_execution_id(initial_condition_id), + first_execution_condition_id(first_execution_condition_id), num_iteration_id(num_iteration_id), - current_iteration_id(current_iteration_id), - condition_id(condition_id), + body_current_iteration_id(body_current_iteration_id), + body_execution_condition_id(body_execution_condition_id), input_primitive_maps(input_primitive_maps), output_primitive_maps(output_primitive_maps), back_edges(back_edges), - max_iteration(max_iteration) - {} + max_num_iterations(max_num_iterations) { + OPENVINO_ASSERT(inputs.front().pid == num_iteration_id, "first input of inputs should be num_iteration_id"); + } - /// @brief Topology to be recurrently executed. - topology body; + /// @brief Body program to be recurrently executed. + program::ptr body_program; /// @brief Data primitive id in external topology specifying maximum number of iterations. primitive_id trip_count_id; /// @brief Data primitive id in external topology specifying initial execution condition. - primitive_id initial_execution_id; + primitive_id first_execution_condition_id; /// @brief mutable_data primitive id to get the actual number of loop iterations. primitive_id num_iteration_id; /// @brief Data primitive id in the body network to store current iteration - primitive_id current_iteration_id; + primitive_id body_current_iteration_id; /// @brief Data primitive id in the body network to store execution condition - primitive_id condition_id; + primitive_id body_execution_condition_id; /// @brief Rules to map input or output data of loop layer onto input or output data of body topology. std::vector input_primitive_maps; @@ -195,7 +213,7 @@ struct loop : public primitive_base { /// @brief Rules to transfer data from body outputs at one iteration to body input at the next iteration. std::vector back_edges; - int64_t max_iteration; + int32_t max_num_iterations; size_t hash() const override { size_t seed = primitive::hash(); @@ -206,42 +224,43 @@ struct loop : public primitive_base { void save(BinaryOutputBuffer& ob) const override { primitive_base::save(ob); ob << trip_count_id; - ob << initial_execution_id; + ob << first_execution_condition_id; ob << num_iteration_id; - ob << current_iteration_id; - ob << condition_id; + ob << body_current_iteration_id; + ob << body_execution_condition_id; ob << input_primitive_maps; ob << output_primitive_maps; ob << back_edges; - ob << max_iteration; + ob << max_num_iterations; } void load(BinaryInputBuffer& ib) override { primitive_base::load(ib); ib >> trip_count_id; - ib >> initial_execution_id; + ib >> first_execution_condition_id; ib >> num_iteration_id; - ib >> current_iteration_id; - ib >> condition_id; + ib >> body_current_iteration_id; + ib >> body_execution_condition_id; ib >> input_primitive_maps; ib >> output_primitive_maps; ib >> back_edges; - ib >> max_iteration; + ib >> max_num_iterations; } protected: std::vector> get_dependencies() const override { - std::vector> ret{ - std::ref(trip_count_id), std::ref(initial_execution_id), std::ref(num_iteration_id) - }; + std::vector> ret; + ret.push_back(std::ref(num_iteration_id)); + if (!trip_count_id.empty()) ret.push_back(std::ref(trip_count_id)); + if (!first_execution_condition_id.empty()) ret.push_back(std::ref(first_execution_condition_id)); + // add external_id in dependencies if not exist for (const auto& mapping : input_primitive_maps) { auto target = std::find_if(input.begin(), input.end(), - [&](const input_info& info) { - return info.pid == mapping.external_id; - }); + [&](const input_info& info) { + return info.pid == mapping.external_id.pid;}); if (target == input.end()) { - ret.push_back(std::ref(mapping.external_id)); + ret.push_back(std::ref(mapping.external_id.pid)); } } return ret; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/primitive.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/primitive.hpp index bc55ed80e4f362..72c841a7578ab4 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/primitive.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/primitive.hpp @@ -70,8 +70,19 @@ struct input_info { ib >> pid; ib >> idx; } + + std::string to_string() const { + std::stringstream ss; + ss << "input_info(pid:" << pid << ",idx:" << idx << ")"; + return ss.str(); + } }; +static inline std::ostream& operator<< (std::ostream& os, input_info& info) { + os << info.to_string(); + return os; +} + struct prim_map_storage { static prim_map_storage& instance() { static prim_map_storage instance; diff --git a/src/plugins/intel_gpu/src/graph/condition.cpp b/src/plugins/intel_gpu/src/graph/condition.cpp index 842495e0b24e0d..8da80347ea66fd 100644 --- a/src/plugins/intel_gpu/src/graph/condition.cpp +++ b/src/plugins/intel_gpu/src/graph/condition.cpp @@ -215,7 +215,7 @@ std::string condition_inst::to_string(condition_node const& node) { } /* -Condition primitive is resuing memory with the input. +Condition primitive is reusing memory with the input. */ condition_inst::typed_primitive_inst(network& network, condition_node const& node) : parent(network, node), diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index 537fa7412b09f5..9c3eb7c813045d 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -14,6 +14,7 @@ #include "arg_max_min_inst.h" #include "fully_connected_inst.h" #include "condition_inst.h" +#include "loop_inst.h" #include "program_node.h" #include @@ -74,7 +75,7 @@ void compile_graph::run(program& p) { if (node->is_dynamic() && !is_planar) can_select_impl = false; - if (node->is_type() || node->is_type()) + if (node->is_type() || node->is_type() || node->is_type()) can_select_impl = true; if (can_select_impl) { diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/graph_initializations.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/graph_initializations.cpp index 8a031e45582314..cf9f44a9a59686 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/graph_initializations.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/graph_initializations.cpp @@ -402,6 +402,7 @@ void graph_initializations::set_outputs(program& p) { auto custom_outputs = p.get_config().get_property(ov::intel_gpu::custom_outputs); if (!custom_outputs.empty()) { for (auto const& output : custom_outputs) { + OPENVINO_ASSERT(p.has_node(output), "not found custom output node in current cldnn::program: ", output); auto o_node = p.get_node_ptr(output); o_node->set_output(true); p.outputs.push_back(o_node.get()); diff --git a/src/plugins/intel_gpu/src/graph/group_normalization.cpp b/src/plugins/intel_gpu/src/graph/group_normalization.cpp new file mode 100644 index 00000000000000..69b06343362570 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/group_normalization.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "group_normalization_inst.h" +#include "primitive_type_base.h" +#include "json_object.h" + +namespace cldnn { +GPU_DEFINE_PRIMITIVE_TYPE_ID(group_normalization) + +layout group_normalization_inst::calc_output_layout(group_normalization_node const& node, kernel_impl_params const& impl_param) { + assert(static_cast(impl_param.desc->output_data_types[0]) == false && + "Output data type forcing is not supported for group_normalization_node!"); + auto output_layout = impl_param.get_input_layout(); + + if (impl_param.has_fused_primitives()) + output_layout.data_type = impl_param.get_fused_output_layout().data_type; + + return output_layout; +} + +std::string group_normalization_inst::to_string(group_normalization_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + + std::stringstream primitive_description; + + json_composite group_normalization_info; + group_normalization_info.add("dimension", desc->num_groups); + group_normalization_info.add("epsilon", desc->epsilon); + + node_info->add("group_normalization_info", group_normalization_info); + node_info->dump(primitive_description); + + return primitive_description.str(); +} + +group_normalization_inst::typed_primitive_inst(network& network, group_normalization_node const& node) : parent(network, node) { +} + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp index 81f244051a25aa..b774b72dd506ec 100644 --- a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp @@ -12,6 +12,77 @@ namespace cldnn { namespace common { + +// read scala value from data primitive +static int64_t read_scalar_value(memory::ptr mem, stream& stream) { + int64_t trip_count = 0; + const layout& prim_layout = mem->get_layout(); + + switch (prim_layout.data_type) { + case data_types::u8: { + mem_lock lock_prim_output{mem, stream}; + trip_count = *lock_prim_output.data(); + break; + } + case data_types::i8: { + mem_lock lock_prim_output{mem, stream}; + trip_count = *lock_prim_output.data(); + break; + } + case data_types::i32: { + mem_lock lock_prim_output{mem, stream}; + trip_count = *lock_prim_output.data(); + break; + } + case data_types::i64: { + mem_lock lock_prim_output{mem, stream}; + trip_count = *lock_prim_output.data(); + break; + } + default: + OPENVINO_THROW("Invalid data type : ", ov::element::Type(prim_layout.data_type).get_type_name()); + } + return trip_count; +} + +template +static inline void validate_input_value(int64_t input) { + OPENVINO_ASSERT((input >= std::numeric_limits::min() && input <= std::numeric_limits::max()), + "Invalid data value : ", input); +} + +static void write_scalar_value(memory::ptr mem, stream& stream, int64_t input) { + const layout& prim_layout = mem->get_layout(); + + switch (prim_layout.data_type) { + case data_types::u8: { + validate_input_value(input); + mem_lock lock_prim_output{mem, stream}; + lock_prim_output[0] = static_cast(input); + break; + } + case data_types::i8: { + validate_input_value(input); + mem_lock lock_prim_output{mem, stream}; + lock_prim_output[0] = static_cast(input); + break; + } + case data_types::i32: { + validate_input_value(input); + mem_lock lock_prim_output{mem, stream}; + lock_prim_output[0] = static_cast(input); + break; + } + case data_types::i64: { + mem_lock lock_prim_output{mem, stream}; + lock_prim_output[0] = input; + break; + } + default: + OPENVINO_THROW("Invalid data type : ", ov::element::Type(prim_layout.data_type).get_type_name()); + } +} + struct loop_impl : typed_primitive_impl { using parent = typed_primitive_impl; using parent::parent; @@ -27,7 +98,6 @@ struct loop_impl : typed_primitive_impl { loop_impl() : parent() {} loop_impl(const loop_impl& other) : typed_primitive_impl(other), - _max_iteration(other._max_iteration), _back_edges(other._back_edges) {} explicit loop_impl(const loop_node& node) { @@ -37,63 +107,169 @@ struct loop_impl : typed_primitive_impl { void set_node_params(const program_node& arg) override { OPENVINO_ASSERT(arg.is_type()); const auto& node = arg.as(); - _max_iteration = node.get_max_iteration(); _back_edges = node.get_back_edges(); } + void set_memory_in_body_network(cldnn::network::ptr body_network, + const std::shared_ptr& inst, memory::ptr mem) const { + if (inst->is_input()) { + body_network->set_input_data(inst->id(), mem); + } else if (inst->is_output()) { + body_network->set_output_memory(inst->id(), mem); + } else { + inst->set_output_memory(mem, false); + } + } + + std::vector handle_buffers_for_next_iteration(const loop_inst::backedge_memory_mapping& mapping, + network::ptr body_network, int64_t iter, bool is_dynamic) const { + std::vector event_vec; + OPENVINO_ASSERT(iter >= 0, "iteration should not be negative : ", iter); + if (mapping.type == loop_inst::backedge_memory_mapping::CONCAT_OUTPUT) { + if (iter == 0) { + set_memory_in_body_network(body_network, mapping.to_primitive, mapping.initial_mem); + } else if (iter > 0) { + if (is_dynamic) { + auto from_id = mapping.from_primitive->id(); + if (body_network->has_event(from_id)) { + auto ev = body_network->get_primitive_event(from_id); + if (ev) ev->wait(); + } + // In dynamic model, just copy data from inner body output to inner body input in back_edges. + memory::ptr mem1 = mapping.to_primitive->output_memory_ptr(); + memory::ptr mem2 = mapping.from_primitive->output_memory_ptr(); + auto ev = mem1->copy_from(body_network->get_stream(), *(mem2)); + if (ev) event_vec = {ev}; + } else { + auto mem = mapping.concat_mem_mapping->get_sliced_mems().at(iter - 1); + set_memory_in_body_network(body_network, mapping.to_primitive, mem); + } + } + } else if (mapping.type == loop_inst::backedge_memory_mapping::SINGLE_SHARED) { + if (iter == 0) { + if (mapping.from_mem != nullptr) { + auto ev = mapping.from_mem->copy_from(body_network->get_stream(), *(mapping.initial_mem)); + if (ev) event_vec = {ev}; + } + } else { + // In dynamic model, output memory is not defined before execution. + // After body network execution, replace input memory from initial_mem(external input memory) to output memory. + if (mapping.from_mem == nullptr) { + mapping.from_mem = mapping.from_primitive->output_memory_ptr(); + OPENVINO_ASSERT(mapping.from_mem != nullptr, "from_mem should not be null"); + set_memory_in_body_network(body_network, mapping.to_primitive, mapping.from_mem); + } + } + } else if (mapping.type == loop_inst::backedge_memory_mapping::SINGLE) { + memory::ptr mem1 = mapping.to_primitive->output_memory_ptr(); + if (iter == 0) { + auto ev = mem1->copy_from(body_network->get_stream(), *(mapping.initial_mem)); + if (ev) event_vec = {ev}; + } else { + if (is_dynamic) { + // In dynamic model, do not set memory buffer between input and output in inner body network. + // Just copy data from input buffer memory to output buffer memory. + auto from_id = mapping.from_primitive->id(); + if (body_network->has_event(from_id)) { + auto ev = body_network->get_primitive_event(from_id); + if (ev) ev->wait(); + } + memory::ptr mem2 = mapping.from_primitive->output_memory_ptr(); + auto ev = mem1->copy_from(body_network->get_stream(), *(mem2)); + if (ev) event_vec = {ev}; + } else { + // In static model, swap memory buffer between output and input in inner body network + memory::ptr mem2 = mapping.from_primitive->output_memory_ptr(); + set_memory_in_body_network(body_network, mapping.to_primitive, std::move(mem2)); + set_memory_in_body_network(body_network, mapping.from_primitive, std::move(mem1)); + } + } + } + return event_vec; + } + event::ptr execute_impl(const std::vector& events, loop_inst& instance) override { - const auto& primitive = instance.get_typed_desc(); + const auto& impl_params = instance.get_impl_params(); + const auto& primitive = impl_params->typed_desc(); auto& outer_network = instance.get_network(); auto& stream = outer_network.get_stream(); + const auto max_num_iterations = primitive->max_num_iterations; auto body_network = instance.get_body_network(); + int64_t current_iteration_idx = 0; auto ev = stream.create_user_event(false); - if (!instance.preproc_memories_done) { - instance.preprocess_output_memory(); - instance.preprocess_input_memory(); - instance.preprocess_backedge_memory(); - - // set input data for current_iteration primitive if current_iteration is used - if (!primitive->current_iteration_id.empty()) { - auto current_iteration_prim = body_network->get_primitive(primitive->current_iteration_id); - auto input_layout_prim = std::dynamic_pointer_cast(current_iteration_prim); - if (input_layout_prim == nullptr) { - CLDNN_ERROR_MESSAGE(instance.id(), "current_iteration primitive is not input_layout"); - } else { - const auto& backedge_mapping = instance.get_current_iteration_backedge_mapping(); - input_layout_prim->set_data(backedge_mapping.initial_mem); - } - } - instance.preproc_memories_done = true; - } + OPENVINO_ASSERT(!primitive->num_iteration_id.empty(), "loop operation should have num_iteration_id"); + ////////////////////////////////////////// + // memory pointers for outer network + ////////////////////////////////////////// // read trip_count from outer network - bool update_num_iterations = false; - memory::ptr trip_count_mem = outer_network.get_primitive(primitive->trip_count_id)->output_memory_ptr(); - int64_t trip_count = loop_node::read_scalar_value(std::move(trip_count_mem), stream); - if (trip_count < 0) { - trip_count = _max_iteration; - update_num_iterations = true; + int64_t trip_count = -1; + if (!primitive->trip_count_id.empty()) { + memory::ptr trip_count_mem = outer_network.get_primitive(primitive->trip_count_id)->output_memory_ptr(); + trip_count = read_scalar_value(std::move(trip_count_mem), stream); + } else { + trip_count = max_num_iterations; } // read initial execution condition from outer network - memory::ptr initial_execution_mem = outer_network.get_primitive(primitive->initial_execution_id)->output_memory_ptr(); - int64_t execution_condition = loop_node::read_scalar_value(initial_execution_mem, stream); + int64_t execution_condition = 1; + if (!primitive->first_execution_condition_id.empty()) { + memory::ptr first_execution_condition_mem = outer_network.get_primitive(primitive->first_execution_condition_id)->output_memory_ptr(); + execution_condition = read_scalar_value(first_execution_condition_mem, stream); + } + + // When execution_condition is false or trip_count is zero, return execute_impl without any body_network execution. + if (!execution_condition || trip_count == 0) { + // Update num_iterations (actual number of iterations) + memory::ptr num_actual_iterations_mem = outer_network.get_primitive(primitive->num_iteration_id)->output_memory_ptr(); + write_scalar_value(num_actual_iterations_mem, stream, current_iteration_idx); + + instance.update_output_layout(); + ev->set(); + return ev; + } + ////////////////////////////////////////// + // memory pointers for body network + ////////////////////////////////////////// // shortcut of execution_condition memory in body network - memory::ptr execution_condition_mem = nullptr; - if (!primitive->condition_id.empty()) { - execution_condition_mem = body_network->get_primitive(primitive->condition_id)->output_memory_ptr(); + memory::ptr body_execution_condition_mem = nullptr; + if (!primitive->body_execution_condition_id.empty()) { + body_execution_condition_mem = body_network->get_primitive(primitive->body_execution_condition_id)->output_memory_ptr(); + } + + // shortcut of current_iteration memory in body network + if (!primitive->body_current_iteration_id.empty()) { + memory::ptr body_current_iteration_mem = body_network->get_primitive(primitive->body_current_iteration_id)->output_memory_ptr(); + write_scalar_value(body_current_iteration_mem, body_network->get_stream(), 0); + } + + const auto is_dynamic = instance.is_dynamic(); + if (is_dynamic) { + instance.update_shape(); + if (instance.shape_changed()) { + instance.preproc_memories_done = false; + instance.reset_memory(); + } + } + + if (!instance.preproc_memories_done) { + instance.preprocess_output_memory(trip_count); + instance.preprocess_input_memory(trip_count); + instance.preprocess_backedge_memory(); + instance.preproc_memories_done = true; } const auto& concatenated_input_mem_mappings = instance.concatenated_input_mem_mappings; const auto& concatenated_output_mem_mappings = instance.concatenated_output_mem_mappings; + const auto& backedge_memory_mappings = instance.backedge_memory_mappings; // If there are concatenated_output_mem_mappings or backedge_memory_mappings we need to wait for // previous tasks before accessing memory in get_sliced_mem() and setup_iteration() functions - if (!concatenated_input_mem_mappings.empty() || !instance.backedge_memory_mappings.empty()) { + if (!concatenated_input_mem_mappings.empty() || !backedge_memory_mappings.empty()) { for (auto& e : events) { e->wait(); } @@ -102,37 +278,36 @@ struct loop_impl : typed_primitive_impl { // Set sliced input data for (size_t i = 0; i < concatenated_input_mem_mappings.size(); ++i) { const auto& concatenated_input = concatenated_input_mem_mappings.at(i); - memory::ptr mem = concatenated_input.get_sliced_mem(0); - if (mem) { - body_network->set_input_data(concatenated_input.sliced_data_prim->id(), mem); - } else { - CLDNN_ERROR_MESSAGE(instance.id(), "sliced input memory of loop is not allocated properly"); - } + memory::ptr mem = concatenated_input->get_sliced_mem(0); + OPENVINO_ASSERT(mem != nullptr, instance.id(), "sliced input memory of loop is not allocated properly"); + body_network->set_input_data(concatenated_input->sliced_data_prim->id(), mem); } std::vector all_events; std::vector loop_carried_dep(events.begin(), events.end()); - int64_t current_iteration_idx = 0; - while (current_iteration_idx < trip_count && execution_condition) { + while (((trip_count <= 0) || (current_iteration_idx < trip_count)) && execution_condition) { // Copy & Set sliced input memory for (size_t i = 0; i < concatenated_input_mem_mappings.size(); ++i) { const auto& concatenated_input = concatenated_input_mem_mappings.at(i); - memory::ptr mem = concatenated_input.get_sliced_mem(current_iteration_idx); - if (mem) { - concatenated_input.sliced_data_prim->set_output_memory(mem); - } else { - CLDNN_ERROR_MESSAGE(instance.id(), "sliced input memory of loop is not allocated properly"); - } + memory::ptr mem = concatenated_input->get_sliced_mem(current_iteration_idx); + OPENVINO_ASSERT(mem != nullptr, instance.id(), " sliced input memory of loop is not allocated properly"); + concatenated_input->sliced_data_prim->set_output_memory(mem); } - // Set backedges - for (const auto& backedge_memory_mapping : instance.backedge_memory_mappings) { - backedge_memory_mapping.setup_iteration(current_iteration_idx); + // Set backedges and output memory + for (auto& backedge_memory_mapping : backedge_memory_mappings) { + auto event_vec = handle_buffers_for_next_iteration(backedge_memory_mapping, body_network, current_iteration_idx, is_dynamic); + for (auto ev : event_vec) { + loop_carried_dep.push_back(ev); + } } - // Set sliced output memory - for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) { - concat_output_mem_mapping.setup_sliced_output_memory(current_iteration_idx); + if (!is_dynamic) { + // Set sliced output memory for static shape model + // because body network generate output memory during the body network execution in dynamic model + for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) { + concat_output_mem_mapping->setup_sliced_output_memory(current_iteration_idx); + } } // execute body network @@ -141,9 +316,10 @@ struct loop_impl : typed_primitive_impl { loop_carried_dep.clear(); for (const auto& backedge : _back_edges) { event::ptr body_event; - if (body_network->has_event(backedge.from)) + if (body_network->has_event(backedge.from)) { body_event = body_network->get_primitive_event(backedge.from); - loop_carried_dep.emplace_back(body_event); + loop_carried_dep.emplace_back(body_event); + } } // Collect output events for waiting for all iterations finishing @@ -155,42 +331,59 @@ struct loop_impl : typed_primitive_impl { } } - //TODO: execution_condition is prepared as they are presented in the - // ngraph opset document for loop operation. - // However they are not being used yet and only TensorIterator which - // has fixed sequence length is being validated. - if (!primitive->condition_id.empty()) { - execution_condition = loop_node::read_scalar_value(execution_condition_mem, stream); + // Store output of sliced_data_prim to sliced mems vector + // After execution of body network, sliced_data_prim will has output memory buffer + // current memory buffer move to sliced_mems and new memory buffer will be allocated in sliced_data_prim + if (is_dynamic) { + for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) { + auto sliced_data_prim = concat_output_mem_mapping->sliced_data_prim; + auto output_mem_ptr = sliced_data_prim->output_memory_ptr(); + + auto sliced_id = sliced_data_prim->id(); + if (body_network->has_event(sliced_id)) { + auto ev = body_network->get_primitive_event(sliced_id); + if (ev) ev->wait(); + } + memory::ptr new_sliced_mem = concat_output_mem_mapping->get_or_create_sliced_mem(current_iteration_idx, + output_mem_ptr->get_layout()); + auto ev = new_sliced_mem->copy_from(body_network->get_stream(), *output_mem_ptr); + if (ev) { + loop_carried_dep.push_back(ev); + all_events.push_back(ev); + } + } } - // update index & execution condition for the next iteration - ++current_iteration_idx; + // execution condition is the result of body network execution + if (body_execution_condition_mem != nullptr) { + auto execution_id = primitive->body_execution_condition_id; + if (body_network->has_event(execution_id)) { + auto ev = body_network->get_primitive_event(execution_id); + if (ev) ev->wait(); + } + execution_condition = read_scalar_value(body_execution_condition_mem, body_network->get_stream()); + } + GPU_DEBUG_IF(!execution_condition) { + GPU_DEBUG_LOG << "body_exec_condition is false at "<< current_iteration_idx << " iterations" << std::endl; + } + + current_iteration_idx++; } // Reset network and wait for all collected events body_network->reset_execution(false); stream.wait_for_events(all_events); - // Concatenate sliced output to the outer network - for (size_t i = 0; i < concatenated_output_mem_mappings.size(); ++i) { - const auto& concat_output = concatenated_output_mem_mappings.at(i); - concat_output.restore_concatenated_mem(); - } + // Update actual num iteration + // update num_iterations (actual number of iterations) + memory::ptr num_actual_iterations_mem = outer_network.get_primitive(primitive->num_iteration_id)->output_memory_ptr(); + write_scalar_value(num_actual_iterations_mem, stream, current_iteration_idx); + GPU_DEBUG_LOG << "current_iteration(" << primitive->num_iteration_id << ", " + << num_actual_iterations_mem << ") : " << current_iteration_idx << std::endl; - if (update_num_iterations) { - // update num_iterations (actual number of iterations) - int64_t actual_iterations = 0; - if (!primitive->current_iteration_id.empty()) { - const auto& backedge_mapping = instance.get_current_iteration_backedge_mapping(); - auto current_iteration_mem = backedge_mapping.from_primitive->output_memory_ptr(); - actual_iterations = loop_node::read_scalar_value(current_iteration_mem, stream); - } else { - actual_iterations = current_iteration_idx; - } - - memory::ptr num_actual_iterations_mem = outer_network.get_primitive(primitive->num_iteration_id)->output_memory_ptr(); - loop_node::write_scalar_value(num_actual_iterations_mem, stream, actual_iterations); - } + if (is_dynamic) + instance.update_output_layout(); + instance.postprocess_output_memory(is_dynamic); ev->set(); return ev; @@ -202,23 +395,25 @@ struct loop_impl : typed_primitive_impl { void save(BinaryOutputBuffer& ob) const override { parent::save(ob); - ob << _max_iteration; ob << _back_edges; } void load(BinaryInputBuffer& ib) override { parent::load(ib); - ib >> _max_iteration; ib >> _back_edges; } private: - int64_t _max_iteration = 0; std::vector _back_edges; }; namespace detail { attach_loop_common::attach_loop_common() { + implementation_map::add(impl_types::common, + shape_types::dynamic_shape, + loop_impl::create, + {}, + {}); implementation_map::add(impl_types::common, loop_impl::create, {}); } } // namespace detail diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp new file mode 100644 index 00000000000000..5296c1dda7e7aa --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "primitive_base.hpp" +#include "group_normalization_inst.h" +#include "group_normalization/group_normalization_kernel_ref.h" +#include "group_normalization/group_normalization_kernel_selector.h" + +namespace cldnn { +namespace ocl { + +struct group_normalization_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + using kernel_selector_t = kernel_selector::group_normalization_kernel_selector; + using kernel_params_t = std::pair; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::group_normalization_impl) + + std::unique_ptr clone() const override { + return make_unique(*this); + } + + static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param, is_shape_agnostic); + params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1))); + params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(2))); + auto optional_params = get_default_optional_params(impl_param.get_program()); + params.num_groups = primitive->num_groups; + params.epsilon = primitive->epsilon; + return {params, optional_params}; + } + + void update_dispatch_data(const kernel_impl_params& impl_param) override { + auto kernel_params = get_kernel_params(impl_param, true); + (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data); + } +}; + +namespace detail { + +attach_group_normalization_impl::attach_group_normalization_impl() { + auto types = {data_types::f16, data_types::f32}; + auto formats = { + format::bfyx, + format::byxf, + format::yxfb, + format::bfzyx, + format::b_fs_yx_fsv2, + format::b_fs_zyx_fsv2, + format::b_fs_yx_fsv4, + format::b_fs_zyx_fsv4, + format::b_fs_yx_fsv16, + format::b_fs_yx_fsv32, + format::b_fs_zyx_fsv16, + format::b_fs_zyx_fsv32, + }; + + implementation_map::add(impl_types::ocl, shape_types::static_shape, + typed_primitive_impl_ocl::create, + types, + formats); +} + +} // namespace detail +} // namespace ocl +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::group_normalization_impl) +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::group_normalization) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index 3a287bdeda4f7d..6b35b9cdfb16ce 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -40,6 +40,7 @@ void register_implementations() { REGISTER_OCL(gemm); REGISTER_OCL(generate_proposals); REGISTER_OCL(grid_sample); + REGISTER_OCL(group_normalization); REGISTER_OCL(lrn); REGISTER_OCL(lstm_gemm); REGISTER_OCL(lstm_elt); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index a5fb5a5817e395..45f4018bf90dac 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -35,6 +35,7 @@ #include "intel_gpu/primitives/gemm.hpp" #include "intel_gpu/primitives/grid_sample.hpp" #include "intel_gpu/primitives/grn.hpp" +#include "intel_gpu/primitives/group_normalization.hpp" #include "intel_gpu/primitives/lrn.hpp" #include "intel_gpu/primitives/lstm.hpp" #include "intel_gpu/primitives/lstm_dynamic.hpp" @@ -120,6 +121,7 @@ REGISTER_OCL(gather_elements); REGISTER_OCL(gemm); REGISTER_OCL(generate_proposals); REGISTER_OCL(grid_sample); +REGISTER_OCL(group_normalization); REGISTER_OCL(lrn); REGISTER_OCL(lstm_gemm); REGISTER_OCL(lstm_elt); diff --git a/src/plugins/intel_gpu/src/graph/include/group_normalization_inst.h b/src/plugins/intel_gpu/src/graph/include/group_normalization_inst.h new file mode 100644 index 00000000000000..27fe382146999b --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/group_normalization_inst.h @@ -0,0 +1,39 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once +#include "intel_gpu/primitives/group_normalization.hpp" +#include "primitive_inst.h" + +namespace cldnn { +template <> +struct typed_program_node : public typed_program_node_base { + using parent = typed_program_node_base; + +public: + using parent::parent; + + std::vector get_shape_infer_dependencies() const override { return {}; } +}; +using group_normalization_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + template + static std::vector calc_output_layouts(group_normalization_node const& /*node*/, const kernel_impl_params& impl_param) { + return forward_input0_shape(impl_param); + } + + static layout calc_output_layout(group_normalization_node const& node, kernel_impl_params const& impl_param); + static std::string to_string(group_normalization_node const& node); + + typed_primitive_inst(network& network, group_normalization_node const& desc); +}; + +using group_normalization_inst = typed_primitive_inst; + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/loop_inst.h b/src/plugins/intel_gpu/src/graph/include/loop_inst.h index 5d7dd710892181..22f4489ae507b5 100644 --- a/src/plugins/intel_gpu/src/graph/include/loop_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/loop_inst.h @@ -21,163 +21,52 @@ template<> struct typed_program_node : public typed_program_node_base { private: using parent = typed_program_node_base; - mutable topology body; - std::vector input_primitive_maps; - std::vector output_primitive_maps; - mutable std::vector back_edges; - bool use_current_iteration; - bool use_execution_condition; - mutable program::ptr body_program; + std::vector& input_primitive_maps; + std::vector& output_primitive_maps; + std::vector& back_edges; public: - typed_program_node(std::shared_ptr prim, program& prog) : + typed_program_node(std::shared_ptr prim, program& prog) : parent(prim, prog), - body(this->get_primitive()->body), - input_primitive_maps(this->get_primitive()->input_primitive_maps), - output_primitive_maps(this->get_primitive()->output_primitive_maps), - back_edges(this->get_primitive()->back_edges), - use_current_iteration(!this->get_primitive()->current_iteration_id.empty()), - use_execution_condition(!this->get_primitive()->condition_id.empty()), - iteration_axis(0), - max_iteration(this->get_primitive()->max_iteration < 0 ? DEFAULT_MAX_NUM_ITERATION : this->get_primitive()->max_iteration) {} - - mutable size_t iteration_axis; - int64_t max_iteration; - - int64_t get_max_iteration() const { return max_iteration; } - program::ptr get_body_program() const { return body_program; } - bool is_current_iteration_used() const { return use_current_iteration; } - bool is_execution_condition_used() const { return use_execution_condition; } - - static size_t convert_to_raw_axis(size_t axis, size_t ndim) { - // convert between bfyx, bfzyx, bfzyxw and tensor.size.raw - if (axis >= ndim) { - throw std::runtime_error("axis should be less than ndim"); - } - - if (axis < 2) { - return axis; - } - return (ndim - 1) - (axis - 2); - } - - // read scala value from data primitive - static int64_t read_scalar_value(memory::ptr mem, stream& stream) { - int64_t trip_count = 0; - const layout& prim_layout = mem->get_layout(); - - switch (prim_layout.data_type) { - case data_types::u8: { - mem_lock lock_prim_output{mem, stream}; - trip_count = *lock_prim_output.data(); - break; - } - case data_types::i8: { - mem_lock lock_prim_output{mem, stream}; - trip_count = *lock_prim_output.data(); - break; - } - case data_types::i32: { - mem_lock lock_prim_output{mem, stream}; - trip_count = *lock_prim_output.data(); - break; - } - case data_types::i64: { - mem_lock lock_prim_output{mem, stream}; - trip_count = *lock_prim_output.data(); - break; - } - default: - throw std::runtime_error("Invalid data type : " + ov::element::Type(prim_layout.data_type).get_type_name()); - } - return trip_count; - } + input_primitive_maps(prim->input_primitive_maps), + output_primitive_maps(prim->output_primitive_maps), + back_edges(prim->back_edges) {} - template - static inline void validate_input_value(int64_t input) { - if (input < std::numeric_limits::min() || input > std::numeric_limits::max()) { - throw std::runtime_error("Invalid data value : " + std::to_string(input)); - } - } + program::ptr get_body_program() const { return get_primitive()->body_program; } - static void write_scalar_value(memory::ptr mem, stream& stream, int64_t input) { - const layout& prim_layout = mem->get_layout(); - - switch (prim_layout.data_type) { - case data_types::u8: { - validate_input_value(input); - mem_lock lock_prim_output{mem, stream}; - lock_prim_output[0] = static_cast(input); - break; - } - case data_types::i8: { - validate_input_value(input); - mem_lock lock_prim_output{mem, stream}; - lock_prim_output[0] = static_cast(input); - break; - } - case data_types::i32: { - validate_input_value(input); - mem_lock lock_prim_output{mem, stream}; - lock_prim_output[0] = static_cast(input); - break; - } - case data_types::i64: { - mem_lock lock_prim_output{mem, stream}; - lock_prim_output[0] = input; - break; - } - default: - throw std::runtime_error("Invalid data type : " + ov::element::Type(prim_layout.data_type).get_type_name()); - } - } - - layout calc_body_input_layout(const loop::io_primitive_map& inputDesc) const { - const auto& dependency_list = this->get_dependencies(); - auto input = std::find_if(dependency_list.begin(), dependency_list.end(), [&inputDesc](const std::pair& dep){ - return dep.first->id() == inputDesc.external_id; - }); - if (input == dependency_list.end()) { - throw std::runtime_error("Can't find input from dependency_list"); - } - layout calculated_layout = (*input).first->get_output_layout(); - auto shape = calculated_layout.get_tensor().sizes(calculated_layout.format); - - if (inputDesc.axis >= 0) { - iteration_axis = convert_to_raw_axis(static_cast(inputDesc.axis), shape.size()); - auto calculated_size = calculated_layout.get_tensor(); - calculated_size.raw[iteration_axis] = 1; // cropped inputs shape - calculated_layout.set_tensor(calculated_size); - } - - return calculated_layout; - } + const primitive_id& get_trip_count_id() const { return get_primitive()->trip_count_id; } + const primitive_id& get_initial_execution_id() const { return get_primitive()->first_execution_condition_id; } + const primitive_id& get_current_iteration_id() const { return get_primitive()->body_current_iteration_id; } + const primitive_id& get_execution_condition_id() const { return get_primitive()->body_execution_condition_id; } + const primitive_id& get_num_iterations_id() const { return get_primitive()->num_iteration_id; } + const int32_t get_max_num_iteration() const { return get_primitive()->max_num_iterations; } const std::vector& get_input_primitive_maps() const { return input_primitive_maps; } const std::vector& get_output_primitive_maps() const { return output_primitive_maps; } + const std::vector& get_back_edges() const { return back_edges;} void update_primitive_map(const primitive_id& prevID, const primitive_id& newID, bool external_id = true) { if (external_id) { for (auto& pm : input_primitive_maps) { - if (pm.external_id == prevID) { - pm.external_id = newID; + if (pm.external_id.pid == prevID) { + pm.external_id.pid = newID; } } for (auto& pm : output_primitive_maps) { - if (pm.external_id == prevID) { - pm.external_id = newID; + if (pm.external_id.pid == prevID) { + pm.external_id.pid = newID; } } } else { for (auto& pm : input_primitive_maps) { - if (pm.internal_id == prevID) { - pm.internal_id = newID; + if (pm.internal_id.pid == prevID) { + pm.internal_id.pid = newID; } } for (auto& pm : output_primitive_maps) { - if (pm.internal_id == prevID) { - pm.internal_id = newID; + if (pm.internal_id.pid == prevID) { + pm.internal_id.pid = newID; } } for (auto& back_edge : back_edges) { @@ -191,157 +80,266 @@ struct typed_program_node : public typed_program_node_base { } } - const std::vector& get_back_edges() const { return back_edges;} - - static bool is_integer(const data_types& data_type) { - switch (data_type) { - case data_types::u8: - case data_types::i8: - case data_types::i32: - case data_types::i64: - return true; - default: - return false; - } + // current_iteration is necessary to calculate output layout in dynamic shape + std::vector get_shape_infer_dependencies() const override { return {0}; } + + using parent::get_kernel_impl_params; + std::unique_ptr get_kernel_impl_params(const std::vector& in_layouts, const std::vector& out_layouts) const override { + auto params = parent::get_kernel_impl_params(in_layouts, out_layouts); + params->inner_progs = { get_primitive()->body_program }; + // Set memory_deps using custom get_memory_deps to add current_iteration(mutable_data) into memory_deps + params->memory_deps = get_memory_deps(); + return params; } - void process_current_iteration() const { - const primitive_id& current_iteration_id = get_current_iteration_id(); - if (current_iteration_id.empty()) { - return; - } +private: + std::map get_memory_deps() const; +}; - const topology_map& body_topology_map = body.get_primitives(); - const layout body_input_layout(data_types::i64, format::bfyx, {1, 1, 1, 1}); +using loop_node = typed_program_node; - // add current_iteration primitive if current_iteration primitive is not exist in body - if (body_topology_map.find(current_iteration_id) == body_topology_map.end()) { - body.add_primitive(std::make_shared(current_iteration_id, body_input_layout)); - } else { - const auto& body_input_prim = body.at(current_iteration_id); - const auto input_layout_prim = std::dynamic_pointer_cast(body_input_prim); - OPENVINO_ASSERT(input_layout_prim, "[GPU] current_iteration primitive should be cldnn::input_layout in node", this->id()); - input_layout_prim->change_layout(body_input_layout); +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + using parent::parent; + +public: + struct concatenated_memory_mapping { + using ptr = std::shared_ptr; + using cptr = std::shared_ptr; + concatenated_memory_mapping(int64_t axis, + memory::ptr concatenated_mem, + std::vector sliced_mems, // To change shared ptr vector + stream& stream, + engine& engine, + int64_t iteration_elements = 0, + int64_t stride = 0, + int64_t initial_offset = 0) : + axis(axis), + concatenated_mem(concatenated_mem), + sliced_mems(sliced_mems), + stream(stream), + engine(engine), + iteration_elements(iteration_elements), + stride(stride), + initial_offset(initial_offset) { + calculate_concatenated_mem(); + } + + concatenated_memory_mapping(const concatenated_memory_mapping& o) : + axis(o.axis), + concat_data_prim(o.concat_data_prim), + sliced_data_prim(o.sliced_data_prim), + + concatenated_mem(o.concatenated_mem), + sliced_mems(o.sliced_mems), + stream(o.stream), + engine(o.engine), + iteration_elements(o.iteration_elements), + stride(o.stride), + initial_offset(o.initial_offset), + + bytes_per_element(o.bytes_per_element), + batch_size(o.batch_size), + bytes_batch_stride(o.bytes_batch_stride), + bytes_iteration(o.bytes_iteration), + bytes_iteration_stride(o.bytes_iteration_stride), + bytes_iteration_initial_offset(o.bytes_iteration_initial_offset) {} + + + static int64_t get_batch_size(layout mem_layout, int64_t axis) { + if (axis < 0) { + throw std::runtime_error("axis should be positive integer or zero"); + } + + if (mem_layout.is_dynamic()) { + return -1; + } + + int64_t batch_size = 1; + for (int64_t i = 0; i < axis; ++i) { + batch_size *= mem_layout.get_tensor().raw[i]; + } + for (int64_t i = axis-1; i >= 2; --i) { + batch_size *= mem_layout.get_tensor().raw[i]; + } + return batch_size; } - // add incremental data: 1 - // it is used to update current_iteration in body network - const primitive_id increment_value_id = current_iteration_id + "_inc"; - auto mem = get_program().get_engine().allocate_memory(body_input_layout); - auto& stream = get_program().get_stream(); - write_scalar_value(mem, stream, 1); - body.add_primitive(std::make_shared(increment_value_id, mem)); - - // add eltwise sum updating current_iteration with incremental data - const primitive_id updated_currnet_iteration_id = current_iteration_id + "_update"; - body.add_primitive(std::make_shared(updated_currnet_iteration_id, - current_iteration_id, increment_value_id, eltwise_mode::sum)); - - // set backedge - back_edges.emplace_back(updated_currnet_iteration_id, current_iteration_id); - } + void calculate_concatenated_mem() const { + if (!sliced_mems.empty() && concatenated_mem != nullptr) { + auto& sliced_layout = sliced_mems.front()->get_layout(); + const int64_t num_elements_batch = get_batch_size(sliced_layout, axis); + iteration_elements = sliced_layout.count() / num_elements_batch; + bytes_per_element = data_type_traits::size_of(concatenated_mem->get_layout().data_type); + batch_size = get_batch_size(concatenated_mem->get_layout(), axis); + bytes_batch_stride = (static_cast(concatenated_mem->get_layout().count()) / batch_size) * bytes_per_element; + bytes_iteration = iteration_elements * bytes_per_element; + bytes_iteration_stride = stride * bytes_iteration; + bytes_iteration_initial_offset = initial_offset * bytes_iteration; + } + } - void process_single_int_output(const primitive_id& id) const { - // add mutable if not exist - const topology_map& body_topology_map = body.get_primitives(); - layout body_output_layout(data_types::i64, format::bfyx, {1, 1, 1, 1}); - if (!id.empty()) { - auto body_output = body_topology_map.find(id); - if (body_output == body_topology_map.end()) { - auto mem = get_program().get_engine().allocate_memory(body_output_layout); - auto md = std::make_shared(id, mem); - body.add_primitive(md); + void update_concatenated_mem(memory::ptr mem) { + if (concatenated_mem != nullptr && concatenated_mem->get_layout() == mem->get_layout()) { + concatenated_mem = mem; } else { - auto body_output_prim = body.at(body_output->first); - auto mem = get_program().get_engine().allocate_memory(body_output_layout); - body_output_prim.reset(new mutable_data(body_output->first, std::move(mem))); + concatenated_mem = mem; + calculate_concatenated_mem(); } } - } - void build_body_program() const { - for (const auto& pm : input_primitive_maps) { - layout calculated_layout = calc_body_input_layout(pm); - const primitive_id& internal_input_id = pm.internal_id; + void restore_concatenated_mem() const { + OPENVINO_ASSERT(concatenated_mem != nullptr, "concatenated_mem should not be nullptr"); + mem_lock concat_mem_lock{ concatenated_mem, stream }; + int64_t iteration_offset = bytes_iteration_initial_offset; + for (const auto& sliced_mem : sliced_mems) { + // To support multi-batch, just repeat memcpy for each batch + for (int64_t batch = 0; batch < batch_size; ++batch) { + const int64_t src_offset = batch * bytes_iteration; + const int64_t dst_offset = batch * bytes_batch_stride + iteration_offset; + mem_lock sliced_mem_lock{ sliced_mem, stream }; + uint8_t* src = sliced_mem_lock.data() + src_offset; + uint8_t* dst = concat_mem_lock.data() + dst_offset; + std::copy(src, src + bytes_iteration, dst); + } + iteration_offset += bytes_iteration_stride; + } + } - // add inputs for body network if not exist - if (body.get_primitives().count(internal_input_id) == 0) { - body.add_primitive(std::make_shared(internal_input_id, calculated_layout)); - } else { - body.change_input_layout(internal_input_id, calculated_layout); + // Get sliced mem for the iteration idx and copy data from external input to sliced mem + // In the case of dynamic model, concatenated_mem is always non nullptr. + memory::ptr get_sliced_mem(int64_t iteration) const { + OPENVINO_ASSERT(!sliced_mems.empty(), "For input data, sliced_mems should not be empty"); + mem_lock from_lock{ concatenated_mem, stream }; + int64_t batch_offset = 0; + auto sliced_mem = get_or_create_sliced_mem(iteration, sliced_mems.front()->get_layout()); + const int64_t iteration_offset = bytes_iteration_initial_offset + + bytes_iteration_stride * iteration; + // To support multi-batch, just repeat memcpy for each batch + for (int64_t batch = 0; batch < batch_size; ++batch) { + const int64_t src_offset = batch_offset + iteration_offset; + const int64_t dst_offset = batch * bytes_iteration; + mem_lock to_lock{ sliced_mem, stream }; + const auto src = from_lock.begin() + src_offset; + const auto dst = to_lock.begin() + dst_offset; + std::copy(src, src + bytes_iteration, dst); + batch_offset += bytes_batch_stride; } + return sliced_mem; } - // setup internal output - OPENVINO_ASSERT(!output_primitive_maps.empty(), "[GPU] Output primitive map should have at least 1 mapping in primitive ", this->id()); - std::set output_names; - output_names.insert(output_primitive_maps.front().internal_id); - - // add current_iteration_id in body network, condition_id if exist - process_current_iteration(); - process_single_int_output(get_condition_id()); - - // setup outputs for backedges - for (auto& back_edge : back_edges) { - // check whether the back_edge.to has its corresponding io_primitive_map - const auto& input_map = std::find_if(input_primitive_maps.begin(), input_primitive_maps.end(), - [&](const loop::io_primitive_map& pm) { - return pm.internal_id == back_edge.to; - }); - - // backedge which is current_iteration does not have - // input primitive map because its initial value is always - // zero and the value will be set in execute_impl() - if (back_edge.to != get_current_iteration_id() && input_map == input_primitive_maps.end()) { - std::string msg = "[GPU] No primitive mapping for backedge (internal_id: " + back_edge.to + ") for primitive " + this->id(); - OPENVINO_ASSERT(false, msg.c_str()); + memory::ptr get_or_create_sliced_mem(int64_t idx, const layout& mem_layout) const { + bool recalc_data = !sliced_mems.empty(); + while (sliced_mems.size() <= static_cast(idx)) { + memory::ptr sliced_mem = engine.allocate_memory(mem_layout, 0); + sliced_mems.push_back(sliced_mem); } + if (recalc_data) { + calculate_concatenated_mem(); + } + return sliced_mems.at(idx); + } - output_names.insert(back_edge.from); + void setup_sliced_output_memory(uint64_t iteration) const { + if (sliced_data_prim) { + OPENVINO_ASSERT(iteration < sliced_mems.size(), "invalid index"); + const auto& sliced_output_mem = sliced_mems.at(iteration); + sliced_data_prim->set_output_memory(sliced_output_mem); + } } - // if execution_condition_id is specified, we need to add the id in build_option::outputs - if (!get_condition_id().empty()) { - output_names.insert(get_condition_id()); + std::vector& get_sliced_mems() const { return sliced_mems; } + + void reset_data_for_shape_changed() { + bytes_per_element = 0; + batch_size = 0; + bytes_batch_stride = 0; + bytes_iteration = 0; + bytes_iteration_stride = 0; + bytes_iteration_initial_offset = 0; + if (concatenated_mem) concatenated_mem = nullptr; + iteration_elements = 0; + sliced_mems.clear(); } - std::vector output_names_vec(output_names.begin(), output_names.end()); - auto config = get_program().get_config(); - config.set_property(ov::intel_gpu::custom_outputs(output_names_vec)); - body_program = program::build_program(get_program().get_engine(), body, config, get_program().get_task_executor(), false, false, true); - } + std::string to_string() const { + std::stringstream ss; + ss << "concatenated_memory_mapping [" << std::endl; + ss << "* axis : " << axis << std::endl; + ss << "* bytes_per_element : " << bytes_per_element << std::endl; + ss << "* batch_size : " << batch_size << std::endl; + if (concatenated_mem != nullptr && concatenated_mem->get_layout().is_static()) { + ss << "* bytes_batch_stride : " << bytes_batch_stride << " = (static_cast(" + << concatenated_mem->get_layout().count() << ") / batch_size:" << batch_size << ") * bytes_per_element:" << bytes_per_element << std::endl; + } else { + ss << "* bytes_batch_stride : " << bytes_batch_stride << std::endl; + } + ss << "* bytes_iteration : " << bytes_iteration << " = (iteration_elements:" + << iteration_elements << " * bytes_per_element:" << bytes_per_element << ")" << std::endl; + ss << "* bytes_iteration_stride : " << bytes_iteration_stride << std::endl; + ss << "* bytes_iteration_initial_offset : " << bytes_iteration_initial_offset << std::endl; + ss << "* concat_data_prim : " << ((concat_data_prim != nullptr)? concat_data_prim->id() : "nullptr") << std::endl; + ss << "* sliced_data_prim : " << ((sliced_data_prim != nullptr)? sliced_data_prim->id() : "nullptr") << std::endl; + if (concatenated_mem) { + ss << "* concatenated_mem : " << concatenated_mem->get_layout().to_short_string() << std::endl; + } else { + ss << "* concatenated_mem : nullptr" << std::endl; + } + ss << "* iteration_elements : " << iteration_elements << std::endl; + ss << "* stride : " << stride << std::endl; + ss << "* initial_offset : " << initial_offset << std::endl; + ss << "* sliced_mems :{ "; + for (auto mem : sliced_mems) { + ss << mem->get_layout().to_short_string() << ","; + } + ss << "}]" << std::endl; + return ss.str(); + } - const primitive_id& get_trip_count_id() const { return get_primitive()->trip_count_id; } - const primitive_id& get_initial_execution_id() const { return get_primitive()->initial_execution_id; } - const primitive_id& get_current_iteration_id() const { return get_primitive()->current_iteration_id; } - const primitive_id& get_condition_id() const { return get_primitive()->condition_id; } - const primitive_id& get_num_iteration_id() const { return get_primitive()->num_iteration_id; } - const topology& get_body_topology() const { return get_primitive()->body; } -}; + const int64_t axis; + std::shared_ptr concat_data_prim; + std::shared_ptr sliced_data_prim; -using loop_node = typed_program_node; +private: + mutable memory::ptr concatenated_mem; + mutable std::vector sliced_mems; + cldnn::stream& stream; + cldnn::engine& engine; + mutable int64_t iteration_elements = 0; + const int64_t stride = 0; + const int64_t initial_offset = 0; -template <> -class typed_primitive_inst : public typed_primitive_inst_base { - using parent = typed_primitive_inst_base; - using parent::parent; + // element size + mutable int64_t bytes_per_element; + // number of higher level of dimension of slicing axis + mutable int64_t batch_size; + // stride of batch in concatenated memory + mutable int64_t bytes_batch_stride; + // byte size of each iteration per batch in a sliced memory + mutable int64_t bytes_iteration; + // byte size of each iteration (bytes_iteration * batch_size) in a sliced memory + mutable int64_t bytes_iteration_stride; + // byte offset of 1st iteration in a batch in a sliced memory + mutable int64_t bytes_iteration_initial_offset; + }; -public: struct backedge_memory_mapping { enum backedge_type { // output memory(from_primitive) of body network needs to be concatenated CONCAT_OUTPUT, - // output memory(from_primitive) of body network does not need to be concateneated + // output memory(from_primitive) of body network does not need to be concatenated // input memory is shared by output memory SINGLE_SHARED, - // output memory(from_primitive) of body network does not need to be concateneated - // input memory is not shared by output memroy + // output memory(from_primitive) of body network does not need to be concatenated + // input memory is not shared by output memory // each iteration input memory and output memory are swapped SINGLE, }; std::shared_ptr from_primitive; std::shared_ptr to_primitive; - std::vector from_mems; + std::shared_ptr concat_mem_mapping; + mutable memory::ptr from_mem; memory::ptr initial_mem; cldnn::stream& stream; backedge_type type; @@ -349,10 +347,11 @@ class typed_primitive_inst : public typed_primitive_inst_base { backedge_memory_mapping( std::shared_ptr _from_primitive, std::shared_ptr _to_primitive, - std::vector _from_mems, memory::ptr _initial_mem, cldnn::stream& _stream, backedge_type _type = CONCAT_OUTPUT): + std::shared_ptr _concat_mem_mapping, memory::ptr _initial_mem, + cldnn::stream& _stream, backedge_type _type = CONCAT_OUTPUT): from_primitive(_from_primitive), to_primitive(std::move(_to_primitive)), - from_mems(_from_mems), + concat_mem_mapping(std::move(_concat_mem_mapping)), initial_mem(std::move(_initial_mem)), stream(_stream), type(_type), @@ -365,7 +364,7 @@ class typed_primitive_inst : public typed_primitive_inst_base { memory::ptr _from_mem, memory::ptr _initial_mem, cldnn::stream& _stream, backedge_type _type = SINGLE_SHARED): from_primitive(_from_primitive), to_primitive(std::move(_to_primitive)), - from_mems{std::move(_from_mem)}, + from_mem{std::move(_from_mem)}, initial_mem(std::move(_initial_mem)), stream(_stream), type(_type), @@ -385,161 +384,67 @@ class typed_primitive_inst : public typed_primitive_inst_base { validate_backedge_memory(); } - void setup_iteration(int64_t iter) const { - if (type == CONCAT_OUTPUT) { - if (iter == 0) { - to_primitive->set_output_memory(initial_mem); - } else if (iter > 0) { - to_primitive->set_output_memory(from_mems.at(iter - 1)); - } else { - throw std::runtime_error("Invalid iteraton count" + std::to_string(iter)); - } - } else if (type == SINGLE_SHARED && iter == 0) { - from_mems.front()->copy_from(stream, *initial_mem); - } else if (type == SINGLE) { - memory::ptr mem1 = to_primitive->output_memory_ptr(); - if (iter == 0) { - mem1->copy_from(stream, *initial_mem); - } else { - memory::ptr mem2 = from_primitive->output_memory_ptr(); - to_primitive->set_output_memory(std::move(mem2)); - from_primitive->set_output_memory(mem1); - } - } - } - private: void validate_backedge_memory() { - for (const auto& from_mem : from_mems) { + if (from_mem) { const size_t from_mem_bytes = from_mem->get_layout().bytes_count(); - if (from_mem_bytes != total_bytes) { - throw std::runtime_error("Invalid backedge memory layout: " - "size not matched with that of initial_mem"); - } + OPENVINO_ASSERT((from_mem_bytes == total_bytes), "Invalid backedge memory layout: size(", + from_mem_bytes, ",", from_mem->get_layout().to_short_string(), + ") not matched with that of initial_mem(", total_bytes, + ",", initial_mem->get_layout().to_short_string(), ")"); } - } - }; - - struct concatenated_memory_mapping { - concatenated_memory_mapping(int64_t axis, - memory::ptr concatenated_mem, - std::vector sliced_mems, - stream& stream, - int64_t iteration_elements = 0, - int64_t stride = 0, - int64_t initial_offset = 0) : - axis(axis), - concatenated_mem(concatenated_mem), - sliced_mems(sliced_mems), - stream(stream), - bytes_per_element(data_type_traits::size_of(concatenated_mem->get_layout().data_type)), - batch_size(get_batch_size(concatenated_mem->get_layout(), axis)), - bytes_batch_stride((static_cast(concatenated_mem->get_layout().count()) / batch_size) * bytes_per_element), - bytes_iteration(iteration_elements * bytes_per_element), - bytes_iteration_stride(stride * bytes_iteration), - bytes_iteration_initial_offset(initial_offset * bytes_iteration) {} - - static int64_t get_batch_size(layout mem_layout, int64_t axis) { - if (axis < 0) { - throw std::runtime_error("axis should be positive integer or zero"); - } - - int64_t batch_size = 1; - for (int64_t i = 0; i < axis; ++i) { - batch_size *= mem_layout.get_tensor().raw[i]; - } - for (int64_t i = axis-1; i >= 2; --i) { - batch_size *= mem_layout.get_tensor().raw[i]; - } - return batch_size; - } - - void restore_concatenated_mem() const { - mem_lock concat_mem_lock{ concatenated_mem, stream }; - int64_t iteration_offset = bytes_iteration_initial_offset; - for (const auto& sliced_mem : sliced_mems) { - for (int64_t batch = 0; batch < batch_size; ++batch) { - const int64_t src_offset = batch * bytes_iteration; - const int64_t dst_offset = batch * bytes_batch_stride + iteration_offset; - mem_lock sliced_mem_lock{ sliced_mem, stream }; - uint8_t* src = sliced_mem_lock.data() + src_offset; - uint8_t* dst = concat_mem_lock.data() + dst_offset; - std::copy(src, src + bytes_iteration, dst); + if (concat_mem_mapping) { + for (const auto& from_mem : concat_mem_mapping->get_sliced_mems()) { + const size_t from_mem_bytes = from_mem->get_layout().bytes_count(); + OPENVINO_ASSERT((from_mem_bytes == total_bytes), "Invalid backedge memory layout: size(", + from_mem_bytes, ",", from_mem->get_layout().to_short_string(), + ") not matched with that of initial_mem(", total_bytes, + ",", initial_mem->get_layout().to_short_string(), ")"); } - iteration_offset += bytes_iteration_stride; } } - - void setup_sliced_output_memory(uint64_t iteration) const { - const auto& sliced_output_mem = sliced_mems.at(iteration); - sliced_data_prim->set_output_memory(sliced_output_mem); - } - - memory::ptr get_sliced_mem(int64_t iteration) const { - mem_lock from_lock{ concatenated_mem, stream }; - int64_t batch_offset = 0; - const int64_t iteration_offset = bytes_iteration_initial_offset + - bytes_iteration_stride * iteration; - for (int64_t batch = 0; batch < batch_size; ++batch) { - const int64_t src_offset = batch_offset + iteration_offset; - const int64_t dst_offset = batch * bytes_iteration; - mem_lock to_lock{ sliced_mems.at(iteration), stream }; - const auto src = from_lock.begin() + src_offset; - const auto dst = to_lock.begin() + dst_offset; - std::copy(src, src + bytes_iteration, dst); - batch_offset += bytes_batch_stride; - } - return sliced_mems.at(iteration); - } - - const int64_t axis; - std::shared_ptr concat_data_prim; - std::shared_ptr sliced_data_prim; - memory::ptr concatenated_mem; - std::vector sliced_mems; - cldnn::stream& stream; - // element size - const int64_t bytes_per_element; - // number of higher level of dimension of slicing axis - const int64_t batch_size; - // stride of batch in concatanated memory - const int64_t bytes_batch_stride; - // byte size of each iteration per batch in a sliced memory - const int64_t bytes_iteration; - // byte size of each iteration (bytes_iteration * batch_size) in a sliced memory - const int64_t bytes_iteration_stride; - // byte offset of 1st iteration in a batch in a sliced memory - const int64_t bytes_iteration_initial_offset; }; - static layout calc_output_layout(const loop_node& node, kernel_impl_params const& impl_param); + template + static std::vector calc_output_layouts(loop_node const& /*node*/, kernel_impl_params const& impl_param); + static layout calc_output_layout(const loop_node& /*node*/, kernel_impl_params const& impl_param); bool preproc_memories_done = false; std::vector backedge_memory_mappings; - std::vector concatenated_input_mem_mappings; - std::vector concatenated_output_mem_mappings; + std::vector concatenated_input_mem_mappings; + std::vector concatenated_output_mem_mappings; static std::string to_string(const loop_node& node); - size_t current_iteratoin_backedge_mapping_idx = 0; public: typed_primitive_inst(network& network, const loop_node& node); network::ptr get_body_network() const { return body_network; } - void preprocess_input_memory(); - void preprocess_output_memory(); + void preprocess_input_memory(const int64_t trip_count); + void preprocess_output_memory(const int64_t trip_count); void preprocess_backedge_memory(); void update_mapped_memory(); + void update_input_mapped_memory(); + void update_output_mapped_memory(); + void update_backedge_mapped_memory(); + void postprocess_output_memory(bool is_dynamic); + concatenated_memory_mapping::ptr create_concat_memory_map(const input_info& id, + const cldnn::loop::io_primitive_map& io_prim_map, + memory::ptr mem_ptr, + const int64_t trip_count); event::ptr set_output_memory(memory::ptr mem, bool check = true, size_t idx = 0) override; - const backedge_memory_mapping& get_current_iteration_backedge_mapping() const { - OPENVINO_ASSERT(node->is_current_iteration_used(), "[GPU] No backedge mapping for current_iteration for primitive ", node->id()); - return backedge_memory_mappings.at(current_iteratoin_backedge_mapping_idx); - } + void reset_memory(); + void save(BinaryOutputBuffer& ob) const override; void load(BinaryInputBuffer& ib) override; + void validate_backedges(loop_node const & node) const; + + void update_shape() override { primitive_inst::update_shape(); } + void update_output_layout(); private: network::ptr body_network; - memory::ptr get_external_memory(const primitive_id& external_id) const; - std::vector get_sliced_mem(const primitive_id& internal_id) const; + memory::ptr get_external_memory(const primitive_id& external_id, size_t mem_idx = 0) const; + layout get_external_output_layout(const primitive_id& external_id, size_t mem_idx = 0) const; + std::shared_ptr get_sliced_mem(const primitive_id& internal_id) const; std::vector _input_primitive_maps; std::vector _output_primitive_maps; std::vector _back_edges; @@ -547,9 +452,13 @@ class typed_primitive_inst : public typed_primitive_inst_base { primitive_id _initial_execution_id; primitive_id _current_iteration_id; primitive_id _condition_id; - primitive_id _num_iteration_id; - int64_t _max_iteration = 0; + primitive_id _num_iterations_id; }; using loop_inst = typed_primitive_inst; + +static inline std::ostream& operator<< (std::ostream& os, loop_inst::concatenated_memory_mapping& map) { + os << map.to_string(); + return os; +} } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/loop.cpp b/src/plugins/intel_gpu/src/graph/loop.cpp index 4e33f14e507084..a51c2d0d85973e 100644 --- a/src/plugins/intel_gpu/src/graph/loop.cpp +++ b/src/plugins/intel_gpu/src/graph/loop.cpp @@ -3,6 +3,8 @@ // #include "loop_inst.h" +#include "data_inst.h" +#include "mutable_data_inst.h" #include "json_object.h" #include "primitive_type_base.h" #include "intel_gpu/primitives/data.hpp" @@ -15,6 +17,41 @@ namespace cldnn { GPU_DEFINE_PRIMITIVE_TYPE_ID(loop) +std::map loop_node::get_memory_deps() const { + auto memory_deps = get_const_memory_deps(); + for (auto& i : get_shape_infer_dependencies()) { + auto& dep = get_dependency(i); + auto dep_id = dep.id(); + if (memory_deps.count(i) > 0 || i >= get_dependencies().size()) { + continue; + } + + memory::ptr mem_ptr = nullptr; + if (dep.is_type()) { + mem_ptr = dep.as().get_attached_memory_ptr(); + } else if (dep.is_type()) { + mem_ptr = dep.as().get_attached_memory_ptr(); + } + + if (mem_ptr) { + memory_deps.insert({i, mem_ptr}); + } + } + return memory_deps; +} + +static size_t convert_to_raw_axis(size_t axis, size_t ndim) { + // convert between bfyx, bfzyx, bfzyxw and tensor.size.raw + if (axis >= ndim) { + throw std::runtime_error("axis should be less than ndim"); + } + + if (axis < 2) { + return axis; + } + return (ndim - 1) - (axis - 2); +} + static bool check_if_axis_is_set_properly(loop_node const & node) { const auto& input_primitive_maps = node.get_input_primitive_maps(); @@ -30,11 +67,11 @@ static bool check_if_axis_is_set_properly(loop_node const & node) { int32_t iteration_size = -1; for (const auto& pm : input_with_axis_iteration) { auto found = std::find_if(dependencies.begin(), dependencies.end(), - [&pm](const std::pair& dep){ return dep.first->id() == pm.get().external_id; }); + [&pm](const std::pair& dep){ return dep.first->id() == pm.get().external_id.pid; }); assert(found != dependencies.end()); const layout input_layout = (*found).first->get_output_layout(); const auto shape = input_layout.get_tensor().sizes(input_layout.format); - const size_t iteration_axis = node.convert_to_raw_axis(pm.get().axis, static_cast(shape.size())); + const size_t iteration_axis = convert_to_raw_axis(pm.get().axis, static_cast(shape.size())); if (iteration_size < 0) { iteration_size = shape[iteration_axis]; } else { @@ -48,7 +85,7 @@ static bool check_if_axis_is_set_properly(loop_node const & node) { for (const auto& input_ref : input_with_axis_iteration) { const loop::io_primitive_map& input = input_ref.get(); auto dep = std::find_if(dependencies.begin(), dependencies.end(), - [&input](const std::pair& dep) { return input.external_id == dep.first->id(); }); + [&input](const std::pair& dep) { return input.external_id.pid == dep.first->id(); }); // if corresponding external id is not found if (dep == dependencies.end()) { @@ -58,75 +95,104 @@ static bool check_if_axis_is_set_properly(loop_node const & node) { return true; } -static void validate_backedges(loop_node const & node) { - const auto& back_edges = node.get_back_edges(); - const auto& input_primitive_maps = node.get_input_primitive_maps(); +layout loop_inst::calc_output_layout(loop_node const& /*node*/, kernel_impl_params const& impl_param) { + auto prim = impl_param.typed_desc(); - // check input with iteration axis has backedge - for (const auto& back_edge : back_edges) { - for (const auto& mapping : input_primitive_maps) { - if (mapping.internal_id == back_edge.to && mapping.axis >= 0) { - CLDNN_ERROR_MESSAGE(node.id(), - "input with iteration axis should not have backedges"); - } - } - } -} + // finds internal output + const auto& output_primitive_maps = prim->output_primitive_maps; + const auto& output_mapping = output_primitive_maps.front(); -layout loop_inst::calc_output_layout(loop_node const & node, kernel_impl_params const& impl_param) { - // body program should be built here to calculate body input layout - // from outputs of loop's dependency and calculate loop output layout - // from the outputs of body program - if (!node.get_body_program()) { - const_cast(node).build_body_program(); - } + const auto& body_program = impl_param.inner_progs.front(); + const auto& body_outputs = body_program->get_outputs(); - // type checks - const primitive_id& num_iteration_id = node.get_num_iteration_id(); - if (!node.get_program().get_node(num_iteration_id).is_type()) { - CLDNN_ERROR_MESSAGE(node.id(), "num_iteration is not mutable_data"); + const primitive_id& output_internal_id = output_mapping.internal_id.pid; + auto target = std::find_if(body_outputs.begin(), body_outputs.end(), [&](const cldnn::program_node * output) { + return output->id() == output_internal_id; + }); + OPENVINO_ASSERT(target != body_outputs.end(), impl_param.desc->id, "output not found"); + + // set body output layout + layout loop_output_layout = (*target)->get_output_layout(); + const int64_t axis_to_iterate_through = output_mapping.axis; + if (axis_to_iterate_through != -1) { + const size_t ndim = loop_output_layout.get_rank(); + auto shape = loop_output_layout.get_dims(); + shape[axis_to_iterate_through] = static_cast(prim->max_num_iterations); + loop_output_layout.set_tensor(tensor(format::get_default_format(ndim), shape)); } - if (!check_if_axis_is_set_properly(node)) { - CLDNN_ERROR_MESSAGE(node.id(), "axis is not set properly"); - } + return loop_output_layout; +} +template +static std::vector get_output_layouts(kernel_impl_params const& impl_param, std::vector body_outputs, const int64_t num_iterations = -1) { + auto prim = impl_param.typed_desc(); + std::vector output_layouts; + + const auto& output_primitive_maps = prim->output_primitive_maps; + for (auto& output_mapping : output_primitive_maps) { + const primitive_id& output_internal_id = output_mapping.internal_id.pid; + auto target = std::find_if(body_outputs.begin(), body_outputs.end(), [&](const T output) { + return output->id() == output_internal_id; + }); + OPENVINO_ASSERT(target != body_outputs.end(), impl_param.desc->id, "output not found"); - // finds internal output - const auto& output_primitive_maps = node.get_output_primitive_maps(); - const auto& output_mapping = output_primitive_maps.front(); - const auto& body_outputs = node.get_body_program()->get_outputs(); - const primitive_id& output_internal_id = output_mapping.internal_id; - auto target = std::find_if(body_outputs.begin(), body_outputs.end(), [&](const cldnn::program_node * output) { - return output->id() == output_internal_id; - }); - layout loop_output_layout; - if (target == body_outputs.end()) { - CLDNN_ERROR_MESSAGE(impl_param.desc->id, "output not found"); - } else { // set body output layout - loop_output_layout = (*target)->get_output_layout(); - const int64_t axis_to_iterate_throgh = output_mapping.axis; - if (axis_to_iterate_throgh != -1) { - const size_t ndim = loop_output_layout.get_rank(); - auto shape = loop_output_layout.get_dims(); - shape[axis_to_iterate_throgh] = static_cast(node.get_max_iteration()); - loop_output_layout.set_tensor(tensor(format::get_default_format(ndim), shape)); + layout loop_output_layout = (*target)->get_output_layout(); + const int64_t axis_to_iterate_through = output_mapping.axis; + if (axis_to_iterate_through != -1) { + auto shape = loop_output_layout.get_partial_shape(); + shape[axis_to_iterate_through] = static_cast(num_iterations); + loop_output_layout.set_partial_shape(shape); } + output_layouts.push_back(loop_output_layout); } - return loop_output_layout; + return output_layouts; +} + +template +std::vector loop_inst::calc_output_layouts(loop_node const& /*node*/, kernel_impl_params const& impl_param) { + std::vector output_layouts; + auto prim = impl_param.typed_desc(); + if (impl_param.inner_nets.empty()) { + OPENVINO_ASSERT(impl_param.inner_progs.size() == 1, "Loop(", prim->id, ") should have only one inner network"); + const auto& body_outputs = impl_param.inner_progs.front()->get_outputs(); + output_layouts = get_output_layouts(impl_param, body_outputs); + } else { + auto& memory_deps = impl_param.memory_deps; + const size_t current_iteration_idx = 0; + OPENVINO_ASSERT(memory_deps.count(current_iteration_idx) > 0, "The count of memory deps(current_iteration) should not be zero"); + cldnn::mem_lock current_iterations_lock(memory_deps.at(current_iteration_idx), impl_param.get_stream()); + int64_t current_iteration = static_cast(*current_iterations_lock.data()); + GPU_DEBUG_LOG << "* current_iteration(" << memory_deps.at(current_iteration_idx) << ") : " << current_iteration << std::endl; + + OPENVINO_ASSERT(impl_param.inner_nets.size() == 1, "Loop(", prim->id, ") should have only one inner program"); + const auto& body_outputs = impl_param.inner_nets.front()->get_outputs(); + output_layouts = get_output_layouts>(impl_param, body_outputs, current_iteration); + } + return output_layouts; } +template std::vector loop_inst::calc_output_layouts(loop_node const& node, const kernel_impl_params& impl_param); + + std::string loop_inst::to_string(const loop_node & node) { auto desc = node.get_primitive(); auto node_info = node.desc_to_json(); + std::vector body_inputs; + { + for (auto& input : desc->body_program->get_inputs()) { + body_inputs.push_back(input->id()); + } + } + json_composite loop_info; - loop_info.add("body input id", desc->body.get_primitives_ids()); + loop_info.add("body input id", body_inputs); loop_info.add("trip_count_id", desc->trip_count_id); - loop_info.add("initial_execution_id", desc->initial_execution_id); - loop_info.add("current_iteration_id", desc->current_iteration_id); - loop_info.add("condition_id", desc->condition_id); + loop_info.add("first_execution_condition_id", desc->first_execution_condition_id); + loop_info.add("body_current_iteration_id", desc->body_current_iteration_id); + loop_info.add("body_execution_condition_id", desc->body_execution_condition_id); std::stringstream primitive_description; node_info->add("loop info", loop_info); @@ -142,23 +208,23 @@ static std::vector find_io_primitive_maps( std::vector ret; if (is_external) { for (const auto& it : input_primitive_maps) { - if (it.external_id == prim_id) { + if (it.external_id.pid == prim_id) { ret.push_back(&it); } } for (const auto& it : output_primitive_maps) { - if (it.external_id == prim_id) { + if (it.external_id.pid == prim_id) { ret.push_back(&it); } } } else { for (const auto& it : input_primitive_maps) { - if (it.internal_id == prim_id) { + if (it.internal_id.pid == prim_id) { ret.push_back(&it); } } for (const auto& it : output_primitive_maps) { - if (it.internal_id == prim_id) { + if (it.internal_id.pid == prim_id) { ret.push_back(&it); } } @@ -175,24 +241,19 @@ static void validate_mappings(loop_node const & node) { for (const auto& id : outer_inputs) { if (id == node.get_trip_count_id() || id == node.get_initial_execution_id() || - id == node.get_num_iteration_id()) { + id == node.get_num_iterations_id()) { continue; } const auto results = find_io_primitive_maps(node.get_input_primitive_maps(), node.get_output_primitive_maps(), id, true); - if (results.size() == 0) { - std::string msg = "outer input '" + id + "' does not have primitive map"; - CLDNN_ERROR_MESSAGE(node.id(), msg.c_str()); - } + OPENVINO_ASSERT(results.size() > 0, node.id(), " : outer input '", id, "' does not have primitive map"); } // check all io_primitive_maps have their corresponding external id for (const auto& pm : input_primitive_maps) { - auto found = std::find(outer_inputs.begin(), outer_inputs.end(), pm.external_id); - if (found == outer_inputs.end()) { - std::string msg = "external id '" + pm.external_id + "' in primitive map cannot be found loop inputs"; - CLDNN_ERROR_MESSAGE(node.id(), msg.c_str()); - } + auto found = std::find(outer_inputs.begin(), outer_inputs.end(), pm.external_id.pid); + OPENVINO_ASSERT(found != outer_inputs.end(), node.id(), + " : external id '", pm.external_id.pid, "' in primitive map cannot be found loop inputs"); } const auto& nodes = node.get_body_program()->get_processing_order(); @@ -200,50 +261,25 @@ static void validate_mappings(loop_node const & node) { // check all io_primitive_maps have their corresponding interal id for (const auto& pm : input_primitive_maps) { auto found = std::find_if(nodes.begin(), nodes.end(), [&pm](const program_node* body_input) { - return body_input->id() == pm.internal_id; + return body_input->id() == pm.internal_id.pid; }); - if (found == nodes.end()) { - std::string msg = "internal id '" + pm.internal_id + "' in primitive map cannot be found loop body"; - CLDNN_ERROR_MESSAGE(node.id(), msg.c_str()); - } + OPENVINO_ASSERT(found != nodes.end(), node.id(), + " : internal id '", pm.internal_id.pid, "' in primitive map cannot be found loop body"); } for (const auto& pm : output_primitive_maps) { auto found = std::find_if(nodes.begin(), nodes.end(), [&pm](const program_node* body_output) { - return body_output->id() == pm.internal_id; + return body_output->id() == pm.internal_id.pid; }); - if (found == nodes.end()) { - std::string msg = "internal id '" + pm.internal_id + "' in primitive map cannot be found body body"; - CLDNN_ERROR_MESSAGE(node.id(), msg.c_str()); - } + OPENVINO_ASSERT(found != nodes.end(), node.id(), + " : internal id '", pm.internal_id.pid, "' in primitive map cannot be found body body"); } } -void loop_inst::update_mapped_memory() { - if (!preproc_memories_done) { - return; - } - // update output memory - for (size_t i = 0; i < _output_primitive_maps.size(); ++i) { - const auto& output_mapping = _output_primitive_maps.at(i); - const primitive_id& external_id = output_mapping.external_id; - const primitive_id& internal_id = output_mapping.internal_id; - memory::ptr to_mem = get_external_memory(external_id); - if (output_mapping.axis < 0) { - body_network->get_primitive(internal_id)->set_output_memory(to_mem); - } else { - for (auto& mem_mapping : concatenated_output_mem_mappings) { - if (mem_mapping.sliced_data_prim->id() == internal_id) { - mem_mapping.concatenated_mem = to_mem; - break; - } - } - } - } - // update input memory +void loop_inst::update_input_mapped_memory() { for (size_t memory_num = 0; memory_num < inputs_memory_count(); memory_num++) { const primitive_id& input_external_id = dependencies().at(memory_num).first->id(); auto input_map_ptrs = find_io_primitive_maps(_input_primitive_maps, - _output_primitive_maps, input_external_id, true); + _output_primitive_maps, input_external_id, true); if (input_map_ptrs.empty()) { if (input_external_id == _trip_count_id || input_external_id == _initial_execution_id) { @@ -257,36 +293,69 @@ void loop_inst::update_mapped_memory() { bool is_concatenated_input = (input_map->axis >= 0); if (is_concatenated_input) { for (auto& mem_mapping : concatenated_input_mem_mappings) { - if (mem_mapping.sliced_data_prim->id() == input_map->internal_id) { - mem_mapping.concatenated_mem = memory; + if (mem_mapping->sliced_data_prim->id() == input_map->internal_id.pid) { + mem_mapping->update_concatenated_mem(memory); break; } } } else { - body_network->set_input_data(input_map->internal_id, memory); + body_network->set_input_data(input_map->internal_id.pid, memory); } } } - //update backedges memory +} + +void loop_inst::update_output_mapped_memory() { + if (is_dynamic()) { + if (!outputs_allocated()) { + _outputs = allocate_outputs(_impl_params.get(), true, true); + } + } + + for (size_t i = 0; i < _output_primitive_maps.size(); ++i) { + const auto& output_mapping = _output_primitive_maps.at(i); + const primitive_id& external_id = output_mapping.external_id.pid; + const size_t external_mem_idx = output_mapping.external_id.idx; + const primitive_id& internal_id = output_mapping.internal_id.pid; + const size_t internal_mem_idx = output_mapping.internal_id.idx; + + memory::ptr to_mem = get_external_memory(external_id, external_mem_idx); + if (to_mem) { + if (output_mapping.axis < 0) { + body_network->get_primitive(internal_id)->set_output_memory(to_mem, true, internal_mem_idx); + } else { + for (auto& mem_mapping : concatenated_output_mem_mappings) { + if (mem_mapping->sliced_data_prim->id() == internal_id) { + mem_mapping->update_concatenated_mem(to_mem); + break; + } + } + } + } + } +} + +void loop_inst::update_backedge_mapped_memory() { // checking if memory is a destination of a backedge for (const auto& back_edge : _back_edges) { //find corresponding input of the backedge const auto input_map_ptrs = find_io_primitive_maps(_input_primitive_maps, - _output_primitive_maps, back_edge.to, false); + _output_primitive_maps, back_edge.to, false); assert(input_map_ptrs.size() == 1); const auto& input_map = input_map_ptrs.front(); - auto backedged_sliced_output_mems = get_sliced_mem(back_edge.from); + auto backedged_sliced_output = get_sliced_mem(back_edge.from); const auto backedge_to_prim = body_network->get_primitive(back_edge.to); const auto backedge_from_prim = body_network->get_primitive(back_edge.from); - memory::ptr initial_mem = get_external_memory(input_map->external_id); + + memory::ptr initial_mem = get_external_memory(input_map->external_id.pid, input_map->external_id.idx); for (auto& backedge_mapping : backedge_memory_mappings) { if (backedge_mapping.from_primitive->id() == backedge_from_prim->id() && backedge_mapping.to_primitive->id() == backedge_to_prim->id()) { - if (backedged_sliced_output_mems.empty()) { + if (backedged_sliced_output == nullptr) { // backedge output which does not need concatenation const auto output_mapping = find_io_primitive_maps(_input_primitive_maps, - _output_primitive_maps, back_edge.from, false); + _output_primitive_maps, back_edge.from, false); memory::ptr backedge_mem; if (output_mapping.empty()) { // from and to primitives in backedge are connected directly @@ -300,14 +369,15 @@ void loop_inst::update_mapped_memory() { backedge_mem = body_network->get_engine().allocate_memory(output_layout, 0); } } else { - backedge_mem = get_external_memory(output_mapping.front()->external_id); + auto external_id = output_mapping.front()->external_id; + backedge_mem = get_external_memory(external_id.pid, external_id.idx); } body_network->set_input_data(back_edge.to, backedge_mem); body_network->set_output_memory(back_edge.from, backedge_mem); - backedge_mapping.from_mems = { backedge_mem }; + backedge_mapping.from_mem = backedge_mem; backedge_mapping.initial_mem = initial_mem; } else { - backedge_mapping.from_mems = backedged_sliced_output_mems; + backedge_mapping.concat_mem_mapping = backedged_sliced_output; backedge_mapping.initial_mem = initial_mem; } break; @@ -316,92 +386,138 @@ void loop_inst::update_mapped_memory() { } } + +void loop_inst::update_mapped_memory() { + if (!preproc_memories_done) { + return; + } + + update_output_mapped_memory(); + update_input_mapped_memory(); + update_backedge_mapped_memory(); +} + event::ptr loop_inst::set_output_memory(memory::ptr mem, bool check, size_t idx) { auto ev = primitive_inst::set_output_memory(mem, check, idx); update_mapped_memory(); return ev; } -void loop_inst::preprocess_output_memory() { - auto& engine = _network.get_engine(); - concatenated_output_mem_mappings.reserve(_output_primitive_maps.size()); - for (size_t i = 0; i < _output_primitive_maps.size(); ++i) { - const auto& output_mapping = _output_primitive_maps.at(i); - const primitive_id& external_id = output_mapping.external_id; - const primitive_id& internal_id = output_mapping.internal_id; - if (output_mapping.axis < 0) { - memory::ptr memory = get_external_memory(external_id); - body_network->get_primitive(internal_id)->set_output_memory(memory); +loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(const input_info& internal_id, + const cldnn::loop::io_primitive_map& io_prim_map, + memory::ptr mem_ptr, + const int64_t trip_count) { + auto& engine = body_network->get_engine(); + auto& stream = body_network->get_stream(); + auto prim = body_network->get_primitive(internal_id.pid); + const int64_t start = io_prim_map.start < 0? trip_count - 1: io_prim_map.start; + + std::vector sliced_mems; + int64_t num_elements_iteration = 0; + + // if memory is nullptr, that means memory is not allocated yet because current network is dynamic shape model. + // In dynamic model, we can't calculate num_element_iteration, start, and sliced_layout. + // will recalculate that parameters in backedge preprocessing map after first execution. + if (mem_ptr != nullptr) { + layout sliced_layout = prim->output_memory(internal_id.idx).get_layout(); + + // When trip_count is -1, allocate first sliced_mem and allocate sliced memory if additional sliced mem is required + if (trip_count < 0) { + memory::ptr sliced_mem = engine.allocate_memory(sliced_layout, 0); + sliced_mems.push_back(sliced_mem); } else { - memory::ptr to_mem = get_external_memory(external_id); - auto output_prim = body_network->get_primitive(internal_id); - layout sliced_layout = output_prim->output_memory().get_layout(); - - const int64_t max_iteration = _max_iteration; - std::vector sliced_mems; - sliced_mems.reserve(max_iteration); - for (int32_t j = 0; j < max_iteration; ++j) { + sliced_mems.reserve(trip_count); + for (int j=0; j < trip_count; ++j) { memory::ptr sliced_mem = engine.allocate_memory(sliced_layout, 0); sliced_mems.push_back(sliced_mem); } + } - const int64_t num_elements_batch = concatenated_memory_mapping::get_batch_size( - sliced_layout, output_mapping.axis); - const int64_t num_elements_iteration = sliced_layout.count() / num_elements_batch; - const int64_t start = output_mapping.start < 0? _max_iteration - 1: output_mapping.start; - concatenated_memory_mapping memory_mapping_info( - output_mapping.axis, std::move(to_mem), sliced_mems, _network.get_stream(), - num_elements_iteration, output_mapping.stride, start); - memory_mapping_info.sliced_data_prim = body_network->get_primitive(internal_id); - memory_mapping_info.concat_data_prim = get_network().get_primitive(external_id); - concatenated_output_mem_mappings.push_back(memory_mapping_info); + const int64_t num_elements_batch = concatenated_memory_mapping::get_batch_size( + sliced_layout, io_prim_map.axis); + num_elements_iteration = sliced_layout.count() / num_elements_batch; + } + + auto concat_memory_mapping = std::make_shared( + io_prim_map.axis, mem_ptr, sliced_mems, stream, + engine, num_elements_iteration, io_prim_map.stride, start); + concat_memory_mapping->sliced_data_prim = body_network->get_primitive(internal_id.pid); + return concat_memory_mapping; +} + +void loop_inst::preprocess_output_memory(const int64_t trip_count) { + if (concatenated_output_mem_mappings.empty()) + concatenated_output_mem_mappings.reserve(_output_primitive_maps.size()); + for (size_t i = 0; i < _output_primitive_maps.size(); ++i) { + const auto& output_mapping = _output_primitive_maps.at(i); + const auto& external_id = output_mapping.external_id; + const auto& internal_id = output_mapping.internal_id; + GPU_DEBUG_LOG << i << ") output mapping - external " << external_id.to_string() << std::endl; + GPU_DEBUG_LOG << i << ") output mapping - internal " << internal_id.to_string() << std::endl; + + memory::ptr memory = get_external_memory(external_id.pid, external_id.idx); + if (output_mapping.axis < 0) { + // In dynamic model, Don't get output memory of loop node because body network's output layouts are not calculated + if (memory != nullptr) { + body_network->get_primitive(internal_id.pid)->set_output_memory(memory, true, internal_id.idx); + } + } else { + auto iter = std::find_if(concatenated_output_mem_mappings.begin(), concatenated_output_mem_mappings.end(), + [&](loop_inst::concatenated_memory_mapping::ptr concat_mem_map) -> bool { + return concat_mem_map->sliced_data_prim->id() == internal_id.pid; + }); + if (iter == concatenated_output_mem_mappings.end()) { + auto memory_mapping_info = create_concat_memory_map(internal_id, output_mapping, memory, trip_count); + memory_mapping_info->concat_data_prim = get_network().get_primitive(external_id.pid); + concatenated_output_mem_mappings.push_back(memory_mapping_info); + GPU_DEBUG_LOG << i << ") generate concat output memory mapping: " << memory_mapping_info->to_string() << std::endl; + } + GPU_DEBUG_IF(iter != concatenated_output_mem_mappings.end()) { + GPU_DEBUG_LOG << i << ") memory_mapping_info is already existed : " << (*iter)->to_string() << std::endl; + } } } } -void loop_inst::preprocess_input_memory() { - auto& engine = _network.get_engine(); - auto& iteration_mem = concatenated_input_mem_mappings; +void loop_inst::preprocess_input_memory(const int64_t trip_count) { for (size_t memory_num = 0; memory_num < inputs_memory_count(); memory_num++) { const primitive_id& input_external_id = dependencies().at(memory_num).first->id(); auto input_map_ptrs = find_io_primitive_maps(_input_primitive_maps, - _output_primitive_maps, input_external_id, true); + _output_primitive_maps, input_external_id, true); if (input_map_ptrs.size() == 0) { - if (input_external_id == _trip_count_id || - input_external_id == _initial_execution_id) { - continue; - } - CLDNN_ERROR_MESSAGE(id(), "loop primitive_map is incomplete"); + OPENVINO_ASSERT((input_external_id == _trip_count_id + || input_external_id == _num_iterations_id + || input_external_id == _initial_execution_id), + id(), "loop primitive_map is incomplete " + "input_external_id(", input_external_id, ") != _trip_count_id(", _trip_count_id, ")", + "input_external_id(", input_external_id, ") != _num_iterations_id(", _num_iterations_id, ")", + " && input_external_id(", input_external_id, ") != _initial_execution_id(", _initial_execution_id, ")"); + continue; } auto memory = input_memory_ptr(memory_num); for (size_t i = 0; i < input_map_ptrs.size(); ++i) { const auto input_map = input_map_ptrs.at(i); - bool is_concatenated_input = (input_map->axis >= 0); - if (is_concatenated_input) { - layout sliced_layout - = body_network->get_primitive(input_map->internal_id)->output_memory().get_layout(); - const int64_t max_iteration = _max_iteration; - std::vector sliced_mems; - sliced_mems.reserve(max_iteration); - for (int j=0; j < max_iteration; ++j) { - memory::ptr sliced_mem = engine.allocate_memory(sliced_layout, 0); - sliced_mems.push_back(sliced_mem); - } - const int64_t num_elements_batch = concatenated_memory_mapping::get_batch_size( - sliced_layout, input_map->axis); - const int64_t num_elements_iteration = sliced_layout.count() / num_elements_batch; - const int64_t start = input_map->start < 0? _max_iteration - 1: input_map->start; - concatenated_memory_mapping concatenated_input_mem_mapping_info( - input_map->axis, memory, sliced_mems, _network.get_stream(), - num_elements_iteration, input_map->stride, start); - concatenated_input_mem_mapping_info.sliced_data_prim = body_network->get_primitive(input_map->internal_id); - iteration_mem.push_back(concatenated_input_mem_mapping_info); + const auto& external_id = input_map->external_id; + const auto& internal_id = input_map->internal_id; + GPU_DEBUG_LOG << i << ") input mapping - external " << external_id.to_string() << std::endl; + GPU_DEBUG_LOG << i << ") input mapping - internal " << internal_id.to_string() << std::endl; + + if (input_map->axis >= 0) { + OPENVINO_ASSERT(trip_count > 0, "In preprocessing concat input mapping, trip_count should be positive"); + OPENVINO_ASSERT(memory != nullptr, "In preprocessing concat input mapping, concat memory should be allocated"); + auto memory_mapping_info = create_concat_memory_map(internal_id, *input_map, memory, trip_count); + concatenated_input_mem_mappings.push_back(memory_mapping_info); + GPU_DEBUG_LOG << i << ") generate concat input memory mapping: " << memory_mapping_info->to_string() << std::endl; } else { - if (memory->get_layout().data_type != body_network->get_primitive(input_map->internal_id)->output_memory().get_layout().data_type) { - CLDNN_ERROR_MESSAGE(id(), "incompatible datatypes"); + auto input_inst = body_network->get_primitive(internal_id.pid); + if (memory->get_layout() != input_inst->get_output_layout()) { + input_inst->set_output_layout(memory->get_layout()); + GPU_DEBUG_LOG << input_inst->id() << " is changed memory because layout is changed from " + << input_inst->get_output_layout().to_short_string() + << " to " << memory->get_layout().to_short_string() << std::endl; } - body_network->set_input_data(input_map->internal_id, memory); + body_network->set_input_data(internal_id.pid, memory); } } } @@ -409,88 +525,141 @@ void loop_inst::preprocess_input_memory() { void loop_inst::preprocess_backedge_memory() { // checking if memory is a destination of a backedge - for (const auto& back_edge : _back_edges) { + for (size_t idx = 0; idx < _back_edges.size(); idx++) { + const auto& back_edge = _back_edges[idx]; //find corresponding input of the backedge const auto input_map_ptrs = find_io_primitive_maps(_input_primitive_maps, - _output_primitive_maps, back_edge.to, false); + _output_primitive_maps, back_edge.to, false); const auto backedge_to_prim = body_network->get_primitive(back_edge.to); const auto backedge_from_prim = body_network->get_primitive(back_edge.from); memory::ptr initial_mem; - if (back_edge.to == _current_iteration_id) { - const layout current_iteration_layout = backedge_to_prim->output_memory().get_layout(); - initial_mem = get_network().get_engine().allocate_memory(current_iteration_layout); - auto& stream = get_network().get_stream(); - loop_node::write_scalar_value(initial_mem, stream, 0); - current_iteratoin_backedge_mapping_idx = backedge_memory_mappings.size(); + OPENVINO_ASSERT(!input_map_ptrs.empty(), id(), " has no input_mapping for backedged input"); + auto& external_id = input_map_ptrs.front()->external_id; + initial_mem = get_external_memory(external_id.pid, external_id.idx); + + GPU_DEBUG_LOG << idx << ") back_edge mapping - back_edge.from " << back_edge.from << std::endl; + GPU_DEBUG_LOG << idx << ") back_edge mapping - back_edge.to " << back_edge.to << std::endl; + + auto backedged_sliced_output = get_sliced_mem(back_edge.from); + const auto output_mapping = find_io_primitive_maps(_input_primitive_maps, + _output_primitive_maps, back_edge.from, false); + if (backedged_sliced_output != nullptr) { + // CONCAT_OUTPUT mode, backedge output which needs concatenation + backedge_memory_mappings.emplace_back( + backedge_from_prim, backedge_to_prim, backedged_sliced_output, initial_mem, body_network->get_stream()); + GPU_DEBUG_LOG << idx << ") add back_edge mapping with CONCAT_OUTPUT type, backedged_sliced_output(" + << backedged_sliced_output << "), initial_mem(" << initial_mem << ")" << std::endl; + } else if (output_mapping.empty() && backedge_to_prim == backedge_from_prim->dependencies().front().first) { + // SINGLE mode, from and to primitives in backedge are connected directly + backedge_memory_mappings.emplace_back( + backedge_from_prim, backedge_to_prim, initial_mem, body_network->get_stream()); + GPU_DEBUG_LOG << idx << ") add back_edge mapping with SINGLE type, initial_mem(" << initial_mem << ")" << std::endl; } else { - if (input_map_ptrs.empty()) { - CLDNN_ERROR_MESSAGE(id(), "no input_mapping for backedged input"); - } - initial_mem = get_external_memory(input_map_ptrs.front()->external_id); - } - - auto backedged_sliced_output_mems = get_sliced_mem(back_edge.from); - if (backedged_sliced_output_mems.empty()) { - // backedge output which does not need concatenation - const auto output_mapping = find_io_primitive_maps(_input_primitive_maps, - _output_primitive_maps, back_edge.from, false); + // SINGLE_SHARED mode memory::ptr backedge_mem; - if (output_mapping.empty()) { - // from and to primitives in backedge are connected directly - if (backedge_to_prim == backedge_from_prim->dependencies().front().first) { - backedge_memory_mappings.emplace_back( - backedge_from_prim, backedge_to_prim, initial_mem, body_network->get_stream()); - continue; + auto output_prim = body_network->get_primitive(back_edge.from); + + if (is_dynamic()) { + if (output_prim->outputs_allocated()) { + auto internal_output_prim_mem = output_prim->output_memory_ptr(); + if (internal_output_prim_mem->get_layout() == initial_mem->get_layout()) { + backedge_mem = internal_output_prim_mem; + body_network->set_input_data(back_edge.to, backedge_mem); + GPU_DEBUG_LOG << idx << ") Get backedge_mem(" << backedge_mem + << ") from back_edge.from(" << back_edge.from << ")" << std::endl; + } else { + // When input layout is changed or backedge_mem is null + // because output layout of body network is not calculated yet, + // Set backedge_mem to nullptr and update it after first execution. + body_network->set_input_data(back_edge.to, initial_mem); + GPU_DEBUG_LOG << idx << ") Just set input data using initial_mem because back_edge.from(" + << back_edge.from << ") layout is changed or backedge_mem is nullptr" << std::endl; + } } else { - auto output_prim = body_network->get_primitive(back_edge.from); - layout output_layout = output_prim->output_memory().get_layout(); - backedge_mem = body_network->get_engine().allocate_memory(output_layout, 0); + body_network->set_input_data(back_edge.to, initial_mem); + GPU_DEBUG_LOG << idx << ") Just set input data using initial_mem because back_edge.from(" + << back_edge.from << ") has dynamic layout now" << std::endl; } } else { - backedge_mem = get_external_memory(output_mapping.front()->external_id); + if (output_mapping.empty()) { + backedge_mem = output_prim->output_memory_ptr(); + body_network->set_input_data(back_edge.to, backedge_mem); + GPU_DEBUG_LOG << idx << ") Get backedge_mem(" << backedge_mem + << ") from back_edge.from(" << back_edge.from << ")" << std::endl; + } else { + // Set input and output memory for body_network using external output memory of loop op + auto& out_mapping_ext_id = output_mapping.front()->external_id; + backedge_mem = get_external_memory(out_mapping_ext_id.pid, out_mapping_ext_id.idx); + GPU_DEBUG_LOG << idx << ") Get backedge_mem(" << backedge_mem << ") from output_mapping_external_id.pid(" + << out_mapping_ext_id.pid << ")" << std::endl; + + body_network->set_input_data(back_edge.to, backedge_mem); + body_network->set_output_memory(back_edge.from, backedge_mem); + } } - body_network->set_input_data(back_edge.to, backedge_mem); - body_network->set_output_memory(back_edge.from, backedge_mem); + backedge_memory_mappings.emplace_back( backedge_from_prim, backedge_to_prim, backedge_mem, initial_mem, body_network->get_stream()); - } else { - // backedge output which needs concatenation - backedge_memory_mappings.emplace_back( - backedge_from_prim, backedge_to_prim, backedged_sliced_output_mems, initial_mem, body_network->get_stream()); + GPU_DEBUG_LOG << idx << ") add back_edge mapping with SINGLE_SHARED type, backedge_mem(" + << backedge_mem << "), initial_mem(" << initial_mem << ")" << std::endl; } } } -std::vector loop_inst::get_sliced_mem(const primitive_id& internal_id) const { +std::shared_ptr loop_inst::get_sliced_mem(const primitive_id& internal_id) const { for (const auto& mem_mapping : concatenated_input_mem_mappings) { - if (mem_mapping.sliced_data_prim->id() == internal_id) { - return mem_mapping.sliced_mems; + if (mem_mapping->sliced_data_prim->id() == internal_id) { + return mem_mapping; } } for (const auto& mem_mapping : concatenated_output_mem_mappings) { - if (mem_mapping.sliced_data_prim->id() == internal_id) { - return mem_mapping.sliced_mems; + if (mem_mapping->sliced_data_prim->id() == internal_id) { + return mem_mapping; + } + } + return nullptr; // not found +} + +void loop_inst::validate_backedges(loop_node const & node) const { + const auto& back_edges = node.get_back_edges(); + const auto& input_primitive_maps = node.get_input_primitive_maps(); + + // check input with iteration axis has backedge + for (const auto& back_edge : back_edges) { + for (const auto& mapping : input_primitive_maps) { + OPENVINO_ASSERT((mapping.internal_id.pid != back_edge.to || mapping.axis < 0), + node.id(), ": input with iteration axis should not have backedges"); } } - return {}; // not found } -memory::ptr loop_inst::get_external_memory(const primitive_id& external_id) const { +memory::ptr loop_inst::get_external_memory(const primitive_id& external_id, size_t mem_idx) const { const auto outputPrim = _network.get_primitive(external_id); - return outputPrim->output_memory_ptr(); + if (outputPrim->outputs_allocated()) { + return outputPrim->output_memory_ptr(mem_idx); + } + return nullptr; +} + +layout loop_inst::get_external_output_layout(const primitive_id& external_id, size_t mem_idx) const { + const auto outputPrim = _network.get_primitive(external_id); + return outputPrim->get_output_layout(mem_idx); } loop_inst::typed_primitive_inst(network & network, loop_node const & node) : parent(network, node), - preproc_memories_done(false), - body_network(network::allocate_network(network.get_stream_ptr(), - node.get_body_program(), - false, - network.is_primary_stream())) { - if (!check_if_axis_is_set_properly(node)) - CLDNN_ERROR_MESSAGE(node.id(), "axis is not set properly"); - + preproc_memories_done(false), + body_network(network::allocate_network(network.get_stream_ptr(), + node.get_body_program(), + false, + network.is_primary_stream())) { + const primitive_id& num_iterations_id = node.get_num_iterations_id(); + OPENVINO_ASSERT(node.get_program().get_node(num_iterations_id).is_type(), + node.id(), ": num_iterations is not mutable_data"); + OPENVINO_ASSERT(check_if_axis_is_set_properly(node), node.id(), ": axis is not set properly"); + + set_inner_networks({body_network}); validate_backedges(node); validate_mappings(node); @@ -500,9 +669,8 @@ loop_inst::typed_primitive_inst(network & network, loop_node const & node) _trip_count_id = node.get_trip_count_id(); _initial_execution_id = node.get_initial_execution_id(); _current_iteration_id = node.get_current_iteration_id(); - _condition_id = node.get_condition_id(); - _num_iteration_id = node.get_num_iteration_id(); - _max_iteration = node.get_max_iteration(); + _condition_id = node.get_execution_condition_id(); + _num_iterations_id = node.get_num_iterations_id(); } void loop_inst::save(BinaryOutputBuffer& ob) const { @@ -514,8 +682,7 @@ void loop_inst::save(BinaryOutputBuffer& ob) const { ob << _initial_execution_id; ob << _current_iteration_id; ob << _condition_id; - ob << _num_iteration_id; - ob << _max_iteration; + ob << _num_iterations_id; body_network->save(ob); } @@ -529,9 +696,97 @@ void loop_inst::load(BinaryInputBuffer& ib) { ib >> _initial_execution_id; ib >> _current_iteration_id; ib >> _condition_id; - ib >> _num_iteration_id; - ib >> _max_iteration; + ib >> _num_iterations_id; body_network = std::make_shared(ib, get_network().get_stream_ptr(), get_network().get_engine(), get_network().is_primary_stream(), 0); } +void loop_inst::postprocess_output_memory(bool is_dynamic) { + if (is_dynamic) { + for (size_t i = 0; i < _output_primitive_maps.size(); ++i) { + const auto& output_mapping = _output_primitive_maps.at(i); + const auto& external_id = output_mapping.external_id; + const auto& internal_id = output_mapping.internal_id; + if (output_mapping.axis < 0) { + auto internalOutputPrim = get_body_network()->get_primitive(internal_id.pid); + auto internal_mem = internalOutputPrim->output_memory_ptr(internal_id.idx); + if (internal_mem == nullptr) { + continue; + } + auto externalOutputPrim = _network.get_primitive(external_id.pid); + if (!externalOutputPrim->outputs_allocated()) { + externalOutputPrim->set_output_memory(internal_mem, external_id.idx); + } else { + auto external_mem = externalOutputPrim->output_memory_ptr(external_id.idx); + if (external_mem->get_layout() != internal_mem->get_layout()) { + externalOutputPrim->set_output_memory(internal_mem, external_id.idx); + } else if (external_mem != internal_mem) { + external_mem->copy_from(get_network().get_stream(), *internal_mem); + } + } + } else { + auto externalOutputPrim = _network.get_primitive(external_id.pid); + if (!externalOutputPrim->outputs_allocated() || shape_changed()) { + auto concat_layout = _impl_params->get_output_layout(external_id.idx); + auto concat_mem = _network.get_engine().allocate_memory(concat_layout, 0); + externalOutputPrim->set_output_memory(concat_mem, external_id.idx); + auto iter = std::find_if(concatenated_output_mem_mappings.begin(), + concatenated_output_mem_mappings.end(), + [&](std::shared_ptr &concat_output){ + return concat_output->concat_data_prim->id() == external_id.pid; + }); + if (iter != concatenated_output_mem_mappings.end()) { + (*iter)->update_concatenated_mem(concat_mem); + } + } + } + } + } + + for (size_t i = 0; i < concatenated_output_mem_mappings.size(); ++i) { + const auto& concat_output = concatenated_output_mem_mappings.at(i); + concat_output->restore_concatenated_mem(); + } +} + +void loop_inst::reset_memory() { + backedge_memory_mappings.clear(); + concatenated_input_mem_mappings.clear(); + for (auto concat_mem_map : concatenated_output_mem_mappings) { + concat_mem_map->reset_data_for_shape_changed(); + } +} + + +void loop_inst::update_output_layout() { + if (_node == nullptr) + return; + + auto memory_deps = _node->get_const_memory_deps(); + for (auto& i : _node->get_shape_infer_dependencies()) { + auto dep_id = _node->get_dependency(i).id(); + if (memory_deps.count(i) > 0 || i >= _node->get_dependencies().size()) { + continue; + } + + auto dep_mem = _network.get_output_memory(dep_id); + memory_deps.insert({i, dep_mem}); + } + _impl_params->memory_deps = memory_deps; + + auto new_layouts = _node->type()->calc_output_layouts(*_node, *_impl_params); + if (new_layouts.empty()) { + auto new_layout = _node->type()->calc_output_layout(*_node, *_impl_params); + new_layout.data_padding = padding::max(_node->get_primitive()->output_paddings[0], new_layout.data_padding); + _impl_params->output_layouts[0] = new_layout; + } else { + if (_impl_params->output_layouts.size() < new_layouts.size()) { + _impl_params->output_layouts.resize(new_layouts.size()); + } + for (size_t i = 0; i < new_layouts.size(); ++i) { + auto new_layout = new_layouts[i]; + new_layout.data_padding = padding::max(_node->get_primitive()->output_paddings[i], new_layout.data_padding); + _impl_params->output_layouts[i] = new_layout; + } + } +} } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index f72a38af85589f..dc9b2029ff408c 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -332,6 +332,8 @@ layout program_node::get_non_padded_output_layout(bool invalidate_users_if_chang bool program_node::set_output_layout(layout& new_layout, bool invalidate_users_if_changed, size_t idx) { merge_output_padding(new_layout.data_padding, idx); + OPENVINO_ASSERT(idx < output_layouts.size(), id(), " has invalid index : index is ", std::to_string(idx), + " but output_layouts length is ", std::to_string(output_layouts.size())); new_layout.data_padding = output_layouts[idx].data_padding; bool changed = (new_layout != output_layouts[idx]); if (changed && invalidate_users_if_changed) // output_layout has changed! invalidate users diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_ref.cl new file mode 100644 index 00000000000000..2715f90780071d --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_ref.cl @@ -0,0 +1,144 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/common.cl" + +#define NUM_CHANNELS_IN_GROUP (INPUT0_FEATURE_NUM / NUM_GROUPS) +#define CHANNEL_SIZE (INPUT0_BATCH_PITCH / INPUT0_FEATURE_NUM) +#define GROUP_SIZE (NUM_CHANNELS_IN_GROUP * CHANNEL_SIZE) + +#if MEAN_KERNEL_ENABLED || STANDARD_DEVIATION_KERNEL_ENABLED +inline void FUNC(kahan_summation)(INPUT0_TYPE elem, __private float* compensation, __private float* sum) { + if (isfinite(elem) && isfinite(*sum)) { + float temp = *sum + (elem - *compensation); + *compensation = (temp - *sum) - (elem - *compensation); + *sum = temp; + } else { + *sum += elem; + } +} +#endif + +#if MEAN_KERNEL_ENABLED + +KERNEL (calc_mean_ref)( __global INPUT0_TYPE* input + , __global float* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) +{ + const int batch = get_global_id(0); + if (batch >= INPUT0_BATCH_NUM) + return; + const int group = get_global_id(1); + const int feature_begin = group * NUM_CHANNELS_IN_GROUP; + const int feature_end = group * NUM_CHANNELS_IN_GROUP + NUM_CHANNELS_IN_GROUP; + float variance = 0.f, error = 0.f, mean_value = 0.f; + for (int feature = feature_begin; feature < feature_end; feature++) + { + if (feature >= INPUT0_FEATURE_NUM) + continue; +#if OUTPUT_DIMS > 4 + for (int z = 0; z < INPUT0_SIZE_Z; z++) +#endif + for (int y = 0; y < INPUT0_SIZE_Y; y++) + for (int x = 0; x < INPUT0_SIZE_X; x++) + { +#if OUTPUT_DIMS == 5 + size_t input_idx = INPUT0_GET_INDEX(batch, feature, z, y, x); +#elif OUTPUT_DIMS == 4 + size_t input_idx = INPUT0_GET_INDEX(batch, feature, y, x); +#endif + FUNC_CALL(kahan_summation)(input[input_idx], &error, &mean_value); + } + } + mean_value /= GROUP_SIZE; + output[batch * NUM_GROUPS + group] = mean_value; +} + +#elif STANDARD_DEVIATION_KERNEL_ENABLED + +KERNEL (calc_standard_deviation_ref)( __global INPUT0_TYPE* input + , __global float* mean + , __global float* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) +{ + const int batch = get_global_id(0); + if (batch >= INPUT0_BATCH_NUM) + return; + const int group = get_global_id(1); + const output_idx = batch * NUM_GROUPS + group; + const int feature_begin = group * NUM_CHANNELS_IN_GROUP; + const int feature_end = group * NUM_CHANNELS_IN_GROUP + NUM_CHANNELS_IN_GROUP; + float variance = 0.f, error = 0.f; + + for (int feature = feature_begin; feature < feature_end; feature++) + { + if (feature >= INPUT0_FEATURE_NUM) + continue; +#if OUTPUT_DIMS > 4 + for (int z = 0; z < INPUT0_SIZE_Z; z++) +#endif + for (int y = 0; y < INPUT0_SIZE_Y; y++) + for (int x = 0; x < INPUT0_SIZE_X; x++) + { +#if OUTPUT_DIMS == 5 + size_t input_idx = INPUT0_GET_INDEX(batch, feature, z, y, x); +#elif OUTPUT_DIMS == 4 + size_t input_idx = INPUT0_GET_INDEX(batch, feature, y, x); +#endif + FUNC_CALL(kahan_summation)(pow(input[input_idx] - mean[output_idx], 2), &error, &variance); + } + } + variance /= GROUP_SIZE; + float standard_deviation = sqrt(variance + EPSILON); + output[output_idx] = standard_deviation; +} +#elif NORMALIZE_KERNEL_ENABLED +KERNEL (normalize_ref)( __global INPUT0_TYPE* input + , __global INPUT0_TYPE* scale_values + , __global INPUT0_TYPE* bias_values + , __global float* mean_values + , __global float* standard_deviation_values + , __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) +{ + const int batch = get_global_id(0); +#if OUTPUT_DIMS == 4 + const int feature = get_global_id(1); +#elif OUTPUT_DIMS == 5 + const int feature = get_global_id(1) / OUTPUT_SIZE_Z; + const int z = get_global_id(1) % OUTPUT_SIZE_Z; +#endif + const int y = get_global_id(2) / OUTPUT_SIZE_X; + const int x = get_global_id(2) % OUTPUT_SIZE_X; + const int group = feature / NUM_CHANNELS_IN_GROUP; + float mean = mean_values[batch * NUM_GROUPS + group]; + float standard_deviation = standard_deviation_values[batch * NUM_GROUPS + group]; +#if OUTPUT_DIMS == 4 + size_t output_idx = OUTPUT_GET_INDEX(batch, feature, y, x); +#elif OUTPUT_DIMS == 5 + size_t output_idx = OUTPUT_GET_INDEX(batch, feature, z, y, x); +#endif + OUTPUT_TYPE res = ((input[output_idx] - mean) / standard_deviation) * scale_values[feature] + bias_values[feature]; +#if HAS_FUSED_OPS + FUSED_OPS; + output[output_idx] = FUSED_OPS_RESULT; +#else + output[output_idx] = ACTIVATION(res, ACTIVATION_PARAMS); +#endif +} + +#endif + +#undef NUM_CHANNELS_IN_GROUP +#undef CHANNEL_SIZE +#undef GROUP_SIZE diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h index 148b6c10e39183..7706da6003fe74 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h @@ -32,6 +32,7 @@ enum class KernelType { RESHAPE, COUNT_NONZERO, GATHER_NONZERO, + GROUP_NORMALIZATION, PERMUTE, CONCATENATION, RESAMPLE, diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.cpp new file mode 100644 index 00000000000000..a6dd21c43fe4e3 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "group_normalization_kernel_ref.h" +#include + +namespace kernel_selector { + +ParamsKey GroupNormalizationKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + return k; +} + +static std::size_t InternalBufferSize(const group_normalization_params ¶ms) { + const auto& output = params.outputs[0]; + return output.Batch().v * params.num_groups * sizeof(float); +} + +static GroupNormalizationKernelRef::KernelId operator++(GroupNormalizationKernelRef::KernelId& id) { + id = static_cast(static_cast(id) + 1); + return id; +} + +GroupNormalizationKernelRef::DispatchData GroupNormalizationKernelRef::SetDefault( + KernelId id, const group_normalization_params ¶ms) const { + DispatchData dispatch_data; + auto& output = params.outputs[0]; + switch (id) { + case eCalcMeanKernel: + case eCalcStandardDeviationKernel: { + auto maxWorkGroupSize = params.engineInfo.maxWorkGroupSize; + dispatch_data.gws = std::vector{ + output.Batch().v, + static_cast(params.num_groups), + 1 + }; + dispatch_data.lws = std::vector{ + output.Batch().v * params.num_groups > maxWorkGroupSize ? maxWorkGroupSize / params.num_groups : output.Batch().v, + static_cast(params.num_groups), + 1}; + break; + } + case eNormalize: { + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = output.GetLayout(); + std::vector> dims_by_gws = { + { Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::Z }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y }}; + dispatch_data.gws = std::vector{ + output.Batch().v, + output.Feature().v * output.Z().v, + output.X().v * output.Y().v}; + dispatch_data.lws = GetOptimalLocalWorkGroupSizes(dispatch_data.gws, params.engineInfo, + in_layout, out_layout, dims_by_gws); + break; + } + default: + assert(false); + break; + } + return dispatch_data; +} + +JitConstants GroupNormalizationKernelRef::GetJitConstants(KernelId kernelId, + const group_normalization_params ¶ms) const { + auto jit = MakeBaseParamsJitConstants(params); + jit.AddConstant(MakeJitConstant("EPSILON", static_cast(params.epsilon))); + jit.AddConstant(MakeJitConstant("NUM_GROUPS", params.num_groups)); + switch (kernelId) { + case eCalcMeanKernel: + jit.AddConstant(MakeJitConstant("MEAN_KERNEL_ENABLED", true)); + break; + case eCalcStandardDeviationKernel: + jit.AddConstant(MakeJitConstant("STANDARD_DEVIATION_KERNEL_ENABLED", true)); + break; + case eNormalize: { + jit.AddConstant(MakeJitConstant("NORMALIZE_KERNEL_ENABLED", true)); + jit.AddConstant(MakeJitConstant("INPUT_INDICES_ORDER", "batch, feature, z, y, x")); + if (!params.fused_ops.empty()) { + FusedOpsConfiguration conf{ + "", + params.outputs[0].Dimentions() == 5 ? std::vector{"batch", "feature", "z", "y", "x"} : + std::vector{"batch", "feature", "y", "x"}, + "res", + params.outputs[0].GetDType() + }; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + break; + } + default: + assert(false); + break; + } + return jit; +} + +void GroupNormalizationKernelRef::SetKernelArguments(const group_normalization_params& params, + KernelId kernelId, + cldnn::arguments_desc& arguments, + std::vector& internalBufferSizes) { + switch (kernelId) { + case eCalcMeanKernel: { + arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); + arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + internalBufferSizes.push_back(InternalBufferSize(params)); + break; + } + case eCalcStandardDeviationKernel: { + arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); + arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + internalBufferSizes.push_back(InternalBufferSize(params)); + break; + } + case eNormalize: { + arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); + arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); + arguments.push_back({ArgumentDescriptor::Types::INPUT, 2}); + arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0}); + break; + } + default: + assert(false); + break; + } +} + +KernelsData GroupNormalizationKernelRef::GetKernelsData(const Params ¶ms, const optional_params &options) const { + const group_normalization_params& parameters = static_cast(params); + KernelData kd = KernelData::Default(params, eKernelsNum); + kd.internalBufferDataType = Datatype::F32; + for (KernelId id = eCalcMeanKernel; id < eKernelsNum; ++id) { + auto& kernel = kd.kernels[id]; + const auto entryPoint = GetEntryPoint(kernelName, parameters.layerID, params, options, id); + auto jitConstants = GetJitConstants(id, parameters); + const auto jit = CreateJit(kernelName, jitConstants, entryPoint); + const auto dispatchData = SetDefault(id, parameters); + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entryPoint, + "", + false, + false, + 0, + 0, + 0); + SetKernelArguments(parameters, id, kernel.params.arguments, kd.internalBufferSizes); + } + return {kd}; +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.h new file mode 100644 index 00000000000000..0737c4d45089bf --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.h @@ -0,0 +1,60 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once +#include "kernel_base_opencl.h" +#include "kernel_selector_params.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// GroupNormalizationParams +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct group_normalization_params : public base_params { + group_normalization_params() : base_params(KernelType::GROUP_NORMALIZATION) {} + + std::int64_t num_groups{}; + double epsilon{}; + + ParamsKey GetParamsKey() const override { + return base_params::GetParamsKey(); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// group_normalization_optional_params +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +struct group_normalization_optional_params : optional_params { + group_normalization_optional_params() : optional_params(KernelType::GROUP_NORMALIZATION) {} +}; + +class GroupNormalizationKernelRef : public KernelBaseOpenCL { +public: + using DispatchData = CommonDispatchData; + enum KernelId { + eCalcMeanKernel, + eCalcStandardDeviationKernel, + eNormalize, + eKernelsNum + }; + + GroupNormalizationKernelRef() : KernelBaseOpenCL{"group_normalization_gpu_ref"} {} + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; + std::vector GetSupportedFusedOps() const override { + return { + FusedOpType::ACTIVATION, + FusedOpType::QUANTIZE, + FusedOpType::ELTWISE + }; + } + +protected: + DispatchData SetDefault(KernelId id, const group_normalization_params& params) const; + JitConstants GetJitConstants(KernelId kernelId, const group_normalization_params& params) const; + static void SetKernelArguments(const group_normalization_params& params, + KernelId kernelId, + cldnn::arguments_desc& arguments, + std::vector& internalBufferSizes); +}; + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.cpp new file mode 100644 index 00000000000000..40a5044c5216b5 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "group_normalization_kernel_selector.h" +#include "group_normalization_kernel_ref.h" + +namespace kernel_selector { + +group_normalization_kernel_selector::group_normalization_kernel_selector() { + Attach(); +} + +KernelsData group_normalization_kernel_selector::GetBestKernels(const Params ¶ms, + const optional_params &options) const { + return GetNaiveBestKernel(params, options, KernelType::GROUP_NORMALIZATION); +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.h new file mode 100644 index 00000000000000..8e8579e61de41c --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.h @@ -0,0 +1,19 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once +#include "kernel_selector.h" + +namespace kernel_selector { +class group_normalization_kernel_selector : public kernel_selector_base { +public: + static group_normalization_kernel_selector& Instance() { + static group_normalization_kernel_selector instance_; + return instance_; + } + + group_normalization_kernel_selector(); + + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/plugin/ops/condition.cpp b/src/plugins/intel_gpu/src/plugin/ops/condition.cpp index d9b4e77314e600..c25726f673a2f8 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/condition.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/condition.cpp @@ -22,6 +22,12 @@ static cldnn::condition::branch gen_branch(ProgramBuilder& p, const std::shared_ << ", num inputs: " << op->get_input_size() << std::endl; auto config = p.get_config(); + { + auto custom_outputs = config.get_property(ov::intel_gpu::custom_outputs); + if (!custom_outputs.empty()) { + config.set_property(ov::intel_gpu::custom_outputs(std::vector({}))); + } + } config.set_property(ov::intel_gpu::max_dynamic_batch(1)); config.set_property(ov::intel_gpu::allow_new_shape_infer(op->is_dynamic())); @@ -61,10 +67,13 @@ static void CreateIfOp(ProgramBuilder& p, const std::shared_ptr& auto branch_true = gen_branch(p, op, idx_true); auto branch_false = gen_branch(p, op, idx_false); + const size_t num_outputs = op->get_output_size(); + const cldnn::condition conditionPrimitive(layerName, inputs, branch_true, - branch_false); + branch_false, + num_outputs); p.add_primitive(*op, conditionPrimitive); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp index b74d05e4f9aca6..b12536b10ccb9a 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp @@ -19,6 +19,8 @@ #include "openvino/op/roi_align.hpp" #include "openvino/op/variadic_split.hpp" #include "openvino/op/util/op_types.hpp" +#include "openvino/op/loop.hpp" +#include "openvino/op/tensor_iterator.hpp" #include "intel_gpu/primitives/data.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" @@ -206,6 +208,13 @@ static void CreateConstantOp(ProgramBuilder& p, const std::shared_ptr(outOp) || ov::is_type(outOp)) { consts[op].needsBatchInterpretation = constDims.size() == 1; + } else if ((ov::is_type(outOp) || ov::is_type(outOp))) { + // when inner network has 1d parameter which is connected to outer loop's constant 1d data, + // outer constant 1d data and inner 1d parameter has same bytes_count but layout is different + // (outer constant is [1, N, 1, 1] but inner parameter is [N, 1, 1, 1]). + // To pass check_memory_to_set in input_layout::set_data for this case, Set constDims to [N, 1, 1, 1] + // when constDims is one dim and user op is Loop or TensorIterator. + consts[op].needsBatchInterpretation = constDims.size() == 1; } } diff --git a/src/plugins/intel_gpu/src/plugin/ops/group_normalization.cpp b/src/plugins/intel_gpu/src/plugin/ops/group_normalization.cpp new file mode 100644 index 00000000000000..8c17d111331a67 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/group_normalization.cpp @@ -0,0 +1,30 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/group_normalization.hpp" +#include "intel_gpu/plugin/program_builder.hpp" +#include "intel_gpu/primitives/group_normalization.hpp" + +namespace ov { +namespace intel_gpu { + +static void CreateGroupNormalizationOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {3}); + auto inputs = p.GetInputInfo(op); + auto layerName = layer_type_name_ID(op); + cldnn::group_normalization groupNormalizationPrimitive { + layerName, + inputs[0], + inputs[1], + inputs[2], + op->get_num_groups(), + op->get_epsilon() + }; + p.add_primitive(*op, groupNormalizationPrimitive); +} + +REGISTER_FACTORY_IMPL(v12, GroupNormalization); + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/loop.cpp b/src/plugins/intel_gpu/src/plugin/ops/loop.cpp index f44dddb26ba0e6..628b0d7c37d9aa 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/loop.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/loop.cpp @@ -5,6 +5,7 @@ #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/plugin/plugin.hpp" +#include "openvino/op/tensor_iterator.hpp" #include "openvino/op/loop.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/util/sub_graph_base.hpp" @@ -20,13 +21,14 @@ #include using Loop = ov::op::v5::Loop; +using TensorIterator = ov::op::v0::TensorIterator; namespace ov { namespace intel_gpu { template -static DATA_TYPE CreateScalarData(ProgramBuilder &p, const cldnn::primitive_id& id, int64_t num) { - auto mem = p.get_engine().allocate_memory({ cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } }); +static DATA_TYPE CreateScalarData(ProgramBuilder &p, const cldnn::primitive_id& id, ov::Shape& shape, cldnn::data_types dtype, int64_t num) { + auto mem = p.get_engine().allocate_memory({ shape, dtype, cldnn::format::bfyx }); cldnn::mem_lock ptr{mem, p.get_engine().get_service_stream()}; *ptr.begin() = num; return {id, mem}; @@ -40,52 +42,37 @@ static cldnn::mutable_data CreateAdditionalOutputData(ProgramBuilder &p, const s const auto tensor = tensor_from_dims(op->get_output_shape(output_idx)); cldnn::layout output_layout = cldnn::layout(precision, format, tensor); auto mem = p.get_engine().allocate_memory(output_layout); - auto md = cldnn::mutable_data(id, {cldnn::input_info(input)}, mem); // cldnn::data cannot set dependency + auto md = cldnn::mutable_data(id, {cldnn::input_info(input)}, std::move(mem)); // cldnn::data cannot set dependency return md; } -static void CreateLoopOp(ProgramBuilder& p, const std::shared_ptr& op) { +static void SetLoopInputOutputMap(ProgramBuilder& p, + const std::shared_ptr& op, + cldnn::primitive::input_info_arr& inputs, + std::vector& input_primitive_maps, + std::vector& output_primitive_maps, + std::vector& back_edges_maps) { const std::string layerName = layer_type_name_ID(op); - auto inputs = p.GetInputInfo(op); const auto& loop_input_descs = op->get_input_descriptions(); const auto& loop_output_descs = op->get_output_descriptions(); const auto& body_inputs = op->get_function()->get_parameters(); const auto& body_outputs = op->get_function()->get_results(); - // Set special body ports: current_iteration input , execution condition output - auto special_body_ports = op->get_special_body_ports(); - - std::string body_current_iteration_id; - if (special_body_ports.current_iteration_input_idx >= 0) { - auto current_iteration_input = body_inputs.at(special_body_ports.current_iteration_input_idx); - body_current_iteration_id = layer_type_name_ID(current_iteration_input); - std::string input_name = ov::op::util::create_ie_output_name(current_iteration_input); - } - - cldnn::primitive_id body_execution_condition_id; - if (special_body_ports.body_condition_output_idx >= 0) { - auto body_condition_output = body_outputs.at(special_body_ports.body_condition_output_idx)->get_input_node_shared_ptr(0); - body_execution_condition_id = layer_type_name_ID(body_condition_output); - } - - // get body topology from ov::Model - ProgramBuilder body_program(op->get_function(), p.get_engine(), p.get_config(), true); - auto body_topology = *body_program.get_topology(); - - // setup input_primitive_maps/ output_primitive_maps and back_edges - std::vector input_primitive_maps; - std::vector output_primitive_maps; - std::vector back_edges; + bool use_new_shape_infer = p.use_new_shape_infer(); // set input mapping & back edges for (const auto& loop_input_desc : loop_input_descs) { - const cldnn::primitive_id& external_id = inputs.at(loop_input_desc->m_input_index).pid; + auto external_id = inputs.at(loop_input_desc->m_input_index); auto& body_input = body_inputs.at(loop_input_desc->m_body_parameter_index); cldnn::primitive_id internal_id = layer_type_name_ID(body_input); + GPU_DEBUG_LOG << "loop_input_descs[" << layerName << "] = {m_input_index:" << loop_input_desc->m_input_index << "(external_id: " + << external_id << "), m_body_parameter_index:" << loop_input_desc->m_body_parameter_index + << "(internal_id: " << internal_id << ")}" << std::endl; + // set input mapping if (const auto& sliceInfo = - std::dynamic_pointer_cast(loop_input_desc)) { + std::dynamic_pointer_cast(loop_input_desc)) { // sliced input input_primitive_maps.emplace_back(external_id, internal_id, sliceInfo->m_axis, sliceInfo->m_start, sliceInfo->m_end, sliceInfo->m_stride); @@ -96,7 +83,7 @@ static void CreateLoopOp(ProgramBuilder& p, const std::shared_ptr& op) { // set back edges if (const auto& mergedInput = - std::dynamic_pointer_cast(loop_input_desc)) { + std::dynamic_pointer_cast(loop_input_desc)) { // backedge const auto& to = body_inputs.at(mergedInput->m_body_parameter_index); const auto& from = body_outputs.at(mergedInput->m_body_value_index); @@ -104,81 +91,234 @@ static void CreateLoopOp(ProgramBuilder& p, const std::shared_ptr& op) { cldnn::primitive_id to_id = layer_type_name_ID(to); cldnn::primitive_id from_id = layer_type_name_ID(from); - // reset output data type because the data types of the outputs of the - // body topology are always FP32 regardless of element type - { - const auto from_prim = body_topology.at(from_id); - const auto to_cldnn_type = cldnn::element_type_to_data_type(to->get_element_type()); - from_prim->output_data_types = {to_cldnn_type}; - } - back_edges.emplace_back(from_id, to_id); + back_edges_maps.emplace_back(from_id, to_id); } } - // set trip count, initial execution condition, num iteration primitives - // they should be mutable_data to prevent from being optimized out - const cldnn::primitive_id trip_count_id = layer_type_name_ID(op->get_input_node_shared_ptr(0)); - const cldnn::primitive_id execution_condition_id = layer_type_name_ID(op->get_input_node_shared_ptr(1)); - const int64_t num_iterations = op->get_num_iterations(); - if (num_iterations < 0) { - OPENVINO_THROW("loop's num_iteration cannot be negative"); + // set output mapping + if (use_new_shape_infer) { + for (const auto& loop_output_desc : loop_output_descs) { + cldnn::input_info external_input_info(layerName, loop_output_desc->m_output_index); + p.primitive_ids[layerName] = layerName; + + const auto& body_output = body_outputs.at(loop_output_desc->m_body_value_index); + cldnn::primitive_id internal_id = layer_type_name_ID(body_output); + + // update primitive_map + if (const auto& concatOutput = + std::dynamic_pointer_cast(loop_output_desc)) { + // output which requires concatenation + output_primitive_maps.emplace_back(external_input_info, internal_id, concatOutput->m_axis, + concatOutput->m_start, concatOutput->m_end, concatOutput->m_stride); + GPU_DEBUG_LOG << "loop_output_descs[" << layerName << "][ConcatOutputDescription] external:" + << external_input_info << ", internal:" + << internal_id << "(axis, start, end, stride)={" + << concatOutput->m_axis << "," << concatOutput->m_start << "," + << concatOutput->m_end << "," << concatOutput->m_stride << "}" << std::endl; + } + if (std::dynamic_pointer_cast(loop_output_desc)) { + // output which requires no concatenation + output_primitive_maps.emplace_back(external_input_info, internal_id); + GPU_DEBUG_LOG << "loop_output_descs[" << layerName << "][BodyOutputDescription] external:" + << external_input_info << ", internal:" << internal_id << std::endl; + } + } + } else { + for (const auto& loop_output_desc : loop_output_descs) { + const uint64_t output_idx = loop_output_desc->m_output_index; + + // Add additional mutable_data for multiple outputs + // primitive ID should be . if output_idx > 0 + // otherwise primitive ID should be equals to TI primitive ID + const std::string layerNameWithIndex = layerName + ".out" + std::to_string(output_idx); + std::string external_id; + if (output_idx > 0) { + cldnn::mutable_data output_data = CreateAdditionalOutputData(p, op, layerNameWithIndex, layerName, output_idx); + p.add_primitive(*op, std::move(output_data)); + external_id = layerNameWithIndex; + } else { + p.primitive_ids[layerNameWithIndex] = layerName; + p.primitive_ids[layerName] = layerName; + external_id = layerName; + } + const auto& body_output = body_outputs.at(loop_output_desc->m_body_value_index); + cldnn::primitive_id internal_id = layer_type_name_ID(body_output); + + // update primitive_map + if (const auto& concatOutput = + std::dynamic_pointer_cast(loop_output_desc)) { + // output which requires concatenation + output_primitive_maps.emplace_back(external_id, internal_id, concatOutput->m_axis, + concatOutput->m_start, concatOutput->m_end, concatOutput->m_stride); + GPU_DEBUG_LOG << "loop_output_descs[" << layerName << "][ConcatOutputDescription] external:" + << external_id << ", internal:" + << internal_id << "(axis, start, end, stride)={" + << concatOutput->m_axis << "," << concatOutput->m_start << "," + << concatOutput->m_end << "," << concatOutput->m_stride << "}" << std::endl; + } + if (std::dynamic_pointer_cast(loop_output_desc)) { + // output which requires no concatenation + output_primitive_maps.emplace_back(external_id, internal_id); + GPU_DEBUG_LOG << "loop_output_descs[" << layerName << "][BodyOutputDescription] external:" + << external_id << ", internal:" << internal_id << std::endl; + } + } } - const cldnn::primitive_id num_iteration_id = layerName + "_numIteration"; - { - cldnn::mutable_data num_iteration = CreateScalarData(p, num_iteration_id, 0); - p.add_primitive(*op, std::move(num_iteration)); +} + +static std::vector GetOutputNames(const cldnn::primitive_id id, + const cldnn::primitive_id body_execution_condition_id, + const std::vector& output_primitive_maps, + const std::vector& back_edges) { + std::vector output_names; + OPENVINO_ASSERT(!output_primitive_maps.empty(), "[GPU] Output primitive map should have at least 1 mapping in primitive ", id); + for (auto out_map : output_primitive_maps) { + output_names.push_back(out_map.internal_id.pid); } - // set output mapping - for (const auto& loop_output_desc : loop_output_descs) { - const uint64_t output_idx = loop_output_desc->m_output_index; - - // Add additional mutable_data for multiple outputs - // primitive ID should be . if output_idx > 0 - // otherwise primitive ID should be equals to TI primitive ID - const std::string layerNameWithIndex = layerName + ".out" + std::to_string(output_idx); - std::string external_id; - if (output_idx > 0) { - cldnn::mutable_data output_data = CreateAdditionalOutputData(p, op, layerNameWithIndex, layerName, output_idx); - p.add_primitive(*op, std::move(output_data)); - external_id = layerNameWithIndex; - } else { - external_id = layerName; + // setup outputs for backedges + for (auto& back_edge : back_edges) { + auto iter = std::find(output_names.begin(), output_names.end(), back_edge.from); + // Do not add duplicated output name + if (iter == output_names.end()) { + output_names.push_back(back_edge.from); } - const auto& body_output = body_outputs.at(loop_output_desc->m_body_value_index); - cldnn::primitive_id internal_id = layer_type_name_ID(body_output); - - // update primitive_map - if (const auto& concatOutput = - std::dynamic_pointer_cast(loop_output_desc)) { - // output which requires concatenation - output_primitive_maps.emplace_back(external_id, internal_id, concatOutput->m_axis, - concatOutput->m_start, concatOutput->m_end, concatOutput->m_stride); + } + + // if execution_condition_id is specified, we need to add the id in build_option::outputs + if (!body_execution_condition_id.empty()) { + output_names.push_back(body_execution_condition_id); + } + + return output_names; +} + +static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr& op, bool is_loop_op) { + const std::string layerName = layer_type_name_ID(op); + auto inputs = p.GetInputInfo(op); + bool is_dynamic = p.use_new_shape_infer() || op->is_dynamic(); + + int64_t num_iterations = op->get_num_iterations(); + OPENVINO_ASSERT((is_dynamic || num_iterations > 0), "loop's num_iteration should be positive on static shape model"); + + auto num_outputs = is_dynamic? op->get_output_size() : 1; + auto ov_model = op->get_function(); + + // Set special body ports: current_iteration input , execution condition output + cldnn::primitive_id body_current_iteration_id; + cldnn::primitive_id body_execution_condition_id; + cldnn::primitive_id trip_count_id; + cldnn::primitive_id first_execution_condition_id; + cldnn::primitive_id updated_current_iteration_id; + + std::shared_ptr current_iteration_input_op; + if (is_loop_op) { + auto loop_op = std::dynamic_pointer_cast(op); + auto special_body_ports = loop_op->get_special_body_ports(); + if (special_body_ports.current_iteration_input_idx >= 0) { + const auto& body_inputs = loop_op->get_function()->get_parameters(); + current_iteration_input_op = body_inputs.at(special_body_ports.current_iteration_input_idx); + body_current_iteration_id = layer_type_name_ID(current_iteration_input_op); } - if (std::dynamic_pointer_cast(loop_output_desc)) { - // output which requires no concatenation - output_primitive_maps.emplace_back(external_id, internal_id); + + if (special_body_ports.body_condition_output_idx >= 0) { + const auto& body_outputs = loop_op->get_function()->get_results(); + auto body_condition_output = body_outputs.at(special_body_ports.body_condition_output_idx)->get_input_node_shared_ptr(0); + body_execution_condition_id = layer_type_name_ID(body_condition_output); } + + trip_count_id = layer_type_name_ID(loop_op->get_input_node_shared_ptr(0)); + first_execution_condition_id = layer_type_name_ID(loop_op->get_input_node_shared_ptr(1)); + } + + // setup input_primitive_maps/ output_primitive_maps and back_edges + std::vector input_primitive_maps; + std::vector output_primitive_maps; + std::vector back_edges; + + SetLoopInputOutputMap(p, op, inputs, input_primitive_maps, output_primitive_maps, back_edges); + + auto shape = is_dynamic? ngraph::Shape{1} : ngraph::Shape{1, 1, 1, 1}; + auto prec = ngraph::element::i64; + if (current_iteration_input_op) { + current_iteration_input_op->set_output_type(0, prec, shape); + current_iteration_input_op->set_partial_shape(shape); + current_iteration_input_op->set_element_type(prec); + + auto increment_value_id = current_iteration_input_op->get_friendly_name() + "_inc"; + auto increment_value_op = std::make_shared(prec, shape, 1); + increment_value_op->set_friendly_name(increment_value_id); + + auto update_current_iter_op_id = current_iteration_input_op->get_friendly_name() + "_update"; + auto update_current_iter_op = std::make_shared(current_iteration_input_op, increment_value_op); + update_current_iter_op->set_friendly_name(update_current_iter_op_id); + updated_current_iteration_id = layer_type_name_ID(update_current_iter_op); + + auto result = std::make_shared(update_current_iter_op); + ov_model->add_results({result}); + } + + // set trip count, num iteration primitives + // they should be mutable_data to prevent from being optimized out + const cldnn::primitive_id num_iteration_id = layerName + "_numIteration"; + cldnn::mutable_data num_iteration_data = CreateScalarData(p, num_iteration_id, shape, prec, 0); + + p.add_primitive(*op, std::move(num_iteration_data)); + inputs.insert(inputs.begin(), cldnn::input_info(num_iteration_id, 0)); + + if (!body_current_iteration_id.empty()) { + // update input_primitive_maps and back_edges for current_iteration nodes + input_primitive_maps.emplace_back(cldnn::input_info(num_iteration_id), cldnn::input_info(body_current_iteration_id)); + back_edges.emplace_back(updated_current_iteration_id, body_current_iteration_id); } + auto output_names_vec = GetOutputNames(layerName, body_execution_condition_id, output_primitive_maps, back_edges); + + auto config = p.get_config(); + config.set_property(ov::intel_gpu::custom_outputs(output_names_vec)); + config.set_property(ov::intel_gpu::max_dynamic_batch(1)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic)); + + // get body program from ov::Model + ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), true); + auto body_program = prog.get_compiled_program(); + + GPU_DEBUG_LOG << "* trip_count_id : " << trip_count_id << std::endl; + GPU_DEBUG_LOG << "* num_iteration_id : " << num_iteration_id << std::endl; + GPU_DEBUG_LOG << "* body_current_iteration_id : " << body_current_iteration_id << std::endl; + GPU_DEBUG_LOG << "* first_execution_condition_id : " << first_execution_condition_id << std::endl; + GPU_DEBUG_LOG << "* body_execution_condition_id : " << body_execution_condition_id << std::endl; + const cldnn::loop loopPrimitive( - layerName, /* layer name of this primitive (output id) */ - inputs, /* inputs of this layer */ - body_topology, /* body network */ - trip_count_id, /* trip_count data in outer network, always same as num_iterations in TI */ - execution_condition_id, /* initial_execution_condition data in outer network, always true in TI */ - num_iteration_id, /* actual number of iteration data in body network */ - input_primitive_maps, /* input mappings connecting outer network and inner network */ - output_primitive_maps, /* output mappings connecting outer network and inner network */ - back_edges, /* back edge mapping */ - num_iterations, /* max iteration, i.e. length of iteration axis */ + layerName, /* layer name of this primitive (output id) */ + inputs, /* inputs of this layer */ + body_program, /* body network */ + trip_count_id, /* trip_count data in outer network, always same as num_iterations in TI */ + first_execution_condition_id, /* initial_execution_condition data in outer network, always true in TI */ + num_iteration_id, /* actual number of iteration data in body network */ + input_primitive_maps, /* input mappings connecting outer network and inner network */ + output_primitive_maps, /* output mappings connecting outer network and inner network */ + back_edges, /* back edge mapping */ + num_iterations, /* max iteration, i.e. length of iteration axis */ body_current_iteration_id, - body_execution_condition_id); + body_execution_condition_id, + num_outputs); p.add_primitive(*op, loopPrimitive); } +static void CreateLoopOp(ProgramBuilder& p, const std::shared_ptr& op) { + CreateCommonLoopOp(p, op, true); +} + +/* The above code is a comment in C++ programming language. It is not doing anything in terms of code +execution. It is simply providing information or documentation about the code. */ +static void CreateTensorIteratorOp(ProgramBuilder& p, const std::shared_ptr& op) { + CreateCommonLoopOp(p, op, false); +} + REGISTER_FACTORY_IMPL(v5, Loop); +REGISTER_FACTORY_IMPL(v0, TensorIterator); } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/tensor_iterator.cpp b/src/plugins/intel_gpu/src/plugin/ops/tensor_iterator.cpp deleted file mode 100644 index 21c7d3a8167a91..00000000000000 --- a/src/plugins/intel_gpu/src/plugin/ops/tensor_iterator.cpp +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "intel_gpu/plugin/program_builder.hpp" -#include "intel_gpu/plugin/common_utils.hpp" -#include "intel_gpu/plugin/plugin.hpp" - -#include - -#include "openvino/op/tensor_iterator.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/util/sub_graph_base.hpp" - -#include "intel_gpu/primitives/loop.hpp" -#include "intel_gpu/primitives/mutable_data.hpp" -#include "intel_gpu/primitives/data.hpp" -#include "intel_gpu/primitives/reorder.hpp" -#include "intel_gpu/graph/topology.hpp" - -#include -#include - -using TensorIterator = ov::op::v0::TensorIterator; - -namespace ov { -namespace intel_gpu { - -template -static DATA_TYPE CreateScalarData(ProgramBuilder &p, const cldnn::primitive_id& id, int64_t num) { - auto mem = p.get_engine().allocate_memory({ cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } }); - cldnn::mem_lock ptr{mem, p.get_engine().get_service_stream()}; - *ptr.begin() = num; - return {id, mem}; -} - -static cldnn::mutable_data CreateAdditionalOutputData(ProgramBuilder &p, const std::shared_ptr& op, - const cldnn::primitive_id& id, const cldnn::primitive_id& input, - const int32_t output_idx) { - const auto precision = cldnn::element_type_to_data_type(op->get_output_element_type(output_idx)); - const auto format = cldnn::format::get_default_format(op->get_output_shape(output_idx).size()); - const auto tensor = tensor_from_dims(op->get_output_shape(output_idx)); - cldnn::layout output_layout = cldnn::layout(precision, format, tensor); - auto mem = p.get_engine().allocate_memory(output_layout); - auto md = cldnn::mutable_data(id, {cldnn::input_info(input)}, std::move(mem)); // cldnn::data cannot set dependency - return md; -} - -static void CreateTensorIteratorOp(ProgramBuilder &p, const std::shared_ptr &op) { - auto inputs = p.GetInputInfo(op); - - ProgramBuilder body_program(op->get_body(), p.get_engine(), p.get_config(), true); - auto body_topology = *body_program.get_topology(); - - // setup input_primitive_maps/ output_primitive_maps and back_edges - const auto& loop_input_descs = op->get_input_descriptions(); - const auto& loop_output_descs = op->get_output_descriptions(); - const auto& body_inputs = op->get_body()->get_parameters(); - const auto& body_outputs = op->get_body()->get_results(); - - std::vector input_primitive_maps; - std::vector output_primitive_maps; - std::vector back_edges; - std::map reordered_output_ids; - - // set input mapping & back edges - for (const auto& loop_input_desc : loop_input_descs) { - const cldnn::primitive_id& external_id = inputs.at(loop_input_desc->m_input_index).pid; - auto& body_input = body_inputs.at(loop_input_desc->m_body_parameter_index); - cldnn::primitive_id internal_id = layer_type_name_ID(body_input); - - // set input mapping - if (const auto& sliceInfo = - std::dynamic_pointer_cast(loop_input_desc)) { - // sliced input - input_primitive_maps.emplace_back(external_id, internal_id, sliceInfo->m_axis, - sliceInfo->m_start, sliceInfo->m_end, sliceInfo->m_stride); - } else { - // input without slicing - input_primitive_maps.emplace_back(external_id, internal_id); - } - - // set back edges - if (const auto& mergedInput = - std::dynamic_pointer_cast(loop_input_desc)) { - // backedge - const auto& to = body_inputs.at(mergedInput->m_body_parameter_index); - const auto& from = body_outputs.at(mergedInput->m_body_value_index); - - cldnn::primitive_id to_id = layer_type_name_ID(to); - cldnn::primitive_id from_id = layer_type_name_ID(from); - - // reset output data type because the data types of the outputs of the - // body topology are always FP32 regardless of element type - { - const auto from_prim = body_topology.at(from_id); - const auto to_cldnn_type = cldnn::element_type_to_data_type(to->get_element_type()); - from_prim->output_data_types = {to_cldnn_type}; - } - back_edges.emplace_back(from_id, to_id); - } - } - - // set trip count, initial execution condition, num iteration primitives - // they should be mutable_data to prevent from being optimized out - std::string layerName = layer_type_name_ID(op); - const cldnn::primitive_id trip_count_id = layerName + "_tripCount"; - const int64_t num_iterations = op->get_num_iterations(); - if (num_iterations < 0) { - throw std::runtime_error("tensor iterator's num_iteration cannot be negative"); - } - { - cldnn::data trip_count = CreateScalarData(p, trip_count_id, num_iterations); - p.add_primitive(*op, trip_count); - } - const cldnn::primitive_id execution_condition_id = layerName + "_initialExecutionCondition"; - { - cldnn::mutable_data execution_condition = CreateScalarData(p, execution_condition_id, 1); - p.add_primitive(*op, std::move(execution_condition)); - } - const cldnn::primitive_id num_iteration_id = layerName + "_numIteration"; - { - cldnn::mutable_data num_iteration = CreateScalarData(p, num_iteration_id, 0); - p.add_primitive(*op, num_iteration); - } - - // set output mapping - for (const auto& loop_output_desc : loop_output_descs) { - const uint64_t output_idx = loop_output_desc->m_output_index; - - // Add additional mutable_data for multiple outputs - // primitive ID should be . if output_idx > 0 - // otherwise primitive ID should be equals to TI primitive ID - const std::string layerNameWithIndex = layerName + ".out" + std::to_string(output_idx); - std::string external_id; - if (output_idx > 0) { - cldnn::mutable_data output_data = CreateAdditionalOutputData(p, op, layerNameWithIndex, layerName, output_idx); - p.add_primitive(*op, std::move(output_data)); - external_id = layerNameWithIndex; - } else { - p.primitive_ids[layerNameWithIndex] = layerName; - p.primitive_ids[layerName] = layerName; - external_id = layerName; - } - const auto& body_output = body_outputs.at(loop_output_desc->m_body_value_index); - cldnn::primitive_id internal_id = layer_type_name_ID(body_output); - - // update primitive_map - if (const auto& concatOutput = - std::dynamic_pointer_cast(loop_output_desc)) { - // output which requires concatenation - output_primitive_maps.emplace_back(external_id, internal_id, concatOutput->m_axis, - concatOutput->m_start, concatOutput->m_end, concatOutput->m_stride); - } - if (std::dynamic_pointer_cast(loop_output_desc)) { - // output which requires no concatenation - output_primitive_maps.emplace_back(external_id, internal_id); - } - } - - const cldnn::loop loopPrimitive( - layerName, /* layer name of this primitive (output id) */ - inputs, /* inputs of this layer */ - body_topology, /* body network */ - trip_count_id, /* trip_count data in outer network, always same as num_iterations in TI */ - execution_condition_id, /* initial_execution_condition data in outer network, always true in TI */ - num_iteration_id, /* actual number of iteration data in body network */ - input_primitive_maps, /* input mappings connecting outer network and inner network */ - output_primitive_maps, /* output mappings connecting outer network and inner network */ - back_edges, /* back edge mapping */ - num_iterations, /* max iteration, i.e. length of iteration axis */ - "", - ""); - - p.add_primitive(*op, loopPrimitive); -} - -REGISTER_FACTORY_IMPL(v0, TensorIterator); - -} // namespace intel_gpu -} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 404818ce92ce8c..a97b7e87a9e4b3 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -158,7 +158,7 @@ std::shared_ptr ProgramBuilder::build(const std::vector func) { pass_config->disable(); pass_config->disable(); pass_config->disable(); + pass_config->disable(); pass_config->enable(); diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp new file mode 100644 index 00000000000000..72bb27eba05c54 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/group_normalization.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "single_layer_tests/group_normalization.hpp" + +using namespace ov::test::subgraph; + +namespace { + +const std::vector netPrecisions = { + ov::element::f16, + ov::element::f32, +}; + +const std::vector inputShapes = { + {3, 8, 32, 64}, + {3, 8, 28, 32, 12}, +}; + +const std::vector numGroups = { + 2, 4, +}; + +const std::vector epsilon = { + 0.0025 +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_GroupNormalization, + GroupNormalizationTest, + testing::Combine(testing::ValuesIn(netPrecisions), + ::testing::Values(ov::element::undefined), + ::testing::Values(ov::element::undefined), + testing::ValuesIn(ov::test::static_shapes_to_test_representation(inputShapes)), + testing::ValuesIn(numGroups), + testing::ValuesIn(epsilon), + testing::Values(ov::test::utils::DEVICE_GPU), + testing::Values(ov::AnyMap())), + GroupNormalizationTest::getTestCaseName); + +} // anonymous namespace diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/loop.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/loop.cpp new file mode 100644 index 00000000000000..8c7de510531348 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/loop.cpp @@ -0,0 +1,316 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include "ov_models/utils/ov_helpers.hpp" +#include "shared_test_classes/base/layer_test_utils.hpp" +#include "ov_models/builders.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "common_test_utils/test_constants.hpp" +#include "shared_test_classes/base/utils/ranges.hpp" +#include +#include "shared_test_classes/base/utils/compare_results.hpp" +#include "openvino/pass/constant_folding.hpp" +#include + +using namespace InferenceEngine; +using namespace ov::test; + +namespace GPULayerTestsDefinitions { + +using DynamicShapeLoopParams = typename std::tuple< + bool, + std::tuple< + bool, + int64_t, + int64_t, + int64_t + >, + int64_t, + InputShape, + InferenceEngine::Precision, + std::string, + ov::AnyMap + >; + +/** + * Test case with Dynamic SHAPE version of loop operation. + * Total iteration count is dynamic. + */ +class DynamicShapeLoopTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj) { + bool static_iter_num; + bool static_continue_cond; + int64_t max_iter_num; + int64_t dynamic_exit; + int64_t axis; + int64_t start_value; + InputShape data_shapes; + InferenceEngine::Precision data_prc; + std::string targetDevice; + auto args_pack = std::tie(static_iter_num, max_iter_num, dynamic_exit, axis); + ov::Any configuration; + std::tie( + static_continue_cond, + args_pack, + start_value, + data_shapes, + data_prc, + targetDevice, + configuration) = obj.param; + + std::ostringstream result; + result << "static_iter_num=" << std::to_string(static_iter_num) << "_"; + result << "static_continue_cond=" << std::to_string(static_continue_cond) << "_"; + result << "max_iter_num=" << std::to_string(max_iter_num) << "_"; + result << "dynamic_exit=" << std::to_string(dynamic_exit) << "_"; + result << "axis=" << std::to_string(axis) << "_"; + result << "start_value=" << std::to_string(start_value) << "_"; + result << "max_iter_num=" << std::to_string(max_iter_num) << "_"; + result << "IS=("; + result << ov::test::utils::partialShape2str({data_shapes.first}) << "_"; + for (size_t i = 0lu; i < data_shapes.second.size(); i++) { + result << "{"; + result << ov::test::utils::vec2str(data_shapes.second[i]) << "_"; + result << "}_"; + } + result << ")_"; + result << "netPRC=" << data_prc << "_"; + result << "targetDevice=" << targetDevice << "_"; + + auto res_str = result.str(); + std::replace(res_str.begin(), res_str.end(), '-', '_'); + return res_str; + } + +private: + bool static_iter_num; // trip count provided by constant node + bool static_continue_cond; // initial_cond provided by constant node + int64_t max_iter_num; // -1 means infinity loop (expected dynamic exit condition in body) + int64_t dynamic_exit; // -1 means always true + int64_t axis; // -1 means no auto concatenation + int64_t start_value; + InputShape data_shapes; + InferenceEngine::Precision data_prc; + +protected: + void SetUp() override { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + auto args_pack = std::tie(static_iter_num, max_iter_num, dynamic_exit, axis); + std::tie( + static_continue_cond, + args_pack, + start_value, + data_shapes, + data_prc, + targetDevice, + configuration) = GetParam(); + + const auto prc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(data_prc); + const auto inputShape = data_shapes.first; + const auto scalarShape = ngraph::Shape{}; + init_input_shapes({data_shapes}); + + ngraph::ParameterVector params{}; + auto cond_input_create = [¶ms] (ngraph::element::Type prc, const ov::PartialShape &shape, int value = 0, bool is_static = false) + -> std::shared_ptr { + if (is_static) + return std::make_shared(prc, shape.to_shape(), value); + + auto input = std::make_shared(prc, shape); + params.push_back(input); + return input; + }; + + auto start = cond_input_create(prc, inputShape); + start->set_friendly_name("start"); + auto count = cond_input_create(ngraph::element::i64, scalarShape, max_iter_num, static_iter_num); + count->set_friendly_name("count"); + auto skip = cond_input_create(ngraph::element::boolean, scalarShape, true, static_continue_cond); + skip->set_friendly_name("skip"); + + // + // count skip start count skip start + // / / + // ___*___*____ __________*___*____ | idx | data | out | + // | idx in | | ex_val idx in | | 0 | 7 | 7 | + // | | / | | | / | / | | 1 | 7 | 8 | + // | add | | less add | | 2 | 8 | 10 | + // | | true | | | | | | 3 | 10 | 13 | + // | | | | | | | | ~~~~~ * * * ~~~~~ + // | out cnd | | cnd out | + // |___*____*___| |____*_____*________| + // Full loop Dynamic exit loop + // n_iter = count n_iter = ex_val + // + auto b_indx = std::make_shared(ngraph::element::i64, ngraph::Shape{}); + b_indx->set_friendly_name("body_index"); + auto b_data = std::make_shared(prc, inputShape); + b_data->set_friendly_name("body_data"); + auto b_indx_cast = std::make_shared(b_indx, prc); + b_indx_cast->set_friendly_name("body_index_cast"); + auto b_add = std::make_shared(b_data, b_indx_cast); + b_add->set_friendly_name("body_addition"); + + std::shared_ptr b_cond; + if (dynamic_exit == -1) { + b_cond = std::make_shared(ngraph::element::boolean, ngraph::Shape{}, true); + b_cond->set_friendly_name("body_condition"); + } else { + auto b_exit_value = std::make_shared(ngraph::element::i64, scalarShape, dynamic_exit); + b_exit_value->set_friendly_name("body_exit_value"); + b_cond = std::make_shared(b_indx, b_exit_value); + b_cond->set_friendly_name("body_condition_with_exit_value"); + } + + auto body = std::make_shared( + ngraph::OutputVector {b_cond, b_add}, // TODO: check with reverse + ngraph::ParameterVector {b_indx, b_data}); // TODO: check with reverse + body->set_friendly_name("body_network"); + + auto loop = std::make_shared(count, skip); + loop->set_friendly_name("loop"); + loop->set_function(body); + loop->set_special_body_ports({0, 0}); + loop->set_merged_input(b_data, start, b_add); + if (axis == -1) + loop->get_iter_value(b_add, -1); + else + loop->get_concatenated_slices(b_add, 0, 1, 1, -1, axis); + + function = std::make_shared( + ngraph::OutputVector {loop}, + params); + function->set_friendly_name("outer_body_network"); + } +}; + + +TEST_P(DynamicShapeLoopTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + run(); +} + +std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::I32 +}; + +ov::AnyMap netConfigurations = { + {GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING, PluginConfigParams::NO} +}; + +static const std::vector> dynamic_loop_types_axis_0 { + // GCC4.8 limitation: have to specify type of each element in list + // static_trip_count | max | dynamic_exit | axis + std::tuple{ true , 10, -1, 0 }, // n_iter 10, no dynamic exit +}; + +std::vector inputs_0 = { + InputShape(ov::PartialShape({1, -1, 2}), {{1, 4, 2}, {1, 5, 2}, {1, 10, 2}}), +}; + +INSTANTIATE_TEST_SUITE_P(smoke_DynamicShapeLoop_axis_0, DynamicShapeLoopTest, + testing::Combine( + /* static_continue_cond */ testing::Values(true), + /* args_pack */ testing::ValuesIn(dynamic_loop_types_axis_0), + /* start_value */ testing::Values(0), + /* data_shape */ testing::ValuesIn(inputs_0), + /* data_prc */ testing::ValuesIn(netPrecisions), + /* device */ testing::Values(ov::test::utils::DEVICE_GPU), + /* configuration */ testing::Values(netConfigurations)), + DynamicShapeLoopTest::getTestCaseName); + +static const std::vector> dynamic_loop_types_1 { + // GCC4.8 limitation: have to specify type of each element in list + // static_trip_count | max | dynamic_exit | axis + std::tuple{ true , 5, -1, 1 }, // n_iter 5, no dynamic exit +}; + +std::vector inputs_1 = { + InputShape(ov::PartialShape({-1, 1, 4, -1}), {{2, 1, 4, 10}, {3, 1, 4, 14}, {6, 1, 4, 16}}), +}; + +INSTANTIATE_TEST_SUITE_P(smoke_DynamicShapeLoop_axis_1, DynamicShapeLoopTest, + testing::Combine( + /* static_continue_cond */ testing::Values(true), + /* args_pack */ testing::ValuesIn(dynamic_loop_types_1), + /* start_value */ testing::Values(0), + /* data_shape */ testing::ValuesIn(inputs_1), + /* data_prc */ testing::ValuesIn(netPrecisions), + /* device */ testing::Values(ov::test::utils::DEVICE_GPU), + /* configuration */ testing::Values(netConfigurations)), + DynamicShapeLoopTest::getTestCaseName); + +static const std::vector> dynamic_loop_types_2 { + // GCC4.8 limitation: have to specify type of each element in list + // static_trip_count | max | dynamic_exit | axis + std::tuple{ true , 10, -1, 2 }, // n_iter 10, no dynamic exit +}; + +std::vector inputs_2 = { + InputShape(ov::PartialShape({-1, -1, 1, 6}), {{2, 4, 1, 6}, {10, 40, 1, 6}, {12, 16, 1, 6}}), +}; + +INSTANTIATE_TEST_SUITE_P(smoke_DynamicShapeLoop_axis_2, DynamicShapeLoopTest, + testing::Combine( + /* static_continue_cond */ testing::Values(true), + /* args_pack */ testing::ValuesIn(dynamic_loop_types_2), + /* start_value */ testing::Values(0), + /* data_shape */ testing::ValuesIn(inputs_2), + /* data_prc */ testing::ValuesIn(netPrecisions), + /* device */ testing::Values(ov::test::utils::DEVICE_GPU), + /* configuration */ testing::Values(netConfigurations)), + DynamicShapeLoopTest::getTestCaseName); + +static const std::vector> dynamic_loop_types_no_auto_concat { + // GCC4.8 limitation: have to specify type of each element in list + // static_trip_count | max | dynamic_exit | axis + std::tuple{ true , 10, -1, -1 }, // n_iter 5, no dynamic exit +}; + +std::vector inputs_no_auto_concat = { + InputShape(ov::PartialShape({-1, 1, 6}), {{2, 1, 6}, {10, 1, 6}, {12, 1, 6}}), +}; + +INSTANTIATE_TEST_SUITE_P(smoke_DynamicShapeLoop_no_auto_concat, DynamicShapeLoopTest, + testing::Combine( + /* static_continue_cond */ testing::Values(true), + /* args_pack */ testing::ValuesIn(dynamic_loop_types_no_auto_concat), + /* start_value */ testing::Values(0), + /* data_shape */ testing::ValuesIn(inputs_no_auto_concat), + /* data_prc */ testing::ValuesIn(netPrecisions), + /* device */ testing::Values(ov::test::utils::DEVICE_GPU), + /* configuration */ testing::Values(netConfigurations)), + DynamicShapeLoopTest::getTestCaseName); + +static const std::vector> dynamic_loop_types_dynamic_exit { + // GCC4.8 limitation: have to specify type of each element in list + // static_trip_count | max | dynamic_exit | axis + std::tuple{ true , 5, 3, -1 }, // n_iter 3, dynamic exit on 3 + std::tuple{ true , 5, 7, 1 }, // n_iter 5, dynamic exit not reached + std::tuple{ true , -1, 5, -1 }, // n_iter 5, inf loop with dynamic exit on 5 +}; + +std::vector inputs_dynamic_exit = { + InputShape(ov::PartialShape({-1, 1, 2}), {{4, 1, 2}, {10, 1, 2}, {12, 1, 2}}), +}; + +INSTANTIATE_TEST_SUITE_P(smoke_DynamicShapeLoop_dynamic_exit, DynamicShapeLoopTest, + testing::Combine( + /* static_continue_cond */ testing::Values(true), + /* args_pack */ testing::ValuesIn(dynamic_loop_types_dynamic_exit), + /* start_value */ testing::Values(0), + /* data_shape */ testing::ValuesIn(inputs_dynamic_exit), + /* data_prc */ testing::ValuesIn(netPrecisions), + /* device */ testing::Values(ov::test::utils::DEVICE_GPU), + /* configuration */ testing::Values(netConfigurations)), + DynamicShapeLoopTest::getTestCaseName); + +} // namespace GPULayerTestsDefinitions \ No newline at end of file diff --git a/src/plugins/intel_gpu/tests/unit/fusions/loop_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/loop_fusion_test.cpp index 6e980ac7d25d8b..a3635bb320f47f 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/loop_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/loop_fusion_test.cpp @@ -31,6 +31,35 @@ struct loop_params { size_t expected_not_fused_primitives; }; + +program::ptr build_program(engine& engine, + topology& body_topology, + primitive_id initial_condition_id, + std::vector output_primitive_maps, + std::vector back_edges) { + std::vector output_names_vec; + for (auto out_map : output_primitive_maps) { + output_names_vec.push_back(out_map.internal_id.pid); + } + + // setup outputs for backedges + for (auto& back_edge : back_edges) { + output_names_vec.push_back(back_edge.from); + } + + // if execution_condition_id is specified, we need to add the id in build_option::outputs + if (!initial_condition_id.empty()) { + output_names_vec.push_back(initial_condition_id); + } + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::custom_outputs(output_names_vec)); + config.set_property(ov::intel_gpu::max_dynamic_batch(1)); + + return program::build_program(engine, body_topology, config, false, false, true); +} + class LoopFusingTest : public ::BaseFusingTest { public: @@ -71,6 +100,8 @@ TEST_P(permute_eltwise_loop, basic) { std::vector output_primitive_maps {loop::io_primitive_map("loop", "body_eltwise", 2)}; std::vector back_edges {loop::backedge_mapping("body_eltwise", "body_eltwise_operand")}; + auto body_program = build_program(engine, body, "", output_primitive_maps, back_edges); + create_topologies( input_layout("input", get_input_layout(p)), data("eltwise_data", get_mem(layout{p.data_type, p.default_format, p.loop_input_shape})), @@ -80,7 +111,7 @@ TEST_P(permute_eltwise_loop, basic) { data("trip_count", trip_count_mem), data("initial_condition", initial_condition_mem), mutable_data("num_iteration", num_iteration_mem), - loop("loop", { input_info("eltwise"), input_info("loop_eltwise_init_values") }, body, + loop("loop", { input_info("num_iteration"), input_info("eltwise"), input_info("loop_eltwise_init_values") }, body_program, "trip_count", "initial_condition", "num_iteration", input_primitive_maps, output_primitive_maps, back_edges, p.loop_trip_count), reorder("output", input_info("loop"), format::bfyx, p.default_type) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp index b42241b23f1e99..fda7c1c41e5c12 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp @@ -28,6 +28,7 @@ bool is_output_equal(const cldnn::memory::ptr mem, const std::vector& ref) return true; } + topology generate_simple_branch (bool branch_true_false, const primitive_id& id, const primitive_id& input_id, const data_types dt = data_types::f32) { topology branch; @@ -86,14 +87,14 @@ class condition_gpu_basic_test : public ::testing::Test { condition::branch branch_true; { cldnn::topology branch_true_topology = generate_simple_branch(true, cond_id, branch_input_id, dat_dt); - branch_true.inner_program = program::build_program(engine, branch_true_topology, config, true); + branch_true.inner_program = program::build_program(engine, branch_true_topology, config, false, false, true); branch_true.input_map.insert({input_id, branch_input_id}); branch_true.output_map.insert({0, "condi_when_true"}); } condition::branch branch_false; { cldnn::topology branch_false_topology = generate_simple_branch(false, cond_id, branch_input_id, dat_dt); - branch_false.inner_program = program::build_program(engine, branch_false_topology, config, true); + branch_false.inner_program = program::build_program(engine, branch_false_topology, config, false, false, true); branch_false.input_map.insert({input_id, branch_input_id}); branch_false.output_map.insert({0, "condi_when_false"}); } @@ -178,14 +179,14 @@ TEST(condition_gpu, basic_range_equal_comp) { condition::branch branch_true; { cldnn::topology branch_true_topology = generate_simple_branch(true, condi_id, branch_input_id); - branch_true.inner_program = program::build_program(engine, branch_true_topology, config, true); + branch_true.inner_program = program::build_program(engine, branch_true_topology, config, false, false, true); branch_true.input_map.insert({concat_id, branch_input_id}); branch_true.output_map.insert({0, "condi_when_true"}); } condition::branch branch_false; { cldnn::topology branch_false_topology = generate_simple_branch(false, condi_id, branch_input_id); - branch_false.inner_program = program::build_program(engine, branch_false_topology, config, true); + branch_false.inner_program = program::build_program(engine, branch_false_topology, config, false, false, true); branch_false.input_map.insert({concat_id, branch_input_id}); branch_false.output_map.insert({0, "condi_when_false"}); } @@ -258,8 +259,8 @@ TEST(condition_gpu, basic_stacked_ifs) { auto predicate2 = engine.allocate_memory({ data_types::f32, format::bfyx,{ 1, 1, 1, 1 } }); primitive_id input_id = "input"; - primitive_id pred_id = "predicate"; - primitive_id predicate2_id = "predicate2"; + primitive_id pred_id = "predicate"; + primitive_id predicate2_id = "predicate2"; primitive_id branch_input_id = "branch_input"; primitive_id cond_id = "condi"; primitive_id cond2_id = "condi2"; @@ -280,22 +281,22 @@ TEST(condition_gpu, basic_stacked_ifs) { ); condition::branch branch_condi_1_true; - branch_condi_1_true.inner_program = program::build_program(engine, condi_1_true, config, true); + branch_condi_1_true.inner_program = program::build_program(engine, condi_1_true, config, false, false, true); branch_condi_1_true.input_map.insert({input_id, branch_input_id}); branch_condi_1_true.output_map.insert({0, "condi_when_true"}); condition::branch branch_condi_1_false; - branch_condi_1_false.inner_program = program::build_program(engine, condi_1_false, config, true); + branch_condi_1_false.inner_program = program::build_program(engine, condi_1_false, config, false, false, true); branch_condi_1_false.input_map.insert({input_id, branch_input_id}); branch_condi_1_false.output_map.insert({0, "condi_when_false"}); condition::branch branch_condi_2_true; - branch_condi_2_true.inner_program = program::build_program(engine, condi_2_true, config, true); + branch_condi_2_true.inner_program = program::build_program(engine, condi_2_true, config, false, false, true); branch_condi_2_true.input_map.insert({cond_id, branch_input_id}); branch_condi_2_true.output_map.insert({0, "activ_when_true"}); condition::branch branch_condi_2_false; - branch_condi_2_false.inner_program = program::build_program(engine, condi_2_false, config, true); + branch_condi_2_false.inner_program = program::build_program(engine, condi_2_false, config, false, false, true); branch_condi_2_false.input_map.insert({cond_id, branch_input_id}); branch_condi_2_false.output_map.insert({0, "activ_when_false"}); @@ -373,7 +374,7 @@ TEST(condition_gpu, basic_nested_ifs) { data("scale_5_data", scale_5_mem), eltwise("scale_5", { input_info("branch_input1"), input_info("scale_5_data") }, eltwise_mode::prod) ); - nested_true.inner_program = program::build_program(engine, nested_true_topology, config, true); + nested_true.inner_program = program::build_program(engine, nested_true_topology, config, false, false, true); nested_true.input_map.insert({"pooling_when_true", "branch_input1"}); nested_true.output_map.insert({0, "scale_5"}); } @@ -385,7 +386,7 @@ TEST(condition_gpu, basic_nested_ifs) { data("scale_10_data", scale_10_mem), eltwise("scale_10", { input_info("branch_input2"), input_info("scale_10_data") }, eltwise_mode::prod) ); - nested_false.inner_program = program::build_program(engine, nested_false_topology, config, true); + nested_false.inner_program = program::build_program(engine, nested_false_topology, config, false, false, true); nested_false.input_map.insert({"pooling_when_true", "branch_input2"}); nested_false.output_map.insert({0, "scale_10"}); } @@ -399,7 +400,7 @@ TEST(condition_gpu, basic_nested_ifs) { input_layout("predicate2", predicate2->get_layout()), condition( "condi_nested", {input_info("predicate2"), input_info("pooling_when_true")}, nested_true, nested_false) ); - branch_true.inner_program = program::build_program(engine, branch_true_topology, config, true); + branch_true.inner_program = program::build_program(engine, branch_true_topology, config, false, false, true); branch_true.input_map.insert({"input", "branch_input3"}); branch_true.output_map.insert({0, "condi_nested"}); } @@ -411,7 +412,7 @@ TEST(condition_gpu, basic_nested_ifs) { input_layout("branch_input4", { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }), pooling("pooling_when_false", input_info("branch_input4"), cldnn::pooling_mode::average, { 1, 2 }, { 1, 2 }) ); - branch_false.inner_program = program::build_program(engine, branch_false_topology, config, true); + branch_false.inner_program = program::build_program(engine, branch_false_topology, config, false, false, true); branch_false.input_map.insert({"input", "branch_input4"}); branch_false.output_map.insert({0, "pooling_when_false"}); } @@ -460,21 +461,21 @@ TEST(condition_gpu, negative_predicate_wrong_layout) { auto predicate = engine.allocate_memory({ data_types::f32, format::bfyx,{ 1, 1, 5, 1 } }); primitive_id input_id = "input"; - primitive_id pred_id = "predicate"; + primitive_id pred_id = "predicate"; primitive_id branch_input_id = "branch_input"; primitive_id cond_id = "condi"; condition::branch branch_true; { cldnn::topology branch_true_topology = generate_simple_branch(true, cond_id, branch_input_id, data_types::f32); - branch_true.inner_program = program::build_program(engine, branch_true_topology, config, true); + branch_true.inner_program = program::build_program(engine, branch_true_topology, config, false, false, true); branch_true.input_map.insert({input_id, branch_input_id}); branch_true.output_map.insert({0, "condi_when_true"}); } condition::branch branch_false; { cldnn::topology branch_false_topology = generate_simple_branch(false, cond_id, branch_input_id, data_types::f32); - branch_false.inner_program = program::build_program(engine, branch_false_topology, config, true); + branch_false.inner_program = program::build_program(engine, branch_false_topology, config, false, false, true); branch_false.input_map.insert({input_id, branch_input_id}); branch_false.output_map.insert({0, "condi_when_false"}); } @@ -501,7 +502,7 @@ TEST(condition_gpu, negative_not_same_layouts) { auto predicate = engine.allocate_memory({ data_types::u8, format::bfyx,{ 1, 1, 1, 1 } }); primitive_id input_id = "input"; - primitive_id pred_id = "predicate"; + primitive_id pred_id = "predicate"; primitive_id branch_input_id = "branch_input"; primitive_id cond_id = "condi"; @@ -513,7 +514,7 @@ TEST(condition_gpu, negative_not_same_layouts) { input_layout(branch_input_id, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }), pooling(pool_id, input_info(branch_input_id), cldnn::pooling_mode::max, { 1, 2 }, { 1, 2 }) ); - branch_true.inner_program = program::build_program(engine, branch_true_topology, config, true); + branch_true.inner_program = program::build_program(engine, branch_true_topology, config, false, false, true); branch_true.input_map.insert({input_id, branch_input_id}); branch_true.output_map.insert({0, pool_id}); } @@ -526,7 +527,7 @@ TEST(condition_gpu, negative_not_same_layouts) { input_layout(branch_input_id, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }), pooling(pool_id, input_info(branch_input_id), cldnn::pooling_mode::max, { 1, 4 }, { 1, 4 }) ); - branch_false.inner_program = program::build_program(engine, branch_false_topology, config, true); + branch_false.inner_program = program::build_program(engine, branch_false_topology, config, false, false, true); branch_false.input_map.insert({input_id, branch_input_id}); branch_false.output_map.insert({0, pool_id}); } @@ -566,7 +567,7 @@ TEST(condition_gpu, negative_same_names_within_different_networks) { input_layout(branch_input_id, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }), pooling(duplicated_id, input_info(branch_input_id), cldnn::pooling_mode::max, { 2, 1 }, { 2, 1 }) ); - branch_true.inner_program = program::build_program(engine, branch_true_topology, config, true); + branch_true.inner_program = program::build_program(engine, branch_true_topology, config, false, false, true); branch_true.input_map.insert({input_id, branch_input_id}); branch_true.output_map.insert({0, duplicated_id}); } @@ -578,7 +579,7 @@ TEST(condition_gpu, negative_same_names_within_different_networks) { input_layout(branch_input_id, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } }), pooling("pooling_when_false", input_info(branch_input_id), cldnn::pooling_mode::max, { 2, 1 }, { 2, 1 }) ); - branch_false.inner_program = program::build_program(engine, branch_false_topology, config, true); + branch_false.inner_program = program::build_program(engine, branch_false_topology, config, false, false, true); branch_false.input_map.insert({input_id, branch_input_id}); branch_false.output_map.insert({0, "pooling_when_false"}); } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp new file mode 100644 index 00000000000000..a13c1d1550882f --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp @@ -0,0 +1,146 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "random_generator.hpp" +#include +#include +#include "openvino/reference/group_normalization.hpp" +#include "compilation_context.hpp" + + +using namespace cldnn; +using namespace ::tests; + +namespace { + +typedef std::tuple< +std::vector, // Input shape +std::size_t, // Number of groups +double, // Epsilon +format // First input layout +> +GroupNormalizationParams; + +class GroupNormalizationGPUTest : public ::testing::TestWithParam { +public: + GroupNormalizationGPUTest() = default; + + void SetUp() override { + std::vector input_shape; + const auto& params = GetParam(); + std::tie(input_shape, num_groups_, epsilon_, format_) = params; + std::copy(std::begin(input_shape), std::end(input_shape), std::back_inserter(data_shape_)); + tests::random_generator rg{"GroupNormalizationGPUTest"}; + data_ = rg.generate_random_1d(ov::shape_size(input_shape), -1, 1); + scale_ = rg.generate_random_1d(input_shape[1], -1, 1); + bias_ = rg.generate_random_1d(input_shape[1], -1, 1); + const auto planar_format = format::dimension(format_) == 4 ? format::bfyx : format::bfzyx; + + topology tp; + auto &engine = get_test_engine(); + data_layout_ = layout{data_types::f32, planar_format, tensor{input_shape}}; + scale_bias_layout_ = layout{data_types::f32, planar_format, tensor{1, + static_cast(scale_.size()), 1, 1}}; + + primitive_id reordered_data_primitive = data_primitive_ + "_reordered"; + tp.add(input_layout{data_primitive_, data_layout_}); + tp.add(input_layout{scale_primitive_, scale_bias_layout_}); + tp.add(input_layout{bias_primitive_, scale_bias_layout_}); + tp.add(reorder{reordered_data_primitive, data_primitive_, format_, data_types::f32}); + + auto g = group_normalization{ + "group_normalization_output", + input_info{reordered_data_primitive}, + input_info{scale_primitive_}, + input_info{bias_primitive_}, + static_cast(num_groups_), + epsilon_ + }; + tp.add(g); + tp.add(reorder{"output", input_info("group_normalization_output"), planar_format, data_types::f32}); + + network_ = std::make_shared(engine, tp, get_test_default_config(engine)); + } + + void Test() { + auto &engine = get_test_engine(); + auto data_gpu_mem = engine.allocate_memory(data_layout_); + auto scale_gpu_mem = engine.allocate_memory(scale_bias_layout_); + auto bias_gpu_mem = engine.allocate_memory(scale_bias_layout_); + set_values(data_gpu_mem, data_); + set_values(scale_gpu_mem, scale_); + set_values(bias_gpu_mem, bias_); + network_->set_input_data(data_primitive_, data_gpu_mem); + network_->set_input_data(scale_primitive_, scale_gpu_mem); + network_->set_input_data(bias_primitive_, bias_gpu_mem); + auto outputs = network_->execute(); + auto output = outputs.at("output").get_memory(); + cldnn::mem_lock output_gpu_mem(output, get_test_stream()); + + std::vector reference_output(data_.size()); + ov::reference::group_normalization(data_.data(), scale_.data(), bias_.data(), reference_output.data(), + ov::Shape{data_shape_}, num_groups_, epsilon_); + + ASSERT_EQ(output_gpu_mem.size(), reference_output.size()); + for (std::size_t i = 0; i < reference_output.size(); i++) { + ASSERT_NEAR(output_gpu_mem[i], reference_output[i], 0.0001); + } + } + +private: + std::vector data_{}; + std::vector scale_{}; + std::vector bias_{}; + std::size_t num_groups_{}; + double epsilon_{}; + format format_{format::any}; + network::ptr network_{}; + layout data_layout_{}; + layout scale_bias_layout_{}; + std::vector data_shape_; + static const primitive_id data_primitive_; + static const primitive_id scale_primitive_; + static const primitive_id bias_primitive_; +}; + +const primitive_id GroupNormalizationGPUTest::data_primitive_{"data"}; +const primitive_id GroupNormalizationGPUTest::scale_primitive_{"scale"}; +const primitive_id GroupNormalizationGPUTest::bias_primitive_{"bias"}; + +TEST_P(GroupNormalizationGPUTest, blocked_layouts_support) { + Test(); +} + +const std::vector f_blocked_4d_formats { + format::b_fs_yx_fsv2, + format::b_fs_yx_fsv4, + format::b_fs_yx_fsv16, + format::b_fs_yx_fsv32, +}; + +const std::vector f_blocked_5d_formats { + format::b_fs_zyx_fsv2, + format::b_fs_zyx_fsv4, + format::b_fs_zyx_fsv16, + format::b_fs_zyx_fsv32, +}; + +INSTANTIATE_TEST_SUITE_P( + GroupNormalizationGPUTest_blocked_layouts_support_4d, GroupNormalizationGPUTest, + ::testing::Combine( + ::testing::Values(std::vector{3, 64, 32, 64}), + ::testing::Values(4), + ::testing::Values(0.0025), + ::testing::ValuesIn(f_blocked_4d_formats))); + +INSTANTIATE_TEST_SUITE_P( + GroupNormalizationGPUTest_blocked_layouts_support_5d, GroupNormalizationGPUTest, + ::testing::Combine( + ::testing::Values(std::vector{3, 64, 28, 32, 12}), + ::testing::Values(4), + ::testing::Values(0.0025), + ::testing::ValuesIn(f_blocked_5d_formats))); + +} // anonymous namespace diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp index 94fb17104275e0..240f0df9980e97 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,34 @@ using namespace cldnn; using namespace tests; using namespace testing; +static program::ptr build_program(engine& engine, + topology& body_topology, + primitive_id execution_condition_id, + std::vector output_primitive_maps, + std::vector back_edges) { + std::vector output_names_vec; + for (auto out_map : output_primitive_maps) { + output_names_vec.push_back(out_map.internal_id.pid); + } + + // setup outputs for backedges + for (auto& back_edge : back_edges) { + output_names_vec.push_back(back_edge.from); + } + + // if execution_condition_id is specified, we need to add the id in build_option::outputs + if (!execution_condition_id.empty()) { + output_names_vec.push_back(execution_condition_id); + } + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::custom_outputs(output_names_vec)); + config.set_property(ov::intel_gpu::max_dynamic_batch(1)); + + return program::build_program(engine, body_topology, config, false, false, true); +} + template void test_loop_gpu_basic_no_concat(bool is_caching_test) { @@ -52,23 +81,23 @@ void test_loop_gpu_basic_no_concat(bool is_caching_test) set_values(initial_condition_mem, {initial_condition}); topology body( + input_layout("input", input_mem->get_layout()), data("eltwise_operand", operand_mem), eltwise("eltwise", input_info("input"), input_info("eltwise_operand"), eltwise_mode::sum) ); std::vector input_primitive_maps { loop::io_primitive_map("input", "input") }; std::vector output_primitive_maps { loop::io_primitive_map("loop", "eltwise") }; + std::vector back_edges { loop::backedge_mapping("eltwise", "input") }; - std::vector back_edges { - loop::backedge_mapping("eltwise", "input") - }; + auto body_program = build_program(engine, body, "", output_primitive_maps, back_edges); topology topology( input_layout("input", input_mem->get_layout()), input_layout("trip_count", trip_count_mem->get_layout()), input_layout("initial_condition", initial_condition_mem->get_layout()), mutable_data("num_iteration", num_iteration_mem), - loop("loop", { input_info("input") }, body, + loop("loop", { input_info("num_iteration"), input_info("input") }, body_program, "trip_count", "initial_condition", "num_iteration", input_primitive_maps, output_primitive_maps, back_edges, 8) ); @@ -161,15 +190,16 @@ void test_loop_gpu_basic_concat(bool is_caching_test) std::vector input_primitive_maps { loop::io_primitive_map("input", "input", 2) }; std::vector output_primitive_maps { loop::io_primitive_map("loop", "eltwise", 2) }; - std::vector back_edges {}; + auto body_program = build_program(engine, body, "", output_primitive_maps, back_edges); + topology topology( input_layout("input", input_mem->get_layout()), input_layout("trip_count", trip_count_mem->get_layout()), input_layout("initial_condition", initial_condition_mem->get_layout()), mutable_data("num_iteration", num_iteration_mem), - loop("loop", { input_info("input") }, body, + loop("loop", { input_info("num_iteration"), input_info("input") }, body_program, "trip_count", "initial_condition", "num_iteration", input_primitive_maps, output_primitive_maps, back_edges, trip_count) ); @@ -274,6 +304,8 @@ void test_loop_gpu_basic_concat_nested(bool is_caching_test) std::vector inner_output_primitive_maps { loop::io_primitive_map("inner_loop", "inner_eltwise", 2) }; std::vector inner_back_edges {}; + auto inner_body_program = build_program(engine, inner_loop_body, "", inner_output_primitive_maps, inner_back_edges); + ///////////////////////////////// // set outer loop body ///////////////////////////////// @@ -282,8 +314,8 @@ void test_loop_gpu_basic_concat_nested(bool is_caching_test) input_layout("trip_count", inner_trip_count_mem->get_layout()), input_layout("initial_condition", inner_initial_condition_mem->get_layout()), mutable_data("inner_num_iteration", inner_num_iteration_mem), - loop("inner_loop", { input_info("inner_input"), input_info("trip_count"), input_info("initial_condition") }, - inner_loop_body, "trip_count", "initial_condition", "inner_num_iteration", + loop("inner_loop", { input_info("inner_num_iteration"), input_info("inner_input"), input_info("trip_count"), input_info("initial_condition") }, + inner_body_program, "trip_count", "initial_condition", "inner_num_iteration", inner_input_primitive_maps, inner_output_primitive_maps, inner_back_edges, inner_trip_count) ); std::vector outer_input_primitive_maps { @@ -296,6 +328,8 @@ void test_loop_gpu_basic_concat_nested(bool is_caching_test) }; std::vector outer_back_edges { {"inner_loop", "inner_input"} }; + auto outer_body_program = build_program(engine, outer_loop_body, "", outer_output_primitive_maps, outer_back_edges); + ///////////////////////////////// // set main topology ///////////////////////////////// @@ -306,8 +340,8 @@ void test_loop_gpu_basic_concat_nested(bool is_caching_test) mutable_data("num_iteration", num_iteration_mem), input_layout("inner_trip_count", inner_trip_count_mem->get_layout()), input_layout("inner_initial_condition", inner_initial_condition_mem->get_layout()), - loop("loop", { input_info("input"), input_info("inner_trip_count"), input_info("inner_initial_condition") }, - outer_loop_body, "trip_count", "initial_condition", "num_iteration", + loop("loop", { input_info("num_iteration"), input_info("input"), input_info("inner_trip_count"), input_info("inner_initial_condition") }, + outer_body_program, "trip_count", "initial_condition", "num_iteration", outer_input_primitive_maps, outer_output_primitive_maps, outer_back_edges, outer_trip_count) ); diff --git a/src/tests/functional/plugin/shared/include/single_layer_tests/group_normalization.hpp b/src/tests/functional/plugin/shared/include/single_layer_tests/group_normalization.hpp new file mode 100644 index 00000000000000..8f080764adbced --- /dev/null +++ b/src/tests/functional/plugin/shared/include/single_layer_tests/group_normalization.hpp @@ -0,0 +1,21 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once +#include "shared_test_classes/single_layer/group_normalization.hpp" + +namespace ov { +namespace test { +namespace subgraph { + +TEST_P(GroupNormalizationTest, CompareWithRefs) { + run(); +} + +TEST_P(GroupNormalizationTest, CompareQueryModel) { + query_model(); +} + +} // namespace subgraph +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/include/single_op_tests/pooling.hpp b/src/tests/functional/plugin/shared/include/single_op_tests/pooling.hpp new file mode 100644 index 00000000000000..495c551e9764eb --- /dev/null +++ b/src/tests/functional/plugin/shared/include/single_op_tests/pooling.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/single_op/pooling.hpp" + +namespace ov { +namespace test { +TEST_P(PoolingLayerTest, Inference) { + run(); +} + +TEST_P(MaxPoolingV8LayerTest, Inference) { + run(); +} +} // namespace test +} // namespace ov diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/group_normalization.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/group_normalization.hpp new file mode 100644 index 00000000000000..759f47786d98be --- /dev/null +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/group_normalization.hpp @@ -0,0 +1,98 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#pragma once +#include "ov_models/builders.hpp" +#include "common_test_utils/common_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" + +namespace ov { +namespace test { +namespace subgraph { + +using GroupNormalizationTestParams = std::tuple; + +class GroupNormalizationTest : public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj) { + ElementType netType, inType, outType; + InputShape shapes; + std::int64_t num_groups; + double epsilon; + TargetDevice targetDevice; + Config config; + std::tie(netType, inType, outType, shapes, num_groups, epsilon, targetDevice, config) = obj.param; + + std::ostringstream result; + result << "NetType=" << netType << "_"; + result << "InType=" << inType << "_"; + result << "OutType=" << outType << "_"; + result << "IS=" << ov::test::utils::partialShape2str({shapes.first}) << "_"; + result << "TS="; + for (const auto& item : shapes.second) { + result << ov::test::utils::vec2str(item) << "_"; + } + result << "NumGroups=" << num_groups << "_"; + result << "Epsilon=" << epsilon << "_"; + result << "Device=" << targetDevice; + + return result.str(); + } + +protected: + void SetUp() override { + InputShape shapes; + ElementType ngPrc; + std::int64_t num_groups; + double epsilon; + + std::tie(ngPrc, inType, outType, shapes, num_groups, epsilon, targetDevice, configuration) = this->GetParam(); + InputShape biasInputShape = ExtractBiasShape(shapes); + init_input_shapes({shapes, biasInputShape, biasInputShape}); + ov::ParameterVector params; + for (auto&& shape : inputDynamicShapes) { + params.push_back(std::make_shared(ngPrc, shape)); + } + const auto paramOuts = + ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)); + + const auto groupNormalization = std::make_shared( + paramOuts.at(0), + paramOuts.at(1), + paramOuts.at(2), + num_groups, + epsilon); + const ngraph::ResultVector results{std::make_shared(groupNormalization)}; + + // TODO: This workaround is needed as there is no full support for f16 type in the reference implementation + if (ngPrc == element::Type_t::f16) { + abs_threshold = 0.007; + } + + function = std::make_shared(results, params, "GroupNormalization"); + } + + InputShape ExtractBiasShape(const InputShape& shape) { + std::vector biasShape; + std::transform(shape.second.cbegin(), shape.second.cend(), std::back_inserter(biasShape), + [](const ov::Shape& s)->ov::Shape { return {s[1]}; }); + InputShape biasInputShape { + shape.first.is_dynamic() ? ov::PartialShape{shape.first[1]} : shape.first, + std::move(biasShape) + }; + return biasInputShape; + } +}; + +} // namespace subgraph +} // namespace test +} // namespace ov diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/pooling.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/pooling.hpp new file mode 100644 index 00000000000000..cfb57994b35b33 --- /dev/null +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/pooling.hpp @@ -0,0 +1,71 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "common_test_utils/test_enums.hpp" + +namespace ov { +namespace test { +typedef std::tuple< + ov::test::utils::PoolingTypes, // Pooling type, max or avg + std::vector, // Kernel size + std::vector, // Stride + std::vector, // Pad begin + std::vector, // Pad end + ov::op::RoundingType, // Rounding type + ov::op::PadType, // Pad type + bool // Exclude pad +> poolSpecificParams; + +typedef std::tuple< + poolSpecificParams, + ov::element::Type, // Model type + std::vector, // Input shape + std::string // Device name +> poolLayerTestParamsSet; + +typedef std::tuple< + std::vector, // Kernel size + std::vector, // Stride + std::vector, // Dilation + std::vector, // Pad begin + std::vector, // Pad end + ov::element::Type, // Index element type + int64_t, // Axis + ov::op::RoundingType, // Rounding type + ov::op::PadType // Pad type +> maxPoolV8SpecificParams; + +typedef std::tuple< + maxPoolV8SpecificParams, + ov::element::Type, // Model type + std::vector, // Input shape + std::string // Device name +> maxPoolV8LayerTestParamsSet; + +class PoolingLayerTest : public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj); + +protected: + void SetUp() override; +}; + +class MaxPoolingV8LayerTest : public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj); + +protected: + void SetUp() override; +}; +} // namespace test +} // namespace ov diff --git a/src/tests/functional/shared_test_classes/src/single_op/pooling.cpp b/src/tests/functional/shared_test_classes/src/single_op/pooling.cpp new file mode 100644 index 00000000000000..c86f8a2217fee5 --- /dev/null +++ b/src/tests/functional/shared_test_classes/src/single_op/pooling.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "shared_test_classes/single_op/pooling.hpp" + +#include "ov_models/builders.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/avg_pool.hpp" +#include "openvino/op/max_pool.hpp" + +namespace ov { +namespace test { +using ov::test::utils::PoolingTypes; + +std::string PoolingLayerTest::getTestCaseName(const testing::TestParamInfo& obj) { + poolSpecificParams pool_params; + ov::element::Type model_type; + std::vector shapes; + std::string targetDevice; + std::tie(pool_params, model_type, shapes, targetDevice) = obj.param; + PoolingTypes pool_type; + std::vector kernel, stride; + std::vector pad_begin, pad_end; + ov::op::PadType pad_type; + ov::op::RoundingType rounding_type; + bool excludePad; + std::tie(pool_type, kernel, stride, pad_begin, pad_end, rounding_type, pad_type, excludePad) = pool_params; + + std::ostringstream result; + result << "IS=("; + for (size_t i = 0lu; i < shapes.size(); i++) { + result << ov::test::utils::partialShape2str({shapes[i].first}) << (i < shapes.size() - 1lu ? "_" : ""); + } + result << ")_TS="; + for (size_t i = 0lu; i < shapes.front().second.size(); i++) { + result << "{"; + for (size_t j = 0lu; j < shapes.size(); j++) { + result << ov::test::utils::vec2str(shapes[j].second[i]) << (j < shapes.size() - 1lu ? "_" : ""); + } + result << "}_"; + } + switch (pool_type) { + case PoolingTypes::MAX: + result << "MaxPool_"; + break; + case PoolingTypes::AVG: + result << "AvgPool_"; + result << "ExcludePad=" << excludePad << "_"; + break; + } + result << "K" << ov::test::utils::vec2str(kernel) << "_"; + result << "S" << ov::test::utils::vec2str(stride) << "_"; + result << "PB" << ov::test::utils::vec2str(pad_begin) << "_"; + result << "PE" << ov::test::utils::vec2str(pad_end) << "_"; + result << "Rounding=" << rounding_type << "_"; + result << "AutoPad=" << pad_type << "_"; + result << "modelType=" << model_type.get_type_name() << "_"; + result << "trgDev=" << targetDevice; + return result.str(); +} + +void PoolingLayerTest::SetUp() { + poolSpecificParams pool_params; + std::vector shapes; + ov::element::Type model_type; + std::tie(pool_params, model_type, shapes, targetDevice) = this->GetParam(); + PoolingTypes pool_type; + std::vector kernel, stride; + std::vector pad_begin, pad_end; + ov::op::PadType pad_type; + ov::op::RoundingType rounding_type; + bool excludePad; + std::tie(pool_type, kernel, stride, pad_begin, pad_end, rounding_type, pad_type, excludePad) = pool_params; + init_input_shapes(shapes); + + auto param = std::make_shared(model_type, inputDynamicShapes.front()); + + std::shared_ptr pooling = ngraph::builder::makePooling(param, + stride, + pad_begin, + pad_end, + kernel, + rounding_type, + pad_type, + excludePad, + pool_type); + + auto result = std::make_shared(pooling); + function = std::make_shared(result, ov::ParameterVector{param}, "pooling"); +} + + +std::string MaxPoolingV8LayerTest::getTestCaseName(const testing::TestParamInfo& obj) { + maxPoolV8SpecificParams pool_params; + ov::element::Type model_type; + std::vector shapes; + std::string target_device; + std::tie(pool_params, model_type, shapes, target_device) = obj.param; + std::vector kernel, stride, dilation; + std::vector pad_begin, pad_end; + ov::op::PadType pad_type; + ov::op::RoundingType rounding_type; + ov::element::Type index_element_type; + int64_t axis; + std::tie(kernel, stride, dilation, pad_begin, pad_end, index_element_type, axis, rounding_type, pad_type) = pool_params; + + std::ostringstream result; + result << "IS=("; + for (size_t i = 0lu; i < shapes.size(); i++) { + result << ov::test::utils::partialShape2str({shapes[i].first}) << (i < shapes.size() - 1lu ? "_" : ""); + } + result << ")_TS="; + for (size_t i = 0lu; i < shapes.front().second.size(); i++) { + result << "{"; + for (size_t j = 0lu; j < shapes.size(); j++) { + result << ov::test::utils::vec2str(shapes[j].second[i]) << (j < shapes.size() - 1lu ? "_" : ""); + } + result << "}_"; + } + result << "K" << ov::test::utils::vec2str(kernel) << "_"; + result << "S" << ov::test::utils::vec2str(stride) << "_"; + result << "D" << ov::test::utils::vec2str(dilation) << "_"; + result << "PB" << ov::test::utils::vec2str(pad_begin) << "_"; + result << "PE" << ov::test::utils::vec2str(pad_end) << "_"; + result << "IET" << index_element_type << "_"; + result << "A" << axis << "_"; + result << "Rounding=" << rounding_type << "_"; + result << "AutoPad=" << pad_type << "_"; + result << "modelType=" << model_type.get_type_name() << "_"; + result << "trgDev=" << target_device; + return result.str(); +} + +void MaxPoolingV8LayerTest::SetUp() { + maxPoolV8SpecificParams pool_params; + ov::element::Type model_type; + std::vector shapes; + std::tie(pool_params, model_type, shapes, targetDevice) = this->GetParam(); + std::vector kernel, stride, dilation; + std::vector pad_begin, pad_end; + ov::op::PadType pad_type; + ov::op::RoundingType rounding_type; + ov::element::Type index_element_type; + int64_t axis; + std::tie(kernel, stride, dilation, pad_begin, pad_end, index_element_type, axis, rounding_type, pad_type) = pool_params; + init_input_shapes(shapes); + + auto param = std::make_shared(model_type, inputDynamicShapes.front()); + + std::shared_ptr max_pool = ngraph::builder::makeMaxPoolingV8(param, stride, dilation, pad_begin, pad_end, + kernel, rounding_type, pad_type, + index_element_type, axis); + + const auto max_pool_v8_second_output_is_supported = targetDevice == ov::test::utils::DEVICE_GPU; + + ov::ResultVector results; + if (max_pool_v8_second_output_is_supported) { + results = {std::make_shared(max_pool->output(0)), + std::make_shared(max_pool->output(1))}; + } else { + results = { std::make_shared(max_pool->output(0)) }; + } + function = std::make_shared(max_pool->outputs(), ov::ParameterVector{param}, "MaxPoolV8"); +} + +} // namespace test +} // namespace ov