diff --git a/.ci/azure/ci_utils/onnxruntime/skip_tests b/.ci/azure/ci_utils/onnxruntime/skip_tests
new file mode 100644
index 00000000000000..475b79b4acac66
--- /dev/null
+++ b/.ci/azure/ci_utils/onnxruntime/skip_tests
@@ -0,0 +1,6 @@
+TransposeOpTest.NHWC2NCHW
+TransposeOpTest.NCHW2NHWC
+TransposeOpTest.TwoDim_int16
+GatherOpTest.Gather_axis1_indices2d_int16
+SoftmaxOperator.ThreeDimsAxis1
+SoftmaxOperator.ThreeDimsAxis0
diff --git a/.ci/azure/ci_utils/onnxruntime/version b/.ci/azure/ci_utils/onnxruntime/version
new file mode 100644
index 00000000000000..3abd49542da1e3
--- /dev/null
+++ b/.ci/azure/ci_utils/onnxruntime/version
@@ -0,0 +1 @@
+rel-1.7.1
diff --git a/.ci/azure/linux_onnxruntime.yml b/.ci/azure/linux_onnxruntime.yml
new file mode 100644
index 00000000000000..37173a4560fcf9
--- /dev/null
+++ b/.ci/azure/linux_onnxruntime.yml
@@ -0,0 +1,156 @@
+jobs:
+- job: onnxruntime
+  timeoutInMinutes: 90
+
+  pool:
+    name: LIN_VMSS_VENV_ONNX_WU2
+
+  variables:
+    system.debug: true
+    VSTS_HTTP_RETRY: 5
+    VSTS_HTTP_TIMEOUT: 200
+    WORKERS_NUMBER: 8
+    BUILD_TYPE: Release
+    REPO_DIR: $(Build.Repository.LocalPath)
+    ONNXRUNTIME_REPO_DIR: $(REPO_DIR)/../onnxruntime
+    WORK_DIR: $(Pipeline.Workspace)/_w
+    MODELS_DIR: /mount/cinfsshare/onnxtestdata
+    TMP_DIR: /mnt/tmp
+    INSTALL_DIR: $(WORK_DIR)/install_pkg
+    BUILD_DIR: $(WORK_DIR)/build
+    ONNXRUNTIME_UTILS: $(REPO_DIR)/.ci/azure/ci_utils/onnxruntime
+    ONNXRUNTIME_BUILD_DIR: $(ONNXRUNTIME_REPO_DIR)/build
+  steps:
+  - script: |
+      curl -H Metadata:true --noproxy "*" "http://169.254.169.254/metadata/instance?api-version=2019-06-01"
+      whoami
+      uname -a
+      echo Python3 info ; which python3 ; python3 --version
+      echo Python info ; which python ; python --version
+      echo Java info ; which java ; java -version
+      echo gcc info ; which gcc ; gcc --version
+      lsb_release
+      env
+      cat /proc/cpuinfo
+      cat /proc/meminfo
+      cat /etc/fstab
+      vmstat -s
+      df
+      lsblk -o NAME,HCTL,SIZE,MOUNTPOINT | grep -i "sd"
+      free -h
+    displayName: 'System info'
+
+  - script: |
+      rm -rf $(WORK_DIR) ; mkdir $(WORK_DIR)
+      sudo rm -rf $(TMP_DIR) ; sudo mkdir $(TMP_DIR) ; sudo chmod 777 -R $(TMP_DIR)
+      sudo mkdir -p $(MODELS_DIR)
+      sudo apt --assume-yes install nfs-common
+      sudo mount -vvv -t nfs cinfsshare.file.core.windows.net:/cinfsshare/onnxtestdata $(MODELS_DIR) -o vers=4,minorversion=1,sec=sys
+    displayName: 'Make dirs'
+
+  - checkout: self
+    clean: true
+    lfs: false
+    submodules: recursive
+    path: openvino
+
+  - script: |
+      branch=`tr -s '\n ' < $(ONNXRUNTIME_UTILS)/version`
+      git clone --branch $branch --single-branch --recursive https://github.com/microsoft/onnxruntime.git $(ONNXRUNTIME_REPO_DIR)
+    displayName: 'Clone onnxruntime'
+
+  - script: |
+      sudo apt --assume-yes install libusb-1.0-0-dev
+      python3 -m pip install -r $(REPO_DIR)/inference-engine/ie_bridges/python/requirements.txt
+      # For running Python API tests
+      python3 -m pip install -r $(REPO_DIR)/inference-engine/ie_bridges/python/src/requirements-dev.txt
+      # Speed up build
+      wget https://github.com/ninja-build/ninja/releases/download/v1.10.0/ninja-linux.zip
+      unzip ninja-linux.zip
+      sudo cp -v ninja /usr/local/bin/
+      # Speed up tests
+      git clone https://github.com/google/gtest-parallel.git
+    workingDirectory: $(WORK_DIR)
+    displayName: 'Install dependencies'
+
+  - task: CMake@1
+    inputs:
+      # CMake must get Python 3.x version by default
+      cmakeArgs: >
+        -GNinja
+        -DCMAKE_BUILD_TYPE=$(BUILD_TYPE)
+        -DENABLE_PYTHON=ON
+        -DPYTHON_EXECUTABLE=/usr/bin/python3.6
+        -DENABLE_VPU=OFF
+        -DENABLE_GNA=OFF
+        -DENABLE_OPENCV=OFF
+        -DENABLE_CPPLINT=OFF
+        -DENABLE_TESTS=OFF
+        -DENABLE_MKL_DNN=ON
+        -DENABLE_CLDNN=OFF
+        -DENABLE_PROFILING_ITT=OFF
+        -DENABLE_SAMPLES=OFF
+        -DENABLE_SPEECH_DEMO=OFF
+        -DENABLE_PYTHON=ON
+        -DNGRAPH_ONNX_IMPORT_ENABLE=ON
+        -DNGRAPH_ONNX_EDITOR_ENABLE=ON
+        -DNGRAPH_INTERPRETER_ENABLE=ON
+        -DNGRAPH_DEBUG_ENABLE=OFF
+        -DNGRAPH_DYNAMIC_COMPONENTS_ENABLE=ON
+        $(REPO_DIR)
+      workingDirectory: $(BUILD_DIR)
+
+  - script: ninja
+    workingDirectory: $(BUILD_DIR)
+    displayName: 'Build Lin'
+
+  - script: ls -alR $(REPO_DIR)/bin/
+    displayName: 'List files'
+
+  - script: cmake -DCMAKE_INSTALL_PREFIX=$(INSTALL_DIR) -P cmake_install.cmake
+    workingDirectory: $(BUILD_DIR)
+    displayName: 'Install'
+
+  - script: |
+      source $(INSTALL_DIR)/bin/setupvars.sh
+      echo "2021.2" > $(INSTALL_DIR)/deployment_tools/inference_engine/version.txt
+      ./build.sh --config RelWithDebInfo --use_openvino CPU_FP32 --build_shared_lib --parallel --skip_tests --build_dir $(ONNXRUNTIME_BUILD_DIR)
+    workingDirectory: $(ONNXRUNTIME_REPO_DIR)
+    displayName: 'Build ONNX Runtime'
+
+  - script: |
+      source $(INSTALL_DIR)/bin/setupvars.sh
+      skip_tests=`tr -s '\n ' ':' < $(ONNXRUNTIME_UTILS)/skip_tests`
+      ./onnxruntime_test_all --gtest_filter=-$skip_tests
+    workingDirectory: $(ONNXRUNTIME_BUILD_DIR)/RelWithDebInfo
+    displayName: 'Run onnxruntime_test_all'
+
+  - script: |
+      source $(INSTALL_DIR)/bin/setupvars.sh
+      ./onnxruntime_shared_lib_test
+    workingDirectory: $(ONNXRUNTIME_BUILD_DIR)/RelWithDebInfo
+    displayName: 'Run onnxruntime_shared_lib_test'
+
+  - script: |
+      source $(INSTALL_DIR)/bin/setupvars.sh
+      ./onnxruntime_global_thread_pools_test
+    workingDirectory: $(ONNXRUNTIME_BUILD_DIR)/RelWithDebInfo
+    displayName: 'Run onnxruntime_global_thread_pools_test'
+
+  - script: |
+      source $(INSTALL_DIR)/bin/setupvars.sh
+      ./onnxruntime_api_tests_without_env
+    workingDirectory: $(ONNXRUNTIME_BUILD_DIR)/RelWithDebInfo
+    displayName: 'Run onnxruntime_api_tests_without_env'
+
+  - script: |
+      source $(INSTALL_DIR)/bin/setupvars.sh
+      ./onnx_test_runner "$(ONNXRUNTIME_REPO_DIR)/cmake/external/onnx/onnx/backend/test/data/pytorch-converted"
+    workingDirectory: $(ONNXRUNTIME_BUILD_DIR)/RelWithDebInfo
+    displayName: 'Run pytorch-converted tests'
+
+  - script: |
+      source $(INSTALL_DIR)/bin/setupvars.sh
+      ./onnx_test_runner "$(ONNXRUNTIME_REPO_DIR)/cmake/external/onnx/onnx/backend/test/data/pytorch-operator"
+    workingDirectory: $(ONNXRUNTIME_BUILD_DIR)/RelWithDebInfo
+    displayName: 'Run pytorch-operator tests'
diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml
index e344f1d8243032..c805aff037c8aa 100644
--- a/.ci/azure/windows.yml
+++ b/.ci/azure/windows.yml
@@ -36,7 +36,7 @@ jobs:
     SETUPVARS: $(INSTALL_DIR)\bin\setupvars.bat
     IB_DIR: C:\Program Files (x86)\IncrediBuild
     IB_TESTCONSOLE: $(IB_DIR)\IBTestConsole.exe
-    TEST_ENV_PATH: $(REPO_DIR)\inference-engine\temp\tbb\bin;$(REPO_DIR)\inference-engine\temp\opencv_4.5.1\opencv\bin;$(IB_DIR);%PATH%
+    TEST_ENV_PATH: $(REPO_DIR)\inference-engine\temp\tbb\bin;$(REPO_DIR)\inference-engine\temp\opencv_4.5.2\opencv\bin;$(IB_DIR);%PATH%
 
   steps:
   - script: |
diff --git a/.ci/azure/windows_conditional_compilation.yml b/.ci/azure/windows_conditional_compilation.yml
index e627030a36223e..7352f9c758c79e 100644
--- a/.ci/azure/windows_conditional_compilation.yml
+++ b/.ci/azure/windows_conditional_compilation.yml
@@ -24,7 +24,7 @@ jobs:
     SETUPVARS: $(INSTALL_DIR)\bin\setupvars.bat
     IB_DIR: C:\Program Files (x86)\IncrediBuild
     IB_TESTCONSOLE: $(IB_DIR)\IBTestConsole.exe
-    TEST_ENV_PATH: $(REPO_DIR)\inference-engine\temp\tbb\bin;$(REPO_DIR)\inference-engine\temp\opencv_4.5.1\opencv\bin;$(IB_DIR);%PATH%
+    TEST_ENV_PATH: $(REPO_DIR)\inference-engine\temp\tbb\bin;$(REPO_DIR)\inference-engine\temp\opencv_4.5.2\opencv\bin;$(IB_DIR);%PATH%
 
   steps:
   - script: |
diff --git a/.ci/openvino-onnx/Dockerfile b/.ci/openvino-onnx/Dockerfile
index 6879c49182a8fb..ec78869b6d6585 100644
--- a/.ci/openvino-onnx/Dockerfile
+++ b/.ci/openvino-onnx/Dockerfile
@@ -1,6 +1,10 @@
 FROM ubuntu:20.04
 
-LABEL version=2020.07.09.1
+LABEL version=2021.03.30.1
+
+# Build configuration arguments
+ARG BUILD_TYPE=Release
+ARG PROTOBUF_LITE=OFF
 
 ARG http_proxy
 ARG https_proxy
@@ -10,7 +14,6 @@ ENV https_proxy ${https_proxy}
 ENV CI=true
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED 1
-ARG PROTOBUF_LITE=OFF
 
 # Install base dependencies
 RUN apt-get update && apt-get install -y locales && apt-get clean autoclean && apt-get autoremove -y
@@ -52,7 +55,7 @@ RUN apt-get update && apt-get -y --no-install-recommends install \
 COPY . /openvino/
 WORKDIR /openvino/build
 RUN cmake .. \
-    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
     -DENABLE_VPU=OFF \
     -DENABLE_GNA=OFF \
     -DENABLE_OPENCV=OFF \
@@ -66,6 +69,7 @@ RUN cmake .. \
     -DENABLE_PYTHON=ON \
     -DPYTHON_EXECUTABLE=/usr/bin/python3 \
     -DNGRAPH_ONNX_IMPORT_ENABLE=ON \
+    -DNGRAPH_ONNX_EDITOR_ENABLE=ON \
     -DNGRAPH_INTERPRETER_ENABLE=ON \
     -DNGRAPH_DEBUG_ENABLE=OFF \
     -DNGRAPH_DYNAMIC_COMPONENTS_ENABLE=ON \
@@ -75,7 +79,7 @@ RUN make -j $(nproc) install
 
 # Run tests via tox
 WORKDIR /openvino/ngraph/python
-ENV NGRAPH_CPP_BUILD_PATH=/openvino/dist/deployment_tools/ngraph
+ENV ngraph_DIR=/openvino/dist/deployment_tools/ngraph
 ENV LD_LIBRARY_PATH=/openvino/dist/deployment_tools/ngraph/lib
-ENV PYTHONPATH=/openvino/bin/intel64/Release/lib/python_api/python3.8:${PYTHONPATH}
+ENV PYTHONPATH=/openvino/bin/intel64/${BUILD_TYPE}/lib/python_api/python3.8:${PYTHONPATH}
 CMD tox
diff --git a/.ci/openvino-onnx/Jenkinsfile b/.ci/openvino-onnx/Jenkinsfile
index de359f0b21d026..f157392d471892 100644
--- a/.ci/openvino-onnx/Jenkinsfile
+++ b/.ci/openvino-onnx/Jenkinsfile
@@ -1,12 +1,13 @@
 // Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-DOCKER_CONTAINER_NAME= "openvino-onnx-ci-container"
-DOCKER_IMAGE_TAG = "openvino-onnx-ci-image"
+DOCKER_CONTAINER_PREFIX= "openvino-onnx-ci-container"
+DOCKER_IMAGE_PREFIX= "openvino-onnx-ci-image"
 
 BACKEND_CONFIGURATIONS = [
-    [ protobuf_lite : "ON" ],
-    [ protobuf_lite : "OFF" ]
+    [ name: "Release", build_type: "Release", protobuf_lite : "OFF" ],
+    [ name: "Debug", build_type: "Debug", protobuf_lite : "OFF" ],
+    [ name: "Rel_Lite", build_type: "Release", protobuf_lite : "ON" ],
 ]
 
 // workaround for aborting previous builds on PR update
@@ -77,7 +78,7 @@ def gitSubmoduleUpdate(String repository_name) {
             git submodule init && git submodule update \
                 --init \
                 --no-fetch \
-                --recursive 
+                --recursive
         """
     }
 }
@@ -89,35 +90,59 @@ def prepare_repository() {
     }
 }
 
+def get_commit_id(){
+    dir("${WORKDIR}/PROJECT_NAME") {
+        GIT_COMMIT_ID = sh(returnStdout: true, script: "git log -n 1 --pretty=format:'%h'").trim()
+    }
+}
+
 def updateModels() {
     sh """
         ./ngraph/python/tests/test_onnx/model_zoo_preprocess.sh -d ${HOME}/ONNX_CI/data -o
     """
 }
 
-def buildDockerImage(String protobuf_lite="OFF") {
+def buildDockerImage(Map configuration) {
     updateModels()
+    get_commit_id()
+    DOCKER_IMAGE_TAG="${DOCKER_IMAGE_PREFIX}_${GIT_COMMIT_ID}_${configuration.name}".toLowerCase()
     sh """
-        docker build --tag=${DOCKER_IMAGE_TAG} --build-arg PROTOBUF_LITE=${protobuf_lite} \
+        docker build --tag=${DOCKER_IMAGE_TAG} \
+        --build-arg BUILD_TYPE=${configuration.build_type} \
+        --build-arg PROTOBUF_LITE=${configuration.protobuf_lite} \
         --file=.ci/openvino-onnx/Dockerfile \
         --build-arg http_proxy=http://proxy-chain.intel.com:911/ \
         --build-arg https_proxy=http://proxy-chain.intel.com:912/ .
     """
 }
 
-def runTests() {
-    sh """
-        docker run --name ${DOCKER_CONTAINER_NAME} \
-        --volume ${HOME}/ONNX_CI/data/model_zoo:/root/.onnx/model_zoo \
-        ${DOCKER_IMAGE_TAG}
-    """
+
+def runTests(Map configuration) {
+    get_commit_id()
+    DOCKER_IMAGE_TAG="${DOCKER_IMAGE_PREFIX}_${GIT_COMMIT_ID}_${configuration.name}".toLowerCase()
+    DOCKER_CONTAINER_NAME="${DOCKER_CONTAINER_PREFIX}_${GIT_COMMIT_ID}_${configuration.name}"
+    // Run only basic unit tests in Debug configuration
+    if (configuration.build_type == "Debug") {
+        sh """
+            docker run --name ${DOCKER_CONTAINER_NAME} ${DOCKER_IMAGE_TAG}
+        """
+    }
+
+    // Run unit-tests AND large model tests by default
+    else {
+        sh """
+            docker run --name ${DOCKER_CONTAINER_NAME} \
+            --volume ${HOME}/ONNX_CI/data/model_zoo:/root/.onnx/model_zoo \
+            ${DOCKER_IMAGE_TAG} /bin/bash -c "tox && tox -e zoo_models"
+        """
+    }
 }
 
 def getConfigurationsMap() {
     def configurationsMap = [:]
     for (backend in BACKEND_CONFIGURATIONS) {
         def configuration = backend.clone()
-        configuration.name = "protobuf-lite ${configuration.protobuf_lite}"
+        configuration.name = "${configuration.name}"
         configurationsMap[configuration.name] = {
             stage(configuration.name) { CONFIGURATION_WORKFLOW(configuration) }
         }
@@ -143,12 +168,12 @@ CONFIGURATION_WORKFLOW = { configuration ->
             }
             stage("Prepare Docker environment") {
               dir("${WORKDIR}") {
-                    buildDockerImage(configuration.protobuf_lite)
+                    buildDockerImage(configuration)
                 }
             }
             stage("Run tests") {
                 timeout(time: 20, unit: 'MINUTES') {
-                    runTests()
+                    runTests(configuration)
                 }
             }
         }
@@ -165,9 +190,11 @@ CONFIGURATION_WORKFLOW = { configuration ->
         finally {
             stage("Cleanup") {
                 deleteDir()
+                    get_commit_id()
+                    DOCKER_CONTAINER_NAME="${DOCKER_CONTAINER_PREFIX}_${GIT_COMMIT_ID}_${configuration.name}"
                 sh """
-                    docker image prune -f
                     docker rm -f ${DOCKER_CONTAINER_NAME}
+                    docker image prune -f
                 """
             }
         }
diff --git a/.github/workflows/mo.yml b/.github/workflows/mo.yml
index 7dcb851466aae3..eb2ea91484e7ca 100644
--- a/.github/workflows/mo.yml
+++ b/.github/workflows/mo.yml
@@ -80,12 +80,22 @@ jobs:
           python3 setup.py sdist bdist_wheel
         working-directory: model-optimizer
       
-      - name: Test
+      - name: Test package content
+        run: |
+          echo "src = open('openvino_mo.egg-info/SOURCES.txt', 'rt').read().split()" | tee -a test_wheel.py
+          echo "ref = open('automation/package_BOM.txt', 'rt').read().split()"       | tee -a test_wheel.py
+          echo "for name in ref:"                                                    | tee -a test_wheel.py
+          echo "  if name.endswith('.py'):"                                          | tee -a test_wheel.py
+          echo "    assert name in src or './' + name in src, name + ' file missed'" | tee -a test_wheel.py
+          python3 test_wheel.py
+        working-directory: model-optimizer
+
+      - name: Test conversion
         run: |
           wget -q http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz
           tar -xf mobilenet_v1_1.0_224.tgz
           python3 -m pip install model-optimizer/dist/*.whl
-          python3 -c "import sys, subprocess, mo_tf; subprocess.run([sys.executable, mo_tf.__file__, '--input_model', 'mobilenet_v1_1.0_224_frozen.pb', '--input_shape', '[1,224,224,3]'], check=True)"
+          python3 -m mo --input_model mobilenet_v1_1.0_224_frozen.pb --input_shape "[1,224,224,3]"
 
       - uses: actions/upload-artifact@v2
         with:
diff --git a/README.md b/README.md
index b03a32b256b39b..7d54e9e8f9cd33 100644
--- a/README.md
+++ b/README.md
@@ -45,5 +45,4 @@ Please report questions, issues and suggestions using:
 [Open Model Zoo]:https://github.com/opencv/open_model_zoo
 [Inference Engine]:https://software.intel.com/en-us/articles/OpenVINO-InferEngine
 [Model Optimizer]:https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer
-[tag on StackOverflow]:https://stackoverflow.com/search?q=%23openvino
 [nGraph]:https://docs.openvinotoolkit.org/latest/openvino_docs_nGraph_DG_DevGuide.html
diff --git a/cmake/developer_package/clang_format/clang_format.cmake b/cmake/developer_package/clang_format/clang_format.cmake
index 1484e7038d789a..78114abeaf80aa 100644
--- a/cmake/developer_package/clang_format/clang_format.cmake
+++ b/cmake/developer_package/clang_format/clang_format.cmake
@@ -17,6 +17,9 @@ if (ENABLE_CLANG_FORMAT)
                 set(ENABLE_CLANG_FORMAT OFF)
             endif()
         endif()
+    else()
+        message(WARNING "Supported clang-format version is not found!")
+        set(ENABLE_CLANG_FORMAT OFF)
     endif()
 endif()
 
diff --git a/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake b/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake
index f7fe488366bc6a..610568e8bb1dd2 100644
--- a/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake
+++ b/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake
@@ -10,7 +10,7 @@
 #   XARCH_FUNC_NAME -- name of function to dispatch
 #   XARCH_NAMESPACES -- full namespace used to keep ODR
 #   XARCH_DISP_FILE -- dispatcher file name to generate
-#   XARCH_SET -- set of ARCH supported by dispatcher. space delimited
+#   XARCH_SET -- set of ARCH supported by dispatcher. semicolon-delimited
 #
 # =================================================================
 
@@ -24,7 +24,6 @@ function(_generate_dispatcher)
     _find_signature_in_file(${XARCH_API_HEADER} ${XARCH_FUNC_NAME} SIGNATURE)
     _generate_call_line_from_signature("${SIGNATURE}" CALL_LINE)
 
-    string(REPLACE " " ";" XARCH_SET "${XARCH_SET}")
     string(REPLACE "::" ";" XARCH_NAMESPACES "${XARCH_NAMESPACES}")
 
     list(GET XARCH_NAMESPACES -1 XARCH_CURRENT_NAMESPACE)
diff --git a/cmake/developer_package/cross_compile/cross_compiled_func.cmake b/cmake/developer_package/cross_compile/cross_compiled_func.cmake
index ed969a3869f29c..7d83b4dbd4a2d3 100644
--- a/cmake/developer_package/cross_compile/cross_compiled_func.cmake
+++ b/cmake/developer_package/cross_compile/cross_compiled_func.cmake
@@ -117,17 +117,21 @@ function(_clone_source_to_target TARGET SOURCE ARCH_SET)
                         ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE}
                         ${CMAKE_CURRENT_BINARY_DIR}/${ARCH_SOURCE}
                 DEPENDS ${SOURCE}
+                VERBATIM
                 )
 
         set(_ARCH_SPECIFIC_FLAGS
                 ${_DEFINE_${_arch}}
                 ${_FLAGS_${_arch}}
                 "-DXARCH=${_arch}"  ## to replace XARCH with direct ARCH name
-                "-I${CMAKE_CURRENT_SOURCE_DIR}/${ARCH_INCLUDE_DIR}"  ## To make valid #include "some.hpp"
                 )
 
         _add_source_compile_flags(${ARCH_SOURCE} ${_ARCH_SPECIFIC_FLAGS})
 
+        ## To make `#include "some.hpp"` valid
+        set_property(SOURCE ${ARCH_SOURCE} APPEND PROPERTY INCLUDE_DIRECTORIES
+                "${CMAKE_CURRENT_SOURCE_DIR}/${ARCH_INCLUDE_DIR}")
+
         list(APPEND _ARCH_SOURCES ${ARCH_SOURCE})
     endforeach()
 
@@ -146,25 +150,26 @@ function(_add_dispatcher_to_target TARGET HEADER FUNC_NAME NAMESPACE ARCH_SET)
     set(DISPATCHER_SOURCE     "cross-compiled/${DISPATCHER_NAME}_disp.cpp")
     set(DISPATCHER_OPT_HOLDER "cross-compiled/${DISPATCHER_NAME}_holder.txt")
 
-    set(_GEN_ARGS_LIST
-            -DXARCH_FUNC_NAME="${X_NAME}"
-            -DXARCH_NAMESPACES="${NAMESPACE}"
-            -DXARCH_API_HEADER="${CMAKE_CURRENT_SOURCE_DIR}/${HEADER}"
-            -DXARCH_DISP_FILE="${CMAKE_CURRENT_BINARY_DIR}/${DISPATCHER_SOURCE}"
-            -DXARCH_SET="${ARCH_SET}"
-    )
     configure_file(${DISPATCHER_GEN_OPTIONS_HOLDER} ${DISPATCHER_OPT_HOLDER})
 
     add_custom_command(
             OUTPUT  ${DISPATCHER_SOURCE}
-            COMMAND ${CMAKE_COMMAND} ${_GEN_ARGS_LIST}
+            COMMAND ${CMAKE_COMMAND}
+                    -D "XARCH_FUNC_NAME=${X_NAME}"
+                    -D "XARCH_NAMESPACES=${NAMESPACE}"
+                    -D "XARCH_API_HEADER=${CMAKE_CURRENT_SOURCE_DIR}/${HEADER}"
+                    -D "XARCH_DISP_FILE=${CMAKE_CURRENT_BINARY_DIR}/${DISPATCHER_SOURCE}"
+                    -D "XARCH_SET=${ARCH_SET}"
                     -P ${DISPATCHER_GEN_SCRIPT}
             DEPENDS ${HEADER}
                     ${DISPATCHER_GEN_SCRIPT}
                     ${CMAKE_CURRENT_BINARY_DIR}/${DISPATCHER_OPT_HOLDER} ## Just to make run dependency on args value
+            VERBATIM
     )
 
-    _add_source_compile_flags(${DISPATCHER_SOURCE} "-I${DISPATCHER_INCLUDE_DIR}")
+    set_property(SOURCE ${DISPATCHER_SOURCE} APPEND PROPERTY INCLUDE_DIRECTORIES
+            "${CMAKE_CURRENT_SOURCE_DIR}/${DISPATCHER_INCLUDE_DIR}")
+
     _add_source_to_target(${TARGET} ${DISPATCHER_SOURCE})
 endfunction()
 
diff --git a/cmake/toolchains/ia32.linux.toolchain.cmake b/cmake/toolchains/ia32.linux.toolchain.cmake
index 675b0b3c4613ad..3c821a04f0b33f 100644
--- a/cmake/toolchains/ia32.linux.toolchain.cmake
+++ b/cmake/toolchains/ia32.linux.toolchain.cmake
@@ -22,3 +22,4 @@ _set_if_not_defined(ENABLE_VPU OFF)
 
 # fix conversion from uint64_t / int64_t to size_t
 _set_if_not_defined(NGRAPH_ONNX_IMPORT_ENABLE OFF)
+_set_if_not_defined(NGRAPH_ONNX_EDITOR_ENABLE OFF)
diff --git a/docs/IE_DG/Samples_Overview.md b/docs/IE_DG/Samples_Overview.md
index 1eeedca35b9f52..b59d5a576ae588 100644
--- a/docs/IE_DG/Samples_Overview.md
+++ b/docs/IE_DG/Samples_Overview.md
@@ -15,21 +15,25 @@ Inference Engine sample applications include the following:
 - **Hello Classification Sample** – Inference of image classification networks like AlexNet and GoogLeNet using Synchronous Inference Request API. Input of any size and layout can be set to an infer request which will be pre-processed automatically during inference (the sample supports only images as inputs and supports Unicode paths).
    - [Hello Classification C++ Sample](../../inference-engine/samples/hello_classification/README.md)
    - [Hello Classification C Sample](../../inference-engine/ie_bridges/c/samples/hello_classification/README.md)
+   - [Hello Classification Python Sample](../../inference-engine/ie_bridges/python/sample/hello_classification/README.md)
 - **Hello NV12 Input Classification Sample** – Input of any size and layout can be provided to an infer request. The sample transforms the input to the NV12 color format and pre-process it automatically during inference. The sample supports only images as inputs. 
    - [Hello NV12 Input Classification C++ Sample](../../inference-engine/samples/hello_nv12_input_classification/README.md)
    - [Hello NV12 Input Classification C Sample](../../inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/README.md)
 - **Hello Query Device Sample** – Query of available Inference Engine devices and their metrics, configuration values.
    - [Hello Query Device C++ Sample](../../inference-engine/samples/hello_query_device/README.md)
    - [Hello Query Device Python* Sample](../../inference-engine/ie_bridges/python/sample/hello_query_device/README.md)
-- **[Hello Reshape SSD C++ Sample**](../../inference-engine/samples/hello_reshape_ssd/README.md)** – Inference of SSD networks resized by ShapeInfer API according to an input size.
+- **Hello Reshape SSD Sample** – Inference of SSD networks resized by ShapeInfer API according to an input size.
+   - [Hello Reshape SSD C++ Sample**](../../inference-engine/samples/hello_reshape_ssd/README.md)
+   - [Hello Reshape SSD Python Sample**](../../inference-engine/ie_bridges/python/sample/hello_reshape_ssd/README.md)
 - **Image Classification Sample Async** – Inference of image classification networks like AlexNet and GoogLeNet using Asynchronous Inference Request API (the sample supports only images as inputs). 
    - [Image Classification C++ Sample Async](../../inference-engine/samples/classification_sample_async/README.md)
    - [Image Classification Python* Sample Async](../../inference-engine/ie_bridges/python/sample/classification_sample_async/README.md)
-- **[Image Classification Python* Sample](../../inference-engine/ie_bridges/python/sample/hello_classification/README.md)** – Inference of image classification networks like AlexNet and GoogLeNet using Synchronous Inference Request API (the sample supports only images as inputs).
 - **Neural Style Transfer Sample** – Style Transfer sample (the sample supports only images as inputs).
    - [Neural Style Transfer C++ Sample](../../inference-engine/samples/style_transfer_sample/README.md)
    - [Neural Style Transfer Python* Sample](../../inference-engine/ie_bridges/python/sample/style_transfer_sample/README.md)
-- **[nGraph Function Creation C++ Sample](../../inference-engine/samples/ngraph_function_creation_sample/README.md)** – Construction of the LeNet network using the nGraph function creation sample.
+- **nGraph Function Creation Sample** – Construction of the LeNet network using the nGraph function creation sample.
+   - [nGraph Function Creation C++ Sample](../../inference-engine/samples/ngraph_function_creation_sample/README.md)
+   - [nGraph Function Creation Python Sample](../../inference-engine/ie_bridges/python/sample/ngraph_function_creation_sample/README.md)
 - **Object Detection for SSD Sample** – Inference of object detection networks based on the SSD, this sample is simplified version that supports only images as inputs. 
    - [Object Detection for SSD C++ Sample](../../inference-engine/samples/object_detection_sample_ssd/README.md)
    - [Object Detection for SSD C Sample](../../inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/README.md)
@@ -39,7 +43,7 @@ Inference Engine sample applications include the following:
 
 ## Media Files Available for Samples
 
-To run the sample applications, you can use images and videos from the media files collection available at https://github.com/intel-iot-devkit/sample-videos.
+To run the sample applications, you can use images and videos from the media files collection available at https://storage.openvinotoolkit.org/data/test_data.
 
 ## Samples that Support Pre-Trained Models
 
diff --git a/docs/IE_DG/supported_plugins/GNA.md b/docs/IE_DG/supported_plugins/GNA.md
index 82e168997056d2..f47297571840a4 100644
--- a/docs/IE_DG/supported_plugins/GNA.md
+++ b/docs/IE_DG/supported_plugins/GNA.md
@@ -69,7 +69,7 @@ Limitations include:
 - Only 1D convolutions are natively supported.
 - The number of output channels for convolutions must be a multiple of 4.
 - Permute layer support is limited to the cases where no data reordering is needed or when reordering is happening for two dimensions, at least one of which is not greater than 8.
-- Concatenations and splitting are supported only along the channel dimension (axis=1).
+- Splits and concatenations are supported for continuous portions of memory (e.g., split of 1,2,3,4 to 1,1,3,4 and 1,1,3,4 or concats of 1,2,3,4 and 1,2,3,5 to 2,2,3,4).
 
 #### Experimental Support for 2D Convolutions
 
diff --git a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_WideAndDeep_Family_Models.md b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_WideAndDeep_Family_Models.md
index 7e28a7ac0533e3..84821d6b41c87c 100644
--- a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_WideAndDeep_Family_Models.md
+++ b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_WideAndDeep_Family_Models.md
@@ -15,7 +15,7 @@ through Tensorflow* tf.feature_column API. Table below presents what feature typ
 **Step 1**. Clone the GitHub repository with TensorFlow models and move to the directory with an example of Wide and Deep model:
 
 ```sh
-git clone https://github.com/tensorflow/models.git;
+git clone https://github.com/tensorflow/models.git --branch r2.2.0;
 cd official/r1/wide_deep
 ```
 
diff --git a/docs/doxygen/ie_docs.xml b/docs/doxygen/ie_docs.xml
index 1049d4d328372f..a6f5dd3250c818 100644
--- a/docs/doxygen/ie_docs.xml
+++ b/docs/doxygen/ie_docs.xml
@@ -234,7 +234,7 @@ limitations under the License.
                 <tab type="user" title="ScatterNDUpdate" url="@ref openvino_docs_ops_movement_ScatterNDUpdate_3"/>
                 <tab type="user" title="ScatterUpdate-3" url="@ref openvino_docs_ops_movement_ScatterUpdate_3"/>
                 <tab type="user" title="Select-1" url="@ref openvino_docs_ops_condition_Select_1"/>
-                <tab type="user" title="Selu-1" url="@ref openvino_docs_ops_arithmetic_Selu_1"/>
+                <tab type="user" title="Selu-1" url="@ref openvino_docs_ops_activation_Selu_1"/>
                 <tab type="user" title="ShapeOf-1" url="@ref openvino_docs_ops_shape_ShapeOf_1"/>
                 <tab type="user" title="ShapeOf-3" url="@ref openvino_docs_ops_shape_ShapeOf_3"/>
                 <tab type="user" title="ShuffleChannels-1" url="@ref openvino_docs_ops_movement_ShuffleChannels_1"/>
diff --git a/docs/how_tos/POT_how_to_example.md b/docs/how_tos/POT_how_to_example.md
new file mode 100644
index 00000000000000..571269a92ff437
--- /dev/null
+++ b/docs/how_tos/POT_how_to_example.md
@@ -0,0 +1,163 @@
+# Post-Training Optimization Tool - A real example
+
+This tutorial describes the example from the following YouTube* video:
+
+http://XXXXX
+
+Watch this video to learn the basics of Post-training Optimization Tool (POT): 
+   https://www.youtube.com/watch?v=SvkI25Ca_SQ   
+
+The example has been tested on OpenVINO™ 2021 on Ubuntu 18.04 Operating System.
+
+
+## 1. Installation
+
+Install OpenVINO™ toolkit and Model Optimizer, Accuracy Checker, and Post-training Optimization Tool components.
+
+1. Define the OpenVINO™ install directory:
+```
+export OV=/opt/intel/openvino_2021/
+```
+2. Install the Model Optimizer prerequisites:
+```
+cd $OV/deployment_tools/model_optimizer/install_prerequisites
+sudo ./install_prerequisites.sh
+```
+3. Install the Accuracy Checker requirements:
+```
+cd $OV/deployment_tools/open_model_zoo/tools/accuracy_checker
+sudo python3 setup.py install
+```
+4. Install the Post-training Optimization Tool:
+```
+cd $OV/deployment_tools/tools/post_training_optimization_toolkit
+sudo python3 setup.py install
+```
+
+## 2. Download Model
+
+This tutorial describes MobileNet v2 model from PyTorch* framework. You can choose any other model. 
+
+Download the MobileNet v2 PyTorch* model using the commands below:
+```
+mkdir ~/POT
+```
+```
+cd ~/POT
+```
+```
+python3 $OV/deployment_tools/tools/model_downloader/downloader.py --name mobilenet-v2-pytorch -o .
+```
+
+## 3. Prepare Model for Inference
+
+Install requirements for PyTorch using the commands below:
+```
+cd $OV/deployment_tools/open_model_zoo/tools/downloader
+```
+```
+python3 -mpip install --user -r ./requirements-pytorch.in
+```
+
+You can find the parameters for Mobilnet v2 conversion here:
+```
+vi /opt/intel/openvino_2021/deployment_tools/open_model_zoo/models/public/mobilenet-v2-pytorch/model.yml
+```
+
+Convert the model from PyTorch to ONNX*:
+```
+cd ~/POT/public/mobilenet-v2-pytorch
+python3 /opt/intel/openvino_2021/deployment_tools/open_model_zoo/tools/downloader/pytorch_to_onnx.py  \
+    --model-name=MobileNetV2 \
+    --model-path=.  \
+    --weights=mobilenet-v2.pth \
+    --import-module=MobileNetV2  \
+    --input-shape=1,3,224,224 /
+    --output-file=mobilenet-v2.onnx  \
+    --input-names=data  \
+    --output-names=prob
+
+```
+Convert the model from ONNX to the OpenVINO™ Intermediate Representation (IR):
+```
+mo_onnx.py  \
+    -m mobilenet-v2.onnx \  
+    --input=data  \
+    --mean_values=data[123.675,116.28,103.53]  \
+    --scale_values=data[58.624,57.12,57.375]  \
+    --reverse_input_channels        \
+    --output=prob  
+```
+
+Move the IR files to my directory:
+
+```
+mv mobilenet-v2.xml ~/POT/model.xml
+mv mobilenet-v2.bin ~/POT/model.bin
+```
+
+## 4. Edit Configurations 
+
+Edit the configuration files:
+```
+sudo vi $OV/deployment_tools/open_model_zoo/tools/accuracy_checker/dataset_definitions.yml
+(edit imagenet_1000_classes)
+```
+```
+export DEFINITIONS_FILE=/opt/intel/openvino_2021/deployment_tools/open_model_zoo/tools/accuracy_checker/dataset_definitions.yml
+```
+
+Copy the JSON file to my directory and edit:
+
+```
+cp $OV/deployment_tools/tools/post_training_optimization_toolkit/configs/examples/quantization/classification/mobilenetV2_pytorch_int8.json ~/POT
+```
+```
+vi mobilenetV2_pytorch_int8.json
+```
+
+Copy the YML file to my directory and edit:
+
+```
+cp /opt/intel/openvino_2021/deployment_tools/open_model_zoo/tools/accuracy_checker/configs/mobilenet-v2.yml ~/POT
+```
+```
+vi mobilenet-v2.yml
+```
+
+## 5. Run Baseline 
+
+Run Accuracy Checker on the original model:
+
+```
+accuracy_check -c mobilenet-v2.yml
+```
+
+Install the Benchmark Tool first. To learn more about Benchmark Tool refer to [Benchmark C++ Tool](https://docs.openvinotoolkit.org/latest/openvino_inference_engine_samples_benchmark_app_README.html)
+ or [Benchmark Python* Tool](https://docs.openvinotoolkit.org/latest/openvino_inference_engine_tools_benchmark_tool_README.html).
+
+Run performance benchmark:
+```
+~/inference_engine_cpp_samples_build/intel64/Release/benchmark_app -m ~/POT/model.xml
+```
+
+## 6. Run Integer Calibration
+
+You can edit the JSON file to switch between two modes of calibration:
+
+ -  AccuracyAwareQuantization
+ -  DefaultQuantization
+
+
+```
+pot --config      /home/~/POT/mobilenetV2_pytorch_int8.json   \
+        --output-dir /home/~/POT/        \ 
+        --evaluate                            \
+        --log-level INFO 
+```
+
+Run the Benchmark Tool for the calibrated model. Make sure the name contains `DafultQuantization/.../optimized/...`
+
+```
+~/inference_engine_cpp_samples_build/intel64/Release/benchmark_app -m mobilenetv2_DefaultQuantization/2021-03-07/optimized/mobilenetv2.xml
+```
diff --git a/docs/install_guides/pypi-openvino-dev.md b/docs/install_guides/pypi-openvino-dev.md
index 9504475f6b6466..3da7e3c1088fd8 100644
--- a/docs/install_guides/pypi-openvino-dev.md
+++ b/docs/install_guides/pypi-openvino-dev.md
@@ -51,7 +51,11 @@ python -m pip install --user virtualenv
 python -m venv openvino_env --system-site-packages
 ```
 
-Activate virtual environment:<br>
+> **NOTE**: On Linux and macOS, you may need to type `python3` instead of
+`python`. You may also need to [install pip](https://pip.pypa.io/en/stable/installing/).
+
+### Step 2. Activate Virtual Environment
+
 On Linux and macOS:
 ```sh
 source openvino_env/bin/activate
@@ -61,14 +65,14 @@ On Windows:
 openvino_env\Scripts\activate
 ```
 
-### Step 2. Set Up and Update pip to the Highest Version
+### Step 3. Set Up and Update pip to the Highest Version
 
 Run the command below:
 ```sh
 python -m pip install --upgrade pip
 ```
 
-### Step 3. Install the Package
+### Step 4. Install the Package
 
 Run the command below: <br>
 
@@ -76,13 +80,13 @@ Run the command below: <br>
    pip install openvino-dev
    ```
 
-### Step 4. Verify that the Package is Installed
+### Step 5. Verify that the Package is Installed
 
-Run the command below:
+Run the command below (this may take a few seconds):
 ```sh
-python -c "pot -h"
+pot -h
 ```
-   
+
 You will see the help message for Post-Training Optimization Tool if installation finished successfully.
 
 ## Additional Resources
@@ -90,4 +94,3 @@ You will see the help message for Post-Training Optimization Tool if installatio
 - Intel® Distribution of OpenVINO™ toolkit home page: [https://software.intel.com/en-us/openvino-toolkit](https://software.intel.com/en-us/openvino-toolkit)
 - OpenVINO™ toolkit online documentation: [https://docs.openvinotoolkit.org](https://docs.openvinotoolkit.org)
 
-
diff --git a/docs/install_guides/pypi-openvino-rt.md b/docs/install_guides/pypi-openvino-rt.md
index 1f44344b3e95f6..9b825c8651873a 100644
--- a/docs/install_guides/pypi-openvino-rt.md
+++ b/docs/install_guides/pypi-openvino-rt.md
@@ -48,7 +48,11 @@ python -m pip install --user virtualenv
 python -m venv openvino_env --system-site-packages
 ```
 
-Activate virtual environment:<br>
+> **NOTE**: On Linux and macOS, you may need to type `python3` instead of
+`python`. You may also need to [install pip](https://pip.pypa.io/en/stable/installing/).
+
+### Step 2. Activate Virtual Environment
+
 On Linux and macOS:
 ```sh
 source openvino_env/bin/activate
@@ -58,14 +62,14 @@ On Windows:
 openvino_env\Scripts\activate
 ```
 
-### Step 2. Set Up and Update pip to the Highest Version
+### Step 3. Set Up and Update pip to the Highest Version
 
 Run the command below:
 ```sh
 python -m pip install --upgrade pip
 ```
 
-### Step 3. Install the Package
+### Step 4. Install the Package
 
 Run the command below: <br>
 
@@ -73,7 +77,7 @@ Run the command below: <br>
    pip install openvino
    ```
 
-### Step 4. Verify that the Package is Installed
+### Step 5. Verify that the Package is Installed
 
 Run the command below:
 ```sh
diff --git a/docs/ops/activation/Elu_1.md b/docs/ops/activation/Elu_1.md
index 8d5d424d02312b..60a093c506fff6 100644
--- a/docs/ops/activation/Elu_1.md
+++ b/docs/ops/activation/Elu_1.md
@@ -8,29 +8,58 @@
 
 **Detailed Description**
 
-For each element from the input tensor calculates corresponding
-element in the output tensor with the following formula:
+*Elu* operation is introduced in this [article](https://arxiv.org/abs/1511.07289v3).
+It performs element-wise activation function on a given input tensor, based on the following mathematical formula:
+
 \f[
-elu(x) = \left\{\begin{array}{ll}
-    alpha(e^{x} - 1) \quad \mbox{if } x < 0 \\
-    x \quad \mbox{if } x \geq  0
+Elu(x) = \left\{\begin{array}{r}
+    x \qquad \mbox{if } x >  0 \\
+    \alpha(e^{x} - 1) \quad \mbox{if } x \leq 0
 \end{array}\right.
 \f]
 
+where α corresponds to *alpha* attribute.
+
+*Elu* is equivalent to *ReLU* operation when *alpha* is equal to zero. 
+
 **Attributes**
 
 * *alpha*
 
   * **Description**: scale for the negative factor
-  * **Range of values**: arbitrary floating point number
-  * **Type**: float
+  * **Range of values**: non-negative arbitrary floating-point number
+  * **Type**: `float`
   * **Default value**: none
   * **Required**: *yes*
 
 **Inputs**:
 
-*   **1**: Input tensor x of any floating point type. Required.
+*   **1**: A tensor of type `T` and arbitrary shape. **Required**.
 
 **Outputs**:
 
-*   **1**: Result of Elu function applied to the input tensor *x*. Floating point tensor with shape and type matching the input tensor.
+*   **1**: The result of element-wise *Elu* function applied to the input tensor. A tensor of type `T` and the same shape as input tensor.
+
+**Types**
+
+* *T*: arbitrary supported floating-point type.
+
+**Example**
+
+```xml
+<layer ... type="Elu">
+    <data alpha="1.0"/>
+    <input>
+        <port id="0">
+            <dim>1</dim>
+            <dim>128</dim>
+        </port>
+    </input>
+    <output>
+        <port id="1">
+            <dim>1</dim>
+            <dim>128</dim>
+        </port>
+    </output>
+</layer>
+```
diff --git a/docs/ops/activation/Exp_1.md b/docs/ops/activation/Exp_1.md
index b5815a271603cb..fad5c24c3bd1bd 100644
--- a/docs/ops/activation/Exp_1.md
+++ b/docs/ops/activation/Exp_1.md
@@ -6,12 +6,43 @@
 
 **Short description**: Exponential element-wise activation function.
 
-**Attributes**: has no attributes
+**Detailed description**
 
-**Inputs**:
+*Exp* performs element-wise exponential activation function on a given input tensor. The mathematical formula is as follows:
 
-*   **1**: Input tensor x of any floating point type. Required.
+\f[
+exp(x) = e^{x}
+\f]
 
-**Outputs**:
+**Attributes**: *Exp* operation has no attributes.
 
-*   **1**: Result of Exp function applied to the input tensor *x*. Floating point tensor with shape and type matching the input tensor.
+**Inputs**
+
+*   **1**: A tensor of type `T` and arbitrary shape. **Required**.
+
+**Outputs**
+
+*   **1**: The result of element-wise *Exp* function applied to the input tensor. A tensor of type `T` and the same shape as input tensor.
+
+**Types**
+
+* *T*: arbitrary supported floating-point type.
+
+**Example**
+
+```xml
+<layer ... type="Exp">
+    <input>
+        <port id="0">
+            <dim>1</dim>
+            <dim>256</dim>
+        </port>
+    </input>
+    <output>
+        <port id="1">
+            <dim>1</dim>
+            <dim>256</dim>
+        </port>
+    </output>
+</layer>
+```
\ No newline at end of file
diff --git a/docs/ops/activation/GELU_2.md b/docs/ops/activation/GELU_2.md
index 461defb02c9e56..9f48eba8791c06 100644
--- a/docs/ops/activation/GELU_2.md
+++ b/docs/ops/activation/GELU_2.md
@@ -2,35 +2,40 @@
 
 **Versioned name**: *Gelu-2*
 
-**Category**: *Activation*
+**Category**: *Activation function*
 
-**Short description**: [Reference](https://pytorch.org/docs/stable/nn.functional.html#gelu)
+**Short description**: Gaussian error linear unit element-wise activation function.
 
-**Detailed description**: [Reference](https://arxiv.org/abs/1606.08415)
+**Detailed description**
 
-**Attributes**: *Gelu* operation has no attributes.
-
-**Mathematical Formulation**
-Gelu(x)=x*Φ(x), where Φ(x) is the Cumulative Distribution Function for Gaussian Distribution.
-The following equivalent combination is recognized and fused into single Gelu op: 
+*Gelu* operation is introduced in this [article](https://arxiv.org/abs/1606.08415).
+It performs element-wise activation function on a given input tensor, based on the following mathematical formula:
 
 \f[
-    Gelu(x) = 0.5*x*(1.0 + erf((x) / \sqrt{2})
+    Gelu(x) = x\cdot\Phi(x) = x\cdot\frac{1}{2}\cdot\left[1 + erf\left(x/\sqrt{2}\right)\right]
 \f]
 
-Similarly, the following Gelu approximation (typical for the TensorFlow*) is recognized and fused into single Gelu op 
+where Φ(x) is the Cumulative Distribution Function for Gaussian Distribution.
+
+Additionally, *Gelu* function may be approximated as follows:
 
 \f[
-    Gelu(x) \approx 0.5x(1.0 + tanh(\sqrt{2.0/pi} * (x + 0.044715 * x ^ 3))
+    Gelu(x) \approx 0.5\cdot x\cdot \left(1 + \tanh\left[\sqrt{2/\pi} \cdot (x + 0.044715 \cdot x^3)\right]\right)
 \f]
 
+**Attributes**: *Gelu* operation has no attributes.
+
 **Inputs**:
 
-*   **1**: Multidimensional input tensor. Required.
+*   **1**: A tensor of type `T` and arbitrary shape. **Required**.
 
 **Outputs**:
 
-*   **1**: Floating point tensor with shape and type matching the input tensor.
+*   **1**: The result of element-wise *Gelu* function applied to the input tensor. A tensor of type `T` and the same shape as input tensor.
+
+**Types**
+
+* *T*: arbitrary supported floating-point type.
 
 **Example**
 
diff --git a/docs/ops/activation/Mish_4.md b/docs/ops/activation/Mish_4.md
index 8eda674f5039f4..30eebc71c643b3 100644
--- a/docs/ops/activation/Mish_4.md
+++ b/docs/ops/activation/Mish_4.md
@@ -2,35 +2,35 @@
 
 **Versioned name**: *Mish-4*
 
-**Category**: *Activation*
+**Category**: *Activation function*
 
-**Short description**: Mish is a Self Regularized Non-Monotonic Neural Activation Function.
+**Short description**: *Mish* is a Self Regularized Non-Monotonic Neural Activation Function.
 
-**Detailed description**: Mish is a self regularized non-monotonic neural activation function proposed in the [article](https://arxiv.org/abs/1908.08681).
+**Detailed description**
 
-**Attributes**: operation has no attributes.
+*Mish* is a self regularized non-monotonic neural activation function proposed in this [article](https://arxiv.org/abs/1908.08681v2).
+
+*Mish* performs element-wise activation function on a given input tensor, based on the following mathematical formula:
+
+\f[
+Mish(x) = x\cdot\tanh\big(SoftPlus(x)\big) = x\cdot\tanh\big(\ln(1+e^{x})\big)
+\f]
+
+**Attributes**: *Mish* operation has no attributes.
 
 **Inputs**:
 
-*   **1**: Input tensor *x* of any floating point type T. Required.
+*   **1**: A tensor of type `T` and arbitrary shape. **Required**.
 
 **Outputs**:
 
-*   **1**: Floating point tensor with shape and type matching the input tensor.
+*   **1**: The result of element-wise *Mish* function applied to the input tensor. A tensor of type `T` and the same shape as input tensor.
 
 **Types**
 
-* *T*: any floating point type.
-
-**Mathematical Formulation**
-
-   For each element from the input tensor calculates corresponding
-    element in the output tensor with the following formula:
-\f[
-Mish(x) = x*tanh(ln(1.0+e^{x}))
-\f]
+* *T*: arbitrary supported floating-point type.
 
-**Examples**
+**Example**
 
 ```xml
 <layer ... type="Mish">
diff --git a/docs/ops/activation/PReLU_1.md b/docs/ops/activation/PReLU_1.md
index 74920e1306be47..ea0d4d05b2f603 100644
--- a/docs/ops/activation/PReLU_1.md
+++ b/docs/ops/activation/PReLU_1.md
@@ -2,32 +2,114 @@
 
 **Versioned name**: *PReLU-1*
 
-**Category**: Activation function
+**Category**: *Activation function*
 
-**Short description**: *PReLU* performs element-wise parametric ReLU operation with negative slope defined by the second input.
+**Short description**: Parametric rectified linear unit element-wise activation function.
 
-**Attributes**: operation has no attributes.
+**Detailed description**
 
-**Inputs**
+*PReLU* operation is introduced in this [article](https://arxiv.org/abs/1502.01852v1).
+
+*PReLU* performs element-wise parametric *ReLU* operation on a given input tensor, based on the following mathematical formula:
+
+\f[
+PReLU(x) = \left\{\begin{array}{r}
+    x \quad \mbox{if } x \geq  0 \\
+    \alpha x \quad \mbox{if } x < 0
+\end{array}\right.
+\f]
+
+where α is a learnable parameter and corresponds to the negative slope, per channel, defined by the second input `slope`.
+
+Another mathematical representation that may be found in other references:
+
+\f[
+PReLU(x) = \max(0, x) + \alpha\cdot\min(0, x)
+\f]
 
-* **1**: `X` - Input tensor of any supported floating point type T1. Required.
 
-* **2**: `slope` - Tensor with negative slope values of type T2. The shape of the tensor should be broadcastable to input 1. Required.
+**Attributes**: *PReLU* operation has no attributes.
+
+**Inputs**
+
+* **1**: `data`. A tensor of type `T` and arbitrary shape. **Required**.
+* **2**: `slope`. 1D tensor of type `T`. Tensor with negative slope values, one per channel dimension of `data` input tensor. **Required**.
+* **Note**: Channels dimension corresponds to second dimension of `data` input tensor. If `data` rank is less than 2, the number of channels is 1.
 
 **Outputs**
 
-* **1**: The result of element-wise PReLU operation applied for tensor from input 1 with slope values from input 2. A tensor of type T1 and shape matching shape of input *x* tensor.
+* **1**: The result of element-wise *PReLU* operation applied to `data` input tensor with negative slope values from `slope` input tensor. A tensor of type `T` and the same shape as `data` input tensor.
 
 **Types**
 
-* *T1*: arbitrary supported floating point type.
+* *T*: arbitrary supported floating-point type.
 
-* *T2*: arbitrary supported floating point type.
+**Examples**
 
-**Detailed description**
-Before performing addition operation, input tensor 2 with slope values is broadcasted to input 1.
-The broadcasting rules are aligned with ONNX Broadcasting. Description is available in <a href="https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md">ONNX docs</a>.
+*Example: 1D input tensor `data`*
+
+```xml
+<layer ... type="Prelu">
+    <input>
+        <port id="0">
+            <dim>128</dim>
+        </port>
+        <port id="1">
+            <dim>1</dim>
+        </port>
+    </input>
+    <output>
+        <port id="2">
+            <dim>128</dim>
+        </port>
+    </output>
+</layer>
+```
+
+*Example: 2D input tensor `data`*
+
+```xml
+<layer ... type="Prelu">
+    <input>
+        <port id="0">
+            <dim>20</dim>
+            <dim>128</dim>
+        </port>
+        <port id="1">
+            <dim>128</dim>
+        </port>
+    </input>
+    <output>
+        <port id="2">
+            <dim>20</dim>
+            <dim>128</dim>
+        </port>
+    </output>
+</layer>
+```
 
-After broadcasting *PReLU* does the following for each input 1 element x:
+*Example: 4D input tensor `data`*
 
-    f(x) = slope * x for x < 0; x for x >= 0
\ No newline at end of file
+```xml
+<layer ... type="Prelu">
+    <input>
+        <port id="0">
+            <dim>1</dim>
+            <dim>20</dim>
+            <dim>128</dim>
+            <dim>128</dim>
+        </port>
+        <port id="1">
+            <dim>20</dim>
+        </port>
+    </input>
+    <output>
+        <port id="2">
+            <dim>1</dim>
+            <dim>20</dim>
+            <dim>128</dim>
+            <dim>128</dim>
+        </port>
+    </output>
+</layer>
+```
diff --git a/docs/ops/activation/Selu_1.md b/docs/ops/activation/Selu_1.md
new file mode 100644
index 00000000000000..0af534b8f56292
--- /dev/null
+++ b/docs/ops/activation/Selu_1.md
@@ -0,0 +1,71 @@
+## Selu <a name="Selu"></a> {#openvino_docs_ops_activation_Selu_1}
+
+**Versioned name**: *Selu-1*
+
+**Category**: *Activation function*
+
+**Short description**: *Selu* is a scaled exponential linear unit element-wise activation function.
+
+**Detailed Description**
+
+*Selu* operation is introduced in this [article](https://arxiv.org/abs/1706.02515), as activation function for self-normalizing neural networks (SNNs).
+
+*Selu* performs element-wise activation function on a given input tensor `data`, based on the following mathematical formula:
+
+\f[
+Selu(x) = \lambda \left\{\begin{array}{r}
+    x \quad \mbox{if } x >  0 \\
+    \alpha(e^{x} - 1) \quad \mbox{if } x \le 0
+\end{array}\right.
+\f]
+
+where α and λ correspond to inputs `alpha` and `lambda` respectively.
+
+Another mathematical representation that may be found in other references:
+
+\f[
+Selu(x) = \lambda\cdot\big(\max(0, x) + \min(0, \alpha(e^{x}-1))\big)
+\f]
+
+**Attributes**: *Selu* operation has no attributes.
+
+**Inputs**
+
+* **1**: `data`. A tensor of type `T` and arbitrary shape. **Required.**
+
+* **2**: `alpha`. 1D tensor with one element of type `T`. **Required.**
+
+* **3**: `lambda`. 1D tensor with one element of type `T`. **Required.**
+
+**Outputs**
+
+* **1**: The result of element-wise *Selu* function applied to `data` input tensor. A tensor of type `T` and the same shape as `data` input tensor.
+
+**Types**
+
+* *T*: arbitrary supported floating-point type.
+
+**Example**
+
+```xml
+<layer ... type="Selu">
+    <input>
+        <port id="0">
+            <dim>256</dim>
+            <dim>56</dim>
+        </port>
+        <port id="1">
+            <dim>1</dim>
+        </port>
+        <port id="2">
+            <dim>1</dim>
+        </port>
+    </input>
+    <output>
+        <port id="3">
+            <dim>256</dim>
+            <dim>56</dim>
+        </port>
+    </output>
+</layer>
+```
diff --git a/docs/ops/activation/SoftPlus_4.md b/docs/ops/activation/SoftPlus_4.md
index 135c4cb9dccae4..8afc94684acb7a 100644
--- a/docs/ops/activation/SoftPlus_4.md
+++ b/docs/ops/activation/SoftPlus_4.md
@@ -2,15 +2,18 @@
 
 **Versioned name**: *SoftPlus-4*
 
-**Category**: *Activation*
+**Category**: *Activation function*
 
-**Short description**: SoftPlus takes one input tensor and produces output tensor where the softplus function is applied to the tensor elementwise.
+**Short description**: *SoftPlus* is a rectified-based element-wise activation function.
 
-**Detailed description**: For each element from the input tensor calculates corresponding
-element in the output tensor with the following formula:
+**Detailed description**
+
+*SoftPlus* operation is introduced in this [article](https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.6419). 
+
+*SoftPlus* performs element-wise activation function on a given input tensor, based on the following mathematical formula:
 
 \f[
-SoftPlus(x) = ln(e^{x} + 1.0)
+SoftPlus(x) = \ln(1+e^{x})
 \f]
 
 **Attributes**: *SoftPlus* operation has no attributes.
@@ -18,16 +21,15 @@ SoftPlus(x) = ln(e^{x} + 1.0)
 
 **Inputs**:
 
-*   **1**: Multidimensional input tensor of type *T*. **Required**.
+*   **1**: A tensor of type `T` and arbitrary shape. **Required**.
 
 **Outputs**:
 
-*   **1**: The resulting tensor of the same shape and type as input tensor.
+*   **1**: The result of element-wise *SoftPlus* function applied to the input tensor. A tensor of type `T` and the same shape as input tensor.
 
 **Types**
 
-* *T*: arbitrary supported floating point type.
-
+* *T*: arbitrary supported floating-point type.
 
 **Example**
 
@@ -46,4 +48,4 @@ SoftPlus(x) = ln(e^{x} + 1.0)
         </port>
     </output>
 </layer>
-```
\ No newline at end of file
+```
diff --git a/docs/ops/activation/Swish_4.md b/docs/ops/activation/Swish_4.md
index 78bcb3866e7b91..1a8b7d1b51a4f9 100644
--- a/docs/ops/activation/Swish_4.md
+++ b/docs/ops/activation/Swish_4.md
@@ -2,38 +2,40 @@
 
 **Versioned name**: *Swish-4*
 
-**Category**: *Activation*
+**Category**: *Activation function*
 
-**Short description**: Swish takes one input tensor and produces output tensor where the Swish function is applied to the tensor elementwise.
+**Short description**: *Swish* performs element-wise activation function on a given input tensor.
 
-**Detailed description**: For each element from the input tensor calculates corresponding
-element in the output tensor with the following formula: 
+**Detailed description**
+
+*Swish* operation is introduced in this [article](https://arxiv.org/abs/1710.05941).
+It performs element-wise activation function on a given input tensor, based on the following mathematical formula:
 
 \f[
-Swish(x) = x / (1.0 + e^{-(beta * x)})
+Swish(x) = x\cdot \sigma(\beta x) = x \left(1 + e^{-(\beta x)}\right)^{-1}
 \f]
 
-The Swish operation is introduced in the [article](https://arxiv.org/pdf/1710.05941.pdf).
+where β corresponds to `beta` scalar input.
 
-**Attributes**:
+**Attributes**: *Swish* operation has no attributes.
 
 **Inputs**:
 
-*   **1**: Multidimensional input tensor of type *T*. **Required**.
+*   **1**: `data`. A tensor of type `T` and arbitrary shape. **Required**.
 
-*   **2**: Scalar with non-negative value of type *T*. Multiplication parameter *beta* for the sigmoid. If the input is not connected then the default value 1.0 is used. **Optional**
+*   **2**: `beta`. A non-negative scalar value of type `T`. Multiplication parameter for the sigmoid. Default value 1.0 is used. **Optional**.
 
 **Outputs**:
 
-*   **1**: The resulting tensor of the same shape and type as input tensor.
+*   **1**: The result of element-wise *Swish* function applied to the input tensor `data`. A tensor of type `T` and the same shape as `data` input tensor.
 
 **Types**
 
-* *T*: arbitrary supported floating point type.
-
+* *T*: arbitrary supported floating-point type.
 
-**Example**
+**Examples**
 
+*Example: Second input `beta` provided*
 ```xml
 <layer ... type="Swish">
     <input>
@@ -41,13 +43,30 @@ The Swish operation is introduced in the [article](https://arxiv.org/pdf/1710.05
             <dim>256</dim>
             <dim>56</dim>
         </port>
-        <port id="1"/>
+        <port id="1">  <!-- beta value: 2.0 -->
+        </port>
     </input>
     <output>
-        <port id="1">
+        <port id="2">
             <dim>256</dim>
             <dim>56</dim>
         </port>
     </output>
 </layer>
-```
\ No newline at end of file
+```
+
+*Example: Second input `beta` not provided*
+```xml
+<layer ... type="Swish">
+    <input>
+        <port id="0">
+            <dim>128</dim>
+        </port>
+    </input>
+    <output>
+        <port id="1">
+            <dim>128</dim>
+        </port>
+    </output>
+</layer>
+```
diff --git a/docs/ops/arithmetic/FloorMod_1.md b/docs/ops/arithmetic/FloorMod_1.md
index 26986df0f3c3dc..042ffb7f428696 100644
--- a/docs/ops/arithmetic/FloorMod_1.md
+++ b/docs/ops/arithmetic/FloorMod_1.md
@@ -13,7 +13,7 @@ As a first step input tensors *a* and *b* are broadcasted if their shapes differ
 o_{i} = a_{i} % b_{i}
 \f] 
 
-*FloorMod* operation computes a reminder of a floored division. It is the same behaviour like in Python programming language: `floor(x / y) * y + floor_mod(x, y) = x`. The sign of the result is equal to a sign of a dividend. The result of division by zero is undefined.
+*FloorMod* operation computes a reminder of a floored division. It is the same behaviour like in Python programming language: `floor(x / y) * y + floor_mod(x, y) = x`. The sign of the result is equal to a sign of a divisor. The result of division by zero is undefined.
 
 **Attributes**:
 
diff --git a/docs/ops/arithmetic/Selu_1.md b/docs/ops/arithmetic/Selu_1.md
deleted file mode 100644
index 8d69d13fbf2e37..00000000000000
--- a/docs/ops/arithmetic/Selu_1.md
+++ /dev/null
@@ -1,65 +0,0 @@
-## Selu <a name="Selu"></a> {#openvino_docs_ops_arithmetic_Selu_1}
-
-**Versioned name**: *Selu-1*
-
-**Category**: Arithmetic unary operation
-
-**Short description**: *Selu* calculates the SELU activation function (https://arxiv.org/abs/1706.02515) element-wise with given tensor.
-
-**Detailed Description**
-
-For each element from the input tensor calculates corresponding
-element in the output tensor with the following formula:
-\f[
-selu(x) = \lambda \left\{\begin{array}{ll}
-    \alpha(e^{x} - 1) \quad \mbox{if } x \le 0 \\
-    x \quad \mbox{if } x >  0
-\end{array}\right.
-\f]
-
-**Attributes**:
-
-    No attributes available.
-
-**Inputs**
-
-* **1**: An tensor of type T. **Required.**
-
-* **2**: `alpha` 1D tensor with one element of type T. **Required.**
-
-* **3**: `lambda` 1D tensor with one element of type T. **Required.**
-
-**Outputs**
-
-* **1**: The result of element-wise operation. A tensor of type T.
-
-**Types**
-
-* *T*: any supported floating point type.
-
-**Examples**
-
-*Example 1*
-
-```xml
-<layer ... type="Selu">
-    <input>
-        <port id="0">
-            <dim>256</dim>
-            <dim>56</dim>
-        </port>
-        <port id="1">
-            <dim>1</dim>
-        </port>
-        <port id="2">
-            <dim>1</dim>
-        </port>
-    </input>
-    <output>
-        <port id="3">
-            <dim>256</dim>
-            <dim>56</dim>
-        </port>
-    </output>
-</layer>
-```
\ No newline at end of file
diff --git a/docs/ops/convolution/BinaryConvolution_1.md b/docs/ops/convolution/BinaryConvolution_1.md
index 6ab2458035c1c8..314e9aad029258 100644
--- a/docs/ops/convolution/BinaryConvolution_1.md
+++ b/docs/ops/convolution/BinaryConvolution_1.md
@@ -82,17 +82,17 @@ Computation algorithm for mode *xnor-popcount*:
   
 **Inputs**:
 
-*   **1**: Input tensor of type *T1* and rank 4. Layout is NCYX (number of batches, number of channels, spatial axes Y, X). Required.
-*   **2**: Kernel tensor of type *T2* and rank 4. Layout is OIYX (number of output channels, number of input channels, spatial axes Y, X). Required.
+*   **1**: Input tensor of type *T1* and rank 4. Layout is `[N, C_IN, Y, X]` (number of batches, number of channels, spatial axes Y, X). Required.
+*   **2**: Kernel tensor of type *T2* and rank 4. Layout is `[C_OUT, C_IN, Y, X]` (number of output channels, number of input channels, spatial axes Y, X). Required.
 *   **Note**: Interpretation of tensor values is defined by *mode* attribute.
 
 **Outputs**:
 
-*   **1**: Output tensor of type *T3* and rank 4. Layout is NOYX (number of batches, number of kernel output channels, spatial axes Y, X).
+*   **1**: Output tensor of type *T3* and rank 4. Layout is `[N, C_OUT, Y, X]` (number of batches, number of kernel output channels, spatial axes Y, X).
   
 **Types**:
 
-* *T1*: floating point type with values `0` or `1`.
+* *T1*: any numeric type with values `0` or `1`.
 * *T2*: `u1` type with binary values `0` or `1`.
 * *T3*: *T1* type with full range of values.
 
diff --git a/docs/ops/convolution/Convolution_1.md b/docs/ops/convolution/Convolution_1.md
index ffdbbc508618a4..8e50b3ffada509 100644
--- a/docs/ops/convolution/Convolution_1.md
+++ b/docs/ops/convolution/Convolution_1.md
@@ -37,7 +37,7 @@ The receptive field in each layer is calculated using the formulas:
 
   * **Description**: *strides* is a distance (in pixels) to slide the filter on the feature map over the `(z, y, x)` axes for 3D convolutions and `(y, x)` axes for 2D convolutions. For example, *strides* equal `4,2,1` means sliding the filter 4 pixel at a time over depth dimension, 2 over height dimension and 1 over width dimension.
   * **Range of values**: integer values starting from 0
-  * **Type**: int[]
+  * **Type**: `int[]`
   * **Default value**: None
   * **Required**: *yes*
 
@@ -45,7 +45,7 @@ The receptive field in each layer is calculated using the formulas:
 
   * **Description**: *pads_begin* is a number of pixels to add to the beginning along each axis. For example, *pads_begin* equal `1,2` means adding 1 pixel to the top of the input and 2 to the left of the input.
   * **Range of values**: integer values starting from 0
-  * **Type**: int[]
+  * **Type**: `int[]`
   * **Default value**: None
   * **Required**: *yes*
   * **Note**: the attribute is ignored when *auto_pad* attribute is specified.
@@ -54,7 +54,7 @@ The receptive field in each layer is calculated using the formulas:
 
   * **Description**: *pads_end* is a number of pixels to add to the ending along each axis. For example, *pads_end* equal `1,2` means adding 1 pixel to the bottom of the input and 2 to the right of the input.
   * **Range of values**: integer values starting from 0
-  * **Type**: int[]
+  * **Type**: `int[]`
   * **Default value**: None
   * **Required**: *yes*
   * **Note**: the attribute is ignored when *auto_pad* attribute is specified.
@@ -63,7 +63,7 @@ The receptive field in each layer is calculated using the formulas:
 
   * **Description**: *dilations* denotes the distance in width and height between elements (weights) in the filter. For example, *dilation* equal `1,1` means that all the elements in the filter are neighbors, so it is the same as for the usual convolution. *dilation* equal `2,2` means that all the elements in the filter are matched not to adjacent elements in the input matrix, but to those that are adjacent with distance 1.
   * **Range of values**: integer value starting from 0
-  * **Type**: int[]
+  * **Type**: `int[]`
   * **Default value**: None
   * **Required**: *yes*
 
@@ -74,15 +74,15 @@ The receptive field in each layer is calculated using the formulas:
     * *same_upper* - the input is padded to match the output size. In case of odd padding value an extra padding is added at the end.
     * *same_lower* - the input is padded to match the output size. In case of odd padding value an extra padding is added at the beginning.
     * *valid* - do not use padding.
-  * **Type**: string
+  * **Type**: `string`
   * **Default value**: explicit
   * **Required**: *no*
   * **Note**: *pads_begin* and *pads_end* attributes are ignored when *auto_pad* is specified.
 
 **Inputs**:
 
-*   **1**: Input tensor of type *T* and rank 3, 4 or 5. Layout is NCZYX (number of batches, number of channels, spatial axes Z, Y, X). Required.
-*   **2**: Kernel tensor of type *T* and rank 3, 4 or 5. Layout is OIZYX (number of output channels, number of input channels, spatial axes Z, Y, X). Required.
+*   **1**: Input tensor of type *T* and rank 3, 4 or 5. Layout is `[N, C_IN, Z, Y, X]` (number of batches, number of channels, spatial axes Z, Y, X). Required.
+*   **2**: Kernel tensor of type *T* and rank 3, 4 or 5. Layout is `[C_OUT, C_IN, Z, Y, X]` (number of output channels, number of input channels, spatial axes Z, Y, X). Required.
 *   **Note**: Type of the convolution (1D, 2D or 3D) is derived from the rank of the input tensors and not specified by any attribute:
       * 1D convolution (input tensors rank 3) means that there is only one spatial axis X
       * 2D convolution (input tensors rank 4) means that there are two spatial axes Y, X
@@ -90,11 +90,11 @@ The receptive field in each layer is calculated using the formulas:
 
 **Outputs**:
 
-*   **1**: Output tensor of type *T* and rank 3, 4 or 5. Layout is NOZYX (number of batches, number of kernel output channels, spatial axes Z, Y, X).
+*   **1**: Output tensor of type *T* and rank 3, 4 or 5. Layout is `[N, C_OUT, Z, Y, X]` (number of batches, number of kernel output channels, spatial axes Z, Y, X).
 
 **Types**:
 
-* *T*: any floating point type.
+* *T*: any numeric type.
 
 **Example**:
 
diff --git a/docs/ops/convolution/DeformableConvolution_1.md b/docs/ops/convolution/DeformableConvolution_1.md
index 446d6fd07bb8ac..2cba8d84039fe9 100644
--- a/docs/ops/convolution/DeformableConvolution_1.md
+++ b/docs/ops/convolution/DeformableConvolution_1.md
@@ -88,7 +88,7 @@
 
 **Types**:
 
-* *T*: Any floating point type.
+* *T*: Any numeric type.
  
 **Example**
 
diff --git a/docs/ops/convolution/GroupConvolutionBackpropData_1.md b/docs/ops/convolution/GroupConvolutionBackpropData_1.md
index 9c041a891499b1..2d76aa905ea25d 100644
--- a/docs/ops/convolution/GroupConvolutionBackpropData_1.md
+++ b/docs/ops/convolution/GroupConvolutionBackpropData_1.md
@@ -66,9 +66,9 @@
 
 **Inputs**:
 
-*   **1**: Input tensor of type `T1` and rank 3, 4 or 5. Layout is `NCZYX` (number of batches, number of channels, spatial axes Z, Y, X). Required.
+*   **1**: Input tensor of type `T1` and rank 3, 4 or 5. Layout is `[N, GROUPS * C_IN, Z, Y, X]` (number of batches, number of channels, spatial axes Z, Y, X). Required.
 
-*   **2**: Kernel tensor of type `T1` and rank 4, 5 or 6. Layout is `GOIZYX` (number of groups, number of output channels, number of input channels, spatial axes Z, Y, X). Required.
+*   **2**: Kernel tensor of type `T1` and rank 4, 5 or 6. Layout is `[GROUPS, C_IN, C_OUT, X, Y, Z]` (number of groups, number of input channels, number of output channels, spatial axes X, Y, Z). Required.
 
 *   **3**: Output shape tensor of type `T2` and rank 1. It specifies spatial shape of the output. Optional.
 *   **Note** Number of groups is derived from the shape of the kernel and not specified by any attribute. 
@@ -79,11 +79,11 @@
   
 **Outputs**:
 
-*   **1**: Output tensor of type `T1` and rank 3, 4 or 5 (the same as input *1*). Layout is `NOZYX` (number of batches, number of kernel output channels, spatial axes Z, Y, X).
+*   **1**: Output tensor of type `T1` and rank 3, 4 or 5 (the same as input *1*). Layout is `[N, GROUPS * C_OUT, Z, Y, X]` (number of batches, number of kernel output channels, spatial axes Z, Y, X).
 
 **Types**:
 
-* *T1*: any floating point type.
+* *T1*: any numeric type.
 * *T2*: any integer type.
 
 **Example**
diff --git a/docs/ops/convolution/GroupConvolution_1.md b/docs/ops/convolution/GroupConvolution_1.md
index 33a34c6fa2ed4b..49d6d9c05a84fb 100644
--- a/docs/ops/convolution/GroupConvolution_1.md
+++ b/docs/ops/convolution/GroupConvolution_1.md
@@ -15,7 +15,7 @@ Neural Networks](https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76
 
   * **Description**: *strides* is a distance (in pixels) to slide the filter on the feature map over the `(z, y, x)` axes for 3D convolutions and `(y, x)` axes for 2D convolutions. For example, *strides* equal `4,2,1` means sliding the filter 4 pixel at a time over depth dimension, 2 over height dimension and 1 over width dimension.
   * **Range of values**: positive integer numbers
-  * **Type**: int[]
+  * **Type**: `int[]`
   * **Default value**: None
   * **Required**: *yes*
 
@@ -23,7 +23,7 @@ Neural Networks](https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76
 
   * **Description**: *pads_begin* is a number of pixels to add to the beginning along each axis. For example, *pads_begin* equal `1,2` means adding 1 pixel to the top of the input and 2 to the left of the input.
   * **Range of values**: positive integer numbers
-  * **Type**: int[]
+  * **Type**: `int[]`
   * **Default value**: None
   * **Required**: *yes*
   * **Note**: the attribute is ignored when *auto_pad* attribute is specified. 
@@ -32,7 +32,7 @@ Neural Networks](https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76
 
   * **Description**: *pads_end* is a number of pixels to add to the ending along each axis. For example, *pads_end* equal `1,2` means adding 1 pixel to the bottom of the input and 2 to the right of the input.
   * **Range of values**: positive integer numbers
-  * **Type**: int[]
+  * **Type**: `int[]`
   * **Default value**: None
   * **Required**: *yes*
   * **Note**: the attribute is ignored when *auto_pad* attribute is specified. 
@@ -41,7 +41,7 @@ Neural Networks](https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76
 
   * **Description**: *dilations* denotes the distance in width and height between elements (weights) in the filter. For example, *dilation* equal `1,1` means that all the elements in the filter are neighbors, so it is the same as for the usual convolution. *dilation* equal `2,2` means that all the elements in the filter are matched not to adjacent elements in the input matrix, but to those that are adjacent with distance 1.
   * **Range of values**: positive integer numbers
-  * **Type**: int[]
+  * **Type**: `int[]`
   * **Default value**: None
   * **Required**: *yes*
 
@@ -52,15 +52,15 @@ Neural Networks](https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76
     * *same_upper* - the input is padded to match the output size. In case of odd padding value an extra padding is added at the end.
     * *same_lower* - the input is padded to match the output size. In case of odd padding value an extra padding is added at the beginning.
     * *valid* - do not use padding.
-  * **Type**: string
+  * **Type**: `string`
   * **Default value**: explicit
   * **Required**: *no*
   * **Note**: *pads_begin* and *pads_end* attributes are ignored when *auto_pad* is specified.
 
 **Inputs**:
 
-*   **1**: Input tensor of type *T* and rank 3, 4 or 5. Layout is NCZYX (number of batches, number of channels, spatial axes Z, Y, X). Required.
-*   **2**: Convolution kernel tensor of type *T* and rank 4, 5 or 6. Layout is GOIZYX (number of groups, number of output channels, number of input channels, spatial axes Z, Y, X), 
+*   **1**: Input tensor of type *T* and rank 3, 4 or 5. Layout is `[N, GROUPS * C_IN, Z, Y, X]` (number of batches, number of channels, spatial axes Z, Y, X). Required.
+*   **2**: Convolution kernel tensor of type *T* and rank 4, 5 or 6. Layout is `[GROUPS, C_OUT, C_IN, Z, Y, X]` (number of groups, number of output channels, number of input channels, spatial axes Z, Y, X),
   *   **Note** Number of groups is derived from the shape of the kernel and not specified by any attribute. 
   *   **Note**: Type of the convolution (1D, 2D or 3D) is derived from the rank of the input tensors and not specified by any attribute:
       * 1D convolution (input tensors rank 3) means that there is only one spatial axis X
@@ -69,11 +69,11 @@ Neural Networks](https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76
 
 **Outputs**:
 
-*   **1**: Output tensor of type *T* and rank 3, 4 or 5. Layout is NOZYX (number of batches, number of kernel output channels, spatial axes Z, Y, X).
+*   **1**: Output tensor of type *T* and rank 3, 4 or 5. Layout is `[N, GROUPS * C_OUT, Z, Y, X]` (number of batches, number of output channels, spatial axes Z, Y, X).
 
 **Types**:
 
-* *T*: any floating point type.
+* *T*: any numeric type.
 
 **Example**:  
 1D GroupConvolution
diff --git a/docs/ops/detection/ExperimentalDetectronDetectionOutput_6.md b/docs/ops/detection/ExperimentalDetectronDetectionOutput_6.md
index 4ef48bb0cce8a6..69411e3f31f63b 100644
--- a/docs/ops/detection/ExperimentalDetectronDetectionOutput_6.md
+++ b/docs/ops/detection/ExperimentalDetectronDetectionOutput_6.md
@@ -4,13 +4,13 @@
 
 **Category**: Object detection
 
-**Short description**: An operation *ExperimentalDetectronDetectionOutput* performs non-maximum suppression to generate 
+**Short description**: The *ExperimentalDetectronDetectionOutput* operation performs non-maximum suppression to generate 
 the detection output using information on location and score predictions.
 
-**Detailed description**: Operation doing next steps:
+**Detailed description**: The operation performs the following steps:
 
 1.  Applies deltas to boxes sizes [x<sub>1</sub>, y<sub>1</sub>, x<sub>2</sub>, y<sub>2</sub>] and takes coordinates of 
-refined boxes according to formulas:
+refined boxes according to the formulas:
 
 `x1_new = ctr_x + (dx - 0.5 * exp(min(d_log_w, max_delta_log_wh))) * box_w`
 
@@ -20,20 +20,20 @@ refined boxes according to formulas:
 
 `y1_new = ctr_y + (dy + 0.5 * exp(min(d_log_h, max_delta_log_wh))) * box_h - 1.0`
     
-* `box_w` and `box_h` are width and height of box:
+* `box_w` and `box_h` are width and height of box, respectively:
 
 `box_w = x1 - x0 + 1.0`
 
 `box_h = y1 - y0 + 1.0`
 
-* `ctr_x` and `ctr_y` are center location of box:
+* `ctr_x` and `ctr_y` are center location of a box:
 
 `ctr_x = x0 + 0.5f * box_w`
 
 `ctr_y = y0 + 0.5f * box_h`
     
-* `dx`, `dy`, `d_log_w` and `d_log_h` are deltas calculated according to next formulas and `deltas_tensor` is second 
-input:
+* `dx`, `dy`, `d_log_w` and `d_log_h` are deltas calculated according to the formulas below, and `deltas_tensor` is a 
+second input:
 
 `dx = deltas_tensor[roi_idx, 4 * class_idx + 0] / deltas_weights[0]`
 
@@ -43,21 +43,21 @@ input:
 
 `d_log_h = deltas_tensor[roi_idx, 4 * class_idx + 3] / deltas_weights[3]`
 
-2.  If *class_agnostic_box_regression* is `true` then operation removes predictions for background classes;
-3.  Clips boxes to image;
-4.  Applies *score_threshold* on detection scores;
+2.  If *class_agnostic_box_regression* is `true` removes predictions for background classes.
+3.  Clips boxes to the image.
+4.  Applies *score_threshold* on detection scores.
 5.  Applies non-maximum suppression class-wise with *nms_threshold* and returns *post_nms_count* or less detections per 
-class;
-6.  Operation returns *max_detections_per_image* detections if total number of detections is more than it, otherwise 
-returns total number of detections and the output tensor is filled with undefined values for rest output tensor 
-elements.
+class.
+6.  Returns *max_detections_per_image* detections if total number of detections is more than *max_detections_per_image*; 
+otherwise, returns total number of detections and the output tensor is filled with undefined values for rest output 
+tensor elements.
 
 **Attributes**:
 
 * *score_threshold*
 
-    * **Description**: *score_threshold* attribute specifies threshold to consider only detections whose score are 
-    larger than a threshold.
+    * **Description**: The *score_threshold* attribute specifies a threshold to consider only detections whose score are 
+    larger than the threshold.
     * **Range of values**: non-negative floating point number
     * **Type**: float
     * **Default value**: None
@@ -65,7 +65,7 @@ elements.
 
 * *nms_threshold*
 
-    * **Description**: *nms_threshold* attribute specifies threshold to be used in the NMS stage.
+    * **Description**: The *nms_threshold* attribute specifies a threshold to be used in the NMS stage.
     * **Range of values**: non-negative floating point number
     * **Type**: float
     * **Default value**: None
@@ -73,7 +73,7 @@ elements.
 
 * *num_classes*
 
-    * **Description**: *num_classes* attribute specifies number of detected classes.
+    * **Description**: The *num_classes* attribute specifies the number of detected classes.
     * **Range of values**: non-negative integer number
     * **Type**: int
     * **Default value**: None
@@ -81,7 +81,7 @@ elements.
 
 * *post_nms_count*
 
-    * **Description**: *post_nms_count* attribute specifies the maximal number of detections per class.
+    * **Description**: The *post_nms_count* attribute specifies the maximal number of detections per class.
     * **Range of values**: non-negative integer number
     * **Type**: int
     * **Default value**: None
@@ -89,7 +89,7 @@ elements.
 
 * *max_detections_per_image*
 
-    * **Description**: *max_detections_per_image* attribute specifies maximal number of detections per image.
+    * **Description**: The *max_detections_per_image* attribute specifies maximal number of detections per image.
     * **Range of values**: non-negative integer number
     * **Type**: int
     * **Default value**: None
@@ -101,14 +101,14 @@ elements.
     classes or not.
     * **Range of values**:
       * `true` means background classes should be deleted
-      * `false` means background classes shouldn't be deleted
+      * `false` means background classes should not be deleted
     * **Type**: boolean
     * **Default value**: false
     * **Required**: *no*
 
 * *max_delta_log_wh*
 
-    * **Description**: *max_delta_log_wh* attribute specifies maximal delta of logarithms for width and height.
+    * **Description**: The *max_delta_log_wh* attribute specifies maximal delta of logarithms for width and height.
     * **Range of values**: floating point number
     * **Type**: float
     * **Default value**: None
@@ -116,7 +116,7 @@ elements.
 
 * *deltas_weights*
 
-    * **Description**: *deltas_weights* attribute specifies weights for bounding boxes sizes deltas.
+    * **Description**: The *deltas_weights* attribute specifies weights for bounding boxes sizes deltas.
     * **Range of values**: a list of non-negative floating point numbers
     * **Type**: float[]
     * **Default value**: None
@@ -124,27 +124,25 @@ elements.
 
 **Inputs**
 
-* **1**: A 2D tensor of type *T* with input ROIs, with shape `[number_of_ROIs, 4]` describing the ROIs as 4-tuples: 
-[x<sub>1</sub>, y<sub>1</sub>, x<sub>2</sub>, y<sub>2</sub>]. The batch dimension of first, second and third inputs 
+* **1**: A 2D tensor of type *T* with input ROIs, with shape `[number_of_ROIs, 4]` providing the ROIs as 4-tuples: 
+[x<sub>1</sub>, y<sub>1</sub>, x<sub>2</sub>, y<sub>2</sub>]. The batch dimension of first, second, and third inputs 
 should be the same. **Required.**
 
-* **2**: A 2D tensor of type *T* with shape `[number_of_ROIs, num_classes * 4]` describing deltas for input boxes.
+* **2**: A 2D tensor of type *T* with shape `[number_of_ROIs, num_classes * 4]` providing deltas for input boxes.
  **Required.**
 
-* **3**: A 2D tensor of type *T* with shape `[number_of_ROIs, num_classes]` describing detections scores. **Required.**
+* **3**: A 2D tensor of type *T* with shape `[number_of_ROIs, num_classes]` providing detections scores. **Required.**
 
-* **4**: A 2D tensor of type *T* with shape `[1, 3]` contains 3 elements
- `[image_height, image_width, scale_height_and_width]` describing input image size info. **Required.**
+* **4**: A 2D tensor of type *T* with shape `[1, 3]` contains three elements
+ `[image_height, image_width, scale_height_and_width]` providing input image size info. **Required.**
 
 **Outputs**
 
-* **1**: A 2D tensor of type *T* with shape `[max_detections_per_image, 4]` describing boxes indices.
+* **1**: A 2D tensor of type *T* with shape `[max_detections_per_image, 4]` providing boxes indices.
 
-* **2**: A 1D tensor of type *T_IND* with shape `[max_detections_per_image]` describing classes indices.
+* **2**: A 1D tensor of type *T_IND* with shape `[max_detections_per_image]` providing classes indices.
 
-* **3**: A 1D tensor of type *T* with shape `[max_detections_per_image]` describing scores indices.
-
-* **4**: A 1D tensor of type *T_IND* with shape `[max_detections_per_image]` describing batches indices.
+* **3**: A 1D tensor of type *T* with shape `[max_detections_per_image]` providing scores indices.
 
 **Types**
 
diff --git a/docs/ops/detection/ExperimentalDetectronGenerateProposalsSingleImage_6.md b/docs/ops/detection/ExperimentalDetectronGenerateProposalsSingleImage_6.md
index 7f8726f20d3ff8..ce1513ed2bb8d5 100644
--- a/docs/ops/detection/ExperimentalDetectronGenerateProposalsSingleImage_6.md
+++ b/docs/ops/detection/ExperimentalDetectronGenerateProposalsSingleImage_6.md
@@ -4,26 +4,25 @@
 
 **Category**: Object detection
 
-**Short description**: An operation *ExperimentalDetectronGenerateProposalsSingleImage* computes ROIs and their scores 
+**Short description**: The *ExperimentalDetectronGenerateProposalsSingleImage* operation computes ROIs and their scores 
 based on input data.
 
-**Detailed description**: Operation doing next steps:
+**Detailed description**: The operation performs the following steps:
 
-1.  Transposes and reshape predicted bounding boxes deltas and scores to get them into the same order as the anchors;
-2.  Transforms anchors into proposals using deltas and clips proposals to image;
-3.  Removes predicted boxes with either height or width < *min_size*;
-4.  Sorts all `(proposal, score)` pairs by score from highest to lowest, order of pairs with equal scores is undefined;
-5.  Takes top *pre_nms_count* proposals, if total number of proposals is less than *pre_nms_count* then operation takes 
-all proposals;
-6.  Applies non-maximum suppression with *nms_threshold*;
-7.  Takes top *post_nms_count* proposals and return these top proposals and their scores. If total number of proposals 
-is less than *post_nms_count* then operation returns output tensors filled by zeroes.
+1.  Transposes and reshapes predicted bounding boxes deltas and scores to get them into the same order as the anchors.
+2.  Transforms anchors into proposals using deltas and clips proposals to an image.
+3.  Removes predicted boxes with either height or width < *min_size*.
+4.  Sorts all `(proposal, score)` pairs by score from highest to lowest; order of pairs with equal scores is undefined.
+5.  Takes top *pre_nms_count* proposals, if total number of proposals is less than *pre_nms_count* takes all proposals.
+6.  Applies non-maximum suppression with *nms_threshold*.
+7.  Takes top *post_nms_count* proposals and returns these top proposals and their scores. If total number of proposals 
+is less than *post_nms_count* returns output tensors filled with zeroes.
 
 **Attributes**:
 
 * *min_size*
 
-    * **Description**: *min_size* attribute specifies minimum box width and height.
+    * **Description**: The *min_size* attribute specifies minimum box width and height.
     * **Range of values**: non-negative floating point number
     * **Type**: float
     * **Default value**: None
@@ -31,7 +30,7 @@ is less than *post_nms_count* then operation returns output tensors filled by ze
 
 * *nms_threshold*
 
-    * **Description**: *nms_threshold* attribute specifies threshold to be used in the NMS stage.
+    * **Description**: The *nms_threshold* attribute specifies threshold to be used in the NMS stage.
     * **Range of values**: non-negative floating point number
     * **Type**: float
     * **Default value**: None
@@ -39,7 +38,7 @@ is less than *post_nms_count* then operation returns output tensors filled by ze
 
 * *pre_nms_count*
 
-    * **Description**: *pre_nms_count* attribute specifies number of top-n proposals before NMS.
+    * **Description**: The *pre_nms_count* attribute specifies number of top-n proposals before NMS.
     * **Range of values**: non-negative integer number
     * **Type**: int
     * **Default value**: None
@@ -47,7 +46,7 @@ is less than *post_nms_count* then operation returns output tensors filled by ze
 
 * *post_nms_count*
 
-    * **Description**: *post_nms_count* attribute specifies number of top-n proposals after NMS.
+    * **Description**: The *post_nms_count* attribute specifies number of top-n proposals after NMS.
     * **Range of values**: non-negative integer number
     * **Type**: int
     * **Default value**: None
@@ -55,22 +54,22 @@ is less than *post_nms_count* then operation returns output tensors filled by ze
 
 **Inputs**
 
-* **1**: A 1D tensor of type *T* with 3 elements `[image_height, image_width, scale_height_and_width]` describing input 
+* **1**: A 1D tensor of type *T* with 3 elements `[image_height, image_width, scale_height_and_width]` providing input 
 image size info. **Required.**
 
-* **2**: A 2D tensor of type *T* with shape `[height * width * number_of_channels, 4]` describing anchors. **Required.**
+* **2**: A 2D tensor of type *T* with shape `[height * width * number_of_channels, 4]` providing anchors. **Required.**
 
-* **3**: A 3D tensor of type *T* with shape `[number_of_channels * 4, height, width]` describing deltas for anchors. 
+* **3**: A 3D tensor of type *T* with shape `[number_of_channels * 4, height, width]` providing deltas for anchors. 
 Height and width for third and fourth inputs should be equal. **Required.**
 
-* **4**: A 3D tensor of type *T* with shape `[number_of_channels, height, width]` describing proposals scores. 
+* **4**: A 3D tensor of type *T* with shape `[number_of_channels, height, width]` providing proposals scores. 
 **Required.**
 
 **Outputs**
 
-* **1**: A 2D tensor of type *T* with shape `[post_nms_count, 4]` describing ROIs.
+* **1**: A 2D tensor of type *T* with shape `[post_nms_count, 4]` providing ROIs.
 
-* **2**: A 1D tensor of type *T* with shape `[post_nms_count]` describing ROIs scores.
+* **2**: A 1D tensor of type *T* with shape `[post_nms_count]` providing ROIs scores.
 
 **Types**
 
diff --git a/docs/ops/detection/ExperimentalDetectronPriorGridGenerator_6.md b/docs/ops/detection/ExperimentalDetectronPriorGridGenerator_6.md
index 5a474c7efc782e..bcf09c46c890ea 100644
--- a/docs/ops/detection/ExperimentalDetectronPriorGridGenerator_6.md
+++ b/docs/ops/detection/ExperimentalDetectronPriorGridGenerator_6.md
@@ -4,34 +4,33 @@
 
 **Category**: Object detection
 
-**Short description**: An operation *ExperimentalDetectronPriorGridGenerator* generates prior grids of 
-specified sizes.
+**Short description**: The *ExperimentalDetectronPriorGridGenerator* operation generates prior grids of specified sizes.
 
-**Detailed description**: Operation takes coordinates of centres of boxes and add strides with offset `0.5` to them to 
+**Detailed description**: The operation takes coordinates of centres of boxes and adds strides with offset `0.5` to them to 
 calculate coordinates of prior grids.
 
-Numbers of generated cells is `featmap_height` and `featmap_width` if *h* and *w* are zeroes, otherwise *h* and *w* 
+Numbers of generated cells is `featmap_height` and `featmap_width` if *h* and *w* are zeroes; otherwise, *h* and *w*, 
 respectively. Steps of generated grid are `image_height` / `layer_height` and `image_width` / `layer_width` if 
-*stride_h* and *stride_w* are zeroes, otherwise *stride_h* and *stride_w* respectively.
+*stride_h* and *stride_w* are zeroes; otherwise, *stride_h* and *stride_w*, respectively.
 
 `featmap_height`, `featmap_width`, `image_height` and `image_width` are spatial dimensions values from second and third 
-inputs respectively.
+inputs, respectively.
 
 **Attributes**:
 
 * *flatten*
 
-    * **Description**: *flatten* attribute specifies whether the output tensor should be 2D or 4D.
+    * **Description**: The *flatten* attribute specifies whether the output tensor should be 2D or 4D.
     * **Range of values**:
-      * `true` - the output tensor should be 2D tensor
-      * `false` - the output tensor should be 4D tensor
+      * `true` - the output tensor should be a 2D tensor
+      * `false` - the output tensor should be a 4D tensor
     * **Type**: boolean
     * **Default value**: true
     * **Required**: *no*
 
 * *h*
 
-    * **Description**: *h* attribute specifies number of cells of the generated grid with respect to height.
+    * **Description**: The *h* attribute specifies number of cells of the generated grid with respect to height.
     * **Range of values**: non-negative integer number less or equal than `featmap_height`
     * **Type**: int
     * **Default value**: 0
@@ -39,7 +38,7 @@ inputs respectively.
     
 * *w*
 
-    * **Description**: *w* attribute specifies number of cells of the generated grid with respect to width.
+    * **Description**: The *w* attribute specifies number of cells of the generated grid with respect to width.
     * **Range of values**: non-negative integer number less or equal than `featmap_width`
     * **Type**: int
     * **Default value**: 0
@@ -47,7 +46,7 @@ inputs respectively.
 
 * *stride_x*
 
-    * **Description**: *stride_x* attribute specifies the step of generated grid with respect to x coordinate.
+    * **Description**: The *stride_x* attribute specifies the step of generated grid with respect to x coordinate.
     * **Range of values**: non-negative float number
     * **Type**: float
     * **Default value**: 0.0
@@ -55,7 +54,7 @@ inputs respectively.
     
 * *stride_y*
 
-    * **Description**: *stride_y* attribute specifies the step of generated grid with respect to y coordinate.
+    * **Description**: The *stride_y* attribute specifies the step of generated grid with respect to y coordinate.
     * **Range of values**: non-negative float number
     * **Type**: float
     * **Default value**: 0.0
@@ -75,8 +74,8 @@ not its data. **Required.**
 **Outputs**
 
 * **1**: A tensor of type *T* with priors grid with shape `[featmap_height * featmap_width * number_of_priors, 4]` 
-if flatten is `true` or `[featmap_height, featmap_width, number_of_priors, 4]` otherwise.
-In case then 0 < *h* < `featmap_height` and/or 0 < *w* < `featmap_width` the output data size is less than 
+if flatten is `true` or `[featmap_height, featmap_width, number_of_priors, 4]`, otherwise.
+If 0 < *h* < `featmap_height` and/or 0 < *w* < `featmap_width` the output data size is less than 
 `featmap_height` * `featmap_width` * `number_of_priors` * 4 and the output tensor is filled with undefined values for 
 rest output tensor elements.
 
diff --git a/docs/ops/detection/ExperimentalDetectronROIFeatureExtractor_6.md b/docs/ops/detection/ExperimentalDetectronROIFeatureExtractor_6.md
index d4e93a188fc204..407c4301dc4b7c 100644
--- a/docs/ops/detection/ExperimentalDetectronROIFeatureExtractor_6.md
+++ b/docs/ops/detection/ExperimentalDetectronROIFeatureExtractor_6.md
@@ -30,7 +30,7 @@ For more details please see the following source:
 
 * *output_size*
 
-    * **Description**: *output_size* attribute specifies the width and height of the output tensor.
+    * **Description**: The *output_size* attribute specifies the width and height of the output tensor.
     * **Range of values**: a positive integer number
     * **Type**: int
     * **Default value**: None
@@ -38,7 +38,7 @@ For more details please see the following source:
 
 * *sampling_ratio*
 
-    * **Description**: *sampling_ratio* attribute specifies the number of sampling points per the output value. If 0, 
+    * **Description**: The *sampling_ratio* attribute specifies the number of sampling points per the output value. If 0, 
     then use adaptive number computed as `ceil(roi_width / output_width)`, and likewise for height.
     * **Range of values**: a non-negative integer number
     * **Type**: int
@@ -47,7 +47,7 @@ For more details please see the following source:
 
 * *pyramid_scales*
 
-    * **Description**: *pyramid_scales* enlists `image_size / layer_size[l]` ratios for pyramid layers `l=1,...,L`, 
+    * **Description**: The *pyramid_scales* enlists `image_size / layer_size[l]` ratios for pyramid layers `l=1,...,L`, 
     where `L` is the number of pyramid layers, and `image_size` refers to network's input image. Note that pyramid's 
     largest layer may have smaller size than input image, e.g. `image_size` is `800 x 1344` in the XML example below.
     * **Range of values**: a list of positive integer numbers
@@ -57,7 +57,7 @@ For more details please see the following source:
 
 * *aligned*
 
-    * **Description**: *aligned* attribute specifies add offset (`-0.5`) to ROIs sizes or not.
+    * **Description**: The *aligned* attribute specifies add offset (`-0.5`) to ROIs sizes or not.
     * **Range of values**:
       * `true` - add offset to ROIs sizes
       * `false` - do not add offset to ROIs sizes
@@ -67,7 +67,7 @@ For more details please see the following source:
 
 **Inputs**:
 
-*   **1**: 2D input tensor of type *T* with shape `[number_of_ROIs, 4]` describing the ROIs as 4-tuples: 
+*   **1**: 2D input tensor of type *T* with shape `[number_of_ROIs, 4]` providing the ROIs as 4-tuples: 
 [x<sub>1</sub>, y<sub>1</sub>, x<sub>2</sub>, y<sub>2</sub>]. Coordinates *x* and *y* are refer to the network's input 
 *image_size*. **Required**.
 
diff --git a/docs/ops/movement/Pad_1.md b/docs/ops/movement/Pad_1.md
index 79684706675d9a..853d94eac522d1 100644
--- a/docs/ops/movement/Pad_1.md
+++ b/docs/ops/movement/Pad_1.md
@@ -153,8 +153,7 @@ OUTPUT =
             <dim>8</dim>     <!-- 8 = 5 + 3 + 0 = pads_begin[1] + input.shape[1] + pads_end[1] -->
             <dim>37</dim>    <!-- 37 = 2 + 32 + 3 = pads_begin[2] + input.shape[2] + pads_end[2] -->
             <dim>48</dim>    <!-- 48 = 1 + 40 + 7 = pads_begin[3] + input.shape[3] + pads_end[3] -->
-                             <!-- all new elements are filled with 15.0 value -->
         </port>
     </output>
 </layer>
-```
\ No newline at end of file
+```
diff --git a/docs/ops/movement/ScatterNDUpdate_3.md b/docs/ops/movement/ScatterNDUpdate_3.md
index 93398fa3f98dbc..5dd1ed9a462957 100644
--- a/docs/ops/movement/ScatterNDUpdate_3.md
+++ b/docs/ops/movement/ScatterNDUpdate_3.md
@@ -48,7 +48,7 @@ output  = [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
 
 *   **2**: `indices` tensor with indices of arbitrary rank `q` >= 1 and of type *T_IND*. All index values `i_j` in index entry `(i_0, i_1, ...,i_k)` (where `k = indices.shape[-1]`) must be within bounds `[0, s_j - 1]` where `s_j = data.shape[j]`. `k` must be at most `r`. Required.
 
-*   **3**: `updates` tensor of rank `r - indices.shape[-1] + q - 1` of type *T*. Required.
+*   **3**: `updates` tensor of rank `r - indices.shape[-1] + q - 1` of type *T*. If expected `updates` rank is 0D it can be a tensor with single element. Required.
 
 **Outputs**:
 
diff --git a/docs/ops/opset1.md b/docs/ops/opset1.md
index 73da245d2dc541..eec109ad9c3877 100644
--- a/docs/ops/opset1.md
+++ b/docs/ops/opset1.md
@@ -93,7 +93,7 @@ declared in `namespace opset1`.
 * [Result](infrastructure/Result_1.md)
 * [ReverseSequence](movement/ReverseSequence_1.md)
 * [Select](condition/Select_1.md)
-* [Selu](arithmetic/Selu_1.md)
+* [Selu](activation/Selu_1.md)
 * [ShapeOf](shape/ShapeOf_1.md)
 * [Sigmoid](activation/Sigmoid_1.md)
 * [Sign](arithmetic/Sign_1.md)
diff --git a/docs/ops/opset2.md b/docs/ops/opset2.md
index bfee6cee9c45a8..67c51385a78a80 100644
--- a/docs/ops/opset2.md
+++ b/docs/ops/opset2.md
@@ -98,7 +98,7 @@ declared in `namespace opset2`.
 * [ReverseSequence](movement/ReverseSequence_1.md)
 * [ROIPooling](detection/ROIPooling_1.md)
 * [Select](condition/Select_1.md)
-* [Selu](arithmetic/Selu_1.md)
+* [Selu](activation/Selu_1.md)
 * [ShapeOf](shape/ShapeOf_1.md)
 * [Sigmoid](activation/Sigmoid_1.md)
 * [Sign](arithmetic/Sign_1.md)
diff --git a/docs/ops/opset3.md b/docs/ops/opset3.md
index e36d4be27c5227..52af67efc0f0de 100644
--- a/docs/ops/opset3.md
+++ b/docs/ops/opset3.md
@@ -113,7 +113,7 @@ declared in `namespace opset3`.
 * [ScatterElementsUpdate](movement/ScatterElementsUpdate_3.md)
 * [ScatterUpdate](movement/ScatterUpdate_3.md)
 * [Select](condition/Select_1.md)
-* [Selu](arithmetic/Selu_1.md)
+* [Selu](activation/Selu_1.md)
 * [ShapeOf](shape/ShapeOf_3.md)
 * [ShuffleChannels](movement/ShuffleChannels_1.md)
 * [Sigmoid](activation/Sigmoid_1.md)
diff --git a/docs/ops/opset4.md b/docs/ops/opset4.md
index 709319f0640d16..71607453ee983c 100644
--- a/docs/ops/opset4.md
+++ b/docs/ops/opset4.md
@@ -121,7 +121,7 @@ declared in `namespace opset4`.
 * [ScatterNDUpdate](movement/ScatterNDUpdate_3.md)
 * [ScatterUpdate](movement/ScatterUpdate_3.md)
 * [Select](condition/Select_1.md)
-* [Selu](arithmetic/Selu_1.md)
+* [Selu](activation/Selu_1.md)
 * [ShapeOf](shape/ShapeOf_3.md)
 * [ShuffleChannels](movement/ShuffleChannels_1.md)
 * [Sigmoid](activation/Sigmoid_1.md)
diff --git a/docs/ops/opset5.md b/docs/ops/opset5.md
index 7db25f894d5d32..6c79caca19b0a4 100644
--- a/docs/ops/opset5.md
+++ b/docs/ops/opset5.md
@@ -129,7 +129,7 @@ declared in `namespace opset5`.
 * [ScatterNDUpdate](movement/ScatterNDUpdate_3.md)
 * [ScatterUpdate](movement/ScatterUpdate_3.md)
 * [Select](condition/Select_1.md)
-* [Selu](arithmetic/Selu_1.md)
+* [Selu](activation/Selu_1.md)
 * [ShapeOf](shape/ShapeOf_3.md)
 * [ShuffleChannels](movement/ShuffleChannels_1.md)
 * [Sigmoid](activation/Sigmoid_1.md)
diff --git a/docs/ops/opset6.md b/docs/ops/opset6.md
index dbe17d468611d2..92deb4a6fbea3d 100644
--- a/docs/ops/opset6.md
+++ b/docs/ops/opset6.md
@@ -135,7 +135,7 @@ declared in `namespace opset6`.
 * [ScatterNDUpdate](movement/ScatterNDUpdate_3.md)
 * [ScatterUpdate](movement/ScatterUpdate_3.md)
 * [Select](condition/Select_1.md)
-* [Selu](arithmetic/Selu_1.md)
+* [Selu](activation/Selu_1.md)
 * [ShapeOf](shape/ShapeOf_3.md)
 * [ShuffleChannels](movement/ShuffleChannels_1.md)
 * [Sigmoid](activation/Sigmoid_1.md)
diff --git a/docs/ops/opset7.md b/docs/ops/opset7.md
index 242b1e029cf887..c04b90e81a0391 100644
--- a/docs/ops/opset7.md
+++ b/docs/ops/opset7.md
@@ -138,7 +138,7 @@ declared in `namespace opset7`.
 * [ScatterNDUpdate](movement/ScatterNDUpdate_3.md)
 * [ScatterUpdate](movement/ScatterUpdate_3.md)
 * [Select](condition/Select_1.md)
-* [Selu](arithmetic/Selu_1.md)
+* [Selu](activation/Selu_1.md)
 * [ShapeOf](shape/ShapeOf_3.md)
 * [ShuffleChannels](movement/ShuffleChannels_1.md)
 * [Sigmoid](activation/Sigmoid_1.md)
diff --git a/docs/ops/shape/Squeeze_1.md b/docs/ops/shape/Squeeze_1.md
index 9dff893cd8a419..4510748ca17551 100644
--- a/docs/ops/shape/Squeeze_1.md
+++ b/docs/ops/shape/Squeeze_1.md
@@ -4,15 +4,19 @@
 
 **Category**: Shape manipulation
 
-**Short description**: *Squeeze* removes specified dimensions (second input) equal to 1 of the first input tensor. If the second input is omitted then all dimensions equal to 1 are removed. If the specified dimension is not equal to one then error is raised.
+**Short description**: *Squeeze* removes dimensions equal to 1 from the first input tensor.
+
+**Detailed description**: *Squeeze* can be used with or without the second input tensor.
+* If only the first input is provided, every dimension that is equal to 1 will be removed from it.
+* With the second input provided, each value is an index of a dimension from the first tensor that is to be removed. Specified dimension has to be equal to 1, otherwise an error will be raised. Dimension indices can be specified directly, or by negative indices (counting dimensions from the end).
 
 **Attributes**: *Squeeze* operation doesn't have attributes.
 
 **Inputs**:
 
-*   **1**: Multidimensional input tensor of type *T*. *Required*.
+*   **1**: Multidimensional input tensor of type *T*. **Required**.
 
-*   **2**: 0D or 1D tensor of type *T_SHAPE* with dimensions indices to squeeze. Values could be negative. *Optional*.
+*   **2**: Scalar or 1D tensor of type *T_INT* with indices of dimensions to squeeze. Values could be negative (have to be from range `[-R, R-1]`, where `R` is the rank of the first input). **Optional**.
 
 **Outputs**:
 
@@ -20,13 +24,13 @@
 
 **Types**
 
-* *T*: supported type.
+* *T*: any numeric type.
 
-* *T_SHAPE*: supported integer type.
+* *T_INT*: any supported integer type.
 
 **Example**
 
-*Example 1:*
+*Example 1: squeeze 4D tensor to a 2D tensor*
 ```xml
 <layer ... type="Squeeze">
     <input>
diff --git a/docs/ops/sort/ExperimentalDetectronTopKROIs_6.md b/docs/ops/sort/ExperimentalDetectronTopKROIs_6.md
index a378d3b378d315..6b496fbc93a0b4 100644
--- a/docs/ops/sort/ExperimentalDetectronTopKROIs_6.md
+++ b/docs/ops/sort/ExperimentalDetectronTopKROIs_6.md
@@ -4,19 +4,19 @@
 
 **Category**: Sort
 
-**Short description**: An operation *ExperimentalDetectronTopKROIs* is TopK operation applied to probabilities of input 
+**Short description**: The *ExperimentalDetectronTopKROIs* operation is TopK operation applied to probabilities of input 
 ROIs.
 
-**Detailed description**: Operation performs probabilities descending sorting for input ROIs and returns *max_rois* 
-number of ROIs. Order of sorted ROIs with equal probabilities is undefined. If number of ROIs is less than *max_rois* 
-then operation returns all ROIs descended sorted and the output tensor is filled with undefined values for rest output 
-tensor elements.
+**Detailed description**: The operation performs probabilities descending sorting for input ROIs and returns *max_rois* 
+number of ROIs. Order of sorted ROIs with equal probabilities is undefined. If the number of ROIs is less than *max_rois* 
+then operation returns all ROIs descended sorted and the output tensor is filled with undefined values for the rest of 
+output tensor elements.
 
 **Attributes**:
 
 * *max_rois*
 
-    * **Description**: *max_rois* attribute specifies maximal numbers of output ROIs.
+    * **Description**: The *max_rois* attribute specifies maximal numbers of output ROIs.
     * **Range of values**: non-negative integer number
     * **Type**: int
     * **Default value**: 0
diff --git a/docs/template_plugin/src/template_plugin.cpp b/docs/template_plugin/src/template_plugin.cpp
index 50e1d828f73f20..ca3dbbdacfed52 100644
--- a/docs/template_plugin/src/template_plugin.cpp
+++ b/docs/template_plugin/src/template_plugin.cpp
@@ -118,14 +118,13 @@ InferenceEngine::ExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(const
 // ! [plugin:load_exe_network_impl]
 
 // ! [plugin:import_network_impl]
-InferenceEngine::ExecutableNetwork Plugin::ImportNetworkImpl(std::istream& model, const std::map<std::string, std::string>& config) {
+InferenceEngine::ExecutableNetworkInternal::Ptr
+Plugin::ImportNetworkImpl(std::istream& model, const std::map<std::string, std::string>& config) {
     OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "Plugin::ImportNetworkImpl");
 
     Configuration cfg(config);
-    auto exec_network_impl = std::make_shared<ExecutableNetwork>(model, cfg,
+    return std::make_shared<ExecutableNetwork>(model, cfg,
         std::static_pointer_cast<Plugin>(shared_from_this()));
-
-    return make_executable_network(exec_network_impl);
 }
 // ! [plugin:import_network_impl]
 
diff --git a/docs/template_plugin/src/template_plugin.hpp b/docs/template_plugin/src/template_plugin.hpp
index fd520767444b0c..10b68d7af42f10 100644
--- a/docs/template_plugin/src/template_plugin.hpp
+++ b/docs/template_plugin/src/template_plugin.hpp
@@ -30,7 +30,7 @@ class Plugin : public InferenceEngine::InferencePluginInternal {
     void AddExtension(InferenceEngine::IExtensionPtr extension) override;
     InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string, InferenceEngine::Parameter> & options) const override;
     InferenceEngine::Parameter GetMetric(const std::string& name, const std::map<std::string, InferenceEngine::Parameter> & options) const override;
-    InferenceEngine::ExecutableNetwork ImportNetworkImpl(std::istream& model, const std::map<std::string, std::string>& config) override;
+    InferenceEngine::ExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& model, const std::map<std::string, std::string>& config) override;
 
 private:
     friend class ExecutableNetwork;
diff --git a/inference-engine/cmake/dependencies.cmake b/inference-engine/cmake/dependencies.cmake
index 7c6428b669eae9..8dd7d3e164d1ad 100644
--- a/inference-engine/cmake/dependencies.cmake
+++ b/inference-engine/cmake/dependencies.cmake
@@ -186,9 +186,9 @@ endif ()
 if (ENABLE_OPENCV)
     reset_deps_cache(OpenCV_DIR)
 
-    set(OPENCV_VERSION "4.5.1")
-    set(OPENCV_BUILD "044")
-    set(OPENCV_BUILD_YOCTO "337")
+    set(OPENCV_VERSION "4.5.2")
+    set(OPENCV_BUILD "076")
+    set(OPENCV_BUILD_YOCTO "708")
 
     if (AARCH64)
         if(DEFINED ENV{THIRDPARTY_SERVER_PATH})
@@ -208,7 +208,7 @@ if (ENABLE_OPENCV)
                     TARGET_PATH "${TEMP}/opencv_${OPENCV_VERSION}_${OPENCV_SUFFIX}/opencv"
                     ENVIRONMENT "OpenCV_DIR"
                     VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*"
-                    SHA256 "b5239e0e50b9009f95a29cb11f0840ec085fa07f6c4d3349adf090f1e51b0787")
+                    SHA256 "ee3e5255f381b8de5e6fffe4e43dae8c99035377d0380f9183bd7341f1d0f204")
 
             unset(IE_PATH_TO_DEPS)
         endif()
@@ -219,37 +219,37 @@ if (ENABLE_OPENCV)
                     TARGET_PATH "${TEMP}/opencv_${OPENCV_VERSION}/opencv"
                     ENVIRONMENT "OpenCV_DIR"
                     VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*"
-                    SHA256 "5250bfe5860c15eb1b31963c78804ee9b301a19d8d6e920c06ef41de681cb99e")
+                    SHA256 "a14f872e6b63b6ac12c7ff47fa49e578d14c14433b57f5d85ab5dd48a079938c")
         elseif(APPLE AND X86_64)
             RESOLVE_DEPENDENCY(OPENCV
                     ARCHIVE_MAC "opencv/opencv_${OPENCV_VERSION}-${OPENCV_BUILD}_osx.txz"
                     TARGET_PATH "${TEMP}/opencv_${OPENCV_VERSION}_osx/opencv"
                     ENVIRONMENT "OpenCV_DIR"
                     VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*"
-                    SHA256 "f3ebc5cc72c86106c30cc711ac689e02281556bb43c09a89cd45cb99b6bef9a8")
+                    SHA256 "3e162f96e86cba8836618134831d9cf76df0438778b3e27e261dedad9254c514")
         elseif(LINUX)
             if (AARCH64)
                 set(OPENCV_SUFFIX "yocto_kmb")
                 set(OPENCV_BUILD "${OPENCV_BUILD_YOCTO}")
             elseif (ARM)
                 set(OPENCV_SUFFIX "debian9arm")
-                set(OPENCV_HASH "0e787d6738092993bc92bb55975f52caabae45dc73473b5196d15e65e87d6b9d")
+                set(OPENCV_HASH "4274f8c40b17215f4049096b524e4a330519f3e76813c5a3639b69c48633d34e")
             elseif ((LINUX_OS_NAME STREQUAL "CentOS 7" OR
                      CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9") AND X86_64)
                 set(OPENCV_SUFFIX "centos7")
-                set(OPENCV_HASH "9b813af064d463b31fa1603b11b6559532a031d59bb0782d234380955fd397e0")
+                set(OPENCV_HASH "5fa76985c84fe7c64531682ef0b272510c51ac0d0565622514edf1c88b33404a")
             elseif (LINUX_OS_NAME MATCHES "CentOS 8" AND X86_64)
                 set(OPENCV_SUFFIX "centos8")
-                set(OPENCV_HASH "8ec3e3552500dee334162386b98cc54a5608de1f1a18f283523fc0cc13ee2f83")
+                set(OPENCV_HASH "db087dfd412eedb8161636ec083ada85ff278109948d1d62a06b0f52e1f04202")
             elseif (LINUX_OS_NAME STREQUAL "Ubuntu 16.04" AND X86_64)
                 set(OPENCV_SUFFIX "ubuntu16")
                 set(OPENCV_HASH "cd46831b4d8d1c0891d8d22ff5b2670d0a465a8a8285243059659a50ceeae2c3")
             elseif (LINUX_OS_NAME STREQUAL "Ubuntu 18.04" AND X86_64)
                 set(OPENCV_SUFFIX "ubuntu18")
-                set(OPENCV_HASH "8ec3e3552500dee334162386b98cc54a5608de1f1a18f283523fc0cc13ee2f83")
+                set(OPENCV_HASH "db087dfd412eedb8161636ec083ada85ff278109948d1d62a06b0f52e1f04202")
             elseif ((LINUX_OS_NAME STREQUAL "Ubuntu 20.04" OR LINUX_OS_NAME STREQUAL "LinuxMint 20.1") AND X86_64)
                 set(OPENCV_SUFFIX "ubuntu20")
-                set(OPENCV_HASH "2b7808d002864acdc5fc0b19cd30dadc31a37cc267931cad605f23f2383bfc21")
+                set(OPENCV_HASH "2fe7bbc40e1186eb8d099822038cae2821abf617ac7a16fadf98f377c723e268")
             elseif(NOT DEFINED OpenCV_DIR AND NOT DEFINED ENV{OpenCV_DIR})
                 message(FATAL_ERROR "OpenCV is not available on current platform (${LINUX_OS_NAME})")
             endif()
diff --git a/inference-engine/cmake/vpu_dependencies.cmake b/inference-engine/cmake/vpu_dependencies.cmake
index 86add651af1970..f01d9b4e23fde3 100644
--- a/inference-engine/cmake/vpu_dependencies.cmake
+++ b/inference-engine/cmake/vpu_dependencies.cmake
@@ -6,14 +6,14 @@ include_guard(GLOBAL)
 
 set(VPU_SUPPORTED_FIRMWARES usb-ma2x8x pcie-ma2x8x)
 set(VPU_SUPPORTED_FIRMWARES_HASH
-    "d0f6aaaf71a595963e6013ef59045e20b07324f1a47deaa3f906419d39b2bd5a"
-    "18d3cd10cf6cc36ff58001812d3d215c0bbb2de09a8832128592401c8f959358")
+    "11a6db07d3a17c9c0fc4247fce47c942e0dcd59f8d70665a96bae0d7b7121fe9"
+    "43f3dc0f0a8114ca34226167970aafdc869600929d6e3761c1eaa6eec71f2237")
 
 #
 # Default packages
 #
 
-set(FIRMWARE_PACKAGE_VERSION 1642)
+set(FIRMWARE_PACKAGE_VERSION 1658)
 set(VPU_CLC_MA2X8X_VERSION "movi-cltools-20.09.2")
 
 #
diff --git a/inference-engine/ie_bridges/c/samples/hello_classification/README.md b/inference-engine/ie_bridges/c/samples/hello_classification/README.md
index 6bf0ddf0b6369b..b090b648f2716d 100644
--- a/inference-engine/ie_bridges/c/samples/hello_classification/README.md
+++ b/inference-engine/ie_bridges/c/samples/hello_classification/README.md
@@ -1,31 +1,104 @@
 # Hello Classification C Sample {#openvino_inference_engine_ie_bridges_c_samples_hello_classification_README}
 
-This topic describes how to run the Hello Classification C sample application.
+Inference of image classification networks like AlexNet and GoogLeNet using Synchronous Inference Request API and input auto-resize feature.
 
-It demonstrates how to use the following Inference Engine C API in applications:
-* Synchronous Infer Request API
-* Input auto-resize API. It allows to set image of the original size as input for a network with other input size.
-  Resize will be performed automatically by the corresponding plugin just before inference.
+Hello Classification C sample application demonstrates how to use the following Inference Engine C API in applications:
 
-There is also an API introduced to crop a ROI object and set it as input without additional memory re-allocation.
-To properly demonstrate this API, it is required to run several networks in pipeline which is out of scope of this sample.
+| Feature    | API  | Description |
+|:---     |:--- |:---
+| Basic Infer Flow | [ie_core_create], [ie_core_read_network], [ie_core_load_network], [ie_exec_network_create_infer_request], [ie_infer_request_set_blob], [ie_infer_request_get_blob]  | Common API to do inference: configure input and output blobs, loading model, create infer request
+| Synchronous Infer | [ie_infer_request_infer] | Do synchronous inference
+| Network Operations | [ie_network_get_input_name], [ie_network_get_inputs_number], [ie_network_get_outputs_number], [ie_network_set_input_precision], [ie_network_get_output_name], [ie_network_get_output_precision] |  Managing of network
+| Blob Operations| [ie_blob_make_memory_from_preallocated], [ie_blob_get_dims], [ie_blob_get_cbuffer]   | Work with memory container for storing inputs, outputs of the network, weights and biases of the layers
+| Input auto-resize | [ie_network_set_input_resize_algorithm], [ie_network_set_input_layout] | Set image of the original size as input for a network with other input size. Resize and layout conversions will be performed automatically by the corresponding plugin just before inference
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](../../../../../docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+| Options  | Values |
+|:---                              |:---
+| Validated Models                 | AlexNet and GoogLeNet (image classification networks)
+| Model Format                     | Inference Engine Intermediate Representation (.xml + .bin), ONNX (.onnx)
+| Validated images                 | The sample uses OpenCV\* to [read input image](https://docs.opencv.org/master/d4/da8/group__imgcodecs.html#ga288b8b3da0892bd651fce07b3bbd3a56) (\*.bmp, \*.png)
+| Supported devices                | [All](../../../../../docs/IE_DG/supported_plugins/Supported_Devices.md) |
+| Other language realization       | [C++](../../../../samples/hello_classification/README.md), [Python](../../../python/sample/hello_classification/README.md) |
+
+## How It Works
+
+Upon the start-up, the sample application reads command line parameters, loads specified network and an image to the Inference Engine plugin.
+Then, the sample creates an synchronous inference request object. When inference is done, the application outputs data to the standard output stream.
+
+You can see the explicit description of
+each sample step at [Integration Steps](../../../../../docs/IE_DG/Integrate_with_customer_application_new_API.md) section of "Integrate the Inference Engine with Your Application" guide.
+
+## Building
+
+To build the sample, please use instructions available at [Build the Sample Applications](../../../../../docs/IE_DG/Samples_Overview.md) section in Inference Engine Samples guide.
 
 ## Running
 
-To run the sample, you can use [public](@ref omz_models_public_index) or [Intel's](@ref omz_models_intel_index) pre-trained models from the Open Model Zoo. The models can be downloaded using the [Model Downloader](@ref omz_tools_downloader_README).
+To run the sample, you need specify a model and image:
+- you can use [public](@ref omz_models_public_index) or [Intel's](@ref omz_models_intel_index) pre-trained models from the Open Model Zoo. The models can be downloaded using the [Model Downloader](@ref omz_tools_downloader_README).
+- you can use images from the media files collection available at https://storage.openvinotoolkit.org/data/test_data.
 
-> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+> **NOTES**:
+>
+> - By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](../../../../../docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
 >
-> The sample accepts models in ONNX format (.onnx) that do not require preprocessing.
+> - Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+>
+> - The sample accepts models in ONNX format (.onnx) that do not require preprocessing.
 
 You can do inference of an image using a trained AlexNet network on a GPU using the following command:
 
 ```sh
-./hello_classification_c <path_to_model>/alexnet_fp32.xml <path_to_image>/cat.bmp GPU
+./hello_classification_c <path_to_model>/alexnet_fp32.xml <path_to_image>/cat.png GPU
 ```
 
 ## Sample Output
 
 The application outputs top-10 inference results.
+
+```sh
+Top 10 results:
+
+Image /opt/intel/openvino/deployment_tools/demo/car.png
+
+classid probability
+------- -----------
+479     0.7562205
+511     0.0760381
+436     0.0724111
+817     0.0462140
+656     0.0301231
+661     0.0056171
+581     0.0031622
+468     0.0029917
+717     0.0023081
+627     0.0016193
+
+This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool
+```
+
+## See Also
+
+- [Integrate the Inference Engine with Your Application](../../../../../docs/IE_DG/Integrate_with_customer_application_new_API.md)
+- [Using Inference Engine Samples](../../../../../docs/IE_DG/Samples_Overview.md)
+- [Model Downloader](@ref omz_tools_downloader_README)
+- [Model Optimizer](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
+
+[ie_core_create]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Core.html#gaab73c7ee3704c742eaac457636259541
+[ie_core_read_network]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Core.html#gaa40803295255b3926a3d1b8924f26c29
+[ie_network_get_input_name]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga36b0c28dfab6db2bfcc2941fd57fbf6d
+[ie_network_set_input_precision]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#gadd99b7cc98b3c33daa2095b8a29f66d7
+[ie_network_get_output_name]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga1feabc49576db24d9821a150b2b50a6c
+[ie_network_get_output_precision]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#gaeaa7f1fb8f56956fc492cd9207235984
+[ie_core_load_network]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Core.html#ga318d4b0214b8a3fd33f9e44170befcc5
+[ie_exec_network_create_infer_request]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__ExecutableNetwork.html#gae72247391c1429a18c367594a4b7db9f
+[ie_blob_make_memory_from_preallocated]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Blob.html#ga7a874d46375e10fa1a7e8e3d7e1c9c9c
+[ie_infer_request_set_blob]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__InferRequest.html#ga891c2d475501bba761148a0c3faca196
+[ie_infer_request_infer]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__InferRequest.html#gac6c6fcb67ccb4d0ec9ad1c63a5bee7b6
+[ie_infer_request_get_blob]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__InferRequest.html#ga6cd04044ea95987260037bfe17ce1a2d
+[ie_blob_get_dims]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Blob.html#ga25d93efd7ec1052a8896ac61cc14c30a
+[ie_blob_get_cbuffer]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Blob.html#gaf6b4a110b4c5723dcbde135328b3620a
+[ie_network_set_input_resize_algorithm]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga46ab3b3a06359f2b77f58bdd6e8a5492
+[ie_network_set_input_layout]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga27ea9f92290e0b2cdedbe8a85feb4c01
+[ie_network_get_inputs_number]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga6a3349bca66c4ba8b41a434061fccf52
+[ie_network_get_outputs_number]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga869b8c309797f1e09f73ddffd1b57509
diff --git a/inference-engine/ie_bridges/c/samples/hello_classification/main.c b/inference-engine/ie_bridges/c/samples/hello_classification/main.c
index e17107f8213e92..86d8125a1b0cad 100644
--- a/inference-engine/ie_bridges/c/samples/hello_classification/main.c
+++ b/inference-engine/ie_bridges/c/samples/hello_classification/main.c
@@ -2,17 +2,28 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include <stdlib.h>
+#include <stdbool.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <opencv_c_wraper.h>
+
 #include <c_api/ie_c_api.h>
+#include <opencv_c_wraper.h>
 
+/**
+* @brief Struct to store classification results
+*/
 struct classify_res {
     size_t class_id;
     float probability;
 };
 
+/**
+* @brief Sort result of image classification by probability
+* @param struct with classification results to sort
+* @param size of the struct
+* @return none
+*/
 void classify_res_sort(struct classify_res *res, size_t n) {
     size_t i, j;
     for (i = 0; i < n; ++i) {
@@ -30,6 +41,12 @@ void classify_res_sort(struct classify_res *res, size_t n) {
     }
 }
 
+/**
+* @brief Convert output blob to classify struct for processing results
+* @param blob of output data
+* @param size of the blob
+* @return struct classify_res
+*/
 struct classify_res *output_blob_to_classify_res(ie_blob_t *blob, size_t *n) {
     dimensions_t output_dim;
     IEStatusCode status = ie_blob_get_dims(blob, &output_dim);
@@ -60,6 +77,13 @@ struct classify_res *output_blob_to_classify_res(ie_blob_t *blob, size_t *n) {
     return cls;
 }
 
+/**
+* @brief Print results of classification
+* @param struct of the classification results
+* @param size of the struct of classification results
+* @param string image path
+* @return none
+*/
 void print_classify_res(struct classify_res *cls, size_t n, const char *img_path) {
     printf("\nImage %s\n", img_path);
     printf("\nclassid probability\n");
@@ -68,6 +92,7 @@ void print_classify_res(struct classify_res *cls, size_t n, const char *img_path
     for (i = 0; i < n; ++i) {
         printf("%zu       %f\n", cls[i].class_id, cls[i].probability);
     }
+    printf("\nThis sample is an API example, for any performance measurements please use the dedicated benchmark_app tool\n");
 }
 
 int main(int argc, char **argv) {
@@ -86,22 +111,36 @@ int main(int argc, char **argv) {
     ie_infer_request_t *infer_request = NULL;
     char *input_name = NULL, *output_name = NULL;
     ie_blob_t *imgBlob = NULL, *output_blob = NULL;
+    size_t network_input_size;
+    size_t network_output_size;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 1. Load inference engine instance -------------------------------------
+    // --------------------------- Step 1. Initialize inference engine core -------------------------------------
 
     IEStatusCode status = ie_core_create("", &core);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // 2. Read a model in OpenVINO Intermediate Representation (.xml and .bin files) or ONNX (.onnx file) format
+    // Step 2. Read a model in OpenVINO Intermediate Representation (.xml and .bin files) or ONNX (.onnx file) format
     status = ie_core_read_network(core, input_model, NULL, &network);
     if (status != OK)
         goto err;
+    // check the network topology
+    status = ie_network_get_inputs_number(network, &network_input_size);
+    if (status != OK || network_input_size != 1) {
+        printf("Sample supports topologies with 1 input only\n");
+        goto err;
+    }
+
+    status = ie_network_get_outputs_number(network, &network_output_size);
+    if (status != OK || network_output_size != 1) {
+        printf("Sample supports topologies with 1 output only\n");
+        goto err;
+    }
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 3. Configure input & output ---------------------------------------------
+    // --------------------------- Step 3. Configure input & output ---------------------------------------------
     // --------------------------- Prepare input blobs -----------------------------------------------------
 
     status = ie_network_get_input_name(network, 0, &input_name);
@@ -124,20 +163,20 @@ int main(int argc, char **argv) {
 
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 4. Loading model to the device ------------------------------------------
+    // --------------------------- Step 4. Loading model to the device ------------------------------------------
     ie_config_t config = {NULL, NULL, NULL};
     status = ie_core_load_network(core, network, device_name, &config, &exe_network);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 5. Create infer request -------------------------------------------------
+    // --------------------------- Step 5. Create infer request -------------------------------------------------
     status = ie_exec_network_create_infer_request(exe_network, &infer_request);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 6. Prepare input --------------------------------------------------------
+    // --------------------------- Step 6. Prepare input --------------------------------------------------------
     /* Read input image to a blob and set it to an infer request without resize and layout conversions. */
     c_mat_t img;
     image_read(input_image_path, &img);
@@ -158,14 +197,14 @@ int main(int argc, char **argv) {
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 7. Do inference --------------------------------------------------------
+    // --------------------------- Step 7. Do inference --------------------------------------------------------
     /* Running the request synchronously */
     status = ie_infer_request_infer(infer_request);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 8. Process output ------------------------------------------------------
+    // --------------------------- Step 8. Process output ------------------------------------------------------
     status = ie_infer_request_get_blob(infer_request, output_name, &output_blob);
     if (status != OK) {
         image_free(&img);
diff --git a/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/CMakeLists.txt b/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/CMakeLists.txt
index d0452b283558d9..ddffe4686a7cc3 100644
--- a/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/CMakeLists.txt
+++ b/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/CMakeLists.txt
@@ -3,5 +3,4 @@
 #
 
 ie_add_sample(NAME hello_nv12_input_classification_c
-              SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/main.c"
-              DEPENDENCIES opencv_c_wraper)
+              SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/main.c")
diff --git a/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/README.md b/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/README.md
index a9e1e20056b049..ba7b58ad473ba7 100644
--- a/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/README.md
+++ b/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/README.md
@@ -1,51 +1,104 @@
 # Hello NV12 Input Classification C Sample {#openvino_inference_engine_ie_bridges_c_samples_hello_nv12_input_classification_README}
 
-This topic describes how to run the Hello NV12 Input Classification sample application.
-The sample demonstrates how to use the new NV12 automatic input pre-processing API of the Inference Engine in your applications.
-Refer to [Integrate the Inference Engine New Request API with Your Application](../../../../../docs/IE_DG/Integrate_with_customer_application_new_API.md) for details.
+Inference of image classification networks like AlexNet with images in NV12 color format using Synchronous Inference Request API.
+
+Hello NV12 Input Classification C Sample demonstrates how to use the NV12 automatic input pre-processing API of the Inference Engine in your applications:
+
+| Feature    | API  | Description |
+|:---     |:--- |:---
+| Blob Operations| [ie_blob_make_memory_nv12] | Create a NV12 blob
+| Input in N12 color format |[ie_network_set_color_format]| Change the color format of the input data
+Basic Inference Engine API is covered by [Hello Classification C sample](../hello_classification/README.md).
+
+| Options  | Values |
+|:---                              |:---
+| Validated Models                 | AlexNet (image classification network)
+| Model Format                     | Inference Engine Intermediate Representation (\*.xml + \*.bin), ONNX (\*.onnx)
+| Validated images                 | An uncompressed image in the NV12 color format - \*.yuv
+| Supported devices                | [All](../../../../../docs/IE_DG/supported_plugins/Supported_Devices.md) |
+| Other language realization       | [C++](../../../../samples/hello_nv12_input_classification/README.md) |
 
 ## How It Works
 
-Upon the start-up, the sample application reads command-line parameters, loads a network and sets an
-image in the NV12 color format to an Inference Engine plugin. When inference is done, the
+Upon the start-up, the sample application reads command-line parameters, loads specified network and an
+image in the NV12 color format to an Inference Engine plugin. Then, the sample creates an synchronous inference request object. When inference is done, the
 application outputs data to the standard output stream.
 
+You can see the explicit description of
+each sample step at [Integration Steps](https://docs.openvinotoolkit.org/latest/openvino_docs_IE_DG_Integrate_with_customer_application_new_API.html) section of "Integrate the Inference Engine with Your Application" guide.
+
+## Building
+
+To build the sample, please use instructions available at [Build the Sample Applications](../../../../../docs/IE_DG/Samples_Overview.md) section in Inference Engine Samples guide.
+
+## Running
+
+To run the sample, you need specify a model and image:
+
+- you can use [public](@ref omz_models_public_index) or [Intel's](@ref omz_models_intel_index) pre-trained models from the Open Model Zoo. The models can be downloaded using the [Model Downloader](@ref omz_tools_downloader_README).
+- you can use images from the media files collection available at https://storage.openvinotoolkit.org/data/test_data.
+
 The sample accepts an uncompressed image in the NV12 color format. To run the sample, you need to
 convert your BGR/RGB image to NV12. To do this, you can use one of the widely available tools such
 as FFmpeg\* or GStreamer\*. The following command shows how to convert an ordinary image into an
 uncompressed NV12 image using FFmpeg:
+
 ```sh
 ffmpeg -i cat.jpg -pix_fmt nv12 cat.yuv
 ```
 
-> **NOTE**:
+> **NOTES**:
 >
-> * Because the sample reads raw image files, you should provide a correct image size along with the
+> - Because the sample reads raw image files, you should provide a correct image size along with the
 >   image path. The sample expects the logical size of the image, not the buffer size. For example,
 >   for 640x480 BGR/RGB image the corresponding NV12 logical image size is also 640x480, whereas the
 >   buffer size is 640x720.
-> * The sample uses input autoresize API of the Inference Engine to simplify user-side
->   pre-processing.
-> * By default, this sample expects that network input has BGR channels order. If you trained your
+> - By default, this sample expects that network input has BGR channels order. If you trained your
 >   model to work with RGB order, you need to reconvert your model using the Model Optimizer tool
 >   with `--reverse_input_channels` argument specified. For more information about the argument,
 >   refer to **When to Reverse Input Channels** section of
 >   [Converting a Model Using General Conversion Parameters](../../../../../docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
-
-## Running
-
-To run the sample, you can use [public](@ref omz_models_public_index) or [Intel's](@ref omz_models_intel_index) pre-trained models from the Open Model Zoo. The models can be downloaded using the [Model Downloader](@ref omz_tools_downloader_README).
-
-> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the
-> Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+> - Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
 >
-> The sample accepts models in ONNX format (.onnx) that do not require preprocessing.
+> - The sample accepts models in ONNX format (.onnx) that do not require preprocessing.
+
+You can perform inference on an NV12 image using a trained AlexNet network on a CPU with the following command:
 
-You can perform inference on an NV12 image using a trained AlexNet network on CPU with the following command:
 ```sh
-./hello_nv12_input_classification_c <path_to_model>/alexnet_fp32.xml <path_to_image>/cat.yuv 640x480 CPU
+./hello_nv12_input_classification_c <path_to_model>/alexnet_fp32.xml <path_to_image>/cat.yuv 300x300 CPU
 ```
 
 ## Sample Output
 
 The application outputs top-10 inference results.
+
+```sh
+Top 10 results:
+
+Image ./cat.yuv
+
+classid probability
+------- -----------
+435       0.091733
+876       0.081725
+999       0.069305
+587       0.043726
+666       0.038957
+419       0.032892
+285       0.030309
+700       0.029941
+696       0.021628
+855       0.020339
+
+This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool
+```
+
+## See Also
+
+- [Integrate the Inference Engine with Your Application](../../../../../docs/IE_DG/Integrate_with_customer_application_new_API.md)
+- [Using Inference Engine Samples](../../../../../docs/IE_DG/Samples_Overview.md)
+- [Model Downloader](@ref omz_tools_downloader_README)
+- [Model Optimizer](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
+
+[ie_network_set_color_format]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga85f3251f1f7b08507c297e73baa58969
+[ie_blob_make_memory_nv12]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Blob.html#ga0a2d97b0d40a53c01ead771f82ae7f4a
diff --git a/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/main.c b/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/main.c
index 55343c5b7ff07c..d5384a79bf6c71 100644
--- a/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/main.c
+++ b/inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/main.c
@@ -2,16 +2,27 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include <stdlib.h>
+#include <stdbool.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
+
 #include <c_api/ie_c_api.h>
 
+/**
+* @brief Struct to store classification results
+*/
 struct classify_res {
     size_t class_id;
     float probability;
 };
 
+/**
+* @brief Sort result of image classification by probability
+* @param struct with classification results to sort
+* @param size of the struct
+* @return none
+*/
 void classify_res_sort(struct classify_res *res, size_t n) {
     size_t i, j;
     for (i = 0; i < n; ++i) {
@@ -29,6 +40,12 @@ void classify_res_sort(struct classify_res *res, size_t n) {
     }
 }
 
+/**
+* @brief Convert output blob to classify struct for processing results
+* @param blob of output data
+* @param size of the blob
+* @return struct classify_res
+*/
 struct classify_res *output_blob_to_classify_res(ie_blob_t *blob, size_t *n) {
     dimensions_t output_dim;
     IEStatusCode status = ie_blob_get_dims(blob, &output_dim);
@@ -59,6 +76,13 @@ struct classify_res *output_blob_to_classify_res(ie_blob_t *blob, size_t *n) {
     return cls;
 }
 
+/**
+* @brief Print results of classification
+* @param struct of the classification results
+* @param size of the struct of classification results
+* @param string image path
+* @return none
+*/
 void print_classify_res(struct classify_res *cls, size_t n, const char *img_path) {
     printf("\nImage %s\n", img_path);
     printf("\nclassid probability\n");
@@ -67,8 +91,16 @@ void print_classify_res(struct classify_res *cls, size_t n, const char *img_path
     for (i = 0; i < n; ++i) {
         printf("%zu       %f\n", cls[i].class_id, cls[i].probability);
     }
+    printf("\nThis sample is an API example, for any performance measurements please use the dedicated benchmark_app tool\n");
 }
 
+/**
+* @brief Read image data
+* @param string image path
+* @param pointer to store image data
+* @param size bytes of image
+* @return total number of elements successfully read, in case of error it doesn't equal to size param
+*/
 size_t read_image_from_file(const char *img_path, unsigned char *img_data, size_t size) {
     FILE *fp = fopen(img_path, "rb+");
     size_t read_size = 0;
@@ -84,7 +116,14 @@ size_t read_image_from_file(const char *img_path, unsigned char *img_data, size_
     return read_size;
 }
 
-size_t parse_image_size(const char *size_str, size_t *width, size_t *height) {
+/**
+* @brief Check image has supported width and height
+* @param string image size in WIDTHxHEIGHT format
+* @param pointer to image width
+* @param pointer to image height
+* @return bool status True(success) or False(fail)
+*/
+bool is_supported_image_size(const char *size_str, size_t *width, size_t *height) {
     const char *_size = size_str;
     size_t _width = 0, _height = 0;
     while (_size && *_size != 'x' && *_size != '\0') {
@@ -112,10 +151,10 @@ size_t parse_image_size(const char *size_str, size_t *width, size_t *height) {
         if (_width % 2 == 0 && _height % 2 == 0) {
             *width = _width;
             *height = _height;
-            return 0;
+            return true;
         } else {
             printf("Unsupported image size, width and height must be even numbers \n");
-            return -1;
+            return false;
         }
     } else {
         goto err;
@@ -123,7 +162,7 @@ size_t parse_image_size(const char *size_str, size_t *width, size_t *height) {
 err:
     printf("Incorrect format of image size parameter, expected WIDTHxHEIGHT, "
             "actual: %s\n", size_str);
-    return -1;
+    return false;
 }
 
 int main(int argc, char **argv) {
@@ -134,7 +173,7 @@ int main(int argc, char **argv) {
     }
 
     size_t input_width = 0, input_height = 0, img_size = 0;
-    if (parse_image_size(argv[3], &input_width, &input_height) == -1)
+    if (!is_supported_image_size(argv[3], &input_width, &input_height))
         return EXIT_FAILURE;
 
     const char *input_model = argv[1];
@@ -149,28 +188,30 @@ int main(int argc, char **argv) {
     ie_blob_t *y_blob = NULL, *uv_blob = NULL, *nv12_blob = NULL, *output_blob = NULL;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 1. Load inference engine instance -------------------------------------
+    // --------------------------- Step 1. Initialize inference engine core -------------------------------------
     IEStatusCode status = ie_core_create("", &core);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // 2. Read a model in OpenVINO Intermediate Representation (.xml and .bin files) or ONNX (.onnx file) format
+    // Step 2. Read a model in OpenVINO Intermediate Representation (.xml and .bin files) or ONNX (.onnx file) format
     status = ie_core_read_network(core, input_model, NULL, &network);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 3. Configure input & output ---------------------------------------------
+    // --------------------------- Step 3. Configure input & output ---------------------------------------------
     // --------------------------- Prepare input blobs -----------------------------------------------------
     status = ie_network_get_input_name(network, 0, &input_name);
     if (status != OK)
         goto err;
 
+    /* Mark input as resizable by setting of a resize algorithm.
+     * In this case we will be able to set an input blob of any shape to an infer request.
+     * Resize and layout conversions are executed automatically during inference */
+     status |= ie_network_set_input_resize_algorithm(network, input_name, RESIZE_BILINEAR);
     status |= ie_network_set_input_layout(network, input_name, NCHW);
     status |= ie_network_set_input_precision(network, input_name, U8);
-    // set input resize algorithm to enable input autoresize
-    status |= ie_network_set_input_resize_algorithm(network, input_name, RESIZE_BILINEAR);
     // set input color format to NV12 to enable automatic input color format pre-processing
     status |= ie_network_set_color_format(network, input_name, NV12);
 
@@ -185,20 +226,20 @@ int main(int argc, char **argv) {
 
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 4. Loading model to the device ------------------------------------------
+    // --------------------------- Step 4. Loading model to the device ------------------------------------------
     ie_config_t config = {NULL, NULL, NULL};
     status = ie_core_load_network(core, network, device_name, &config, &exe_network);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 5. Create infer request -------------------------------------------------
+    // --------------------------- Step 5. Create infer request -------------------------------------------------
     status = ie_exec_network_create_infer_request(exe_network, &infer_request);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 6. Prepare input --------------------------------------------------------
+    // --------------------------- Step 6. Prepare input --------------------------------------------------------
     // read image with size converted to NV12 data size: height(NV12) = 3 / 2 * logical height
     img_size = input_width * (input_height * 3 / 2);
     img_data = (unsigned char *)calloc(img_size, sizeof(unsigned char));
@@ -230,14 +271,14 @@ int main(int argc, char **argv) {
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 7. Do inference --------------------------------------------------------
+    // --------------------------- Step 7. Do inference --------------------------------------------------------
     /* Running the request synchronously */
     status = ie_infer_request_infer(infer_request);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 8. Process output ------------------------------------------------------
+    // --------------------------- Step 8. Process output ------------------------------------------------------
     status = ie_infer_request_get_blob(infer_request, output_name, &output_blob);
     if (status != OK)
         goto err;
diff --git a/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/README.md b/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/README.md
index 55916a129f9473..e9736f3385dfb8 100644
--- a/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/README.md
+++ b/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/README.md
@@ -1,21 +1,50 @@
 # Object Detection C Sample SSD {#openvino_inference_engine_ie_bridges_c_samples_object_detection_sample_ssd_README}
 
-This topic demonstrates how to run the Object Detection C sample application, which does inference using object detection
-networks like SSD-VGG on Intel® Processors and Intel® HD Graphics.
+Inference of object detection networks like SSD-VGG using Asynchronous Inference Request API and [input reshape feature](../../../../../docs/IE_DG/ShapeInference.md).
 
-> **NOTE:** This topic describes usage of C implementation of the Object Detection Sample SSD. For the C++* implementation, refer to [Object Detection C++* Sample SSD](../../../../samples/object_detection_sample_ssd/README.md) and for the Python* implementation, refer to [Object Detection Python* Sample SSD](../../../python/sample/object_detection_sample_ssd/README.md).
+Object Detection C sample SSD application demonstrates how to use the following Inference Engine C API in applications:
+
+| Feature    | API  | Description |
+|:---     |:--- |:---
+|Asynchronous Infer |[ie_infer_request_infer_async][ie_infer_request_wait]| Do Asynchronous inference
+|Inference Engine Version| [ie_c_api_version] | Get Inference Engine API version
+|Available Devices| [ie_core_get_versions] | Get version information of the devices for inference
+|Custom Extension Kernels|[ie_core_add_extension] [ie_core_set_config]| Load extension library and config to the device
+|Network Operations|[ie_network_get_inputs_number] [ie_network_get_input_dims] [ie_network_get_input_shapes] [ie_network_get_outputs_number] [ie_network_get_output_dims]| Managing of network
+|Blob Operations|[ie_blob_get_buffer]| Work with memory container for storing inputs, outputs of the network, weights and biases of the layers
+|Input Reshape|[ie_network_reshape]| Set the batch size equal to the number of input images
+
+Basic Inference Engine API is covered by [Hello Classification C sample](../hello_classification/README.md).
+
+> **NOTE**: This sample uses `ie_network_reshape()` to set the batch size. While supported by SSD networks, reshape may not work with arbitrary topologies. See [Shape Inference Guide](../../../../../docs/IE_DG/ShapeInference.md) for more info.
+
+| Options  | Values |
+|:---                              |:---
+| Validated Models                 | Person detection SSD (object detection network)
+| Model Format                     | Inference Engine Intermediate Representation (.xml + .bin), ONNX (.onnx)
+| Validated images                 | The sample uses OpenCV* to [read input image](https://docs.opencv.org/master/d4/da8/group__imgcodecs.html#ga288b8b3da0892bd651fce07b3bbd3a56) (.bmp, .png, .jpg)
+| Supported devices                | [All](../../../../../docs/IE_DG/supported_plugins/Supported_Devices.md) |
+| Other language realization       | [C++](../../../../samples/object_detection_sample_ssd/README.md), [Python](../../../python/sample/object_detection_sample_ssd/README.md) |
 
 ## How It Works
 
-Upon the start-up the sample application reads command line parameters and loads a network and an image to the Inference
-Engine device. When inference is done, the application creates output images and outputs data to the standard output stream.
+Upon the start-up the sample application reads command line parameters, loads specified network and image(s) to the Inference
+Engine plugin. Then, the sample creates an synchronous inference request object. When inference is done, the application creates output image(s) and output data to the standard output stream.
+
+You can see the explicit description of
+each sample step at [Integration Steps](../../../../../docs/IE_DG/Integrate_with_customer_application_new_API.md) section of "Integrate the Inference Engine with Your Application" guide.
 
-> **NOTE**: By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](../../../../../docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+## Building
 
-> **NOTE**: This sample uses `ie_network_reshape()` to set the batch size. While supported by SSD networks, reshape may not work with arbitrary topologies. See [Shape Inference Guide](https://docs.openvinotoolkit.org/latest/openvino_docs_IE_DG_ShapeInference.html) for more info.
+To build the sample, please use instructions available at [Build the Sample Applications](../../../../../docs/IE_DG/Samples_Overview.md) section in Inference Engine Samples guide.
 
 ## Running
 
+To run the sample, you need specify a model and image:
+
+- you can use [public](@ref omz_models_public_index) or [Intel's](@ref omz_models_intel_index) pre-trained models from the Open Model Zoo. The models can be downloaded using the [Model Downloader](@ref omz_tools_downloader_README).
+- you can use images from the media files collection available at https://storage.openvinotoolkit.org/data/test_data.
+
 Running the application with the <code>-h</code> option yields the following usage message:
 
 ```sh
@@ -28,39 +57,43 @@ object_detection_sample_ssd_c [OPTION]
 Options:
 
     -h                      Print a usage message.
-    -i "<path>"             Required. Path to one or more .bmp images.
     -m "<path>"             Required. Path to an .xml file with a trained model.
-      -l "<absolute_path>"  Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.
+    -i "<path>"             Required. Path to one or more images or folder with images.
+      -l "<absolute_path>"  Required for CPU plugin custom layers. Absolute path to a shared library with the kernels implementations.
           Or
-      -c "<absolute_path>"  Required for GPU custom kernels. Absolute path to the .xml file with the kernels descriptions.
-    -d "<device>"           Optional. Specify the target device to infer on (the list of available devices is shown below). Default value is CPU. Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin. Sample will look for a suitable plugin for device specified
+      -c "<absolute_path>"  Required for GPU, MYRIAD, HDDL custom kernels. Absolute path to the .xml config file 
+                            with the kernels descriptions.
+    -d "<device>"           Optional. Specify the target device to infer. Default value is CPU. 
+                            Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin. Sample will look for a suitable plugin for device specified
     -g                      Path to the configuration file. Default value: "config".
 ```
 
-Running the application with the empty list of options yields the usage message given above and an error message.
-
-To run the sample, you can use [public](@ref omz_models_public_index) or [Intel's](@ref omz_models_intel_index) pre-trained models from the Open Model Zoo. The models can be downloaded using the [Model Downloader](@ref omz_tools_downloader_README).
-
-> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+> **NOTES**:
 >
-> The sample accepts models in ONNX format (.onnx) that do not require preprocessing.
+> - By default, Inference Engine samples and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](../../../../../docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+>
+> - Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+>
+> - The sample accepts models in ONNX format (.onnx) that do not require preprocessing.
 
 For example, to do inference on a CPU with the OpenVINO&trade; toolkit person detection SSD models, run one of the following commands:
 
+- with one image and [person-detection-retail-0013](https://docs.openvinotoolkit.org/latest/omz_models_intel_person_detection_retail_0013_description_person_detection_retail_0013.html) model
+
 ```sh
-./object_detection_sample_ssd_c -i <path_to_image>/inputImage.bmp -m <path_to_model>person-detection-retail-0013.xml -d CPU
+./object_detection_sample_ssd_c -i <path_to_image>/inputImage.bmp -m <path_to_model>/person-detection-retail-0013.xml -d CPU
 ```
 
-or
+- with some images and [person-detection-retail-0013](https://docs.openvinotoolkit.org/latest/omz_models_intel_person_detection_retail_0013_description_person_detection_retail_0013.html) model
 
 ```sh
-./object_detection_sample_ssd_c -i <path_to_image>/inputImage1.bmp <path_to_image>/inputImage2.bmp ... -m <path_to_model>person-detection-retail-0013.xml -d CPU
+./object_detection_sample_ssd_c -i <path_to_image>/inputImage1.bmp <path_to_image>/inputImage2.bmp ... -m <path_to_model>/person-detection-retail-0013.xml -d CPU
 ```
 
-or
+- with [person-detection-retail-0002](https://docs.openvinotoolkit.org/latest/omz_models_intel_person_detection_retail_0002_description_person_detection_retail_0002.html) model
 
 ```sh
-./object_detection_sample_ssd_c -i <path_to_image>/inputImage.jpg -m <path_to_model>person-detection-retail-0002.xml -d CPU
+./object_detection_sample_ssd_c -i <path_to_folder_with_images> -m <path_to_model>/person-detection-retail-0002.xml -d CPU
 ```
 
 ## Sample Output
@@ -68,7 +101,59 @@ or
 The application outputs several images (`out_0.bmp`, `out_1.bmp`, ... ) with detected objects enclosed in rectangles. It outputs the list of
 classes of the detected objects along with the respective confidence values and the coordinates of the rectangles to the standard output stream.
 
+```sh
+object_detection_sample_ssd_c -m person-detection-retail-0013.xml -i image_1.png image_2.jpg 
+
+[ INFO ] InferenceEngine: 
+<version><number>
+[ INFO ] Parsing input parameters
+[ INFO ] Files were added: 2
+[ INFO ]     image_1.png
+[ INFO ]     image_2.jpg
+[ INFO ] Loading Inference Engine
+[ INFO ] Device info: 
+         CPU
+         MKLDNNPlugin version ......... <version><number>
+         Build ......... <version><number>
+[ INFO ] Loading network:
+         person-detection-retail-0013.xml
+[ INFO ] Preparing input blobs
+[ WARNING ] Image is resized from (1699, 960) to (544, 320)
+[ WARNING ] Image is resized from (614, 346) to (544, 320)
+[ INFO ] Batch size is 2
+[ INFO ] Preparing output blobs
+[ INFO ] Loading model to the device
+[ INFO ] Create infer request
+[ INFO ] Start inference
+[ INFO ] Processing output blobs
+[0, 1] element, prob = 0.999090    (370, 201)-(634, 762) batch id : 0 WILL BE PRINTED!
+[1, 1] element, prob = 0.997386    (836, 192)-(999, 663) batch id : 0 WILL BE PRINTED!
+[2, 1] element, prob = 0.314753    (192, 2)-(265, 172) batch id : 0
+...
+[ INFO ] Image out_0.bmp created!
+[ INFO ] Image out_1.bmp created!
+[ INFO ] Execution successful
+
+This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool
+```
 
 ## See Also
-* [Model Optimizer](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
-* [Model Downloader](@ref omz_tools_downloader_README)
+
+- [Integrate the Inference Engine with Your Application](../../../../../docs/IE_DG/Integrate_with_customer_application_new_API.md)
+- [Using Inference Engine Samples](../../../../../docs/IE_DG/Samples_Overview.md)
+- [Model Downloader](@ref omz_tools_downloader_README)
+- [Model Optimizer](../../../../../docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
+
+[ie_infer_request_infer_async]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__InferRequest.html#gad2351010e292b6faec959a3d5a8fb60e
+[ie_infer_request_wait]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__InferRequest.html#ga0c05e63e63c8d9cdd92900e82b0137c9
+[ie_c_api_version]:https://docs.openvinotoolkit.org/latest/ie_c_api/ie__c__api_8h.html#a8fe3efe9cc606dcc7bec203102043e68
+[ie_core_get_versions]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Core.html#ga2932e188a690393f5d594572ac5d237b
+[ie_core_add_extension]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Core.html#gadded2444ba81d2d396516b72c2478f8e
+[ie_core_set_config]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Core.html#gaf09d1e77cc264067e4e22ddf99f21ec1
+[ie_network_get_inputs_number]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga6a3349bca66c4ba8b41a434061fccf52
+[ie_network_get_input_dims]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#gac621a654b89d413041cbc2288627f6a5
+[ie_network_get_input_shapes]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga5409734f25ffbb1379e876217c0bc6f3
+[ie_network_get_outputs_number]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga869b8c309797f1e09f73ddffd1b57509
+[ie_network_get_output_dims]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#ga8de7bf2f626f19eba08a2f043fc1b5d2
+[ie_network_reshape]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Network.html#gac4f690afd0c2221f7db2ff9be4aa0637
+[ie_blob_get_buffer]:https://docs.openvinotoolkit.org/latest/ie_c_api/group__Blob.html#ga948e0186cea6a393c113d5c399cfcb4c
diff --git a/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/c_w_dirent.h b/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/c_w_dirent.h
index 56c7005b2ea937..c6dc9ee442de0f 100644
--- a/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/c_w_dirent.h
+++ b/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/c_w_dirent.h
@@ -42,10 +42,18 @@
 #define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
 #define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR)
 
+/// @brief structure to store directory names
 typedef struct dirent {
     char *d_name;
 }dirent;
 
+/**
+* @brief Add directory to directory names struct
+* @param int argc - count of args
+* @param char *argv[] - array values of args
+* @param char *opts - array of options
+* @return pointer to directory names struct
+*/
 static dirent *createDirent(const wchar_t *wsFilePath) {
     dirent *d = (dirent *)malloc(sizeof(dirent));
     size_t i;
@@ -55,6 +63,11 @@ static dirent *createDirent(const wchar_t *wsFilePath) {
     return d;
 }
 
+/**
+* @brief Free directory names struct
+* @param point to directory names structure
+* @return none
+*/
 static void freeDirent(dirent **d) {
     free((*d)->d_name);
     (*d)->d_name = NULL;
@@ -62,12 +75,19 @@ static void freeDirent(dirent **d) {
     *d = NULL;
 }
 
+/// @brief structure to store directory data (files meta)
 typedef struct DIR {
     WIN32_FIND_DATAA FindFileData;
     HANDLE hFind;
     dirent *next;
 }DIR;
 
+/**
+* @brief Compare two string, second string is the end of the first
+* @param string to compare
+* @param end string to find
+* @return status 1(success) or 0(fail)
+*/
 static int endsWith(const char *src, const char *with) {
     int wl = (int)(strlen(with));
     int so = (int)(strlen(with)) - wl;
@@ -77,6 +97,12 @@ static int endsWith(const char *src, const char *with) {
     else
         return 0;
 }
+
+/**
+* @brief Check file handler is valid
+* @param struct of directory data
+* @return status 1(success) or 0(fail)
+*/
 static int isValid(DIR* dp) {
     if (dp->hFind != INVALID_HANDLE_VALUE && dp->FindFileData.dwReserved0) {
         return 1;
@@ -84,6 +110,12 @@ static int isValid(DIR* dp) {
         return 0;
     }
 }
+
+/**
+* @brief Create directory data struct element
+* @param string directory path
+* @return pointer to directory data struct element
+*/
 static DIR *opendir(const char *dirPath) {
     DIR *dp = (DIR *)malloc(sizeof(DIR));
     dp->next = NULL;
@@ -103,6 +135,11 @@ static DIR *opendir(const char *dirPath) {
     return dp;
 }
 
+/**
+* @brief Walk throw directory data struct
+* @param pointer to directory data struct
+* @return pointer to directory data struct next element
+*/
 static struct dirent *readdir(DIR *dp) {
     if (dp->next != NULL) freeDirent(&(dp->next));
 
@@ -117,6 +154,11 @@ static struct dirent *readdir(DIR *dp) {
     return dp->next;
 }
 
+/**
+* @brief Remove directory data struct
+* @param pointer to struct directory data
+* @return none
+*/
 static void closedir(DIR *dp){
     if (dp->next) {
         freeDirent(&(dp->next));
diff --git a/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/main.c b/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/main.c
index 33099ac95e8e27..a482632399f481 100644
--- a/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/main.c
+++ b/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/main.c
@@ -6,6 +6,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <sys/stat.h>
+
 #include <c_api/ie_c_api.h>
 #include "object_detection_sample_ssd.h"
 #include <opencv_c_wraper.h>
@@ -21,8 +22,8 @@
 static const char *img_msg = NULL;
 static const char *input_model = NULL;
 static const char *device_name = "CPU";
-static const char *custom_cldnn_msg = NULL;
-static const char *custom_cpu_library_msg = NULL;
+static const char *custom_plugin_cfg_msg = NULL;
+static const char *custom_ex_library_msg = NULL;
 static const char *config_msg = NULL;
 static int file_num = 0;
 static char **file_paths = NULL;
@@ -30,6 +31,12 @@ static char **file_paths = NULL;
 const char *info = "[ INFO ] ";
 const char *warn = "[ WARNING ] ";
 
+/**
+* @brief Parse and check command line arguments
+* @param int argc - count of args
+* @param char *argv[] - array values of args
+* @return int - status 1(success) or -1(fail)
+*/
 int ParseAndCheckCommandLine(int argc, char *argv[]) {
     int opt = 0;
     int help = 0;
@@ -53,12 +60,12 @@ int ParseAndCheckCommandLine(int argc, char *argv[]) {
                 device_name = optarg;
                 break;
             case 'c':
-                custom_cldnn_msg = optarg;
+                custom_plugin_cfg_msg = optarg;
                 break;
             case 'l':
-                custom_cpu_library_msg = optarg;
+                custom_ex_library_msg = optarg;
                 break;
-            case 'f':
+            case 'g':
                 config_msg = optarg;
                 break;
             default:
@@ -69,11 +76,11 @@ int ParseAndCheckCommandLine(int argc, char *argv[]) {
     if (help)
         return -1;
     if (input_model == NULL) {
-        printf("Model is required but not set. Please set -m option. \n");
+        printf("Model is required but not set. Please set -m option.\n");
         return -1;
     }
     if (img_msg == NULL) {
-        printf("Input is required but not set.Please set - i option.\n");
+        printf("Input is required but not set.Please set -i option.\n");
         return -1;
     }
 
@@ -138,15 +145,6 @@ void readInputFilesArgument(const char *arg) {
         }
         file_paths[file_num++] = file_path;
     }
-
-    if (file_num) {
-        printf("%sFiles were added: %d\n", info, file_num);
-        for (i = 0; i < file_num; ++i) {
-            printf("%s    %s\n", info, file_paths[i]);
-        }
-    } else {
-        printf("%sFiles were added: %d. Too many to display each of them.\n", info, file_num);
-    }
 }
 
 /**
@@ -168,10 +166,19 @@ void parseInputFilesArguments(int argc, char **argv) {
         }
         readInputFilesArgument(argv[i]);
     }
+
+    if (file_num) {
+        printf("%sFiles were added: %d\n", info, file_num);
+        for (i = 0; i < file_num; ++i) {
+            printf("%s    %s\n", info, file_paths[i]);
+        }
+    } else {
+        printf("%sFiles were added: %d. Too many to display each of them.\n", info, file_num);
+    }
 }
 
 /**
-* @brief Convert the contents of configuration file to the ie_config_t type.
+* @brief Convert the contents of configuration file to the ie_config_t struct.
 * @param config_file File path.
 * @param comment Separator symbol.
 * @return A pointer to the ie_config_t instance.
@@ -274,11 +281,14 @@ void int2str(char *str, int num) {
 
 int main(int argc, char **argv) {
     /** This sample covers certain topology and cannot be generalized for any object detection one **/
+    // ------------------------------ Get Inference Engine API version ---------------------------------
     ie_version_t version = ie_c_api_version();
     printf("%sInferenceEngine: \n", info);
     printf("%s\n", version.api_version);
     ie_version_free(&version);
 
+    // ------------------------------ Parsing and validation of input args ---------------------------------
+
     char **argv_temp =(char **)calloc(argc, sizeof(char *));
     if (!argv_temp) {
         return EXIT_FAILURE;
@@ -296,14 +306,13 @@ int main(int argc, char **argv) {
     ie_infer_request_t *infer_request = NULL;
     ie_blob_t *imageInput = NULL, *output_blob = NULL;
 
-    // --------------------------- 1. Parsing and validation of input args ---------------------------------
     if (ParseAndCheckCommandLine(argc, argv) < 0) {
         free(argv_temp);
         return EXIT_FAILURE;
     }
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 2. Read input -----------------------------------------------------------
+    // --------------------------- Read input -----------------------------------------------------------
     /** This file_paths stores paths to the processed images **/
     parseInputFilesArguments(argc, argv_temp);
     if (!file_num) {
@@ -313,12 +322,14 @@ int main(int argc, char **argv) {
     }
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 3. Load inference engine ------------------------------------------------
+    // --------------------------- Step 1. Initialize inference engine core -------------------------------------
+
     printf("%sLoading Inference Engine\n", info);
     IEStatusCode status = ie_core_create("", &core);
     if (status != OK)
         goto err;
 
+    // ------------------------------ Get Available Devices ------------------------------------------------------
     ie_core_versions_t ver;
     printf("%sDevice info: \n", info);
     status = ie_core_get_versions(core, device_name, &ver);
@@ -331,25 +342,25 @@ int main(int argc, char **argv) {
     }
     ie_core_versions_free(&ver);
 
-    if (custom_cpu_library_msg) {
-        // CPU(MKLDNN) extensions are loaded as a shared library and passed as a pointer to base extension
-        status = ie_core_add_extension(core, custom_cpu_library_msg, "CPU");
+    if (custom_ex_library_msg) {
+        // Custom CPU extension is loaded as a shared library and passed as a pointer to base extension
+        status = ie_core_add_extension(core, custom_ex_library_msg, "CPU");
         if (status != OK)
             goto err;
-        printf("%sCPU Extension loaded: %s\n", info, custom_cpu_library_msg);
+        printf("%sCustom extension loaded: %s\n", info, custom_ex_library_msg);
     }
 
-    if (custom_cldnn_msg) {
-        // clDNN Extensions are loaded from an .xml description and OpenCL kernel files
-        ie_config_t cfg = {"CONFIG_FILE", custom_cldnn_msg, NULL};
-        status = ie_core_set_config(core, &cfg, "GPU");
+    if (custom_plugin_cfg_msg && (device_name == "GPU" || device_name == "MYRIAD" || device_name == "HDDL")) {
+        // Config for device plugin custom extension is loaded from an .xml description
+        ie_config_t cfg = {"CONFIG_FILE", custom_plugin_cfg_msg, NULL};
+        status = ie_core_set_config(core, &cfg, device_name);
         if (status != OK)
             goto err;
-        printf("%sGPU Extension loaded: %s\n", info, custom_cldnn_msg);
+        printf("%sConfig for device plugin custom extension loaded: %s\n", info, custom_plugin_cfg_msg);
     }
     // -----------------------------------------------------------------------------------------------------
 
-    // 4. Read a model in OpenVINO Intermediate Representation (.xml and .bin files) or ONNX (.onnx file) format
+    // Step 2. Read a model in OpenVINO Intermediate Representation (.xml and .bin files) or ONNX (.onnx file) format
     printf("%sLoading network:\n", info);
     printf("\t%s\n", input_model);
     status = ie_core_read_network(core, input_model, NULL, &network);
@@ -357,7 +368,8 @@ int main(int argc, char **argv) {
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 5. Prepare input blobs --------------------------------------------------
+    // --------------------------- Step 3. Configure input & output ---------------------------------------------
+    // --------------------------- Prepare input blobs -----------------------------------------------------
     printf("%sPreparing input blobs\n", info);
 
     /** SSD network has one input and one output **/
@@ -494,9 +506,8 @@ int main(int argc, char **argv) {
     size_t batchSize = shapes2.shapes[0].shape.dims[0];
     ie_network_input_shapes_free(&shapes2);
     printf("%sBatch size is %zu\n", info, batchSize);
-    // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 6. Prepare output blobs -------------------------------------------------
+    // --------------------------- Prepare output blobs ----------------------------------------------------
     printf("%sPreparing output blobs\n", info);
 
     size_t output_num = 0;
@@ -534,7 +545,7 @@ int main(int argc, char **argv) {
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 7. Loading model to the device ------------------------------------------
+    // --------------------------- Step 4. Loading model to the device ------------------------------------------
     printf("%sLoading model to the device\n", info);
     if (config_msg) {
         ie_config_t * config = parseConfig(config_msg, '#');
@@ -552,15 +563,14 @@ int main(int argc, char **argv) {
 
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 8. Create infer request -------------------------------------------------
+    // --------------------------- Step 5. Create infer request -------------------------------------------------
     printf("%sCreate infer request\n", info);
     status = ie_exec_network_create_infer_request(exe_network, &infer_request);
     if (status != OK)
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 9. Prepare input --------------------------------------------------------
-
+    // --------------------------- Step 6. Prepare input --------------------------------------------------------
 
     /** Creating input blob **/
     status = ie_infer_request_get_blob(infer_request, imageInputName, &imageInput);
@@ -624,7 +634,7 @@ int main(int argc, char **argv) {
     }
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 10. Do inference ---------------------------------------------------------
+    // --------------------------- Step 7. Do inference --------------------------------------------------------
     printf("%sStart inference\n", info);
     status = ie_infer_request_infer_async(infer_request);
     status |= ie_infer_request_wait(infer_request, -1);
@@ -632,7 +642,7 @@ int main(int argc, char **argv) {
         goto err;
     // -----------------------------------------------------------------------------------------------------
 
-    // --------------------------- 11. Process output -------------------------------------------------------
+    // --------------------------- Step 8. Process output ------------------------------------------------------
     printf("%sProcessing output blobs\n", info);
 
     status = ie_infer_request_get_blob(infer_request, output_name, &output_blob);
@@ -706,6 +716,7 @@ int main(int argc, char **argv) {
     // -----------------------------------------------------------------------------------------------------
 
     printf("%sExecution successful\n", info);
+    printf("\nThis sample is an API example, for any performance measurements please use the dedicated benchmark_app tool\n");
 
     for (i = 0; i < image_num; ++i) {
         free(classes[i]);
diff --git a/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/object_detection_sample_ssd.h b/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/object_detection_sample_ssd.h
index c097aaf968b8cb..e8e1c126e797e8 100644
--- a/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/object_detection_sample_ssd.h
+++ b/inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/object_detection_sample_ssd.h
@@ -13,19 +13,19 @@ static const char *help_message = "Print a usage message.";
 static const char* model_message = "Required. Path to an .xml file with a trained model.";
 
 /// @brief message for images argument
-static const char *image_message = "Required. Path to one or more .bmp images.";
+static const char *image_message = "Required. Path to one or more images or folder with images.";
 
 /// @brief message for assigning cnn calculation to device
-static const char *target_device_message = "Optional. Specify the target device to infer on (the list of available devices is shown below). " \
+static const char *target_device_message = "Optional. Specify the target device to infer. " \
 "Default value is CPU. Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. " \
-"Sample will look for a suitable plugin for device specified";
+"Sample will look for a suitable plugin for device specified.";
 
-/// @brief message for clDNN custom kernels desc
-static const char *custom_cldnn_message = "Required for GPU custom kernels. "\
-"Absolute path to the .xml file with the kernels descriptions.";
+/// @brief message for plugin custom kernels desc
+static const char *custom_plugin_config_message = "Required for GPU, MYRIAD, HDDL custom kernels. "\
+"Absolute path to the .xml config file with the kernels descriptions.";
 
-/// @brief message for user library argument
-static const char *custom_cpu_library_message = "Required for CPU custom layers. " \
+/// @brief message for user extension library argument
+static const char *custom_ex_library_message = "Required for CPU plugin custom layers. " \
 "Absolute path to a shared library with the kernels implementations.";
 
 /// @brief message for config argument
@@ -34,14 +34,14 @@ static const char *config_message = "Path to the configuration file. Default val
 * \brief This function show a help message
 */
 static void showUsage() {
-    printf("\nobject_detection_sample_ssd [OPTION]\n");
+    printf("\nobject_detection_sample_ssd_c [OPTION]\n");
     printf("Options:\n\n");
     printf("    -h                      %s\n", help_message);
     printf("    -m \"<path>\"             %s\n", model_message);
     printf("    -i \"<path>\"             %s\n", image_message);
-    printf("      -l \"<absolute_path>\"  %s\n", custom_cpu_library_message);
+    printf("      -l \"<absolute_path>\"  %s\n", custom_ex_library_message);
     printf("          Or\n");
-    printf("      -c \"<absolute_path>\"  %s\n", custom_cldnn_message);
+    printf("      -c \"<absolute_path>\"  %s\n", custom_plugin_config_message);
     printf("    -d \"<device>\"           %s\n", target_device_message);
     printf("    -g                  %s\n", config_message);
 }
@@ -58,6 +58,13 @@ char *optarg;
     fputc(c, stderr);\
     fputs("\'\n", stderr);}
 
+/**
+* @brief Check command line arguments with available options
+* @param int argc - count of args
+* @param char *argv[] - array values of args
+* @param char *opts - array of options
+* @return option name or -1(fail)
+*/
 static int getopt(int argc, char **argv, char *opts) {
     static int sp = 1;
     register int c = 0;
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp
index 29cb1acfd61fe4..0faab4baccd721 100644
--- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp
+++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp
@@ -23,6 +23,7 @@
 
 #include <ie_extension.h>
 #include <ie_core.hpp>
+#include <ie_iexecutable_network.hpp>
 
 typedef std::chrono::high_resolution_clock Time;
 typedef std::chrono::nanoseconds ns;
diff --git a/inference-engine/ie_bridges/python/tests/test_Blob.py b/inference-engine/ie_bridges/python/tests/test_Blob.py
index 91c16102b5046b..1f0d00519ab02c 100644
--- a/inference-engine/ie_bridges/python/tests/test_Blob.py
+++ b/inference-engine/ie_bridges/python/tests/test_Blob.py
@@ -39,7 +39,22 @@ def test_get_buffer():
     blob = Blob(tensor_desc, array)
     assert np.array_equal(blob.buffer, array)
 
-def write_to_buffer(precision, numpy_precision):
+
+@pytest.mark.parametrize("precision, numpy_precision", [
+    ("FP32", np.float32),
+    ("FP64", np.float64),
+    ("FP16", np.float16),
+    ("I8", np.int8),
+    ("U8", np.uint8),
+    ("I32", np.int32),
+    ("I16", np.int16),
+    ("U16", np.uint16),
+    ("I64", np.int64),
+    ("BOOL", np.uint8),
+    ("BIN", np.int8),
+    ("BF16", np.float16),
+])
+def test_write_to_buffer(precision, numpy_precision):
     tensor_desc = TensorDesc(precision, [1, 3, 127, 127], "NCHW")
     array = np.zeros(shape=(1, 3, 127, 127), dtype=numpy_precision)
     blob = Blob(tensor_desc, array)
@@ -47,53 +62,6 @@ def write_to_buffer(precision, numpy_precision):
     blob.buffer[:] = ones_arr
     assert np.array_equal(blob.buffer, ones_arr)
 
-def test_write_to_buffer_fp32():
-    write_to_buffer("FP32", np.float32)
-
-
-def test_write_to_buffer_fp64():
-    write_to_buffer("FP64", np.float64)
-
-
-def test_write_to_buffer_fp16():
-    write_to_buffer("FP16", np.float16)
-
-
-def test_write_to_buffer_int8():
-    write_to_buffer("I8", np.int8)
-
-
-def test_write_to_buffer_uint8():
-    write_to_buffer("U8", np.uint8)
-
-
-def test_write_to_buffer_int32():
-    write_to_buffer("I32", np.int32)
-
-
-def test_write_to_buffer_int16():
-    write_to_buffer("I16", np.int16)
-
-
-def test_write_to_buffer_uint16():
-    write_to_buffer("U16", np.uint16)
-
-
-def test_write_to_buffer_int64():
-    write_to_buffer("I64", np.int64)
-
-
-def test_write_to_buffer_bool():
-    write_to_buffer("BOOL", np.uint8)
-
-
-def test_write_to_buffer_bin():
-    write_to_buffer("BIN", np.int8)
-
-
-def test_write_to_buffer_bf16():
-    write_to_buffer("BF16", np.float16)
-
 
 def test_write_numpy_scalar_int64():
     tensor_desc = TensorDesc("I64", [], "SCALAR")
diff --git a/inference-engine/ie_bridges/python/tests/test_ExecutableNetwork.py b/inference-engine/ie_bridges/python/tests/test_ExecutableNetwork.py
index d722608696cd82..2193a6501c24bb 100644
--- a/inference-engine/ie_bridges/python/tests/test_ExecutableNetwork.py
+++ b/inference-engine/ie_bridges/python/tests/test_ExecutableNetwork.py
@@ -234,10 +234,11 @@ def test_plugin_accessible_after_deletion(device):
     del ie_core
 
 
-@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") == "ARM",
-                    reason=f"Cannot run test on device {os.environ.get('TEST_DEVICE')}")
 def test_exec_graph(device):
     ie_core = ie.IECore()
+    if device == "CPU":
+        if ie_core.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON":
+            pytest.skip("Can't run on ARM plugin due-to get_exec_graph_info method isn't implemented")
     net = ie_core.read_network(model=test_net_xml, weights=test_net_bin)
     exec_net = ie_core.load_network(net, device)
     img = read_image()
@@ -294,9 +295,11 @@ def test_get_metric(device):
     assert network_name == "test_model"
 
 
-@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU", reason="Device independent test")
+@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU", reason="Device dependent test")
 def test_get_config(device):
     ie_core = ie.IECore()
+    if ie_core.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON":
+        pytest.skip("Can't run on ARM plugin due-to CPU dependent test")
     net = ie_core.read_network(model=test_net_xml, weights=test_net_bin)
     exec_net = ie_core.load_network(net, device)
     config = exec_net.get_config("PERF_COUNT")
diff --git a/inference-engine/ie_bridges/python/tests/test_IECore.py b/inference-engine/ie_bridges/python/tests/test_IECore.py
index 2684def8b41752..41d28f1c41b1f6 100644
--- a/inference-engine/ie_bridges/python/tests/test_IECore.py
+++ b/inference-engine/ie_bridges/python/tests/test_IECore.py
@@ -4,7 +4,6 @@
 import os
 import pytest
 from sys import platform
-import numpy as np
 from pathlib import Path
 
 from openvino.inference_engine import IENetwork, IECore, ExecutableNetwork
@@ -61,8 +60,11 @@ def test_load_network_wrong_device():
 
 
 def test_query_network(device):
-    import ngraph as ng
     ie = IECore()
+    if device == "CPU":
+        if ie.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON":
+            pytest.skip("Can't run on ARM plugin due-to ngraph")
+    import ngraph as ng
     net = ie.read_network(model=test_net_xml, weights=test_net_bin)
     query_res = ie.query_network(net, device)
     func_net = ng.function_from_cnn(net)
@@ -73,18 +75,22 @@ def test_query_network(device):
     assert next(iter(set(query_res.values()))) == device, "Wrong device for some layers"
 
 
-@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU", reason="Device independent test")
+@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU", reason="Device dependent test")
 def test_register_plugin():
     ie = IECore()
+    if ie.get_metric("CPU", "FULL_DEVICE_NAME") == "arm_compute::NEON":
+        pytest.skip("Can't run on ARM plugin due-to MKLDNNPlugin specific test")
     ie.register_plugin("MKLDNNPlugin", "BLA")
     net = ie.read_network(model=test_net_xml, weights=test_net_bin)
     exec_net = ie.load_network(net, "BLA")
     assert isinstance(exec_net, ExecutableNetwork), "Cannot load the network to the registered plugin with name 'BLA'"
 
 
-@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU", reason="Device independent test")
+@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU", reason="Device dependent test")
 def test_register_plugins():
     ie = IECore()
+    if ie.get_metric("CPU", "FULL_DEVICE_NAME") == "arm_compute::NEON":
+        pytest.skip("Can't run on ARM plugin due-to MKLDNNPlugin specific test")
     if platform == "linux" or platform == "linux2":
         ie.register_plugins(plugins_xml)
     elif platform == "darwin":
@@ -126,11 +132,12 @@ def test_get_metric_list_of_str():
                                                    "metric are strings!"
 
 
-
 @pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU",
                     reason=f"Cannot run test on device {os.environ.get('TEST_DEVICE')}, Plugin specific test")
 def test_get_metric_tuple_of_two_ints():
     ie = IECore()
+    if ie.get_metric("CPU", "FULL_DEVICE_NAME") == "arm_compute::NEON":
+        pytest.skip("Can't run on ARM plugin due-to unsupported device metric")
     param = ie.get_metric("CPU", "RANGE_FOR_STREAMS")
     assert isinstance(param, tuple), "Parameter value for 'RANGE_FOR_STREAMS' " \
                                      f"metric must be tuple but {type(param)} is returned"
@@ -142,6 +149,8 @@ def test_get_metric_tuple_of_two_ints():
                     reason=f"Cannot run test on device {os.environ.get('TEST_DEVICE')}, Plugin specific test")
 def test_get_metric_tuple_of_three_ints():
     ie = IECore()
+    if ie.get_metric("CPU", "FULL_DEVICE_NAME") == "arm_compute::NEON":
+        pytest.skip("Can't run on ARM plugin due-to unsupported device metric")
     param = ie.get_metric("CPU", "RANGE_FOR_ASYNC_INFER_REQUESTS")
     assert isinstance(param, tuple), "Parameter value for 'RANGE_FOR_ASYNC_INFER_REQUESTS' " \
                                      f"metric must be tuple but {type(param)} is returned"
@@ -185,21 +194,25 @@ def test_read_network_from_onnx():
     net = ie.read_network(model=test_net_onnx)
     assert isinstance(net, IENetwork)
 
+
 def test_read_network_from_onnx_as_path():
     ie = IECore()
     net = ie.read_network(model=Path(test_net_onnx))
     assert isinstance(net, IENetwork)
 
+
 def test_read_network_from_prototxt():
     ie = IECore()
     net = ie.read_network(model=test_net_prototxt)
     assert isinstance(net, IENetwork)
 
+
 def test_read_network_from_prototxt_as_path():
     ie = IECore()
     net = ie.read_network(model=Path(test_net_prototxt))
     assert isinstance(net, IENetwork)
 
+
 def test_incorrect_xml():
     ie = IECore()
     with pytest.raises(Exception) as e:
diff --git a/inference-engine/ie_bridges/python/tests/test_IENetwork.py b/inference-engine/ie_bridges/python/tests/test_IENetwork.py
index 60e17268ede4e3..1c3474e689134d 100644
--- a/inference-engine/ie_bridges/python/tests/test_IENetwork.py
+++ b/inference-engine/ie_bridges/python/tests/test_IENetwork.py
@@ -4,7 +4,6 @@
 import os
 import pytest
 import warnings
-import numpy as np
 
 from openvino.inference_engine import IECore, IENetwork, DataPtr, InputInfoPtr, PreProcessInfo
 from conftest import model_path
@@ -183,9 +182,12 @@ def test_batch_size_after_reshape():
     assert net.input_info['data'].input_data.shape == [8, 3, 32, 32]
 
 
-def test_serialize():
-    import ngraph as ng
+def test_serialize(device):
     ie = IECore()
+    if device == "CPU":
+        if ie.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON":
+            pytest.skip("Can't run on ARM plugin due-to ngraph")
+    import ngraph as ng
     net = ie.read_network(model=test_net_xml, weights=test_net_bin)
     net.serialize("./serialized_net.xml", "./serialized_net.bin")
     serialized_net = ie.read_network(model="./serialized_net.xml", weights="./serialized_net.bin")
diff --git a/inference-engine/ie_bridges/python/tests/test_InferRequest.py b/inference-engine/ie_bridges/python/tests/test_InferRequest.py
index b2783f5cad9c10..5a534703598fc3 100644
--- a/inference-engine/ie_bridges/python/tests/test_InferRequest.py
+++ b/inference-engine/ie_bridges/python/tests/test_InferRequest.py
@@ -376,6 +376,9 @@ def execute(self, input_data):
 
 def test_get_perf_counts(device):
     ie_core = ie.IECore()
+    if device == "CPU":
+        if ie_core.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON":
+            pytest.skip("Can't run on ARM plugin due-to ngraph")
     net = ie_core.read_network(test_net_xml, test_net_bin)
     ie_core.set_config({"PERF_COUNT": "YES"}, device)
     exec_net = ie_core.load_network(net, device)
@@ -395,6 +398,8 @@ def test_get_perf_counts(device):
                             "Dynamic batch fully supported only on CPU")
 def test_set_batch_size(device):
     ie_core = ie.IECore()
+    if ie_core.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON":
+        pytest.skip("Can't run on ARM plugin due-to dynamic batch isn't supported")
     ie_core.set_config({"DYN_BATCH_ENABLED": "YES"}, device)
     net = ie_core.read_network(test_net_xml, test_net_bin)
     net.batch_size = 10
@@ -438,6 +443,9 @@ def test_set_negative_batch_size(device):
 
 def test_blob_setter(device):
     ie_core = ie.IECore()
+    if device == "CPU":
+        if ie_core.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON":
+            pytest.skip("Can't run on ARM plugin")
     net = ie_core.read_network(test_net_xml, test_net_bin)
     exec_net_1 = ie_core.load_network(network=net, device_name=device, num_requests=1)
 
diff --git a/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.requirements.txt b/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.requirements.txt
index 9fbd32370e952c..487eac4ce12803 100644
--- a/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.requirements.txt
+++ b/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.requirements.txt
@@ -1,9 +1,9 @@
 defusedxml>=0.5.0
 scipy==1.5.4
 jstyleson==0.0.2
-numpy==1.16.6
+numpy~=1.18.5
 addict==2.2.1
-pandas==0.24.2
+pandas~=1.1.5
 hyperopt==0.1.2
 networkx==2.2
 tqdm==4.31.1
@@ -11,17 +11,17 @@ texttable==1.6.3
 py-cpuinfo!=5.0,!=6.0
 PyYAML>=5.4.1
 pillow>=8.1.0
-scikit-image
-scikit-learn
-yamlloader
-shapely
-nibabel
-pydicom
-sentencepiece
-tokenizers
-editdistance
-parasail
-fast-ctc-decode
-rawpy
-nltk
-opencv-python
+scikit-image>=0.17
+scikit-learn>=0.23
+yamlloader>=0.5
+shapely>=1.7
+nibabel>=3.1
+pydicom>=2.0
+sentencepiece>=0.1.91
+tokenizers>=0.8
+editdistance>=0.5
+parasail>=1.2
+fast-ctc-decode>=0.2
+rawpy>=0.15
+nltk>=3.5
+opencv-python>=4.4
diff --git a/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.setup.cfg b/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.setup.cfg
index 9b9012f08bb6c3..77c423416baa0c 100644
--- a/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.setup.cfg
+++ b/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.setup.cfg
@@ -1,25 +1,27 @@
 [options]
-py_modules = 
-	mo
-	mo_tf
-	mo_caffe
-	mo_mxnet
-	mo_onnx
-	mo_kaldi
+py_modules =
+    mo
+    mo_tf
+    mo_caffe
+    mo_mxnet
+    mo_onnx
+    mo_kaldi
 
 [options.package_data]
-	mo = *.txt
+    mo = *.txt
+    compression.configs.hardware = *.json
 
 [options.entry_points]
 console_scripts =
+    mo=mo.__main__:main
     pot=app.run:main
     accuracy_check=accuracy_checker.main:main
     convert_annotation=accuracy_checker.annotation_converters.convert:main
 
 [metadata]
 license_files =
-	readme* 
-	*LICENSE*
-	*license*
-	*third-party-programs*
-	*EULA*
+    readme*
+    *LICENSE*
+    *license*
+    *third-party-programs*
+    *EULA*
diff --git a/inference-engine/include/cldnn/cldnn_config.hpp b/inference-engine/include/cldnn/cldnn_config.hpp
index 111eff4104f913..faf198517fc9d4 100644
--- a/inference-engine/include/cldnn/cldnn_config.hpp
+++ b/inference-engine/include/cldnn/cldnn_config.hpp
@@ -72,6 +72,11 @@ DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS);
 */
 DECLARE_CLDNN_CONFIG_KEY(NV12_TWO_INPUTS);
 
+/**
+* @brief This key sets the max number of host threads that can be used by GPU plugin on model loading.
+* Default value is maximum number of threads available in the environment.
+*/
+DECLARE_CLDNN_CONFIG_KEY(MAX_NUM_THREADS);
 
 }  // namespace CLDNNConfigParams
 }  // namespace InferenceEngine
diff --git a/inference-engine/include/cpp/ie_executable_network.hpp b/inference-engine/include/cpp/ie_executable_network.hpp
index 9716d40bacf2d9..94b84f66b528ea 100644
--- a/inference-engine/include/cpp/ie_executable_network.hpp
+++ b/inference-engine/include/cpp/ie_executable_network.hpp
@@ -16,18 +16,26 @@
 
 #include "cpp/ie_cnn_network.h"
 #include "cpp/ie_infer_request.hpp"
-#include "cpp/ie_memory_state.hpp"
-#include "ie_iexecutable_network.hpp"
-#include "details/ie_so_loader.h"
 
 namespace InferenceEngine {
+namespace details {
+class SharedObjectLoader;
+}
+
+class IExecutableNetworkInternal;
+class IExecutableNetwork;
 
 /**
  * @brief This is an interface of an executable network
  */
 class INFERENCE_ENGINE_API_CLASS(ExecutableNetwork) {
-    IExecutableNetwork::Ptr actual;
-    details::SharedObjectLoader::Ptr plg;
+    std::shared_ptr<IExecutableNetworkInternal> _impl;
+    std::shared_ptr<details::SharedObjectLoader> _so;
+
+    explicit ExecutableNetwork(const std::shared_ptr<IExecutableNetworkInternal>&   impl,
+                               const std::shared_ptr<details::SharedObjectLoader>&  so);
+
+    friend class InferencePlugin;
 
 public:
     /**
@@ -40,14 +48,6 @@ class INFERENCE_ENGINE_API_CLASS(ExecutableNetwork) {
      */
     ~ExecutableNetwork();
 
-    /**
-     * @brief Constructs ExecutableNetwork from the initialized shared_pointer
-     *
-     * @param actual Initialized shared pointer
-     * @param plg Plugin to use
-     */
-    explicit ExecutableNetwork(IExecutableNetwork::Ptr actual, details::SharedObjectLoader::Ptr plg = {});
-
     /**
      * @brief Gets the Executable network output Data node information.
      *
@@ -74,10 +74,11 @@ class INFERENCE_ENGINE_API_CLASS(ExecutableNetwork) {
     /**
      * @brief reset owned object to new pointer.
      *
-     * Eessential for cases when simultaneously loaded networks not expected.
+     * Essential for cases when simultaneously loaded networks not expected.
      * @param newActual actual pointed object
      */
-    void reset(IExecutableNetwork::Ptr newActual);
+    INFERENCE_ENGINE_DEPRECATED("Will be removed")
+    void reset(std::shared_ptr<IExecutableNetwork> newActual);
 
     /**
      * @brief Creates an inference request object used to infer the network.
@@ -94,6 +95,7 @@ class INFERENCE_ENGINE_API_CLASS(ExecutableNetwork) {
      * Wraps IExecutableNetwork::CreateInferRequest.
      * @return shared pointer on InferenceEngine::InferRequest object
      */
+    INFERENCE_ENGINE_DEPRECATED("Use CreateInferRequest")
     InferRequest::Ptr CreateInferRequestPtr();
 
     /**
@@ -118,7 +120,8 @@ class INFERENCE_ENGINE_API_CLASS(ExecutableNetwork) {
      * @brief cast operator is used when this wrapper initialized by LoadNetwork
      * @return A shared pointer to IExecutableNetwork interface.
      */
-    operator IExecutableNetwork::Ptr&();
+    INFERENCE_ENGINE_DEPRECATED("Will be removed")
+    operator std::shared_ptr<IExecutableNetwork>();
 
     /**
      * @copybrief IExecutableNetwork::GetExecGraphInfo
@@ -151,7 +154,7 @@ class INFERENCE_ENGINE_API_CLASS(ExecutableNetwork) {
      * The method is responsible to extract information
      * which affects executable network execution. The list of supported configuration values can be extracted via
      * ExecutableNetwork::GetMetric with the SUPPORTED_CONFIG_KEYS key, but some of these keys cannot be changed
-     * dymanically, e.g. DEVICE_ID cannot changed if an executable network has already been compiled for particular
+     * dynamically, e.g. DEVICE_ID cannot changed if an executable network has already been compiled for particular
      * device.
      *
      * @param name config key, can be found in ie_plugin_config.hpp
@@ -178,9 +181,15 @@ class INFERENCE_ENGINE_API_CLASS(ExecutableNetwork) {
     RemoteContext::Ptr GetContext() const;
 
     /**
-     * @brief A smart pointer to the ExecutableNetwork object
+     * @brief Checks if current ExecutableNetwork object is not initialized
+     * @return true if current ExecutableNetwork object is not initialized, false - otherwise
+     */
+    bool operator!() const noexcept;
+    /**
+     * @brief Checks if current ExecutableNetwork object is initialized
+     * @return true if current ExecutableNetwork object is initialized, false - otherwise
      */
-    using Ptr = std::shared_ptr<ExecutableNetwork>;
+    explicit operator bool() const noexcept;
 };
 
 }  // namespace InferenceEngine
diff --git a/inference-engine/include/details/ie_so_loader.h b/inference-engine/include/details/ie_so_loader.h
index 6ba85360271ff1..aa1a82a2281a6d 100644
--- a/inference-engine/include/details/ie_so_loader.h
+++ b/inference-engine/include/details/ie_so_loader.h
@@ -46,7 +46,7 @@ class INFERENCE_ENGINE_API_CLASS(SharedObjectLoader) {
     /**
      * @brief A destructor
      */
-    ~SharedObjectLoader() noexcept(false);
+    ~SharedObjectLoader();
 
     /**
      * @brief Searches for a function symbol in the loaded module
diff --git a/inference-engine/include/details/ie_so_pointer.hpp b/inference-engine/include/details/ie_so_pointer.hpp
index 3342d03af74b8c..b2926ee44ecae7 100644
--- a/inference-engine/include/details/ie_so_pointer.hpp
+++ b/inference-engine/include/details/ie_so_pointer.hpp
@@ -123,7 +123,7 @@ IE_SUPPRESS_DEPRECATED_END
     }
 
     explicit operator bool() const noexcept {
-        return (nullptr != _so_loader) && (nullptr != _pointedObj);
+        return (nullptr != _pointedObj);
     }
 
     friend bool operator==(std::nullptr_t, const SOPointer& ptr) noexcept {
@@ -145,7 +145,7 @@ IE_SUPPRESS_DEPRECATED_END
         return *this;
     }
 
-    operator std::shared_ptr<Loader>() const noexcept {
+    operator const std::shared_ptr<Loader>&() const noexcept {
         return _so_loader;
     }
 
diff --git a/inference-engine/include/gna/gna_config.hpp b/inference-engine/include/gna/gna_config.hpp
index 8b91de3b066e85..7273f5a0332ae0 100644
--- a/inference-engine/include/gna/gna_config.hpp
+++ b/inference-engine/include/gna/gna_config.hpp
@@ -43,12 +43,11 @@ namespace GNAConfigParams {
 DECLARE_GNA_CONFIG_KEY(SCALE_FACTOR);
 
 /**
-* @brief By default gna api work in Int16 precision, however this can be adjusted if necessary,
+* @brief By default gna api works with Int16 weights precision, however this can be adjusted if necessary,
 * currently supported values are I16, I8
 */
 DECLARE_GNA_CONFIG_KEY(PRECISION);
 
-
 /**
 * @brief if turned on, dump GNA firmware model into specified file
 */
diff --git a/inference-engine/include/ie_common.h b/inference-engine/include/ie_common.h
index 263579d62dd6db..d14f26e70e692a 100644
--- a/inference-engine/include/ie_common.h
+++ b/inference-engine/include/ie_common.h
@@ -281,6 +281,11 @@ struct QueryNetworkResult {
     ResponseDesc resp;
 };
 
+/**
+ * @brief A collection that contains string as key, and const Data smart pointer as value
+ */
+using ConstOutputsDataMap = std::map<std::string, CDataPtr>;
+
 namespace details {
 struct INFERENCE_ENGINE_DEPRECATED("Use InferRequest::Exception")
 INFERENCE_ENGINE_API_CLASS(InferenceEngineException) : public std::runtime_error {
diff --git a/inference-engine/include/ie_iexecutable_network.hpp b/inference-engine/include/ie_iexecutable_network.hpp
index 04bbd2df052905..16c1e9d971e284 100644
--- a/inference-engine/include/ie_iexecutable_network.hpp
+++ b/inference-engine/include/ie_iexecutable_network.hpp
@@ -23,16 +23,11 @@
 #include "ie_remote_context.hpp"
 
 namespace InferenceEngine {
-
-/**
- * @brief A collection that contains string as key, and const Data smart pointer as value
- */
-using ConstOutputsDataMap = std::map<std::string, CDataPtr>;
-
 /**
  * @brief This is an interface of an executable network
  */
-class IExecutableNetwork : public std::enable_shared_from_this<IExecutableNetwork> {
+class INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::ExecutableNetwork instead") IExecutableNetwork
+    : public std::enable_shared_from_this<IExecutableNetwork> {
 public:
     /**
      * @brief A smart pointer to the current IExecutableNetwork object
diff --git a/inference-engine/samples/benchmark_app/README.md b/inference-engine/samples/benchmark_app/README.md
index 084edf45a046f1..d3aa8b5e489134 100644
--- a/inference-engine/samples/benchmark_app/README.md
+++ b/inference-engine/samples/benchmark_app/README.md
@@ -74,49 +74,51 @@ InferenceEngine:
 benchmark_app [OPTION]
 Options:
 
-    -h, --help                Print a usage message
-    -m "<path>"               Required. Path to an .xml/.onnx/.prototxt file with a trained model or to a .blob files with a trained compiled model.
-    -i "<path>"               Optional. Path to a folder with images and/or binaries or to specific image or binary file.
-    -d "<device>"             Optional. Specify a target device to infer on (the list of available devices is shown below). Default value is CPU.
-                              Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin.
-                              Use "-d MULTI:<comma-separated_devices_list>" format to specify MULTI plugin.
+    -h, --help                  Print a usage message
+    -m "<path>"                 Required. Path to an .xml/.onnx/.prototxt file with a trained model or to a .blob files with a trained compiled model.
+    -i "<path>"                 Optional. Path to a folder with images and/or binaries or to specific image or binary file.
+    -d "<device>"               Optional. Specify a target device to infer on (the list of available devices is shown below). Default value is CPU.
+                                Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin.
+                                Use "-d MULTI:<comma-separated_devices_list>" format to specify MULTI plugin.
     The application looks for a suitable plugin for the specified device.
-    -l "<absolute_path>"      Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.
+    -l "<absolute_path>"        Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.
           Or
-    -c "<absolute_path>"      Required for GPU custom kernels. Absolute path to an .xml file with the kernels description.
-    -api "<sync/async>"       Optional. Enable Sync/Async API. Default value is "async".
-    -niter "<integer>"        Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device.
-    -nireq "<integer>"        Optional. Number of infer requests. Default value is determined automatically for a device.
-    -b "<integer>"            Optional. Batch size value. If not specified, the batch size value is determined from Intermediate Representation.
-    -stream_output            Optional. Print progress as a plain text. When specified, an interactive progress bar is replaced with a multiline output.
-    -t                        Optional. Time, in seconds, to execute topology.
-    -progress                 Optional. Show progress bar (can affect performance measurement). Default values is "false".
-    -shape                    Optional. Set shape for input. For example, "input1[1,3,224,224],input2[1,4]" or "[1,3,224,224]" in case of one input size.
-    -layout                   Optional. Prompts how network layouts should be treated by application. For example, "input1[NCHW],input2[NC]" or "[NCHW]" in case of one input size.
+    -c "<absolute_path>"        Required for GPU custom kernels. Absolute path to an .xml file with the kernels description.
+    -api "<sync/async>"         Optional. Enable Sync/Async API. Default value is "async".
+    -niter "<integer>"          Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device.
+    -nireq "<integer>"          Optional. Number of infer requests. Default value is determined automatically for a device.
+    -b "<integer>"              Optional. Batch size value. If not specified, the batch size value is determined from Intermediate Representation.
+    -stream_output              Optional. Print progress as a plain text. When specified, an interactive progress bar is replaced with a multiline output.
+    -t                          Optional. Time, in seconds, to execute topology.
+    -progress                   Optional. Show progress bar (can affect performance measurement). Default values is "false".
+    -shape                      Optional. Set shape for input. For example, "input1[1,3,224,224],input2[1,4]" or "[1,3,224,224]" in case of one input size.
+    -layout                     Optional. Prompts how network layouts should be treated by application. For example, "input1[NCHW],input2[NC]" or "[NCHW]" in case of one input size.
 
   CPU-specific performance options:
-    -nstreams "<integer>"     Optional. Number of streams to use for inference on the CPU, GPU or MYRIAD devices
-                              (for HETERO and MULTI device cases use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>).
-                              Default value is determined automatically for a device.
-                              Please note that although the automatic selection usually provides a reasonable performance,
-                              it still may be non-optimal for some cases, especially for very small networks.
-                              Also, using nstreams>1 is inherently throughput-oriented option, while for the best-latency
-                              estimations the number of streams should be set to 1.
-    -nthreads "<integer>"     Optional. Number of threads to use for inference on the CPU (including HETERO and MULTI cases).
-    -enforcebf16              Optional. Enforcing of floating point operations execution in bfloat16 precision on platforms with native bfloat16 support. By default, this key sets "true" on platforms with native bfloat16 support and "false" for other platforms. Use "-enforcebf16=false" to disable this feature.
-    -pin "YES"/"NO"/"NUMA"    Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO") CPU threads pinning for CPU-involved inference.
-    -ip "U8"/"FP16"/"FP32"    Optional. Specifies precision for all input layers of the network.
-    -op "U8"/"FP16"/"FP32"    Optional. Specifies precision for all output layers of the network.
-    -iop                      Optional. Specifies precision for input and output layers by name. Example: -iop "input:FP16, output:FP16". Notice that quotes are required. Overwrites precision from ip and op options for specified layers.
+    -nstreams "<integer>"       Optional. Number of streams to use for inference on the CPU, GPU or MYRIAD devices
+                                (for HETERO and MULTI device cases use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>).
+                                Default value is determined automatically for a device.
+                                Please note that although the automatic selection usually provides a reasonable performance,
+                                it still may be non-optimal for some cases, especially for very small networks.
+                                Also, using nstreams>1 is inherently throughput-oriented option, while for the best-latency
+                                estimations the number of streams should be set to 1.
+    -nthreads "<integer>"       Optional. Number of threads to use for inference on the CPU (including HETERO and MULTI cases).
+    -enforcebf16="<true/false>" Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform.
+                                        'true'  - enable  bfloat16 regardless of platform support
+                                        'false' - disable bfloat16 regardless of platform support.
+    -pin "YES"/"NO"/"NUMA"      Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO") CPU threads pinning for CPU-involved inference.
+    -ip "U8"/"FP16"/"FP32"      Optional. Specifies precision for all input layers of the network.
+    -op "U8"/"FP16"/"FP32"      Optional. Specifies precision for all output layers of the network.
+    -iop                        Optional. Specifies precision for input and output layers by name. Example: -iop "input:FP16, output:FP16". Notice that quotes are required. Overwrites precision from ip and op options for specified layers.
 
 
   Statistics dumping options:
-    -report_type "<type>"     Optional. Enable collecting statistics report. "no_counters" report contains configuration options specified, resulting FPS and latency. "average_counters" report extends "no_counters" report and additionally includes average PM counters values for each layer from the network. "detailed_counters" report extends "average_counters" report and additionally includes per-layer PM counters and latency for each executed infer request.
-    -report_folder            Optional. Path to a folder where statistics report is stored.
-    -exec_graph_path          Optional. Path to a file where to store executable graph information serialized.
-    -pc                       Optional. Report performance counters.
-    -dump_config              Optional. Path to XML/YAML/JSON file to dump IE parameters, which were set by application.
-    -load_config              Optional. Path to XML/YAML/JSON file to load custom IE parameters. Please note, command line parameters have higher priority then parameters from configuration file.
+    -report_type "<type>"       Optional. Enable collecting statistics report. "no_counters" report contains configuration options specified, resulting FPS and latency. "average_counters" report extends "no_counters" report and additionally includes average PM counters values for each layer from the network. "detailed_counters" report extends "average_counters" report and additionally includes per-layer PM counters and latency for each executed infer request.
+    -report_folder              Optional. Path to a folder where statistics report is stored.
+    -exec_graph_path            Optional. Path to a file where to store executable graph information serialized.
+    -pc                         Optional. Report performance counters.
+    -dump_config                Optional. Path to XML/YAML/JSON file to dump IE parameters, which were set by application.
+    -load_config                Optional. Path to XML/YAML/JSON file to load custom IE parameters. Please note, command line parameters have higher priority then parameters from configuration file.
 ```
 
 Running the application with the empty list of options yields the usage message given above and an error message.
diff --git a/inference-engine/samples/benchmark_app/benchmark_app.hpp b/inference-engine/samples/benchmark_app/benchmark_app.hpp
index bfe75ccf7b6392..66f9d0b2224161 100644
--- a/inference-engine/samples/benchmark_app/benchmark_app.hpp
+++ b/inference-engine/samples/benchmark_app/benchmark_app.hpp
@@ -51,7 +51,9 @@ static const char infer_num_streams_message[] = "Optional. Number of streams to
                                                 "while for the best-latency estimations the number of streams should be set to 1.";
 
 /// @brief message for enforcing of BF16 execution where it is possible
-static const char enforce_bf16_message[] = "Optional. Enforcing of floating point operations execution in bfloat16 precision where it is acceptable.";
+static const char enforce_bf16_message[] = "Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform.\n"
+                                           "                                  'true'  - enable  bfloat16 regardless of platform support\n"
+                                           "                                  'false' - disable bfloat16 regardless of platform support";
 
 /// @brief message for user library argument
 static const char custom_cpu_library_message[] = "Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.";
@@ -249,7 +251,7 @@ static void showUsage() {
     std::cout << std::endl << "  device-specific performance options:" << std::endl;
     std::cout << "    -nstreams \"<integer>\"     " << infer_num_streams_message << std::endl;
     std::cout << "    -nthreads \"<integer>\"     " << infer_num_threads_message << std::endl;
-    std::cout << "    -enforcebf16              " << enforce_bf16_message << std::endl;
+    std::cout << "    -enforcebf16=<true/false>     " << enforce_bf16_message << std::endl;
     std::cout << "    -pin \"YES\"/\"NO\"/\"NUMA\"    " << infer_threads_pinning_message << std::endl;
     std::cout << std::endl << "  Statistics dumping options:" << std::endl;
     std::cout << "    -report_type \"<type>\"     " << report_type_message << std::endl;
diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp
index 4910d5949984ca..ed2153c2bd9ba0 100644
--- a/inference-engine/samples/benchmark_app/main.cpp
+++ b/inference-engine/samples/benchmark_app/main.cpp
@@ -378,18 +378,21 @@ int main(int argc, char *argv[]) {
             topology_name = cnnNetwork.getName();
             slog::info << (FLAGS_b != 0 ? "Network batch size was changed to: " : "Network batch size: ") << batchSize << slog::endl;
 
-            // ----------------- 6. Configuring input ----------------------------------------------------------------------
+            // ----------------- 6. Configuring inputs and outputs ----------------------------------------------------------------------
             next_step();
 
-            for (auto& item : inputInfo) {
-                if (app_inputs_info.at(item.first).isImage()) {
-                    /** Set the precision of input data provided by the user, should be called before load of the network to the device **/
+            processPrecision(cnnNetwork, FLAGS_ip, FLAGS_op, FLAGS_iop);
+            for (auto& item : cnnNetwork.getInputsInfo()) {
+                // if precision for input set by user, then set it to app_inputs
+                // if it an image, set U8
+                if (!FLAGS_ip.empty() || FLAGS_iop.find(item.first) != std::string::npos) {
+                    app_inputs_info.at(item.first).precision = item.second->getPrecision();
+                } else if (app_inputs_info.at(item.first).isImage()) {
                     app_inputs_info.at(item.first).precision = Precision::U8;
                     item.second->setPrecision(app_inputs_info.at(item.first).precision);
                 }
             }
 
-            processPrecision(cnnNetwork, FLAGS_ip, FLAGS_op, FLAGS_iop);
 
             printInputAndOutputsInfo(cnnNetwork);
             // ----------------- 7. Loading the model to the device --------------------------------------------------------
diff --git a/inference-engine/samples/build_samples.sh b/inference-engine/samples/build_samples.sh
index 91354e3913d6eb..d584a11011985d 100755
--- a/inference-engine/samples/build_samples.sh
+++ b/inference-engine/samples/build_samples.sh
@@ -14,7 +14,7 @@ error() {
 }
 trap 'error ${LINENO}' ERR
 
-SAMPLES_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+SAMPLES_PATH="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" && pwd )"
 
 printf "\nSetting environment variables for building samples...\n"
 
diff --git a/inference-engine/samples/classification_sample_async/classification_sample_async.h b/inference-engine/samples/classification_sample_async/classification_sample_async.h
index 6696ab85c53ca6..07e6895d8ea6c6 100644
--- a/inference-engine/samples/classification_sample_async/classification_sample_async.h
+++ b/inference-engine/samples/classification_sample_async/classification_sample_async.h
@@ -48,6 +48,7 @@ DEFINE_string(i, "", image_message);
 /// @brief Define parameter for set model file <br>
 /// It is a required parameter
 DEFINE_string(m, "", model_message);
+DEFINE_string(m2, "", model_message);
 
 /// @brief device the target device to infer on <br>
 DEFINE_string(d, "CPU", target_device_message);
diff --git a/inference-engine/samples/speech_sample/main.cpp b/inference-engine/samples/speech_sample/main.cpp
index 11a90ad09700c3..133a8ad3a9955d 100644
--- a/inference-engine/samples/speech_sample/main.cpp
+++ b/inference-engine/samples/speech_sample/main.cpp
@@ -263,11 +263,6 @@ float StdDevError(score_error_t error) {
                  - (error.sumError / error.numScores) * (error.sumError / error.numScores)));
 }
 
-float StdDevRelError(score_error_t error) {
-    return (sqrt(error.sumSquaredRelError / error.numScores
-                 - (error.sumRelError / error.numScores) * (error.sumRelError / error.numScores)));
-}
-
 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
 #ifdef _WIN32
 #include <intrin.h>
@@ -579,23 +574,24 @@ int main(int argc, char *argv[]) {
         // --------------------------- 1. Load inference engine -------------------------------------
         slog::info << "Loading Inference Engine" << slog::endl;
         Core ie;
+        CNNNetwork network;
+        ExecutableNetwork executableNet;
 
         /** Printing device version **/
         slog::info << "Device info: " << slog::endl;
         std::cout << ie.GetVersions(deviceStr) << std::endl;
         // -----------------------------------------------------------------------------------------------------
 
-        // 2. Read a model in OpenVINO Intermediate Representation (.xml and .bin files) or ONNX (.onnx file) format
+        // --------------------------- 2. Read a model in OpenVINO Intermediate Representation (.xml and .bin files) or ONNX (.onnx file) format
         slog::info << "Loading network files" << slog::endl;
 
-        CNNNetwork network;
         if (!FLAGS_m.empty()) {
             /** Read network model **/
             network = ie.ReadNetwork(FLAGS_m);
             CheckNumberOfInputs(network.getInputsInfo().size(), numInputArkFiles);
             // -------------------------------------------------------------------------------------------------
 
-            // --------------------------- 3. Set batch size ---------------------------------------------------
+            // --------------------------- Set batch size ---------------------------------------------------
             /** Set batch size.  Unlike in imaging, batching in time (rather than space) is done for speech recognition. **/
             network.setBatchSize(batchSize);
             slog::info << "Batch size is " << std::to_string(network.getBatchSize())
@@ -604,7 +600,7 @@ int main(int argc, char *argv[]) {
 
         // -----------------------------------------------------------------------------------------------------
 
-        // --------------------------- 4. Set parameters and scale factors -------------------------------------
+        // --------------------------- Set parameters and scale factors -------------------------------------
         /** Setting parameter for per layer metrics **/
         std::map<std::string, std::string> gnaPluginConfig;
         std::map<std::string, std::string> genericPluginConfig;
@@ -678,7 +674,7 @@ int main(int argc, char *argv[]) {
         gnaPluginConfig[GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)] = std::to_string(FLAGS_pwl_me);
         // -----------------------------------------------------------------------------------------------------
 
-        // --------------------------- 5. Write model to file --------------------------------------------------
+        // --------------------------- Write model to file --------------------------------------------------
         // Embedded GNA model dumping (for Intel(R) Speech Enabling Developer Kit)
         if (!FLAGS_we.empty()) {
             gnaPluginConfig[GNAConfigParams::KEY_GNA_FIRMWARE_MODEL_IMAGE] = FLAGS_we;
@@ -686,14 +682,13 @@ int main(int argc, char *argv[]) {
         }
         // -----------------------------------------------------------------------------------------------------
 
-        // --------------------------- 6. Loading model to the device ------------------------------------------
+        // --------------------------- 3. Loading model to the device ------------------------------------------
 
         if (useGna) {
             genericPluginConfig.insert(std::begin(gnaPluginConfig), std::end(gnaPluginConfig));
         }
         auto t0 = Time::now();
         std::vector<std::string> outputs;
-        ExecutableNetwork executableNet;
 
         if (!FLAGS_oname.empty()) {
             std::vector<std::string> output_names = ParseBlobName(FLAGS_oname);
@@ -726,7 +721,7 @@ int main(int argc, char *argv[]) {
         ms loadTime = std::chrono::duration_cast<ms>(Time::now() - t0);
         slog::info << "Model loading time " << loadTime.count() << " ms" << slog::endl;
 
-        // --------------------------- 7. Exporting gna model using InferenceEngine AOT API---------------------
+        // --------------------------- Exporting gna model using InferenceEngine AOT API---------------------
         if (!FLAGS_wg.empty()) {
             slog::info << "Writing GNA Model to file " << FLAGS_wg << slog::endl;
             t0 = Time::now();
@@ -744,13 +739,17 @@ int main(int argc, char *argv[]) {
             return 0;
         }
 
+
+        // --------------------------- 4. Create infer request --------------------------------------------------
         std::vector<InferRequestStruct> inferRequests((FLAGS_cw_r > 0 || FLAGS_cw_l > 0) ? 1 : FLAGS_nthreads);
         for (auto& inferRequest : inferRequests) {
             inferRequest = {executableNet.CreateInferRequest(), -1, batchSize};
         }
-        // -----------------------------------------------------------------------------------------------------
+        // ---------------------------------------------------------------------------------------------------------
 
-        // --------------------------- 8. Prepare input blobs --------------------------------------------------
+        // --------------------------- 5. Configure input & output --------------------------------------------------
+
+        //--- Prepare input blobs ----------------------------------------------
         /** Taking information about all topology inputs **/
         ConstInputsDataMap cInputInfo = executableNet.GetInputsInfo();
         CheckNumberOfInputs(cInputInfo.size(), numInputArkFiles);
@@ -788,9 +787,9 @@ int main(int argc, char *argv[]) {
             item.second->setPrecision(inputPrecision);
         }
 
-        // -----------------------------------------------------------------------------------------------------
+        // ---------------------------------------------------------------------
 
-        // --------------------------- 9. Prepare output blobs -------------------------------------------------
+        //--- Prepare output blobs ---------------------------------------------
         ConstOutputsDataMap cOutputInfo(executableNet.GetOutputsInfo());
         OutputsDataMap outputInfo;
         if (!FLAGS_m.empty()) {
@@ -821,9 +820,10 @@ int main(int argc, char *argv[]) {
             Precision outputPrecision = Precision::FP32;  // specify Precision::I32 to retrieve quantized outputs
             outData->setPrecision(outputPrecision);
         }
+        // ---------------------------------------------------------------------
         // -----------------------------------------------------------------------------------------------------
 
-        // --------------------------- 10. Do inference --------------------------------------------------------
+        // --------------------------- 6. Do inference --------------------------------------------------------
         std::vector<std::string> output_name_files;
         std::vector<std::string> reference_name_files;
         size_t count_file = 1;
@@ -854,6 +854,7 @@ int main(int argc, char *argv[]) {
                 state.Reset();
             }
 
+            /** Work with each utterance **/
             for (uint32_t utteranceIndex = 0; utteranceIndex < numUtterances; ++utteranceIndex) {
                 std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> utterancePerfMap;
                 std::string uttName;
@@ -867,6 +868,7 @@ int main(int argc, char *argv[]) {
 
                 slog::info << "Number scores per frame : " << numScoresPerFrame << slog::endl;
 
+                /** Get information from ark file for current utterance **/
                 numFrameElementsInput.resize(numInputArkFiles);
                 for (size_t i = 0; i < inputArkFiles.size(); i++) {
                     std::vector<uint8_t> ptrUtterance;
@@ -905,6 +907,7 @@ int main(int argc, char *argv[]) {
 
                 ptrScores.resize(numFrames * numScoresPerFrame * sizeof(float));
                 if (!FLAGS_r.empty()) {
+                    /** Read ark file with reference scores **/
                     std::string refUtteranceName;
                     GetKaldiArkInfo(reference_name_files[next_output].c_str(), utteranceIndex, &n, &numBytesReferenceScoreThisUtterance);
                     ptrReferenceScores.resize(numBytesReferenceScoreThisUtterance);
@@ -950,6 +953,7 @@ int main(int argc, char *argv[]) {
                     }
 
                     bool inferRequestFetched = false;
+                    /** Start inference loop **/
                     for (auto &inferRequest : inferRequests) {
                         if (frameIndex == numFrames) {
                             numFramesThisBatch = 1;
@@ -969,6 +973,7 @@ int main(int argc, char *argv[]) {
                             ConstOutputsDataMap newOutputInfo;
                             if (inferRequest.frameIndex >= 0) {
                                 if (!FLAGS_o.empty()) {
+                                    /* Prepare output data for save to file in future */
                                     outputFrame =
                                             &ptrScores.front() +
                                             numScoresPerFrame * sizeof(float) * (inferRequest.frameIndex);
@@ -993,6 +998,7 @@ int main(int argc, char *argv[]) {
                                                 byteSize);
                                 }
                                 if (!FLAGS_r.empty()) {
+                                    /** Compare output data with reference scores **/
                                     if (!outputs.empty()) {
                                         newOutputInfo[outputs[next_output]] = cOutputInfo[outputs[next_output]];
                                     } else {
@@ -1029,6 +1035,7 @@ int main(int argc, char *argv[]) {
                             continue;
                         }
 
+                        /** Prepare input blobs**/
                         ptrInputBlobs.clear();
                         if (FLAGS_iname.empty()) {
                             for (auto &input : cInputInfo) {
@@ -1063,6 +1070,7 @@ int main(int argc, char *argv[]) {
                         }
 
                         int index = static_cast<int>(frameIndex) - (FLAGS_cw_l + FLAGS_cw_r);
+                        /** Start inference **/
                         inferRequest.inferRequest.StartAsync();
                         inferRequest.frameIndex = index < 0 ? -2 : index;
                         inferRequest.numFramesThisBatch = numFramesThisBatch;
@@ -1086,6 +1094,7 @@ int main(int argc, char *argv[]) {
                         }
                         inferRequestFetched |= true;
                     }
+                    /** Inference was finished for current frame **/
                     if (!inferRequestFetched) {
                         std::this_thread::sleep_for(std::chrono::milliseconds(1));
                         continue;
@@ -1103,6 +1112,7 @@ int main(int argc, char *argv[]) {
                 }
 
                 if (!FLAGS_o.empty()) {
+                    /* Save output data to file */
                     bool shouldAppend = (utteranceIndex == 0) ? false : true;
                     SaveKaldiArkArray(output_name_files[next_output].c_str(), shouldAppend, uttName, &ptrScores.front(),
                                       numFramesArkFile, numScoresPerFrame);
diff --git a/inference-engine/src/cldnn_engine/CMakeLists.txt b/inference-engine/src/cldnn_engine/CMakeLists.txt
index c7ac932910bd07..1ba2bc9e98e277 100644
--- a/inference-engine/src/cldnn_engine/CMakeLists.txt
+++ b/inference-engine/src/cldnn_engine/CMakeLists.txt
@@ -40,6 +40,8 @@ target_include_directories(${TARGET_NAME} PRIVATE
 
 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
 
+
+set_ie_threading_interface_for(clDNN_lib)
 # Failed because of OpenCL
 # ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
 
diff --git a/inference-engine/src/cldnn_engine/cldnn_config.cpp b/inference-engine/src/cldnn_engine/cldnn_config.cpp
index eb7359c2625b32..c25ef88d122a31 100644
--- a/inference-engine/src/cldnn_engine/cldnn_config.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp
@@ -11,6 +11,7 @@
 #include "ie_api.h"
 #include "file_utils.h"
 #include "cldnn_itt.h"
+#include <thread>
 
 #ifdef _WIN32
 # include <direct.h>
@@ -221,6 +222,20 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
             } else {
                 IE_THROW(NotFound) << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val;
             }
+        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS) == 0) {
+            int max_threads = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
+            try {
+                int val_i = std::stoi(val);
+                if (val_i <= 0 || val_i > max_threads) {
+                    n_threads = max_threads;
+                } else {
+                    n_threads = val_i;
+                }
+            } catch (const std::exception&) {
+                IE_THROW() << "Wrong value for property key " << CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS << ": " << val
+                                   << "\nSpecify the number of threads use for build as an integer."
+                                   << "\nOut of range value will be set as a default value, maximum concurrent threads.";
+            }
         } else {
             IE_THROW(NotFound) << "Unsupported property key by plugin: " << key;
         }
@@ -306,5 +321,6 @@ void Config::adjustKeyMapValues() {
     key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams);
     key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id;
     key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = "";
+    key_config_map[CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS] = std::to_string(n_threads);
 }
 }  // namespace CLDNNPlugin
diff --git a/inference-engine/src/cldnn_engine/cldnn_config.h b/inference-engine/src/cldnn_engine/cldnn_config.h
index fed2617df8ca5d..756f324cf99739 100644
--- a/inference-engine/src/cldnn_engine/cldnn_config.h
+++ b/inference-engine/src/cldnn_engine/cldnn_config.h
@@ -31,7 +31,8 @@ struct Config {
                graph_dumps_dir(""),
                sources_dumps_dir(""),
                device_id(""),
-               kernels_cache_dir("") {
+               kernels_cache_dir(""),
+               n_threads(std::max(static_cast<unsigned int>(1), std::thread::hardware_concurrency())) {
         adjustKeyMapValues();
     }
 
@@ -56,6 +57,7 @@ struct Config {
     std::string sources_dumps_dir;
     std::string device_id;
     std::string kernels_cache_dir;
+    size_t n_threads;
 
     std::map<std::string, std::string> key_config_map;
 };
diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
index 41aabc518e62ba..01ea25f87eead8 100644
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@@ -32,6 +32,7 @@
 #include <transformations/common_optimizations/lin_op_sequence_fusion.hpp>
 #include <transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp>
 #include "transformations/common_optimizations/convert_quantize_dequantize.hpp"
+#include "transformations/common_optimizations/softmax_fusion.hpp"
 #include <transformations/op_conversions/convert_depth_to_space.hpp>
 #include <transformations/op_conversions/convert_space_to_depth.hpp>
 #include <transformations/op_conversions/convert_gelu.hpp>
@@ -323,6 +324,11 @@ InferenceEngine::CNNNetwork clDNNEngine::CloneAndTransformNetwork(const Inferenc
                     return false;
                 });
 
+            pass_config->set_callback<ngraph::pass::SoftmaxFusion>(
+                [](const_node_ptr &node) -> bool {
+                    return node->input_value(0).get_partial_shape().rank().get_length() > 5;
+                });
+
             // List of enabled/disabled transformations
             pass_config->disable<ngraph::pass::ConvertGELU>();
             pass_config->disable<ngraph::pass::ConvertMod>();
@@ -488,7 +494,8 @@ ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(const InferenceEn
                context_config.tuningConfig.mode == current_config.tuningConfig.mode &&
                context_config.tuningConfig.cache_file_path == current_config.tuningConfig.cache_file_path &&
                context_config.kernels_cache_dir == current_config.kernels_cache_dir &&
-               context_config.device_id == current_config.device_id;
+               context_config.device_id == current_config.device_id &&
+               context_config.n_threads == current_config.n_threads;
     };
 
     {
diff --git a/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp b/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp
index 54ec6f5eb798f6..95dc67da5d5798 100644
--- a/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp
+++ b/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp
@@ -193,10 +193,10 @@ REGISTER_FACTORY(v5, LogSoftmax);
 REGISTER_FACTORY(v5, LSTMSequence);
 //REGISTER_FACTORY(v5, NonMaxSuppression); Supported via v5 -> v5 internal conversion
 REGISTER_FACTORY(v5, Round);
+REGISTER_FACTORY(v5, GatherND);
 
 // ----------------------------- Unsupported v5 ops ----------------------------- //
 // REGISTER_FACTORY(v5, BatchNormInference);
-// REGISTER_FACTORY(v5, GatherND);
 // REGISTER_FACTORY(v5, GRUSequence);
 // REGISTER_FACTORY(v5, Loop);
 // REGISTER_FACTORY(v5, RNNSequence);
diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
index e73f9d7451cfe7..f03db1c4834e0d 100644
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@@ -267,7 +267,8 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
                 m_config.queueThrottle,
                 m_config.memory_pool_on,
                 m_config.throughput_streams,
-                m_config.kernels_cache_dir));
+                m_config.kernels_cache_dir,
+                m_config.n_threads));
     }
 }
 
diff --git a/inference-engine/src/cldnn_engine/ops/gather_nd.cpp b/inference-engine/src/cldnn_engine/ops/gather_nd.cpp
new file mode 100644
index 00000000000000..6a1cd65132928e
--- /dev/null
+++ b/inference-engine/src/cldnn_engine/ops/gather_nd.cpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "cldnn_program.h"
+#include "cldnn_common_utils.h"
+
+#include "ngraph/op/gather_nd.hpp"
+#include "ngraph/op/constant.hpp"
+
+#include "api/gather_nd.hpp"
+
+namespace CLDNNPlugin {
+
+void CreateGatherNDOp(Program& p, const std::shared_ptr<ngraph::op::v5::GatherND>& op) {
+    p.ValidateInputs(op, {2});
+    auto inputPrimitives = p.GetInputPrimitiveIDs(op);
+    std::string layerName = layer_type_name_ID(op);
+
+    int32_t indices_rank = static_cast<int32_t>(op->get_input_shape(1).size());
+
+    auto batch_dims = op->get_batch_dims();
+
+    auto primitive = cldnn::gather_nd(layerName,
+                                           inputPrimitives[0],
+                                           inputPrimitives[1],
+                                           indices_rank,
+                                           batch_dims);
+
+    p.AddPrimitive(primitive);
+    p.AddPrimitiveToProfiler(op);
+}
+
+REGISTER_FACTORY_IMPL(v5, GatherND);
+
+}  // namespace CLDNNPlugin
diff --git a/inference-engine/src/cldnn_engine/ops/interpolate.cpp b/inference-engine/src/cldnn_engine/ops/interpolate.cpp
index 4212459d5798df..f9241b8ca0fd75 100644
--- a/inference-engine/src/cldnn_engine/ops/interpolate.cpp
+++ b/inference-engine/src/cldnn_engine/ops/interpolate.cpp
@@ -124,14 +124,7 @@ void CreateInterpolateOp(Program& p, const std::shared_ptr<ngraph::op::v4::Inter
     int antialias = attrs.antialias;
     float cube_coeff = attrs.cube_coeff;
 
-    // [WA] Replace linear mode with linear_onnx to emulate the old behavior from v1->v4 Interpolate converison
-    // This WA must be removed as soon as optimized kernel supports linear mode
-    auto input_shape_rank = op->get_input_shape(0).size();
     auto mode = attrs.mode;
-    if (mode == ngraph::op::v4::Interpolate::InterpolateMode::linear && input_shape_rank < 5) {
-        mode = ngraph::op::v4::Interpolate::InterpolateMode::linear_onnx;
-    }
-
     auto cldnnSampleType = GetResampleType(mode);
     auto shapeCalcMode = GetShapeCalculationMode(attrs.shape_calculation_mode);
     auto coordTransMode = GetCoordinateTransformationMode(attrs.coordinate_transformation_mode);
diff --git a/inference-engine/src/cldnn_engine/ops/rnn.cpp b/inference-engine/src/cldnn_engine/ops/rnn.cpp
index b7f3b3f842901d..5801ef91549d67 100644
--- a/inference-engine/src/cldnn_engine/ops/rnn.cpp
+++ b/inference-engine/src/cldnn_engine/ops/rnn.cpp
@@ -153,11 +153,19 @@ void CreateLSTMCellOp(Program& p, const std::shared_ptr<ngraph::op::v4::LSTMCell
     p.AddInnerPrimitiveToProfiler(gemmReorderID, op->get_friendly_name(), op);
     p.AddInnerPrimitiveToProfiler(lstm_elt_id, op->get_friendly_name(), op);
 
+    cldnn::tensor outSz = cldnn::tensor{ lstm_batch_size, lstm_hidden_size, 1, 1 };
+    cldnn::primitive_id outputHiddenCropID = layerName + "_hc";
     cldnn::primitive_id outputHiddenID = layerName + ".0";
-    p.AddPrimitive(cldnn::crop(outputHiddenID, lstm_elt_id, hiddenSz, cldnn::tensor{0, 0, 0, 0}));
+    p.AddPrimitive(cldnn::crop(outputHiddenCropID, lstm_elt_id, hiddenSz, cldnn::tensor{0, 0, 0, 0}));
+    p.AddInnerPrimitiveToProfiler(outputHiddenCropID, op->get_friendly_name(), op);
+    p.AddPrimitive(cldnn::reshape(outputHiddenID, outputHiddenCropID, outSz));
     p.AddInnerPrimitiveToProfiler(outputHiddenID, op->get_friendly_name(), op);
+
+    cldnn::primitive_id outputCellCropID = layerName + "_cc";
     cldnn::primitive_id outputCellID = layerName + ".1";
-    p.AddPrimitive(cldnn::crop(outputCellID, lstm_elt_id, hiddenSz, cellCropSz));
+    p.AddPrimitive(cldnn::crop(outputCellCropID, lstm_elt_id, hiddenSz, cellCropSz));
+    p.AddInnerPrimitiveToProfiler(outputCellCropID, op->get_friendly_name(), op);
+    p.AddPrimitive(cldnn::reshape(outputCellID, outputHiddenCropID, outSz));
     p.AddInnerPrimitiveToProfiler(outputCellID, op->get_friendly_name(), op);
 
     // output primitive IDs
diff --git a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
index e3b0f417ff0f3c..6257d8da47d7c9 100644
--- a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
+++ b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
@@ -824,20 +824,38 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_
                     std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
 
                     if (num_bytes_per_weight == 1) {
-                        int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.affine.ptr_weights);
-                        gna_compound_bias_t *ptr_bias = reinterpret_cast<gna_compound_bias_t *>(component[i].op.affine.ptr_biases);
+                        if (num_bytes_per_bias != 1) {
+                            int8_t* ptr_weight = reinterpret_cast<int8_t*>(component[i].op.affine.ptr_weights);
+                            gna_compound_bias_t* ptr_bias = reinterpret_cast<gna_compound_bias_t*>(component[i].op.affine.ptr_biases);
 #ifdef DUMP_WB
-                        for (uint32_t row = 0; row < num_weight_rows; row++) {
-                            for (uint32_t col = 0; col < num_weight_columns; col++) {
-                                if (logging_precision == kDnnFloat) {
-                                    float val =
-                                        static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier
+                            for (uint32_t row = 0; row < num_weight_rows; row++) {
+                                for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                    if (logging_precision == kDnnFloat) {
+                                        float val =
+                                            static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier
                                             / weight_scale_factor;
-                                    out_wfile << std::setprecision(4) << val << " ";
-                                } else {
-                                    out_wfile <<  int((int8_t) ptr_weight[row * num_weight_columns + col]) << " ";
+                                        out_wfile << std::setprecision(4) << val << " ";
+                                    } else {
+                                        out_wfile << int((int8_t)ptr_weight[row * num_weight_columns + col]) << " ";
+                                    }
+                                    out_wfile << "\n";
+                                }
+                            }
+#endif
+                        } else {
+                            int8_t* ptr_weight = reinterpret_cast<int8_t*>(component[i].op.affine.ptr_weights);
+#ifdef DUMP_WB
+                            for (uint32_t row = 0; row < num_weight_rows; row++) {
+                                for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                    if (logging_precision == kDnnFloat) {
+                                        float val =
+                                            static_cast<float>(ptr_weight[row * num_weight_columns + col]) / weight_scale_factor;
+                                        out_wfile << std::setprecision(4) << val << " ";
+                                    } else {
+                                        out_wfile << int((int8_t)ptr_weight[row * num_weight_columns + col]) << " ";
+                                    }
+                                    out_wfile << "\n";
                                 }
-                                out_wfile << "\n";
                             }
                         }
 #endif
@@ -873,18 +891,31 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_
                     }
                     if (compute_precision_ == kDnnInt) {
                         if (num_bytes_per_weight == 1) {
-                            gna_compound_bias_t
-                                *ptr_biases = reinterpret_cast<gna_compound_bias_t *>(component[i].op.affine.ptr_biases);
+                            if (num_bytes_per_bias != 1) {
+                                gna_compound_bias_t
+                                    * ptr_biases = reinterpret_cast<gna_compound_bias_t*>(component[i].op.affine.ptr_biases);
 #ifdef DUMP_WB
-                            for (uint32_t row = 0; row < num_rows_out; row++) {
-                                if (logging_precision == kDnnInt) {
-                                    out_bfile << std::setw(8) << ptr_biases[row].bias << ", ";
-                                    out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n";
-                                } else {
-                                    out_bfile << std::setw(8) << ptr_biases[row].bias / output_scale_factor << "\n";
+                                for (uint32_t row = 0; row < num_rows_out; row++) {
+                                    if (logging_precision == kDnnInt) {
+                                        out_bfile << std::setw(8) << ptr_biases[row].bias << ", ";
+                                        out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n";
+                                    } else {
+                                        out_bfile << std::setw(8) << ptr_biases[row].bias / output_scale_factor << "\n";
+                                    }
                                 }
-                            }
 #endif
+                            } else {
+                                int8_t *ptr_biases = reinterpret_cast<int8_t*>(component[i].op.affine.ptr_biases);
+#ifdef DUMP_WB
+                                for (uint32_t row = 0; row < num_rows_out; row++) {
+                                    if (logging_precision == kDnnInt) {
+                                        out_bfile << std::setw(8) << ptr_biases[row] << "\n";
+                                    } else {
+                                        out_bfile << std::setw(8) << ptr_biases[row] / output_scale_factor << "\n";
+                                    }
+                                }
+#endif
+                            }
                         } else {
                             int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.affine.ptr_biases);
 #ifdef DUMP_WB
@@ -2102,9 +2133,12 @@ void GNAPluginNS::backend::AMIntelDNN::WriteInputAndOutputText() {
                     } else {
                         floatValue = reinterpret_cast<float*>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
                     }
-                } else {
+                } else if (component[i].num_bytes_per_output == 2) {
                     auto value = reinterpret_cast<int16_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
                     floatValue = static_cast<float>(value);
+                } else {
+                    auto value = reinterpret_cast<int8_t*>(component[i].ptr_outputs)[k * component[i].num_columns_out + j];
+                    floatValue = static_cast<float>(value);
                 }
                 floatValue /= component[i].output_scale_factor;
                 out_file << std::setw(8) << floatValue << "\n";
@@ -2142,10 +2176,14 @@ void GNAPluginNS::backend::AMIntelDNN::WriteInputAndOutputText() {
                     } else {
                         floatValue = reinterpret_cast<float *>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
                     }
-                } else {
+                } else if (component[i].num_bytes_per_input == 2) {
                     auto value = reinterpret_cast<int16_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in+ j];
                     floatValue = static_cast<float>(value);
+                } else {
+                    auto value = reinterpret_cast<int8_t*>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
+                    floatValue = static_cast<float>(value);
                 }
+
                 in_file << std::setw(8) << floatValue / input_scale_factor << "\n";
             }
         }
diff --git a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
index f293b7110cfc47..975397362839d9 100644
--- a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
+++ b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
@@ -13,6 +13,8 @@ constexpr uint32_t convMinFiltersNum = 4;
 constexpr uint32_t convMaxFiltersNum = 65532;
 constexpr uint32_t convFiltersNumDivider = 4;
 constexpr uint32_t convEachKernelByteAlignment = 16;
+constexpr uint32_t noOfInputsDivisor = 8;
+constexpr uint32_t noOfInputsLowPrecDivisor = 16;
 
 }
 } // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/backend/make_pwl.cpp b/inference-engine/src/gna_plugin/backend/make_pwl.cpp
index c63ab7a314526b..6190a89540fbec 100644
--- a/inference-engine/src/gna_plugin/backend/make_pwl.cpp
+++ b/inference-engine/src/gna_plugin/backend/make_pwl.cpp
@@ -18,6 +18,7 @@ void make_gna_pwl(const DnnActivation  fun,
                   const double u_bound,
                   const double in_scale,
                   const double out_scale,
+                  const bool low_precision,
                   std::vector<gna_pwl_segment_t> &gna_pwl) {
     pwl_gna_slope_scale_t s;
     uint32_t pwl_size = static_cast<int32_t>(pwl.size());
@@ -230,7 +231,7 @@ void make_gna_pwl(const DnnActivation  fun,
                 gnalog() << "=========================== LeakyReLU Segments ======================\n";
             int32_t x_lower = INT32_MIN;
             int32_t x_upper = INT32_MAX;
-            int16_t y_lower = INT16_MIN;
+            int16_t y_lower = low_precision ? INT8_MIN : INT16_MIN;
             int16_t y_upper = INT16_MAX;
             if (fun.fqParams.set) {
                 x_lower = FLOAT_TO_INT32(*fun.fqParams.input_low * 1.25 * in_scale);
diff --git a/inference-engine/src/gna_plugin/backend/make_pwl.hpp b/inference-engine/src/gna_plugin/backend/make_pwl.hpp
index eef981034ed2ce..62d95210906d18 100644
--- a/inference-engine/src/gna_plugin/backend/make_pwl.hpp
+++ b/inference-engine/src/gna_plugin/backend/make_pwl.hpp
@@ -15,4 +15,5 @@ void make_gna_pwl(const DnnActivation  fun,
                   const double u_bound,
                   const double in_scale,
                   const double out_scale,
+                  const bool low_precision,
                   std::vector<gna_pwl_segment_t> &gna_pwl);
diff --git a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
index 1669fe050fc079..e55e36a5f1a657 100644
--- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
+++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
@@ -18,5 +18,6 @@ struct GNAFlags {
     bool sw_fp32 = false;
     bool fake_quantized = false;
     bool performance_counting = false;
+    bool input_low_precision = false;
 };
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/descriptions/gna_input_desc.cpp b/inference-engine/src/gna_plugin/descriptions/gna_input_desc.cpp
index 3fc2d49afb6df7..8095d9cf4ddd3e 100644
--- a/inference-engine/src/gna_plugin/descriptions/gna_input_desc.cpp
+++ b/inference-engine/src/gna_plugin/descriptions/gna_input_desc.cpp
@@ -18,7 +18,11 @@ size_t InputDesc::minBytesRequiredForStoreInput(CNNLayerPtr layer) {
     auto quantized = getInjectedData<QuantizedLayerParams>(layer);
     size_t precision_bytes;
     if (quantized) {
-        precision_bytes = 2;
+        if (quantized->lowPrecision) {
+            precision_bytes = 1;
+        } else {
+            precision_bytes = 2;
+        }
     } else {
         precision_bytes = 4;
     }
diff --git a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
index ae0edf28013240..ac1c6bdf47a51a 100644
--- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
@@ -25,6 +25,7 @@ namespace frontend {
 /**
  * @brief description of quantisation precision
  * @tparam Ip - input precision
+ * @tparam Op - output precision
  * @tparam Wp - weights precision
  * @tparam Bp - biases precision
  * @tparam Np - network precision - can be auto generated in future
@@ -82,6 +83,12 @@ struct QuantI8  : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna
         _Np = InferenceEngine::Precision::MIXED;
     }
 };
+// Low precision path quantizer (I8 inputs, weights, biases)
+struct QuantI8_I8 : public QuantDescTmpl<PRECISION_TYPE(I8, I32, I8, I8, MIXED)> {
+    QuantI8_I8() {
+        _Np = InferenceEngine::Precision::MIXED;
+    }
+};
 
 // for support proper trait instantiation for quantization function callback
 struct FakeQuantI16 : public QuantI16 {};
@@ -155,6 +162,17 @@ class Quant<QuantI8> {
     }
 };
 
+template<>
+class Quant<QuantI8_I8> {
+public:
+    template<class ...Args>
+    void operator()(Args && ... args) const {
+        QuantizationCallback<int8_t, int8_t> {
+            std::forward<Args>(args)...
+        }.runQuantize();
+    }
+};
+
 template<>
 class Quant<FakeQuantI16> {
  public:
@@ -393,13 +411,12 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
             << "cannot copy weights for layer :"<< conv->name << " of size" << intWeights->byteSize();
     }
 
-    auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) {
+    auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) -> size_t {
         if (wl->_biases) {
             return wl->_biases->size();
         }
-        // calculating biases len using outdata dims
-        auto & dims = wl->outData.front()->getDims();
-        return dims[1];
+        // calculating biases len using outdata dims: biases number should be equal to output channels number
+        return InferenceEngine::GetDataDimSize(wl->outData.front(), InferenceEngine::DataDimName::C);
     };
 
     using BiasesPrecision = typename QuantDesc::BiasesPrecision;
@@ -651,8 +668,8 @@ template<class Desc>
 class DataQuantizer<Desc, InferenceEngine::ConvolutionLayer *> : public DataQuantizerBase {
  public:
     explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
-    bool operator()(InferenceEngine::WeightableLayer *wl) const {
-        quantizeWeightsBiasesConv<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>());
+    bool operator()(InferenceEngine::ConvolutionLayer *cl) const {
+        quantizeWeightsBiasesConv<typename Desc::OptionalType>(Desc::optional(), cl, Quant<typename Desc::OptionalType>());
         return true;
     }
 };
@@ -661,8 +678,8 @@ template<class Desc>
 class DataQuantizer<Desc, InferenceEngine::ScaleShiftLayer *> : public DataQuantizerBase {
  public:
     explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
-    bool operator()(InferenceEngine::ScaleShiftLayer *wl) const {
-        quantizeWeightsBiases<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>(), true);
+    bool operator()(InferenceEngine::ScaleShiftLayer *ssl) const {
+        quantizeWeightsBiases<typename Desc::OptionalType>(Desc::optional(), ssl, Quant<typename Desc::OptionalType>(), true);
         return true;
     }
 };
@@ -681,6 +698,7 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
 
 using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
 using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
+using QuantI8_I8 = frontend::QuantPair<frontend::QuantI8_I8, frontend::QuantI8_I8>;
 
 
 using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
diff --git a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
index 46b000e35df2ba..1f3f125a029172 100644
--- a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
@@ -26,7 +26,7 @@ template<class T>
 class ModelQuantizer {
  public:
     InferenceEngine::CNNNetwork quantize(const InferenceEngine::CNNNetwork &model, float scaleFactor) const {
-        return quantize(model, [](const InferenceEngine::CNNNetwork &, bool runBeforeCopy){}, std::vector<float>({scaleFactor}));
+        return quantize(model, [](const InferenceEngine::CNNNetwork &, bool runBeforeCopy, bool lowPrecision){}, std::vector<float>({scaleFactor}));
     }
 
     template <class PreQuantisationCb>
@@ -35,7 +35,7 @@ class ModelQuantizer {
     }
 
     InferenceEngine::CNNNetwork quantize(const InferenceEngine::CNNNetwork &model, std::vector<float> scaleFactor) const {
-        return quantize(model, [](InferenceEngine::CNNNetwork &, bool runBeforeCopy){}, scaleFactor);
+        return quantize(model, [](InferenceEngine::CNNNetwork &, bool runBeforeCopy, bool lowPrecision){}, scaleFactor);
     }
 
     template <class PreQuantisationCb>
@@ -45,14 +45,15 @@ class ModelQuantizer {
             transformLayer(newLayer, WeightsConverter());
             return newLayer;
         };
+        bool lowPrecision = (T::mandatory().getInputPrecision().size() == sizeof(uint8_t));
         InferenceEngine::CNNNetwork copiedNet = InferenceEngine::CNNNetCopy(model);
-        cb(copiedNet, true);
+        cb(copiedNet, true, lowPrecision);
 
         copiedNet = InferenceEngine::CNNNetCopy(copiedNet, visitor);
 
         // allow client code to access copied topology, to avoid copies if user would like to chain quantisation with
         // another preprocessing
-        cb(copiedNet, false);
+        cb(copiedNet, false, lowPrecision);
 
         if (scaleFactor.empty()) {
             THROW_GNA_EXCEPTION << "Scale factor is empty";
@@ -62,6 +63,8 @@ class ModelQuantizer {
         auto sortedNewNet = InferenceEngine::details::CNNNetSortTopologically(copiedNet);
         gnalog() << "Sorted layers: " << std::endl;
         for (auto &&layer : sortedNewNet) {
+            auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+            quantData->lowPrecision = lowPrecision;
             gnalog() << layer->name << std::endl;
         }
         /// filling scale factors for input layers, memory layers will have scaleFactor of 1.0 by default
@@ -79,7 +82,8 @@ class ModelQuantizer {
         }
 
         bool isFakeQuantize = std::is_same<T, FakeQuantI8>() || std::is_same<T, FakeQuantI16>();
-        propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), isFakeQuantize);
+        propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), T::optional().getWeightsPrecision().size(),
+                             T::mandatory().getInputPrecision().size(), isFakeQuantize);
 
         // sorted order gives possibility for propagate quantisation along depended layers
         for (auto &&layer : sortedNewNet) {
@@ -90,8 +94,9 @@ class ModelQuantizer {
     }
 
  private :
-    void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int weightsBytesSize, bool fakeQuantize) const {
-        ScaleFactorCalculator sf(net, weightsBytesSize, fakeQuantize);
+    void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int mandWeightsBytesSize,
+                              int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize) const {
+        ScaleFactorCalculator sf(net, mandWeightsBytesSize, optWeightsBytesSize, inputsBytesSize, fakeQuantize);
 
         while (!sf.allLayersProcessed()) {
             for (auto &&layer : sf.getStartLayers()) {
diff --git a/inference-engine/src/gna_plugin/frontend/quantization.cpp b/inference-engine/src/gna_plugin/frontend/quantization.cpp
index df060354f09edb..69dcc1ccb586d8 100644
--- a/inference-engine/src/gna_plugin/frontend/quantization.cpp
+++ b/inference-engine/src/gna_plugin/frontend/quantization.cpp
@@ -358,7 +358,6 @@ void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
             int8_t *ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
             rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
 
-
             value = ptr_float_weights[row * num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
             if (value > 127.0) {
                 *ptr_weight_8 = 127;
@@ -404,3 +403,57 @@ void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
         QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
     }
 }
+
+template<>
+void QuantizationCallback<int8_t, int8_t>::runQuantize() const {
+    uint32_t num_saturate = 0;
+    for (uint32_t row = 0; row < num_rows; row++) {
+        for (uint32_t col = 0; col < num_columns; col++) {
+            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_weights[row * num_columns + col] * *ptr_weight_scale_factor + rounding_value;
+            int8_t* ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
+            if (value > 127.0) {
+                *ptr_weight_8 = 127;
+                num_saturate++;
+            } else if (value < -128.0) {
+                *ptr_weight_8 = -128;
+                num_saturate++;
+            } else {
+                *ptr_weight_8 = (int8_t)value;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int8_t* ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
+            *ptr_weight_8 = 0;
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int8_t* ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
+            *ptr_weight_8 = 0;
+        }
+    }
+
+    if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) {
+        for (uint32_t j = 0; j < num_rows; j++) {
+            float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+            if (value > 127.0) {
+                ptr_int_biases[j] = 127;
+                num_saturate++;
+            } else if (value < -128.0) {
+                ptr_int_biases[j] = -128;
+                num_saturate++;
+            } else {
+                ptr_int_biases[j] = (int8_t)value;
+            }
+        }
+        for (uint32_t j = num_rows; j < num_rows_padded; j++) {
+            ptr_int_biases[j] = 0;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine8_8()\n", num_saturate, num_rows * num_columns + num_rows);
+    }
+}
diff --git a/inference-engine/src/gna_plugin/frontend/quantization.h b/inference-engine/src/gna_plugin/frontend/quantization.h
index 7817b66da297a5..4aaebebe8f6d68 100644
--- a/inference-engine/src/gna_plugin/frontend/quantization.h
+++ b/inference-engine/src/gna_plugin/frontend/quantization.h
@@ -13,6 +13,8 @@
 
 #define MAX_OUT_MULTIPLIER 230
 #define MAX_VAL_1B_WEIGHT 127
+#define MAX_VAL_1B_FEAT 64
+#define MAX_VAL_1B_BIAS 127
 #define MAX_VAL_2B_WEIGHT 16384
 #define MAX_VAL_2B_FEAT 16384
 #define MAX_VAL_4B_BIAS 1073741824
@@ -45,6 +47,7 @@ struct QuantizationCallback {
 
 template class QuantizationCallback<int16_t, int32_t>;
 template class QuantizationCallback<int8_t, gna_compound_bias_t>;
+template class QuantizationCallback<int8_t, int8_t>;
 
 std::pair<float, float> FindMinMaxValues(void* ptr_float_memory, size_t num_elements);
 float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
diff --git a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
index 4de70f711e89db..918ac8ee3d3ec2 100644
--- a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
+++ b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
@@ -84,8 +84,8 @@ struct QuantizedLayerParams {
     // deprecate this
     Quantization _weights_quant;
     Quantization _bias_quant;
-    float _o_shift = 0.0f;
-    float _b_shift = 0.0f;
+
+    bool lowPrecision = false;
 };
 
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
index 80280df403a0f7..a2bfaccc00f54a 100644
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@@ -17,6 +17,7 @@
 #include "gna_plugin_log.hpp"
 #include "gna_slope_scale.h"
 #include "runtime/pwl.h"
+#include "gna_data_types.hpp"
 
 namespace GNAPluginNS {
 namespace frontend {
@@ -181,14 +182,14 @@ template<class T>
 class ScaleFactorPerLayer {
  public:
     /**
-     * @brief calculates weights scale factor for fit dynamic range into target bitsize,
+     * @brief calculates weights scale factor to fit dynamic range into target bitsize,
      * also calculates output scale factor for the given layer
      * @param cnnLayer
      * @param weightsSize
      * @param result
      * @return
      */
-    bool operator()(T cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
+    bool operator()(T cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
         return false;
     }
 };
@@ -197,6 +198,7 @@ template<>
 class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
  private :
     const float activation_scale_factor = 2048.f;
+    const float low_prec_activation_scale_factor = 4.f;
     const float identity_scale_factor = 2049.0f;
     const float max_activation_scale_factor = 4096.0f;
     const float k = 5;
@@ -206,12 +208,13 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
  protected :
     float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer,
                              GNAPluginNS::LayerInfo const& layer,
+                             int inputsSize,
                              const bool fakeQuantize) {
         auto quantizedParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
 
         // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
         // set the initial value
-        float result = activation_scale_factor;
+        float result = (inputsSize == 2 ? activation_scale_factor : low_prec_activation_scale_factor);
         if (layer.isIdentity()) {
 // #define accurate_identity_scale_factor
 #ifdef accurate_identity_scale_factor
@@ -246,11 +249,13 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
             result = fabs(scale_extra) > fabs(scale_default) ?  identity_scale_factor / 2 : identity_scale_factor;
 
 #endif
-        } else if (layer.isRelu() &&
-                static_cast<uint64_t>(activation_scale_factor * quantizedParams->_src_quant.GetScale())
-                                                            > std::numeric_limits<int32_t>::max()-1) {
+        } else if (layer.isRelu()) {
             // if activation is one from relu family, we need to apply heuristic to avoid activation output overflow
-            result = (activation_scale_factor * 0.5);
+            auto limit = (inputsSize == 1 ? std::numeric_limits<int8_t>::max() : std::numeric_limits<int32_t>::max()) - 1;
+
+            if (static_cast<uint64_t>(result * quantizedParams->_src_quant.GetScale()) > limit) {
+                    result *= 0.5;
+            }
         } else if (layer.isPower()) {
             auto powerLayer = dynamic_cast<InferenceEngine::PowerLayer const*>(cnnLayer);
             if (!powerLayer) {
@@ -380,7 +385,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
                 (layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) {
                 auto prevLayerQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
                 if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) &&
-                    (prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) {
+                    (prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) {
                     result = prevLayerQuant->_src_quant.GetScale();
                     usePrevScaleFactor = true;
                 }
@@ -411,7 +416,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
     }
 
  public :
-    bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
+    bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
         if ( !cnnLayer ) {
             IE_THROW() << "Incorrect Convolutional Layer pointer \n";
         }
@@ -543,7 +548,13 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
                 }
             }
 
-            auto levels = fakeQuantize ? MAX_VAL_2B_FEAT : std::numeric_limits<int16_t>::max();
+            auto levels = 0;
+            if (fakeQuantize) {
+                levels = (inputsSize == 2) ? MAX_VAL_2B_FEAT : MAX_VAL_1B_FEAT;
+            } else {
+                levels = (inputsSize == 2) ? std::numeric_limits<int16_t>::max() : std::numeric_limits<int8_t>::max();
+            }
+
             auto abs_val = std::max(std::abs(max_val), std::abs(min_val));
             auto scale_val = static_cast<float>(levels) / abs_val;
             //TODO: use FQ formula for scale factor calculation
@@ -591,7 +602,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
             if (!quant->_dst_quant.IsScaleSet() || fp32eq(quant->_dst_quant.GetScale(), 1.0f) ||
                 !fp32eq(quant->_src_quant.GetScale(), inputQuant->_dst_quant.GetScale())) {
                 quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
-                auto scale = getActivationScale(cnnLayer, layerInfo, fakeQuantize);
+                auto scale = getActivationScale(cnnLayer, layerInfo, inputsSize, fakeQuantize);
                 quant->_dst_quant.SetScale(scale);
             }
             return true;
@@ -612,10 +623,12 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
 template<>
 class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
  public:
-    bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
+    bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
         if ( !eltwiseLayer ) {
             THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
         }
+        bool lowPrecision = (inputsSize == sizeof(int8_t));
+
         auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
         auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1);
 
@@ -640,7 +653,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                         }) : in0;
 
                 if (LayerInfo(in0).has32BOutput() ||
-                    (LayerInfo(in0).isNonFunctional() && (LayerInfo(eltwiseFunctionalPrev).has32BOutput()))) {
+                    (LayerInfo(in0).isNonFunctional() && LayerInfo(eltwiseFunctionalPrev).has32BOutput())) {
                     std::swap(in0, in1);
                     std::swap(quantParams0, quantParams1);
                 }
@@ -653,47 +666,50 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                 // this path might result in significant data loss
                 quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
                 auto weightsScale = quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale();
-                auto prevLayerIn1 = CNNNetPrevLayer(in1);
+
                 // If a previous layer is a layer where freely weights scale factor can be selected,
                 // try to find the scale factor that will allow to use integer as weights scale factor for eltwise
                 // operation.
                 // If the weights scale factor for eltwise sum/sub is not integer, it will cause accuracy degradation.
-                if (fakeQuantize && LayerInfo(in1).isWeightableIdentity() &&
-                    (prevLayerIn1 == nullptr || LayerInfo(prevLayerIn1).has16BOutput())) {
-                    auto bestWeightsScale = 0.0f;
-                    auto bestError = static_cast<float>(std::numeric_limits<int16_t>::max());
-                    auto scaleIn0Dst = quantParams0->_dst_quant.GetScale();
-                    auto scaleIn1Src = quantParams1->_src_quant.GetScale();
-                    for (size_t i = MAX_VAL_2B_FEAT; i > 0; --i) {
-                        auto scaleIn1Dst = i * scaleIn1Src;
-                        auto eltwiseWeightsScale = scaleIn1Dst / scaleIn0Dst;
-                        if (eltwiseWeightsScale < 1.0 || eltwiseWeightsScale > std::numeric_limits<int16_t>::max() - 1) {
-                            continue;
-                        }
+                if (fakeQuantize) {
+                    auto prevLayerIn1 = CNNNetPrevLayer(in1);
+                    if (LayerInfo(in1).isWeightableIdentity() &&
+                        (prevLayerIn1 == nullptr || LayerInfo(prevLayerIn1).has8BOr16BOutput())) {
+                        auto bestWeightsScale = 0.0f;
+                        auto bestError = static_cast<float>(std::numeric_limits<int16_t>::max());
+                        auto scaleIn0Dst = quantParams0->_dst_quant.GetScale();
+                        auto scaleIn1Src = quantParams1->_src_quant.GetScale();
+                        for (size_t i = MAX_VAL_2B_FEAT; i > 0; --i) {
+                            auto scaleIn1Dst = i * scaleIn1Src;
+                            auto eltwiseWeightsScale = scaleIn1Dst / scaleIn0Dst;
+                            if (eltwiseWeightsScale < 1.0 || eltwiseWeightsScale > std::numeric_limits<int16_t>::max() - 1) {
+                                continue;
+                            }
 
-                        auto error = std::abs(eltwiseWeightsScale - static_cast<int16_t>(eltwiseWeightsScale));
-                        if (error < bestError) {
-                            bestError = error;
-                            bestWeightsScale = i;
-                        }
+                            auto error = std::abs(eltwiseWeightsScale - static_cast<int16_t>(eltwiseWeightsScale));
+                            if (error < bestError) {
+                                bestError = error;
+                                bestWeightsScale = i;
+                            }
 
-                        if (fp32eq(error, 0.0f)) {
-                            break;
+                            if (fp32eq(error, 0.0f)) {
+                                break;
+                            }
                         }
-                    }
 
-                    if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
-                        quantParams1->_weights_quant.SetScale(bestWeightsScale);
-                        quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale());
-                        result = ScaleFactorUpdateResult(in1.get());
-                        return true;
+                        if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
+                            quantParams1->_weights_quant.SetScale(bestWeightsScale);
+                            quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale());
+                            result = ScaleFactorUpdateResult(in1.get());
+                            return true;
+                        }
                     }
                 }
                 quantData->_weights_quant.SetScale(weightsScale);
                 quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());
 
-                // eltwise will always work in int16
-                auto maxValue = std::numeric_limits<int16_t>::max() - 1;
+                // eltwise will work in int16 or int8 if low precision inputs are used
+                auto maxValue = lowPrecision ? (std::numeric_limits<int8_t>::max() - 1) : (std::numeric_limits<int16_t>::max() - 1);
                 if (quantData->_weights_quant.GetScale() > maxValue + 1) {
                     // rescaling it's activation input
                     // iterating thru previous layers of eltwise
@@ -709,7 +725,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                             // this case for input from port 0
                             if (info.isSplit() || info.isSlice()) {
                                 continue;
-                            } else if (info.has16BOutput() && info.isActivation()) {
+                            } else if (info.has8BOr16BOutput() && info.isActivation()) {
                                 auto newOutputScale = quantParams->_dst_quant.GetScale() / maxValue;
                                 if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
                                     break;
@@ -721,7 +737,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
                                 quantDataForActivation->_dst_quant.SetScale(newOutputScale);
                                 result = ScaleFactorUpdateResult(in.get());
                                 return true;
-                            } else if (info.has16BOutput()) {
+                            } else if (info.has8BOr16BOutput()) {
                                 break;
                             }
 
@@ -767,7 +783,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
 template<>
 class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
  public:
-    bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
+    bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
         if ( !concatLayer ) {
             THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
         }
@@ -959,7 +975,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
                 auto prevLayer2 = prevLayer != nullptr ? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional) : nullptr;
 
                 if (fakeQuantize && prevLayer != nullptr && LayerInfo(prevLayer).isWeightableIdentity() &&
-                    (prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) {
+                    (prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) {
                     auto weightsScales = generateScaleFactors(MIN_SEARCH_WEIGHTS_VAL, MAX_SEARCH_WEIGHTS_VAL,
                         MAX_SEARCH_WEIGHTS_VAL - MIN_SEARCH_WEIGHTS_VAL);
 
@@ -999,18 +1015,17 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
 template<>
 class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
  private:
-    float const _scale_reduction_50 = 0.50;
-    float const _scale_reduction_45 = 0.45;
-    float const _scale_reduction_40 = 0.40;
-    float const _scale_reduction_35 = 0.35;
-
-    uint16_t const _scale_change_req_threshold = 30;
-    uint16_t const _scale_change_threshold_100 = 100;
-    uint16_t const _scale_change_threshold_150 = 150;
-    uint16_t const _scale_change_threshold_200 = 200;
+    std::vector<std::tuple<uint16_t const, float const, float const>> thresholds {
+        // tuple values: scale factor threshold, scale factor reduction factor for I16 precision, for I8 precision
+        std::make_tuple(30, 0.50f, 0.50f),     // entry check value
+        std::make_tuple(100, 0.50f, 0.50f),    // if below this threshold, then use this factor
+        std::make_tuple(150, 0.45f, 0.45f),
+        std::make_tuple(200, 0.40f, 0.40f),
+        std::make_tuple(200, 0.35f, 0.35f)     // max level -> if above, then use this factor
+    };
 
  public:
-    bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
+    bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
         if ( !wl ) {
             THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer  \n";
         } else if (!wl->_weights) {
@@ -1062,50 +1077,60 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
             }
 
             if (wl->_biases) {
-                quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as<float *>(),
-                                                                      MAX_VAL_4B_BIAS,
-                                                                      wl->_biases->size()));
+                // for now the only case of INT8 bias we support comes with INT8 inputs and weights as well
+                if (inputsSize == 1 && weightsSize == 1) {
+                    quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as<float*>(),
+                        MAX_VAL_1B_BIAS,
+                        wl->_biases->size()));
+                } else {
+                    quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as<float*>(),
+                        MAX_VAL_4B_BIAS,
+                        wl->_biases->size()));
+                }
                 if (quant->_bias_quant.GetScale() != -1.0f) {
-                    quant->_bias_quant.SetScale(
-                        std::min(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(), quant->_bias_quant.GetScale()));
+                    // for low precision we don't change bias scale factor based on source and weights scale factors
+                    // in order not to loose too much precision
+                    if (inputsSize != 1 || weightsSize != 1) {
+                        quant->_bias_quant.SetScale(
+                            std::min(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(), quant->_bias_quant.GetScale()));
+                    }
                     quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale());
                 }
             }
 
-            // TODO: findout why ???
-            if (weightsSize == 1) {
+            // use the MAX_OUT_MULTIPLIER only for int8_t weigths with compound bias (for now handled here only with int16_t inputs)
+            // it gives the possibility to exetend the output dynamic range
+            if (weightsSize == 1 && inputsSize == 2) {
                 quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * MAX_OUT_MULTIPLIER);
             }
 
             double weights_reducer = 1.0;
             auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer *>(wl);
             if (conv) {
-                auto dims = conv->insData.front().lock()->getDims();
-
-                weights_reducer = MAX_VAL_2B_FEAT * scaleRange * dims[1] / std::numeric_limits<int32_t>::max();
+                auto channels_num = GetDataDimSize(conv->insData.front().lock(), InferenceEngine::DataDimName::C);
+                weights_reducer = MAX_VAL_2B_FEAT * scaleRange * channels_num / std::numeric_limits<int32_t>::max();
                 weights_reducer = std::max(1.0, weights_reducer);
             }
             quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() / weights_reducer);
         }
 
         double tmp_dst_quant_scale = quant->_weights_quant.GetScale() * quant->_src_quant.GetScale();
-        if (weightsSize == 1 &&
-            static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.GetScale()) >
-            static_cast<uint64_t>(std::numeric_limits<int32_t>::max() - 1) * _scale_change_req_threshold) {
-            gnawarn() << "Output scale for " << wl->name
-                << " too large and are being reduced. Else saturations likely will happen \n";
-            // reduce weight scale according experimental heuristic
-            if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
-                static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
-                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_50);
-            } else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
-                static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
-                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_45);
-            } else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
-                static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
-                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_40);
-            } else {
-                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_35);
+        if (weightsSize == 1) {
+            auto itt = thresholds.begin();
+            auto limit = std::numeric_limits<int32_t>::max();
+
+            if (inputsSize == 1) {
+                limit = std::numeric_limits<int8_t>::max();
+            }
+
+            if (static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.GetScale()) >
+                static_cast<uint64_t>(limit - 1) * std::get<0>(*itt)) {
+                gnawarn() << "Output scale for " << wl->name
+                    << " too large and are being reduced. Else saturations likely will happen \n";
+                // reduce weight scale according experimental heuristic
+                while ((itt + 1) != thresholds.end() && quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
+                    static_cast<float>(limit) >= std::get<0>(*(++itt))) {}
+                quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * (inputsSize == 2 ? std::get<1>(*itt) : std::get<2>(*itt)));
             }
         }
 
@@ -1149,17 +1174,10 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
 
 template<>
 class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
- public:
-    bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
-        return ScaleFactorPerLayer<InferenceEngine::WeightableLayer*>::operator()(wl, 2, result, fakeQuantize);
-    }
 };
 
-/**
- * GNA convolutions cannot be quantized in int8, remove when library starts support that
- */
 template<>
-class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> {
+class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
 };
 
 
@@ -1174,12 +1192,15 @@ class ScaleFactorCalculator {
     Cnt  net;
     mutable Cnt::const_iterator idx;
     mutable bool needRestart = false;
-    int weightsBytesSize;
+    int mandWeightsBytesSize;
+    int optWeightsBytesSize;
     bool isFakeQuantize;
+    int inputsBytesSize;
 
  public:
-    ScaleFactorCalculator(Cnt &net, int weightsBytesSize, bool fakeQuantize)
-            : net(net), weightsBytesSize(weightsBytesSize), isFakeQuantize(fakeQuantize) {
+    ScaleFactorCalculator(Cnt &net, int mandWeightsBytesSize, int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize)
+            : net(net), mandWeightsBytesSize(mandWeightsBytesSize), optWeightsBytesSize(optWeightsBytesSize),
+              inputsBytesSize(inputsBytesSize), isFakeQuantize(fakeQuantize) {
         idx = std::begin(this->net);
     }
     bool needToRestart() const {
@@ -1195,7 +1216,13 @@ class ScaleFactorCalculator {
     bool operator()(T ptr) const {
         needRestart = false;
         frontend::ScaleFactorUpdateResult result;
-        if (!frontend::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, result, isFakeQuantize)) {
+        auto weightsBytesSize = mandWeightsBytesSize;
+
+        if (LayerInfo(ptr).isConvolution() || LayerInfo(ptr).isScaleShift()) {
+            weightsBytesSize = optWeightsBytesSize;
+        }
+
+        if (!frontend::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, inputsBytesSize, result, isFakeQuantize)) {
             return false;
         }
         if (result) {
diff --git a/inference-engine/src/gna_plugin/gna_data_types.hpp b/inference-engine/src/gna_plugin/gna_data_types.hpp
index 54a9981b3646ba..330a4c25eb411b 100644
--- a/inference-engine/src/gna_plugin/gna_data_types.hpp
+++ b/inference-engine/src/gna_plugin/gna_data_types.hpp
@@ -17,9 +17,6 @@
 #include "memory/polymorph_allocator.hpp"
 #include "memory/gna_memory.hpp"
 
-#define FROM_IR_DIM(mem, idx)\
-((mem->getTensorDesc().getDims().size() > (idx) - 1) ? mem->getTensorDesc().getDims()[mem->getTensorDesc().getDims().size() - (idx)] : 1)
-
 struct TranspositionInfo {
     bool transpose;
     size_t num_transpose_rows;
diff --git a/inference-engine/src/gna_plugin/gna_device.cpp b/inference-engine/src/gna_plugin/gna_device.cpp
index 9d14d647587165..95899982ca3164 100644
--- a/inference-engine/src/gna_plugin/gna_device.cpp
+++ b/inference-engine/src/gna_plugin/gna_device.cpp
@@ -235,7 +235,7 @@ void GNADeviceHelper::checkGna2Status(Gna2Status status, const Gna2Model& gnaMod
                                         ? errorReasons.at(reason)
                                         : "Unknown Error Reason";
         ss << "   Reason (" << std::to_string(reason) << "): " << errorReason << "\n";
-        ss << "   Value (0x" << std::hex << std::to_string(error.Value) << ")";
+        ss << "   Value (0x" << std::hex << error.Value << ")";
 
         THROW_GNA_EXCEPTION << "\nUnsuccessful Gna2Status: (" << status << ") " <<
             gna2StatusBuffer.data() << ss.str() <<
diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
index f4a37b930e4bf7..03f901eea27be1 100644
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@@ -35,6 +35,7 @@
 #include "round_float_define.hpp"
 #include "gna_plugin_policy.hpp"
 #include "gna_groups.hpp"
+#include "backend/gna_limitations.hpp"
 
 using namespace InferenceEngine;
 using namespace std;
@@ -243,17 +244,15 @@ void GNAGraphCompiler::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer)
     const auto outputs = layer->outData.front();
     assertConvolutionLayoutProper(inputs);
 
-    const auto in_order = getFromIRDimsOrderNCHW(inputs->getLayout());
-    const auto in_batch = static_cast<uint32_t>(FROM_IR_DIM(inputs, in_order[0]));
-    const auto in_channels = static_cast<uint32_t>(FROM_IR_DIM(inputs, in_order[1]));
-    auto in_height = static_cast<uint32_t>(FROM_IR_DIM(inputs, in_order[2]));
-    auto in_width = static_cast<uint32_t>(FROM_IR_DIM(inputs, in_order[3]));
+    const auto in_batch = GetDataDimSize(inputs, InferenceEngine::DataDimName::N);
+    const auto in_channels = GetDataDimSize(inputs, InferenceEngine::DataDimName::C);
+    auto in_height = GetDataDimSize(inputs, InferenceEngine::DataDimName::H);
+    auto in_width = GetDataDimSize(inputs, InferenceEngine::DataDimName::W);
 
-    const auto out_order = getFromIRDimsOrderNCHW(outputs->getLayout());
-    const auto out_batch = static_cast<uint32_t>(FROM_IR_DIM(outputs, out_order[0]));
-    const auto out_channels = static_cast<uint32_t>(FROM_IR_DIM(outputs, out_order[1]));
-    auto out_height = static_cast<uint32_t>(FROM_IR_DIM(outputs, out_order[2]));
-    auto out_width = static_cast<uint32_t>(FROM_IR_DIM(outputs, out_order[3]));
+    const auto out_batch = GetDataDimSize(outputs, InferenceEngine::DataDimName::N);
+    const auto out_channels = GetDataDimSize(outputs, InferenceEngine::DataDimName::C);
+    auto out_height = GetDataDimSize(outputs, InferenceEngine::DataDimName::H);
+    auto out_width = GetDataDimSize(outputs, InferenceEngine::DataDimName::W);
 
     if (in_height > 1 && in_width == 1) {
         std::swap(in_height, in_width);
@@ -300,25 +299,25 @@ void GNAGraphCompiler::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer)
 
     // TODO: refine following condition
     if (((in_channels > 1) && (in_height > 1) && (in_width > 1)) || // 3D input
-        (convolution._kernel_x != 1 && convolution._kernel_y != 1 && convolution._kernel_y != in_channels) || // 2D kernel
-        (inputs->getLayout() != Layout::NHWC && in_height != 1)) {
+        (convolution._kernel_x != 1 && convolution._kernel_y != 1) || // 2D kernel
+        in_height != 1) {
         // TensorFlow default layout is NHWC
         // OpenVino Default layout is   NCHW
         // GNA Convolution input is     NHCW
         // When layer layout is in NHWC it means that is was created by PassManager
 #if GNA_LIB_VER == 2
         return finalizeConvolution2DPrimitive(layer, in_batch, in_channels, in_height, in_width,
-                                                out_batch, out_channels, out_height, out_width);
+                                              out_batch, out_channels, out_height, out_width);
 #endif
         THROW_GNA_LAYER_EXCEPTION(layer) << "Convolution 2D is not supported on GNA 1.0 library";
     }
-    finalizeConvolution1DPrimitive(layer, in_batch, in_channels, in_height, in_width,
-                                                out_batch, out_channels, out_height, out_width);
+    finalizeConvolution1DPrimitive(layer, in_batch, in_channels, in_width,
+                                   out_batch, out_channels, out_width);
 }
 
 void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerPtr layer,
-    uint32_t in_batch, uint32_t in_channels, uint32_t in_height, uint32_t in_width,
-    uint32_t out_batch, uint32_t out_channels, uint32_t out_height, uint32_t out_width) {
+    uint32_t in_batch, uint32_t in_channels, uint32_t in_width,
+    uint32_t out_batch, uint32_t out_channels, uint32_t out_width) {
     auto& convolution = dynamic_cast<ConvolutionLayer&>(*layer.get());
     printConvolutionLayer(convolution);
 
@@ -331,18 +330,15 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
         THROW_GNA_LAYER_EXCEPTION(&convolution) << "Padding isn't supported by GNA";
     }
 
-    std::size_t calculated_out_width = (in_width * in_height - convolution._kernel_x + 2 * convolution._padding_x) / convolution._stride_x + 1;
-    if (out_width * in_height != calculated_out_width) {
+    std::size_t calculated_out_width = (in_width - convolution._kernel_x + 2 * convolution._padding_x) / convolution._stride_x + 1;
+    if (out_width != calculated_out_width) {
         THROW_GNA_LAYER_EXCEPTION(&convolution) << "Invalid output configuration. "
-            << calculated_out_width << " != " << out_width * in_height;
+            << calculated_out_width << " != " << out_width;
     }
 
-    uint32_t total_conv_kernel_size = convolution._kernel_x * convolution._kernel_y * convolution._out_depth;
-    uint32_t single_conv_kernel_size = convolution._kernel_x * convolution._kernel_y;
-    if (convolution._kernel_y != in_channels) { // work around the strange special case where 1D kernel gets rewritten as 2D kernel
-        total_conv_kernel_size *= in_channels;
-        single_conv_kernel_size *= in_channels;
-    }
+    IE_ASSERT(convolution._kernel_y == 1);
+    uint32_t total_conv_kernel_size = convolution._kernel_x * convolution._out_depth * in_channels;
+    uint32_t single_conv_kernel_size = convolution._kernel_x * in_channels;
     auto actual_kernel_size = details::product(convolution._weights->getTensorDesc().getDims());
     if (total_conv_kernel_size != actual_kernel_size) {
         THROW_GNA_LAYER_EXCEPTION(&convolution) << "Weights size does not equal kernel size "
@@ -358,17 +354,17 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
     }
 
     // have to pad input to let last kernel meets it's corresponding input
-    uint32_t num_inputs = in_width * in_height * in_channels;
+    uint32_t num_inputs = in_width * in_channels;
     uint32_t num_input_padding = ALIGN(num_inputs, 8) - num_inputs;
 
     //  convert to 2D and set GNA input feature map size
     uint32_t num_feature_map_columns = in_channels * convolution._stride_x * convolution._stride_y;
-    if (in_height == 1 && convolution._stride_y != 1) {
+    if (convolution._stride_y != 1) {
         num_feature_map_columns = in_channels * convolution._stride_x;
     } else if (in_width == 1 && convolution._stride_x != 1) {
         num_feature_map_columns = in_channels * convolution._stride_y;
     }
-    uint32_t num_feature_map_rows = (in_channels * in_height * in_width) / num_feature_map_columns;
+    uint32_t num_feature_map_rows = (in_channels * in_width) / num_feature_map_columns;
 
     uint32_t num_filters = convolution._out_depth;
     uint32_t num_filter_coefficients = single_conv_kernel_size + num_conv_kernel_padding;
@@ -383,7 +379,7 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
     uint32_t additional_padding = 0;
 
     // if kernel padding to multiple of 8 will cause missed outputs, need to pad further
-    while (num_columns_out < out_batch * out_channels * out_height * out_width) {
+    while (num_columns_out < out_batch * out_channels * out_width) {
         num_input_padding = original_input_padding + additional_padding;
         num_feature_map_rows = original_num_feature_map_rows + (num_input_padding) / num_feature_map_columns;
         num_columns_in = num_inputs + num_input_padding;
@@ -398,9 +394,9 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
         gnalog() << LAYER_NAME(&convolution) << "Inputs padding is " << num_input_padding << "\n";
     }
 
-    if (num_columns_out_unpadded != out_batch * out_channels * out_height * out_width) {
+    if (num_columns_out_unpadded != out_batch * out_channels * out_width) {
         THROW_GNA_LAYER_EXCEPTION(&convolution) << "Number of output columns does not equal output tensor size "
-            << num_columns_out_unpadded << " vs " << out_batch * out_channels * out_height * out_width;
+            << num_columns_out_unpadded << " vs " << out_batch * out_channels * out_width;
     }
 
     void* ptr_inputs = nullptr;
@@ -778,17 +774,19 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
                 }
                 ptr_pwl_segments.resize(num_segments);
 
-                PwlDesign16(activation_type,
+                PwlDesign(activation_type,
                     &*ptr_pwl_segments.begin(),
                     static_cast<uint32_t>(ptr_pwl_segments.size()),
                     input_pwl_scale_factor,
-                    output_pwl_scale_factor);
+                    output_pwl_scale_factor,
+                    gnaFlags->input_low_precision);
             } else {
-                PwlDesignOpt16(activation_type,
+                PwlDesignOpt(activation_type,
                     ptr_pwl_segments,
                     input_pwl_scale_factor,
                     output_pwl_scale_factor,
-                    gnaFlags->pwlMaxErrorPercent);
+                    gnaFlags->pwlMaxErrorPercent,
+                    gnaFlags->input_low_precision);
             }
         }
 
@@ -833,15 +831,13 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
     auto inputs = layer->insData.begin()->lock();
     auto outputs = *layer->outData.begin();
 
-    const auto in_order = getFromIRDimsOrderNCHW(inputs->getLayout());
-    uint32_t w_dim_in = FROM_IR_DIM(inputs, in_order[3]);
-    uint32_t h_dim_in = FROM_IR_DIM(inputs, in_order[2]);
-    const uint32_t c_dim_in = FROM_IR_DIM(inputs, in_order[1]);
+    uint32_t w_dim_in = GetDataDimSize(inputs, InferenceEngine::DataDimName::W);
+    uint32_t h_dim_in = GetDataDimSize(inputs, InferenceEngine::DataDimName::H);
+    const uint32_t c_dim_in = GetDataDimSize(inputs, InferenceEngine::DataDimName::C);
 
-    const auto out_order = getFromIRDimsOrderNCHW(outputs->getLayout());
-    uint32_t w_dim_out = FROM_IR_DIM(outputs, out_order[3]);
-    uint32_t h_dim_out = FROM_IR_DIM(outputs, out_order[2]);
-    const uint32_t c_dim_out = FROM_IR_DIM(outputs, out_order[1]);
+    uint32_t w_dim_out = GetDataDimSize(outputs, InferenceEngine::DataDimName::W);
+    uint32_t h_dim_out = GetDataDimSize(outputs, InferenceEngine::DataDimName::H);
+    const uint32_t c_dim_out = GetDataDimSize(outputs, InferenceEngine::DataDimName::C);
 
     if (w_dim_in == 1) {  // swap dimensions if needed to support swapped 1D case
         swap(h_dim_in, w_dim_in);
@@ -1029,7 +1025,7 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
 
     std::vector<int> axis, dim, offset;
     for (int n = 0; n < cropLayer->axis.size(); n++) {
-        uint32_t input_dim = FROM_IR_DIM(inputs, inputs->getDims().size() - cropLayer->axis[n]);
+        uint32_t input_dim = GetDataDimSize(inputs, inputs->getDims().size() - cropLayer->axis[n]);
         // Exclude crop layer components that do nothing
         if (cropLayer->offset[n] == 0 && cropLayer->dim[n] == input_dim) {
             continue;
@@ -1088,10 +1084,10 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
         }
 
         // TODO: add unit tests for 4d crops blobs
-        uint32_t num_rows_in = FROM_IR_DIM(inputs, inputs->getDims().size() - axis.front());
+        uint32_t num_rows_in = GetDataDimSize(inputs, inputs->getDims().size() - axis.front());
         uint32_t num_columns_in = 1;
 
-        uint32_t num_rows_out = FROM_IR_DIM(outputs, inputs->getDims().size() - axis.front());
+        uint32_t num_rows_out = GetDataDimSize(outputs, inputs->getDims().size() - axis.front());
         uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
 
         void* ptr_inputs = nullptr;
@@ -1146,8 +1142,11 @@ void GNAGraphCompiler::SlicePrimitive(InferenceEngine::CNNLayerPtr layer) {
 void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
     auto& eltwise = dynamic_cast<EltwiseLayer&>(*layer.get());
     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+    uint32_t noOfInputsDivisor = gnaFlags->input_low_precision ?
+                                  GNALimitations::noOfInputsLowPrecDivisor : GNALimitations::noOfInputsDivisor;
 
-    // for eltwise should be one input of 4 bytes and one of 2 bytes - detecting that
+    // for eltwise sum/sub in 16-bit precision one input should be 4 bytes and one 2 bytes - detecting that below
+    // the names of variables are left for clarity although not always reflecting the real precision/size
     auto inputs2Bytes = layer->insData[0].lock();
     auto inputs4Bytes = layer->insData[1].lock();
 
@@ -1158,19 +1157,32 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
         case InferenceEngine::EltwiseLayer::Sum:
         case InferenceEngine::EltwiseLayer::Sub:
         {
-            if (inputs4Bytes->getPrecision().size() != 4) {
-                std::swap(inputs4Bytes, inputs2Bytes);
-                biasesLayerIdx = 0;
+            if (gnaFlags->input_low_precision == false) {
+                if (inputs4Bytes->getPrecision().size() != 4) {
+                    std::swap(inputs4Bytes, inputs2Bytes);
+                    biasesLayerIdx = 0;
+                }
+                GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2);
+                GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 4);
+            } else {
+                // for low precision both inputs should be 1 bytes in size
+                GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 1);
+                GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 1);
             }
-            GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2);
-            GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 4);
             break;
         }
         case InferenceEngine::EltwiseLayer::Prod:
         {
-            // for mul both inputs should be 2 bytes precision
-            GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2);
-            GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 2);
+            if (gnaFlags->input_low_precision == false) {
+                // for mul both inputs should be 2 bytes precision
+                GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2);
+                GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 2);
+            } else {
+                // for mul both inputs should be 1 byte precision
+                GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 1);
+                GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 1);
+            }
+
             break;
         }
         default:
@@ -1180,32 +1192,31 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
 
     auto outputs = *layer->outData.begin();
 
-    auto in_4b_order = getFromIRDimsOrderNCHW(inputs4Bytes->getLayout());
-    auto in_4b_batch = FROM_IR_DIM(inputs4Bytes, in_4b_order[0]);
-    auto in_4b_channels = FROM_IR_DIM(inputs4Bytes, in_4b_order[1]);
-    auto in_4b_height = FROM_IR_DIM(inputs4Bytes, in_4b_order[2]);
-    auto in_4b_width = FROM_IR_DIM(inputs4Bytes, in_4b_order[3]);
+    auto in_4b_batch = GetDataDimSize(inputs4Bytes, InferenceEngine::DataDimName::N);
+    auto in_4b_channels = GetDataDimSize(inputs4Bytes, InferenceEngine::DataDimName::C);
+    auto in_4b_height = GetDataDimSize(inputs4Bytes, InferenceEngine::DataDimName::H);
+    auto in_4b_width = GetDataDimSize(inputs4Bytes, InferenceEngine::DataDimName::W);
     auto in_4b_total_size = in_4b_batch * in_4b_channels * in_4b_height * in_4b_width;
 
-    auto in_2b_order = getFromIRDimsOrderNCHW(inputs2Bytes->getLayout());
-    auto in_2b_batch = FROM_IR_DIM(inputs2Bytes, in_2b_order[0]);
-    auto in_2b_channels = FROM_IR_DIM(inputs2Bytes, in_2b_order[1]);
-    auto in_2b_height = FROM_IR_DIM(inputs2Bytes, in_2b_order[2]);
-    auto in_2b_width = FROM_IR_DIM(inputs2Bytes, in_2b_order[3]);
+    auto in_2b_batch = GetDataDimSize(inputs2Bytes, InferenceEngine::DataDimName::N);
+    auto in_2b_channels = GetDataDimSize(inputs2Bytes, InferenceEngine::DataDimName::C);
+    auto in_2b_height = GetDataDimSize(inputs2Bytes, InferenceEngine::DataDimName::H);
+    auto in_2b_width = GetDataDimSize(inputs2Bytes, InferenceEngine::DataDimName::W);
     auto in_2b_total_size = in_2b_batch * in_2b_channels * in_2b_height * in_2b_width;
 
-    if ((in_2b_batch > 1) || (in_4b_batch > 1)) {
-        THROW_GNA_LAYER_EXCEPTION(layer) << " Inputs with batch size that not equals 1 is not supported";
+    if (in_2b_batch != in_4b_batch) {
+        THROW_GNA_LAYER_EXCEPTION(layer) << " Inputs with different batch sizes are not supported";
     }
 
     if (in_4b_total_size != in_2b_total_size) {
         THROW_GNA_LAYER_EXCEPTION(layer) << " Inputs size mismatch " << in_4b_total_size << " != " << in_2b_total_size;
     }
 
-    uint32_t num_rows_in = in_4b_channels * in_4b_height * in_4b_width;
-    uint32_t num_columns_in = in_4b_batch;
+    // If batch size > 1 the data is reshaped to one with batch size = 1
+    uint32_t num_rows_in = in_4b_total_size;
+    uint32_t num_columns_in = 1;
     uint32_t num_rows_out = num_rows_in;
-    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+    uint32_t num_padding = ALIGN(num_rows_in, noOfInputsDivisor) - num_rows_in;
 
     void* ptr_inputs = nullptr;
     void* ptr_outputs = nullptr;
@@ -1220,8 +1231,8 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
         inputs2Bytes->getPrecision().size(),
         outputs->getPrecision().size(),
         // TODO: only fp32 and Int16 tested
-        quantized == nullptr ? inputs2Bytes->getPrecision().size() : 2,
-        quantized == nullptr ? inputs4Bytes->getPrecision().size() : 4,
+        quantized == nullptr ? inputs2Bytes->getPrecision().size() : (!gnaFlags->input_low_precision ? 2 : 1),
+        quantized == nullptr ? inputs4Bytes->getPrecision().size() : (!gnaFlags->input_low_precision ? 4 : 1),
         quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
         quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
         ptr_inputs,
@@ -1246,9 +1257,15 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
         } else {
             auto scaledIdentity = -quantized->_weights_quant.GetScale();
 
-            auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
+            if (gnaFlags->input_low_precision == false) {
+                auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
 
-            gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+            } else {
+                auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast<float>(INT8_MAX)));
+
+                gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+            }
         }
         connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
         break;
@@ -1258,9 +1275,15 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
         } else {
             auto scaledIdentity = quantized->_weights_quant.GetScale();
 
-            auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
+            if (gnaFlags->input_low_precision == false) {
+                auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
 
-            gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+            } else {
+                auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast<float>(INT8_MAX)));
+
+                gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+            }
         }
         connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
         break;
@@ -1269,7 +1292,11 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
         if (quantized == nullptr) {
             gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
         } else {
-            gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+            if (gnaFlags->input_low_precision == false) {
+                gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+            } else {
+                gnamem->readonly().push_value<int8_t>(ptr_biases, 0, num_rows_out, 64);
+            }
         }
         connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx);
         break;
@@ -1287,15 +1314,25 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
     IE_ASSERT(!layer->outData.empty());
     auto inputs = layer->insData.begin()->lock();
     auto outputs = *layer->outData.begin();
-    auto inputPrecision = quantized ? Precision(Precision::I16) : inputs->getPrecision();
+    Precision inputPrecision;
+    uint32_t noOfInputsDivisor = GNALimitations::noOfInputsDivisor;
+
+    if (!quantized) {
+        inputPrecision = inputs->getPrecision();
+    } else if (gnaFlags->input_low_precision == false) {
+        inputPrecision = Precision(Precision::I16);
+    } else {
+        inputPrecision = Precision(Precision::I8);
+        noOfInputsDivisor = GNALimitations::noOfInputsLowPrecDivisor;
+    }
 
     auto input_data = HasTo2DReshapeData(layer) ? Get2DReshapedData(inputs, 8) : inputs;
     auto in_dims = input_data->getDims();
     auto batch_size = (in_dims.size() == 1) ? 1 : in_dims.front();
     uint32_t num_rows_in = InferenceEngine::details::product(in_dims) / batch_size;
     uint32_t num_columns_in = batch_size;
-    uint32_t num_rows_out = isDiag ? num_rows_in : FROM_IR_DIM(outputs, 1);
-    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+    uint32_t num_rows_out = isDiag ? num_rows_in : GetDataDimSize(outputs, 1);
+    uint32_t num_padding = ALIGN(num_rows_in, noOfInputsDivisor) - num_rows_in;
     uint32_t num_padding_out = isDiag ? num_padding : 0;
 
     void* ptr_inputs = nullptr;
@@ -1481,8 +1518,8 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
     auto outputs = *layer->outData.begin();
     auto inputs = layer->insData.begin()->lock();
 
-    uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
-    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_columns_in = GetDataDimSize(inputs, 2);
+    uint32_t num_rows_out = GetDataDimSize(outputs, 1);
     uint32_t num_rows_in = filterLayer->_weights->size() / num_rows_out;
     uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
 
@@ -1617,8 +1654,8 @@ void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer)
     auto outputs = *layer->outData.begin();
     auto inputs = layer->insData.begin()->lock();
 
-    uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
-    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_columns_in = GetDataDimSize(inputs, 2);
+    uint32_t num_rows_out = GetDataDimSize(outputs, 1);
     uint32_t num_rows_in = filterLayer->_weights->size() / num_rows_out;
 
     uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
@@ -1718,16 +1755,16 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
     auto orientation = kDnnInterleavedOrientation;
 
     if (inputs->getDims().size() == 4) {
-        uint32_t w_dim_in = FROM_IR_DIM(inputs, 1);
-        uint32_t h_dim_in = FROM_IR_DIM(inputs, 2);
-        uint32_t c_dim_in = FROM_IR_DIM(inputs, 3);
-        uint32_t b_dim_in = FROM_IR_DIM(inputs, 4);
+        uint32_t w_dim_in = GetDataDimSize(inputs, 1);
+        uint32_t h_dim_in = GetDataDimSize(inputs, 2);
+        uint32_t c_dim_in = GetDataDimSize(inputs, 3);
+        uint32_t b_dim_in = GetDataDimSize(inputs, 4);
 
         num_columns = (w_dim_in == 1) ? h_dim_in * c_dim_in * b_dim_in : w_dim_in * c_dim_in * b_dim_in;
         num_rows = (w_dim_in == 1) ? w_dim_in : h_dim_in;
     } else {
-        num_columns = FROM_IR_DIM(inputs, 2);
-        num_rows = FROM_IR_DIM(inputs, 1);
+        num_columns = GetDataDimSize(inputs, 2);
+        num_rows = GetDataDimSize(inputs, 1);
     }
 
     if (dnn->new_num_conv_columns) {
@@ -1869,17 +1906,19 @@ case name:\
             default:
                 THROW_GNA_EXCEPTION << "Activation function type not yet supported " << activation_type;
             }
-            PwlDesign16(activation_type,
+            PwlDesign(activation_type,
                 &*ptr_pwl_segments.begin(),
                 static_cast<uint32_t>(ptr_pwl_segments.size()),
                 input_pwl_scale_factor,
-                output_pwl_scale_factor);
+                output_pwl_scale_factor,
+                gnaFlags->input_low_precision);
         } else {
-            PwlDesignOpt16(activation_type,
+            PwlDesignOpt(activation_type,
                 ptr_pwl_segments,
                 input_pwl_scale_factor,
                 output_pwl_scale_factor,
-                gnaFlags->pwlMaxErrorPercent);
+                gnaFlags->pwlMaxErrorPercent,
+                gnaFlags->input_low_precision);
         }
         ptr_pwl_segments_target = reinterpret_cast<gna_pwl_segment_t*>(&ptr_pwl_segments_target);
     }
@@ -2238,8 +2277,9 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
             // if request for allocation less that realTensorInput - we need to extend request
             auto minInput = inputDesc->minBytesRequiredForStoreInput(prevLayer);
             if (num_data_bytes_in < minInput) {
-                gnalog() << "[INPUT] : requested bytes: " << num_data_bytes_in << ", extended to" << ALIGN(minInput, 8);
-                num_data_bytes_in = ALIGN(minInput, 8);
+                uint32_t noOfInputsDivisor = gnaFlags->input_low_precision ? GNALimitations::noOfInputsLowPrecDivisor : GNALimitations::noOfInputsDivisor;
+                gnalog() << "[INPUT] : requested bytes: " << num_data_bytes_in << ", extended to" << ALIGN(minInput, noOfInputsDivisor);
+                num_data_bytes_in = ALIGN(minInput, noOfInputsDivisor);
             }
 
             // real allocation pointer will be kept in ptr not in ptr_inputs_global
@@ -2460,17 +2500,3 @@ GNAGraphCompiler::transposeMatrix(uint8_t* ptr_matrix, size_t element_size, uint
     }
     return temp_buffer;
 }
-
-std::vector<std::size_t> GNAGraphCompiler::getFromIRDimsOrderNCHW(InferenceEngine::Layout layout) {
-    std::vector<std::size_t> order;
-    switch (layout) {
-    case Layout::NHWC:
-        order = { 4, 1, 3, 2 };
-        break;
-    case Layout::NCHW:
-    default:
-        order = { 4, 3, 2, 1 };
-        break;
-    }
-    return order;
-}
diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.hpp b/inference-engine/src/gna_plugin/gna_graph_compiler.hpp
index 10a3a85b1744a1..a7099d0d1f6d8c 100644
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.hpp
@@ -50,7 +50,6 @@ class GNAGraphCompiler {
     static void printPoolingLayer(const InferenceEngine::PoolingLayer& layer);
     static void assertConvolutionLayoutProper(const InferenceEngine::DataPtr&);
     std::vector<uint8_t> static transposeMatrix(uint8_t* ptr_matrix, size_t element_size, uint32_t num_rows, uint32_t num_cols);
-    std::vector<std::size_t> static getFromIRDimsOrderNCHW(InferenceEngine::Layout layout);
 
 public:
     GNAPluginNS::backend::DnnComponents dnnComponents;
@@ -127,8 +126,8 @@ class GNAGraphCompiler {
     void CopyPrimitive(InferenceEngine::CNNLayerPtr);
 
     void finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerPtr,
-        uint32_t in_batch, uint32_t in_channels, uint32_t in_height, uint32_t in_width,
-        uint32_t out_batch, uint32_t out_channels, uint32_t out_height, uint32_t out_width);
+        uint32_t in_batch, uint32_t in_channels, uint32_t in_width,
+        uint32_t out_batch, uint32_t out_channels, uint32_t out_width);
 #if GNA_LIB_VER == 2
     void finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerPtr,
         uint32_t in_batch, uint32_t in_channels, uint32_t in_height, uint32_t in_width,
diff --git a/inference-engine/src/gna_plugin/gna_graph_patterns.hpp b/inference-engine/src/gna_plugin/gna_graph_patterns.hpp
index ad76391dd07eef..eed44b0ef35324 100644
--- a/inference-engine/src/gna_plugin/gna_graph_patterns.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_patterns.hpp
@@ -133,16 +133,21 @@ inline bool MustBeConvertedFromNCHWToNHWC(const std::vector<InferenceEngine::CNN
 }
 
 /**
- * @brief returns rotation information for a layer based on the previous convolution or pooling dimensions order
- * @param layer layer from which rotation info search must be started
- * @return bool value which identifies if rotation info is found and rotation information
+ * @brief returns transposition information for a layer based on the previous convolution or pooling dimensions order
+ * @param layer layer from which transposition info search must be started
+ * @return bool value which identifies if transposition info is found and transposition information
  */
 inline std::vector<TranspositionInfo> FindTranspositionInfoFromPrevLayers(InferenceEngine::CNNLayerPtr layer) {
     std::function<std::vector<TranspositionInfo>(InferenceEngine::CNNLayerPtr)> findTranspositionInfoRecursive =
         [&findTranspositionInfoRecursive](InferenceEngine::CNNLayerPtr layer) -> std::vector<TranspositionInfo> {
+        auto getTransposeInfoFromData = [](InferenceEngine::DataPtr data, bool transpose = true) {
+            auto rows = InferenceEngine::GetDataDimSize(data, InferenceEngine::DataDimName::C);
+            auto columns = InferenceEngine::GetDataDimSize(data, InferenceEngine::DataDimName::H) *
+                           InferenceEngine::GetDataDimSize(data, InferenceEngine::DataDimName::W);
+            return std::vector<TranspositionInfo>{{transpose, rows, columns}};
+        };
         if (LayerInfo(layer).isConvolution() || LayerInfo(layer).isPooling()) {
-            auto out_dims = layer->outData[0]->getDims();
-            return {{true, out_dims[1], out_dims[2] * out_dims[3]}};
+            return getTransposeInfoFromData(layer->outData[0]);
         }
 
         /* If a fullyconnected or input layers are reached, it means that transposition isn't needed, but we should keep
@@ -160,6 +165,46 @@ inline std::vector<TranspositionInfo> FindTranspositionInfoFromPrevLayers(Infere
             return findTranspositionInfoRecursive(input1);
         }
 
+        /* If it's a concat along not channel axis and its inputs are transposed the whole concat output must be transposed,
+         * otherwise every part corresponding to some input must be transposed separately */
+        if (LayerInfo(layer).isConcat() && !layer->insData.empty())  {
+            auto concatLayer = LayerInfo(layer).as<InferenceEngine::ConcatLayer*>();
+            IE_ASSERT(concatLayer != nullptr);
+            if (concatLayer->_axis > 1) {
+                for (const auto& input : layer->insData) {
+                    auto in_dims = input.lock()->getDims();
+                    if (in_dims.size() <= 2) {
+                        THROW_GNA_EXCEPTION << layer->name << " Invalid number of input dimensions " << in_dims.size()
+                                            << " for a concat with axis=" << concatLayer->_axis;
+                    }
+                    if (concatLayer->_axis == in_dims.size() - 1 && in_dims[in_dims.size() - 2] > 1) {
+                        std::ostringstream in_dims_oss;
+                        std::copy(in_dims.begin(), in_dims.end(), std::ostream_iterator<size_t>(in_dims_oss, ","));
+                        THROW_GNA_EXCEPTION << layer->name << " Unsupported concatenation axis=" << concatLayer->_axis
+                                            << " for input dimensions: " << in_dims_oss.str();
+                    }
+                }
+                // Check if non-const inputs are transposed
+                bool transpose = false;
+                int nonConstInputIx = 0;
+                for (int i = 0; InferenceEngine::CNNNetHasPrevLayer(layer.get(), i); ++i) {
+                    auto input = InferenceEngine::CNNNetPrevLayer(layer, i);
+                    if (LayerInfo(input).isConst()) continue;
+                    auto transpositionInfo = FindTranspositionInfoFromPrevLayers(input);
+                    auto partToTranspose = std::find_if(std::begin(transpositionInfo), std::end(transpositionInfo),
+                        [](const TranspositionInfo &infoPart) { return infoPart.transpose; });
+                    bool inputTranspose = (partToTranspose != std::end(transpositionInfo));
+                    if (nonConstInputIx == 0) {
+                        transpose = inputTranspose;
+                    } else if (inputTranspose != transpose) {
+                        THROW_GNA_EXCEPTION << layer->name << " concat has inputs with different layouts";
+                    }
+                    ++nonConstInputIx;
+                }
+                return getTransposeInfoFromData(layer->outData[0], transpose);
+            }
+        }
+
         std::vector<TranspositionInfo> transpositionInfo;
         for (int idx = 0; idx < layer->insData.size(); ++idx) {
             if (!InferenceEngine::CNNNetHasPrevLayer(layer.get(), idx)) continue;
@@ -169,8 +214,8 @@ inline std::vector<TranspositionInfo> FindTranspositionInfoFromPrevLayers(Infere
                 auto in_dims = layer->insData[idx].lock()->getDims();
                 transpositionInfo.push_back({false, 1, InferenceEngine::details::product(std::begin(in_dims), std::end(in_dims))});
             } else if (LayerInfo(layer).isConcat() && LayerInfo(inputLayer).isConst()) {
-                // If a concat input is a const we should keep its size to skip this part during transposition
                 auto in_dims = layer->insData[idx].lock()->getDims();
+                // We should keep its size to skip this part during transposition
                 auto data_size = InferenceEngine::details::product(std::begin(in_dims), std::end(in_dims));
                 transpositionInfo.push_back({false, 1, data_size});
             } else {
@@ -184,16 +229,18 @@ inline std::vector<TranspositionInfo> FindTranspositionInfoFromPrevLayers(Infere
 }
 
 /**
- * @brief returns rotation information for a layer based on the next convolution layer dimensions order
- * @param layer layer from which rotation info search must be started
- * @return bool value which identifies if rotation info is found and rotation information
+ * @brief returns transposition information for a layer based on the next convolution layer dimensions order
+ * @param layer layer from which transposition info search must be started
+ * @return bool value which identifies if transposition info is found and transposition information
  */
 inline std::vector<TranspositionInfo> FindTranspositionInfoFromNextLayers(InferenceEngine::CNNLayerPtr layer) {
     std::function<std::vector<TranspositionInfo>(InferenceEngine::CNNLayerPtr)> findTranspositionInfoRecursive =
         [&findTranspositionInfoRecursive](InferenceEngine::CNNLayerPtr layer) -> std::vector<TranspositionInfo> {
         if (LayerInfo(layer).isConvolution()) {
-            auto in_dims = layer->input()->getDims();
-            return {{true, in_dims[1], in_dims[2] * in_dims[3]}};
+            auto rows = InferenceEngine::GetDataDimSize(layer->input(), InferenceEngine::DataDimName::C);
+            auto columns = InferenceEngine::GetDataDimSize(layer->input(), InferenceEngine::DataDimName::H) *
+                           InferenceEngine::GetDataDimSize(layer->input(), InferenceEngine::DataDimName::W);
+            return {{true, rows, columns}};
         }
 
         /* If a fullyconnected or output layers are reached, it means that transposition isn't needed, but we should keep
diff --git a/inference-engine/src/gna_plugin/gna_graph_tools.hpp b/inference-engine/src/gna_plugin/gna_graph_tools.hpp
index 95546c41662742..e089d5269cf128 100644
--- a/inference-engine/src/gna_plugin/gna_graph_tools.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_tools.hpp
@@ -779,4 +779,53 @@ inline void CNNNetworkReconnectLayer(CNNLayerPtr old_prev_layer, CNNLayerPtr new
     }
 }
 
+/**
+ * @brief returns a size of a specified data dimension depending on its back offset
+ * @param data a pointer to the data
+ * @param backOffset back dimension offset
+ */
+inline uint32_t GetDataDimSize(InferenceEngine::DataPtr data, uint32_t backOffset) {
+    auto dims = data->getDims();
+    return (dims.size() > backOffset - 1) ? dims[dims.size() - backOffset] : 1;
+}
+
+enum class DataDimName {
+    N, C, H, W
+};
+
+/**
+ * @brief returns a size of a specified data dimension depending on the layout
+ * @param data a pointer to the data
+ * @param dimName dimension name
+ */
+inline uint32_t GetDataDimSize(InferenceEngine::DataPtr data, DataDimName dimName) {
+    uint32_t dimIxInNCHW = static_cast<uint32_t>(dimName);
+    IE_ASSERT(dimIxInNCHW <= 3);
+
+    std::vector<uint32_t> backOffsets;
+    switch (data->getLayout()) {
+        case Layout::C:
+            // 1 will be returned for offsets > 1
+            backOffsets = std::vector<uint32_t>{1, 2, 3, 4};
+            break;
+        case Layout::NC:
+            // 1 will be returned for offsets > 2
+            backOffsets = std::vector<uint32_t>{2, 1, 3, 4};
+            break;
+        case Layout::HWC:
+            // 1 will be returned for offset 4
+        case Layout::NHWC:
+            backOffsets = std::vector<uint32_t>{4, 1, 3, 2};
+            break;
+        case Layout::CHW:
+            // 1 will be returned for offset 4
+        case Layout::NCHW:
+            backOffsets = std::vector<uint32_t>{4, 3, 2, 1};
+            break;
+        default:
+            THROW_GNA_EXCEPTION << data->getName() << " Unexpected layout " << data->getLayout();
+    }
+    return GetDataDimSize(data, backOffsets[dimIxInNCHW]);
+}
+
 }  // namespace InferenceEngine
diff --git a/inference-engine/src/gna_plugin/gna_groups.hpp b/inference-engine/src/gna_plugin/gna_groups.hpp
index 30faed4673123b..1844be79fc6461 100644
--- a/inference-engine/src/gna_plugin/gna_groups.hpp
+++ b/inference-engine/src/gna_plugin/gna_groups.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <legacy/graph_tools.hpp>
+#include "gna_graph_tools.hpp"
 #include "gna_plugin_log.hpp"
 #include "layers/gna_layer_info.hpp"
 
@@ -54,6 +55,14 @@ inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) {
     if (layer->name.rfind("SyntheticScaleShift", 0) == std::string::npos)
         return false;
 
+    // Don't reshape the first dnn layer since it breaks groups recognition
+    auto prevLayer = InferenceEngine::CNNNetPrevLayerSkipCertain(layer, 0, [](InferenceEngine::CNNLayerPtr ptr) {
+        return LayerInfo(ptr).isNonValuesChangable();
+    });
+    IE_ASSERT(prevLayer != nullptr);
+    if (LayerInfo(prevLayer).isInput())
+        return false;
+
     // Don't reshape diagonallayers with bias connection
     return !GNAPluginNS::LayerInfo(getCreatorLayer(layer->insData.front().lock()).lock()).has32BOutput();
 }
diff --git a/inference-engine/src/gna_plugin/gna_model_serial.cpp b/inference-engine/src/gna_plugin/gna_model_serial.cpp
index 236d569cd94743..6cc23248a1495e 100644
--- a/inference-engine/src/gna_plugin/gna_model_serial.cpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.cpp
@@ -413,10 +413,17 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
         writeBits(layer.NumberOfOperands, os);
 
         for (uint32_t i = 0; i < layer.NumberOfOperands; i++) {
-            if (layer.Operands[i] == nullptr)
+            if (layer.Operands[i] == nullptr) {
                 writeBits(Gna2Tensor{}, os);
-            else
-                writeBits(getTensorWithProperOffset(*layer.Operands[i]), os);
+            } else {
+                Gna2Tensor tensor = getTensorWithProperOffset(*layer.Operands[i]);
+                // we need to remove legacy (up to & including GNA HW 2.0) CNN enforement during export
+                // to avoid issues when importing and running the model on newer GNA HW with libGNA 2.1.x.y
+                if (i == OutOpIdx && layer.Type == Gna2OperationTypeConvolution) {
+                    memset(tensor.Layout, 0, sizeof(tensor.Layout));
+                }
+                writeBits(tensor, os);
+            }
         }
 
         writeBits(layer.NumberOfParameters, os);
@@ -906,7 +913,7 @@ void GNAModelSerial::ImportTranspositionInfo(std::istream &is,
 void GNAModelSerial::ExportTranspositionInfo(std::ostream &os,
         const TranspositionInfoMap &transpositionInfoMap) const {
     for (const auto &transpositionInfo : transpositionInfoMap) {
-        auto nameSize = strlen(transpositionInfo.first.c_str()) + 1;
+        auto nameSize = strlen(transpositionInfo.first.c_str());
         writeBits(static_cast<uint32_t>(nameSize), os);
         writeNBytes(transpositionInfo.first.c_str(), nameSize, os);
         auto fragmentsNum = transpositionInfo.second.size();
diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp
index f94244889b2405..18b84ec3690b27 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -55,6 +55,7 @@
 #include <transformations/common_optimizations/fq_reshape_fusion.hpp>
 #include <transformations/common_optimizations/pull_transpose_through_fq.hpp>
 #include <transformations/common_optimizations/relu_fake_quantize_fusion.hpp>
+#include <transformations/common_optimizations/add_fake_quantize_fusion.hpp>
 
 #if GNA_LIB_VER == 2
 #include <gna2-model-api.h>
@@ -106,7 +107,11 @@ void GNAPlugin::copyInputData(T *dst,
         for (uint32_t i = 0; i < num_frames; i++) {
             for (uint32_t j = 0; j < num_vector_elements; j++) {
                 if (!std::is_same<T, U>::value) {
-                    dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * scaleFactor);
+                    if (!gnaFlags->input_low_precision) {
+                        dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * scaleFactor);
+                    } else {
+                        dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt8(src[i * num_vector_elements + j] * scaleFactor);
+                    }
                 } else {
                     dst[j * num_group + i] = src[i * num_vector_elements + j];
                 }
@@ -128,8 +133,14 @@ void GNAPlugin::copyInputData(T *dst,
                 T *ptr_dst_vec = reinterpret_cast<T *>(dst) + i * num_vector_stride;
                 const U *ptr_src_vec = reinterpret_cast<const U *>(src) + i * num_vector_elements;
                 std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
-                for (uint32_t j=0; j < num_vector_elements; j++) {
-                    ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * scaleFactor);
+                if (!gnaFlags->input_low_precision) {
+                    for (uint32_t j = 0; j < num_vector_elements; j++) {
+                        ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * scaleFactor);
+                    }
+                } else {
+                    for (uint32_t j = 0; j < num_vector_elements; j++) {
+                        ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt8(ptr_src_vec[j] * scaleFactor);
+                    }
                 }
             }
 
@@ -217,6 +228,10 @@ void GNAPlugin::ExportScores(void *ptr_dst,
                     auto dst_ptr = dst + (i * num_vector_elements + j);
 
                     switch (num_bytes_per_element_input) {
+                        case 1: {
+                            *dst_ptr = static_cast<int32_t>(*reinterpret_cast<const int8_t*>(input_ptr));
+                            break;
+                        }
                         case 2 : {
                             *dst_ptr  = static_cast<int32_t>(*reinterpret_cast<const int16_t*>(input_ptr));
                             break;
@@ -283,21 +298,36 @@ void GNAPlugin::ImportFrames(
         // TODO : fix that as well
         if (input_precision == Precision::U8) {
             auto src = reinterpret_cast<const uint8_t *>(ptr_src);
-            auto dst = reinterpret_cast<int16_t *>(ptr_dst);
-            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            if (!gnaFlags->input_low_precision) {
+                auto dst = reinterpret_cast<int16_t*>(ptr_dst);
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            } else {
+                auto dst = reinterpret_cast<int8_t*>(ptr_dst);
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            }
         } else if (input_precision.size() == 2) {
-            auto dst = reinterpret_cast<int16_t *>(ptr_dst);
             auto src = reinterpret_cast<const int16_t *>(ptr_src);
-            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            if (!gnaFlags->input_low_precision) {
+                auto dst = reinterpret_cast<int16_t*>(ptr_dst);
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            } else {
+                auto dst = reinterpret_cast<int8_t*>(ptr_dst);
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            }
         } else if (input_precision.size() == 4) {
             if (!gnadevice) {
                 auto dst = reinterpret_cast<float *>(ptr_dst);
                 auto src = reinterpret_cast<const float *>(ptr_src);
                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
             } else {
-                auto dst = reinterpret_cast<int16_t *>(ptr_dst);
                 auto src = reinterpret_cast<const float *>(ptr_src);
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                if (!gnaFlags->input_low_precision) {
+                    auto dst = reinterpret_cast<int16_t*>(ptr_dst);
+                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                } else {
+                    auto dst = reinterpret_cast<int8_t*>(ptr_dst);
+                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                }
             }
         }
     } else {
@@ -306,24 +336,36 @@ void GNAPlugin::ImportFrames(
             if (!gnadevice) {
                 auto dst = reinterpret_cast<float *>(ptr_dst);
                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            } else if (!gnaFlags->input_low_precision) {
+                auto dst = reinterpret_cast<int16_t*>(ptr_dst);
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
             } else {
-                auto dst = reinterpret_cast<int16_t *>(ptr_dst);
+                auto dst = reinterpret_cast<int8_t*>(ptr_dst);
                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
             }
-
         } else if (input_precision.size()== 2) {
-            auto dst = reinterpret_cast<int16_t *>(ptr_dst);
             auto src = reinterpret_cast<const int16_t *>(ptr_src);
-            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            if (!gnaFlags->input_low_precision) {
+                auto dst = reinterpret_cast<int16_t*>(ptr_dst);
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            } else {
+                auto dst = reinterpret_cast<int8_t*>(ptr_dst);
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+            }
         } else if (input_precision.size() == 4) {
             if (!gnadevice) {
                 auto dst = reinterpret_cast<float *>(ptr_dst);
                 auto src = reinterpret_cast<const float *>(ptr_src);
                 copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
             } else {
-                auto dst = reinterpret_cast<uint16_t *>(ptr_dst);
                 auto src = reinterpret_cast<const float *>(ptr_src);
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                if (!gnaFlags->input_low_precision) {
+                    auto dst = reinterpret_cast<int16_t*>(ptr_dst);
+                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                } else {
+                    auto dst = reinterpret_cast<int8_t*>(ptr_dst);
+                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                }
             }
         }
     }
@@ -635,6 +677,8 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
         pass_config->disable<ngraph::pass::FakeQuantizeReshapeFusion>();
         pass_config->disable<ngraph::pass::PullTransposeThroughFQUp>();
         pass_config->disable<ngraph::pass::ReluFakeQuantizeFusion>();
+        // Consider to enable after per-channel quantization on FakeQuantize layer is supported in GNAPlugin, see issue 52034
+        pass_config->disable<ngraph::pass::AddFakeQuantizeFusion>();
         manager.run_passes(graph);
         convertedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(graph, clonedNetwork);
     }
@@ -660,8 +704,8 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
 
     // network optimisation phases
     int passIdx = 0;
-    auto run_passes = [&] (const CNNNetwork& network, bool runBeforeCopy) {
-        auto passes = make_shared<PassManager>(PassManagerSettings{policy, runBeforeCopy}, network);
+    auto run_passes = [&] (const CNNNetwork& network, bool runBeforeCopy, bool lowPrecision) {
+        auto passes = make_shared<PassManager>(PassManagerSettings{policy, runBeforeCopy, lowPrecision}, network);
         passes->registerPass<RemoveConstPass>();
         passes->registerPass<UnrollTIPass>();
         passes->registerPass<RemoveConstPass>();
@@ -713,8 +757,8 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
         };
         newNet = InferenceEngine::CNNNetCopy(network, visitor);
         // to run all passes need to have two calls to pass manager
-        run_passes(newNet, true);
-        run_passes(newNet, false);
+        run_passes(newNet, true, gnaFlags->input_low_precision);
+        run_passes(newNet, false, gnaFlags->input_low_precision);
     } else if (gnaFlags->fake_quantized) {
         switch (config.gnaPrecision) {
             case Precision::I16:
@@ -735,8 +779,13 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
                 newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
                 break;
             case Precision::I8:
-                ModelQuantizer<QuantI8> q8;
-                newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
+                if (gnaFlags->input_low_precision == false) {
+                    ModelQuantizer<QuantI8> q8;
+                    newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
+                } else {
+                    ModelQuantizer<QuantI8_I8> q8_8;
+                    newNet = q8_8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
+                }
                 break;
             default:
                 THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
@@ -1161,7 +1210,7 @@ uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, Infer
         auto  importedFrames = (is3D || is1D) ? 1 : dims[0];
         auto  targetGroups = is1D ? 1 : dims[0]; // TODO: no proper support for groups yet
 
-        auto  importedElementSizeBytes = gnaFlags->sw_fp32 ? 4 : 2;
+        auto  importedElementSizeBytes = gnaFlags->sw_fp32 ? 4 : (gnaFlags->input_low_precision ? 1 : 2);
         auto  importedBytes = importedElements * importedFrames * importedElementSizeBytes;
 
         if (inputsDesc->bytes_allocated_for_input[input.first] < importedBytes) {
@@ -1427,7 +1476,7 @@ void GNAPlugin::SetName(const std::string & pluginName) noexcept {
     _pluginName = pluginName;
 }
 
-InferenceEngine::ExecutableNetwork GNAPlugin::ImportNetwork(std::istream& networkModel) {
+InferenceEngine::IExecutableNetworkInternal::Ptr GNAPlugin::ImportNetwork(std::istream& networkModel) {
     auto header = GNAModelSerial::ReadHeader(networkModel);
 
     InitGNADevice();
diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp
index 611ba2c6912c00..3e54c224746336 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.hpp
@@ -13,6 +13,7 @@
 #include <vector>
 #include <tuple>
 #include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
+#include <cpp_interfaces/interface/ie_iexecutable_network_internal.hpp>
 #include "cpp_interfaces/impl/ie_variable_state_internal.hpp"
 #include "descriptions/gna_flags.hpp"
 #include "descriptions/gna_input_desc.hpp"
@@ -104,9 +105,9 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
     void AddExtension(InferenceEngine::IExtensionPtr extension) override;
 
     void SetConfig(const std::map<std::string, std::string> &config) override;
-    InferenceEngine::ExecutableNetwork LoadNetwork(const InferenceEngine::CNNNetwork &network,
+    InferenceEngine::IExecutableNetworkInternal::Ptr LoadNetwork(const InferenceEngine::CNNNetwork &network,
         const std::map<std::string, std::string> &config_map) override { THROW_GNA_EXCEPTION << "Not implemented"; }
-    InferenceEngine::ExecutableNetwork LoadNetwork(const InferenceEngine::CNNNetwork &network,
+    InferenceEngine::IExecutableNetworkInternal::Ptr LoadNetwork(const InferenceEngine::CNNNetwork &network,
                                   const std::map<std::string, std::string> &config_map,
                                   InferenceEngine::RemoteContext::Ptr context) override { THROW_GNA_EXCEPTION << "Not implemented"; }
     bool Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result);
@@ -130,22 +131,22 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
 
     void Export(const std::string &fileName);
     void Export(std::ostream &networkModel);
-    InferenceEngine::ExecutableNetwork ImportNetwork(const std::string &modelFileName,
+    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetwork(const std::string &modelFileName,
                                                      const std::map<std::string, std::string> &config) override {
         THROW_GNA_EXCEPTION << "Not implemented";
     }
-    InferenceEngine::ExecutableNetwork ImportNetwork(std::istream& networkModel,
+    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetwork(std::istream& networkModel,
                                                      const InferenceEngine::RemoteContext::Ptr& context,
                                                      const std::map<std::string, std::string> &config) override {
         THROW_GNA_EXCEPTION << "Not implemented";
     }
 
-    InferenceEngine::ExecutableNetwork ImportNetwork(std::istream& networkModel,
+    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetwork(std::istream& networkModel,
                                                      const std::map<std::string, std::string>& config) override {
         THROW_GNA_EXCEPTION << "Not implemented";
     }
 
-    InferenceEngine::ExecutableNetwork ImportNetwork(std::istream& networkModel);
+    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetwork(std::istream& networkModel);
 
     /**
      * utility to provide input and output blobs externally to be used by InferenceEngine request API clients
diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.hpp b/inference-engine/src/gna_plugin/gna_plugin_config.hpp
index f6e48fb04b2583..502c2cbe1b8f49 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_config.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_config.hpp
@@ -49,7 +49,7 @@ struct Config {
     std::string GetParameter(const std::string& name) const;
     std::vector<std::string> GetSupportedKeys() const;
 
-    // precision of GNA hardware model
+    // default precision of GNA hardware model (see QuantI16 quantizer struct)
     InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
 
     std::string dumpXNNPath;
diff --git a/inference-engine/src/gna_plugin/gna_plugin_internal.hpp b/inference-engine/src/gna_plugin/gna_plugin_internal.hpp
index 9a57dbc123d457..7203382ca52012 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_internal.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_internal.hpp
@@ -46,7 +46,7 @@ class GNAPluginInternal  : public InferenceEngine::InferencePluginInternal {
         defaultConfig.UpdateFromMap(config);
     }
 
-    InferenceEngine::ExecutableNetwork ImportNetwork(
+    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetwork(
                                                 const std::string &modelFileName,
                                                 const std::map<std::string, std::string> &config) override {
         Config updated_config(defaultConfig);
@@ -54,20 +54,18 @@ class GNAPluginInternal  : public InferenceEngine::InferencePluginInternal {
         auto plg = std::make_shared<GNAPlugin>(updated_config.keyConfigMap);
         plgPtr = plg;
 
-        return make_executable_network(std::make_shared<GNAExecutableNetwork>(modelFileName, plg));
+        return std::make_shared<GNAExecutableNetwork>(modelFileName, plg);
     }
 
-    InferenceEngine::ExecutableNetwork ImportNetwork(std::istream& networkModel,
+    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetwork(std::istream& networkModel,
                                                      const std::map<std::string, std::string>& config) override {
         Config updated_config(defaultConfig);
         updated_config.UpdateFromMap(config);
         auto plg = std::make_shared<GNAPlugin>(updated_config.keyConfigMap);
         plgPtr = plg;
-        return make_executable_network(std::make_shared<GNAExecutableNetwork>(networkModel, plg));
+        return std::make_shared<GNAExecutableNetwork>(networkModel, plg);
     }
 
-    using InferenceEngine::InferencePluginInternal::ImportNetwork;
-
     std::string GetName() const noexcept override {
         return GetCurrentPlugin()->GetName();
     }
diff --git a/inference-engine/src/gna_plugin/gna_plugin_query_api.cpp b/inference-engine/src/gna_plugin/gna_plugin_query_api.cpp
index 9d7ffe9ac97516..79f0959ab8cf3f 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_query_api.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_query_api.cpp
@@ -53,7 +53,8 @@ Parameter GNAPlugin::GetMetric(const std::string& name, const std::map<std::stri
                 availablesMetrics.push_back(supportedAPI.first);
             }
             return availablesMetrics;
-        }}
+        }},
+        {METRIC_KEY(IMPORT_EXPORT_SUPPORT), []() {return true;}}
     };
 
     auto it = queryApiSupported.find(name);
diff --git a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
index 732ef1384f017b..a91b049c1b285e 100644
--- a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
@@ -54,10 +54,13 @@ class LayerInfo {
         IS_VALID();
         return layer->insData.size() > 1;
     }
-    bool has16BOutput() const noexcept {
+    // The name of the funciton may be somehwat misleading
+    // Explanation: when in low precision mode the listed layers have 8-bit outputs
+    // and when in 16-bit input mode, they have 16-bit outputs
+    bool has8BOr16BOutput() const noexcept {
         IS_VALID();
-        static InferenceEngine::details::caseless_set<std::string> layersWith16BOutputs = {"memory", "input", "split", "slice", "concat", "copy", "const"};
-        return layersWith16BOutputs.find(layer->type) != layersWith16BOutputs.end() ||
+        static InferenceEngine::details::caseless_set<std::string> layersWith8BOr16BOutputs = {"memory", "input", "split", "slice", "concat", "copy", "const"};
+        return layersWith8BOr16BOutputs.find(layer->type) != layersWith8BOr16BOutputs.end() ||
                                                                         isActivation() ||
                                                             (isCrop() && !isCropAffined());
     }
@@ -267,6 +270,9 @@ class LayerInfo {
         }
         return true;
     }
+    bool isNonValuesChangable() const {
+        return isNonFunctional() || isSplit() || isSlice() || isConcat();
+    }
     bool isPooling() const noexcept {
         return isOfType("pooling");
     }
diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
index 1c217dbba57c47..03750a079efa7e 100644
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@@ -81,9 +81,13 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
     auto diagLayer = std::make_shared<ScaleShiftLayer>(LayerParams({diagName, "ScaleShift", Precision::FP32}));
     IE_ASSERT(diagLayer != nullptr);
 
-    // TODO: diagonal size
-    size_t weightsSize = LayerInfo(prevLayer).has32BOutput() ? weightsSize = nextLayer->outData[0]->getDims().back() :
-                                                               Get2DReshapedData(nextLayer->outData[0], 8)->getDims()[1];
+    auto inputLayer = InferenceEngine::CNNNetPrevLayerSkipCertain(nextLayer, 0, [](InferenceEngine::CNNLayerPtr ptr) {
+        return LayerInfo(ptr).isNonValuesChangable();
+    });
+    IE_ASSERT(inputLayer != nullptr);
+    size_t weightsSize = (LayerInfo(prevLayer).has32BOutput() || LayerInfo(inputLayer).isInput()) ?
+                         weightsSize = nextLayer->outData[0]->getDims().back() :
+                         Get2DReshapedData(nextLayer->outData[0], 8)->getDims()[1];
     std::vector<float> weightsValues(weightsSize, fillValue);
     IE_ASSERT(diagLayer != nullptr);
     diagLayer->_weights = make_shared_blob<float>(
@@ -126,7 +130,7 @@ static CNNLayerPtr InsertCopyLayer(CNNLayerPtr prevLayer, CNNLayerPtr nextLayer,
     return copyWithQuant;
 }
 
-static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l) {
+static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager) {
     std::vector<CNNLayerPtr> prevLayers;
 
     // skipping memory inputs and true inputs layers
@@ -148,15 +152,24 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
     if (eltwise != nullptr) {
         // eltwise layer has 2 inputs, so depends on situation identity should or should not be inserted
 
-        // for  sum if we have 4-4 inputs we will handle that by inserting identity activation case (1)
-        // for  sum if we have 4-2 - OK
-        // for  sum if we have 2-2 inputs we need to insert diagonal
-
-        // for  mul if we have 2-2 - OK
-        // for  mul if we have 2-4 - inputs we need to insert identity activation to make 2 bytes input
-        // for  mul if we have 4-4 - there 2 options
-        //          option 1 both inputs came from single outdata  - we will insert 1 identity  to just convert single input into 2 bytes
-        //          option 2 each input came from it's own outdata - we need to insert 2 identities activations to convert both and feed weights and inputs
+        // for sum with 16-bit input precision
+        //          if we have 4-4 inputs - we will handle that by inserting identity activation case (1)
+        //          if we have 4-2 inputs - OK
+        //          if we have 2-2 inputs - we need to insert diagonal
+
+        // for sum with 8-bit input precision
+        //          if we have 1-1 inputs - OK
+        //          if we have 4-4 inputs - there are 2 options
+        //              option 1 both inputs came from single outdata - we need to insert 1 identity activation to just convert single input into 1 byte
+        //              option 2 each input came from its own outdata - we need to insert 2 identity activations to convert both and feed weights and inputs
+
+        // for mul if we have 2-2 or 1-1 (low precision case) inputs - OK
+        // for mul if we have 2-4 or 1-4 (low precision case) inputs - we need to insert identity activation to make 2 bytes input
+        //                                                             or 1 byte input (low precision case)
+        // for mul if we have 4-4 inputs - there are 2 options
+        //          option 1 both inputs came from single outdata - we need to insert 1 identity activation to just convert single input into 2 bytes
+        //                                                          or 1 byte (low precision case)
+        //          option 2 each input came from its own outdata - we need to insert 2 identity activations to convert both and feed weights and inputs
 
         auto prev0 = PrevFunctionalLayer(l, 0);
         auto prev1 = PrevFunctionalLayer(l, 1);
@@ -164,14 +177,32 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
         switch (eltwise->_operation) {
             case EltwiseLayer::Sub:
             case EltwiseLayer::Sum:
-                if (!LayerInfo(prev0).has32BOutput() || !LayerInfo(prev1).has32BOutput()) {
-                    return prevLayers;
+                if (!passmanager->isLowPrecision()) {
+                    if (!LayerInfo(prev0).has32BOutput() || !LayerInfo(prev1).has32BOutput()) {
+                        return prevLayers;
+                    }
+                    // TODO: whether there are possibility to select after what layer identity gets inserted
+                    prevLayers.push_back(CNNNetPrevLayer(l, 0));
+                } else {
+                    if (LayerInfo(prev0).has8BOr16BOutput() && LayerInfo(prev1).has8BOr16BOutput()) {
+                        return prevLayers;
+                    }
+
+                    if (LayerInfo(prev0).has32BOutput()) {
+                        prevLayers.push_back(CNNNetPrevLayer(l, 0));
+                    }
+
+                    // if layers of outdata are different
+                    auto prevData0 = l->insData[0].lock();
+                    auto prevData1 = l->insData[1].lock();
+
+                    if ((prev0 != prev1 || prevData0 != prevData1) && LayerInfo(prev1).has32BOutput()) {
+                        prevLayers.push_back(CNNNetPrevLayer(l, 1));
+                    }
                 }
-                // TODO: whether there are possibility to select after what layer identity gets inserted
-                prevLayers.push_back(CNNNetPrevLayer(l, 0));
                 break;
             case EltwiseLayer::Prod: {
-                if (LayerInfo(prev0).has16BOutput() && LayerInfo(prev1).has16BOutput()) {
+                if (LayerInfo(prev0).has8BOr16BOutput() && LayerInfo(prev1).has8BOr16BOutput()) {
                     return prevLayers;
                 }
 
@@ -227,6 +258,8 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
 }
 
 void InsertDiagonalLayerPass::run() {
+    bool lowPrecision = getPassManager()->isLowPrecision();
+
     for (auto & l : *pLayers) {
         if (l->insData.empty()) continue;
         auto prevLayer = CNNNetPrevLayerSkipCertain(l, 0, [](CNNLayerPtr ptr) {
@@ -241,12 +274,16 @@ void InsertDiagonalLayerPass::run() {
             if (!eltwise) {
                 continue;
             }
-            // in case of eltwise sum one of input would be 4 bytes one - 2
-            // in case of eltwise mull one of input would be 2 bytes one - 2
+            // in case of eltwise sum in 16-bit input precision one of input would be 4 bytes one - 2
+            // in case of eltwise mul in 16-bit input precision one of input would be 2 bytes one - 2
+            // in case of eltwise sum in low (8-bit) input precision both inputs are 1 byte
+            // in case of eltwise mul in low (8-bit) input precision both inputs are 1 byte
             // for e sum if we have 4-4 inputs we will handle that by inserting identity activation
             // for e sum if we have 4-2 - OK
             // for e sum if we have 2-2 inputs we need to insert diagonal -- handling here
+            // for e sum if we have 1-1 inputs in low precision mode - OK
             // for e mul if we have 2-2 - OK
+            // for e mul if we have 1-1 in low precision mode - OK
             // for e mul if we have 2-4 - inputs we need to insert identity to put 4 bytes input into weights
             // for e mul if we have 4-4 - inputs we need to insert 2 identities to put both 4 bytes input into weights
 
@@ -256,7 +293,10 @@ void InsertDiagonalLayerPass::run() {
             auto prevLayer1 = CNNNetPrevLayerSkipCertain(l, 1, [](CNNLayerPtr ptr) {
                 return LayerInfo(ptr).isNonFunctional();
             });
-            if (!LayerInfo(prevLayer).has16BOutput() || !LayerInfo(prevLayer1).has16BOutput())
+            if (!LayerInfo(prevLayer).has8BOr16BOutput() || !LayerInfo(prevLayer1).has8BOr16BOutput())
+                continue;
+
+            if (lowPrecision && LayerInfo(prevLayer).has8BOr16BOutput() && LayerInfo(prevLayer1).has8BOr16BOutput())
                 continue;
         }
         auto prevDirectLayer = CNNNetPrevLayer(l, 0);
@@ -677,16 +717,6 @@ void RemovePermutationsNHWCToNCHWPass::run() {
         }
 
         nhwc_layout_patterns.push_back({prev, next});
-
-        auto* convolution = dynamic_cast<ConvolutionLayer*>(l.get());
-        if (!convolution) {
-            THROW_GNA_EXCEPTION << "Invalid type of convolution layer";
-        }
-        if (convolution->_kernel_y != 1) {
-            THROW_GNA_LAYER_EXCEPTION(l) << "this case is not implemented yet";
-        }
-        auto in_channels = convolution->input()->getDims()[1];
-        convolution->_kernel_y = in_channels;
     }
 
     for (const auto& layers : nhwc_layout_patterns) {
@@ -746,7 +776,7 @@ void RemovePermutationsNHWCToNCHWPass::run() {
 void InsertIdentityLayerPass::run() {
     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
     for (auto & l : *pLayers) {
-        for (auto && prev : getCandidatesForIdentityInsertion(l)) {
+        for (auto && prev : getCandidatesForIdentityInsertion(l, getPassManager())) {
             // Do an upstream search until Functional layer is found
             auto original_prev_layer = prev;
             auto true_layer = l;
@@ -821,7 +851,7 @@ void InsertIdentityLayerPass::run() {
                 for (auto && nextLayer : getInputTo(nextData)) {
                     if (nextLayer.second.get() == l.get())
                         continue;
-                    if (getCandidatesForIdentityInsertion(nextLayer.second).empty()) {
+                    if (getCandidatesForIdentityInsertion(nextLayer.second, getPassManager()).empty()) {
                         notAll = true;
                     }
                 }
@@ -1087,7 +1117,7 @@ void InsertConcatAligningFilterPass::run() {
                     std::make_shared<WeightableLayer>(LayerParams({filterName, "ConcatAlignFilter", Precision::FP32}));
 
                 if (dims.size() != 2) {
-                    THROW_GNA_EXCEPTION << "unsupported concat input    a of dims.size()=" << dims.size() << ", layer=" << prevLayer->name;
+                    THROW_GNA_EXCEPTION << "unsupported concat input of dims.size()=" << dims.size() << ", layer=" << prevLayer->name;
                 }
 
                 auto num_rows_in = dims[1];
@@ -2150,7 +2180,10 @@ void TransposeWeightsFromNCHWToNHWCPass::run() {
                     transpositionInfo = FindTranspositionInfoFromNextLayers(getInputTo(l->outData[0]).begin()->second);
                 }
             }
-            if (!transpositionInfo.empty()) {
+            if (foundPartToTranspose(transpositionInfo)) {
+                if (l->input()->getDims().front() > 1) {
+                    THROW_GNA_EXCEPTION << l->name << " Weights transposition is not supported for a layer with batch size > 1";
+                }
                 auto weightable = dynamic_cast<WeightableLayer*>(l.get());
                 IE_ASSERT(weightable != nullptr);
                 ConvertTensorFromNCHWToNHWC(weightable->precision.size(), 1, weightable->_weights->size(),
@@ -2175,8 +2208,17 @@ void TransposeWeightsFromNCHWToNHWCPass::run() {
             auto weightsColumns = InferenceEngine::details::product(std::begin(in_dims) + 1, std::end(in_dims));
             // Find a convolution in previous layers to rotate weights rows
             if (InferenceEngine::CNNNetHasPrevLayer(l.get())) {
-                auto transpositionInfo = FindTranspositionInfoFromPrevLayers(InferenceEngine::CNNNetPrevLayer(l));
-                if (!transpositionInfo.empty()) {
+                std::vector<TranspositionInfo> transpositionInfo;
+                auto prevLayer = InferenceEngine::CNNNetPrevLayer(l);
+                transpositionInfo = FindTranspositionInfoFromPrevLayers(prevLayer);
+                if (foundPartToTranspose(transpositionInfo)) {
+                    if (l->input()->getDims().front() > 1) {
+                        THROW_GNA_EXCEPTION << l->name << " Weights transposition is not supported for a layer with batch size > 1";
+                    }
+                    if (LayerInfo(prevLayer).isSplit()) {
+                        // If we found a split it's not possible to rotate data
+                        THROW_GNA_EXCEPTION << l->name << " won't be transposed due to a split before it";
+                    }
                     size_t totalColumns = 0;
                     for (auto && transpositionInfoPart : transpositionInfo) {
                         totalColumns += transpositionInfoPart.num_transpose_rows * transpositionInfoPart.num_transpose_columns;
@@ -2193,14 +2235,23 @@ void TransposeWeightsFromNCHWToNHWCPass::run() {
             }
             // Find a convolution in next layers to rotate weights columns
             if (!l->outData.empty() && !getInputTo(l->outData[0]).empty() && !l->outData.empty() && !getInputTo(l->outData[0]).empty()) {
-                auto transpositionInfo = FindTranspositionInfoFromNextLayers(getInputTo(l->outData[0]).begin()->second);
-                if (!transpositionInfo.empty()) {
+                std::vector<TranspositionInfo> transpositionInfo;
+                auto nextLayer = getInputTo(l->outData[0]).begin()->second;
+                transpositionInfo = FindTranspositionInfoFromNextLayers(nextLayer);
+                if (foundPartToTranspose(transpositionInfo)) {
+                    if (l->outData[0]->getDims().front() > 1) {
+                        THROW_GNA_EXCEPTION << l->name << " Weights transposition is not supported for a layer with batch size > 1";
+                    }
+                    if (LayerInfo(nextLayer).isConcat()) {
+                        // If we found a concat it's not possible to rotate data
+                        THROW_GNA_EXCEPTION << l->name << " won't be transposed due to a concat after it";
+                    }
                     size_t totalRows = 0;
                     for (const auto& transpositionInfoPart : transpositionInfo) {
                         totalRows += transpositionInfoPart.num_transpose_rows * transpositionInfoPart.num_transpose_columns;
                     }
                     if (weightsRows != totalRows) {
-                        THROW_GNA_EXCEPTION << l->name << "weights rows from transposition info (" << totalRows
+                        THROW_GNA_EXCEPTION << l->name << " weights rows from transposition info (" << totalRows
                                             << ") don't match output dimensions (" << weightsRows << ")";
                     }
                     ConvertTensorFromNCHWToNHWC(precision, weightsRows, weightsColumns, weightable->_weights->cbuffer().as<uint8_t*>(),
@@ -2227,14 +2278,55 @@ void TransposeWeightsFromNCHWToNHWCPass::run() {
             if (!foundPartToTranspose(transpositionInfo)) {
                 transpositionInfo = FindTranspositionInfoFromNextLayers(getInputTo(l->outData[0]).begin()->second);
             }
-            if (!transpositionInfo.empty()) {
+            if (foundPartToTranspose(transpositionInfo)) {
                 auto blob = secondInput->blobs["custom"];
                 ConvertTensorFromNCHWToNHWC(blob->getTensorDesc().getPrecision().size(), 1, blob->size(),
                                             blob->buffer().as<uint8_t*>(), true, transpositionInfo);
-                gnalog() << l->name << " data transposition info:\n";
+                gnalog() << secondInput->name << " data transposition info:\n";
                 printTranspositionInfo(transpositionInfo);
             }
         }
+
+        if (LayerInfo(l).isConcat()) {
+            auto concatLayer = LayerInfo(l).as<InferenceEngine::ConcatLayer*>();
+            IE_ASSERT(concatLayer != nullptr);
+            // If concatenation is along channel axis constant input transposition isn't required
+            if (concatLayer->_axis <= 1) continue;
+
+            std::vector<InferenceEngine::CNNLayerPtr> constInputs;
+            bool transpose = false;
+            int nonConstInputIx = 0;
+            // Check if non-const inputs are transposed
+            for (int i = 0; InferenceEngine::CNNNetHasPrevLayer(l.get(), i); ++i) {
+                auto input = InferenceEngine::CNNNetPrevLayer(l, i);
+                if (LayerInfo(input).isConst()) {
+                    constInputs.push_back(input);
+                    continue;
+                }
+                auto transpositionInfo = FindTranspositionInfoFromPrevLayers(input);
+                bool transposeInput = foundPartToTranspose(transpositionInfo);
+                if (nonConstInputIx == 0) {
+                    transpose = transposeInput;
+                } else if (transposeInput != transpose) {
+                    THROW_GNA_EXCEPTION << "Concat layer " << l->name << " inputs have different layouts";
+                }
+                ++nonConstInputIx;
+            }
+            if (!transpose) continue;
+
+            // Transpose all constant inputs
+            for (auto && input : constInputs) {
+                auto rows = GetDataDimSize(input->outData[0], DataDimName::C);
+                auto columns = GetDataDimSize(input->outData[0], DataDimName::H) * GetDataDimSize(input->outData[0], DataDimName::W);
+                auto blob = input->blobs["custom"];
+                // A constant should have the same number of channels since concatenation will be in height/weight dimension
+                TranspositionInfo concatTranspositionInfo{true, rows, columns};
+                ConvertTensorFromNCHWToNHWC(blob->getTensorDesc().getPrecision().size(), 1, blob->size(),
+                                            blob->buffer().as<uint8_t*>(), true, {concatTranspositionInfo});
+                gnalog() << input->name << " data transposition info:\n";
+                printTranspositionInfo({concatTranspositionInfo});
+            }
+        }
     }
 }
 
diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp
index 2ee84584e9d648..8f0157ce478b2e 100644
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp
@@ -30,6 +30,7 @@ class IPassManager {
     virtual ~IPassManager() = default;
     virtual int &getIntVar(std::string name) = 0;
     virtual const Policy &getPolicy() const = 0;
+    virtual const bool& isLowPrecision() const = 0;
     virtual InferenceEngine::CNNNetwork &getNetwork() = 0;
 };
 
@@ -221,6 +222,7 @@ struct PassManagerSettings {
     Policy policy;
     /// @brief whether to run passes before copy
     bool runBeforeCopy;
+    bool lowPrecision;
 };
 
 
@@ -245,6 +247,9 @@ class PassManager : public IPassManager, public std::enable_shared_from_this<Pas
     const Policy & getPolicy() const override {
         return settings.policy;
     }
+    const bool& isLowPrecision() const override {
+        return settings.lowPrecision;
+    }
     InferenceEngine::CNNNetwork& getNetwork() override {
         return network;
     }
diff --git a/inference-engine/src/gna_plugin/preprocessing.cpp b/inference-engine/src/gna_plugin/preprocessing.cpp
index 33924b51b9b241..3c316c419e9e90 100644
--- a/inference-engine/src/gna_plugin/preprocessing.cpp
+++ b/inference-engine/src/gna_plugin/preprocessing.cpp
@@ -15,6 +15,17 @@ int16_t GNAPluginNS::ConvertFloatToInt16(float src) {
     return (int16_t)value;
 }
 
+int8_t GNAPluginNS::ConvertFloatToInt8(float src) {
+    float rounding_value = (src > 0) ? 0.5f : -0.5f;
+    float value = src + rounding_value;
+    if (value > 127.0) {
+        return 127;
+    } else if (value < -128.0) {
+        return -128;
+    }
+    return (int8_t)value;
+}
+
 void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
                                  const float *ptr_src,
                                  const uint32_t num_rows,
diff --git a/inference-engine/src/gna_plugin/preprocessing.hpp b/inference-engine/src/gna_plugin/preprocessing.hpp
index a09cfde2982ec6..aac61f2887b7a0 100644
--- a/inference-engine/src/gna_plugin/preprocessing.hpp
+++ b/inference-engine/src/gna_plugin/preprocessing.hpp
@@ -21,4 +21,5 @@ void ConvertToFloat(float *ptr_dst,
                     const float scale_factor);
 
 int16_t ConvertFloatToInt16(float src);
+int8_t ConvertFloatToInt8(float src);
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/round_float_define.hpp b/inference-engine/src/gna_plugin/round_float_define.hpp
index 1bcbb2a4a29d9a..584d14ecc1ac3f 100644
--- a/inference-engine/src/gna_plugin/round_float_define.hpp
+++ b/inference-engine/src/gna_plugin/round_float_define.hpp
@@ -7,5 +7,6 @@
 #include <cstdint>
 
 
+#define FLOAT_TO_INT8(a) static_cast<int8_t>(((a) < 0)?((a) - 0.5f):((a) + 0.5f))
 #define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5f):((a) + 0.5f))
 #define FLOAT_TO_INT32(a) static_cast<int32_t>(((a) < 0)?((a)-0.5f):((a)+0.5f))
diff --git a/inference-engine/src/gna_plugin/runtime/gna_float_runtime_op.cpp b/inference-engine/src/gna_plugin/runtime/gna_float_runtime_op.cpp
index f91ba6c9f31187..49c1727fd5d642 100644
--- a/inference-engine/src/gna_plugin/runtime/gna_float_runtime_op.cpp
+++ b/inference-engine/src/gna_plugin/runtime/gna_float_runtime_op.cpp
@@ -79,10 +79,12 @@ void FP::ApplyDiagonalTransform(intel_dnn_component_t *component) {
             C[i * ldc + j] = bias[i];
         }
     }
-    for (uint32_t j = 0; j < n; j++) {
-        float *Bcol = B + j * component->num_rows_in;
-        float *Ccol = C + j * component->num_rows_out;
-        cblas_ssbmv1(CblasRowMajor, CblasLower, m, 0, 1.0, A, 1, Bcol, 1, 1.0, Ccol, 1);
+    std::vector<float> Arow(n);
+    for (uint32_t i = 0; i < m; i++) {
+        float *Brow = B + i * n;
+        float *Crow = C + i * ldc;
+        std::fill(std::begin(Arow), std::end(Arow), A[i]);
+        cblas_ssbmv1(CblasRowMajor, CblasLower, n, 0, 1.0, Arow.data(), 1, Brow, 1, 1.0, Crow, 1);
     }
 }
 
diff --git a/inference-engine/src/gna_plugin/runtime/pwl.cpp b/inference-engine/src/gna_plugin/runtime/pwl.cpp
index 8d8528a0b113c0..3cd5238eba658e 100644
--- a/inference-engine/src/gna_plugin/runtime/pwl.cpp
+++ b/inference-engine/src/gna_plugin/runtime/pwl.cpp
@@ -496,11 +496,12 @@ std::vector<pwl_t> pwl_search(const DnnActivation& activation_type,
 }
 
 
-void PwlDesignOpt16(const DnnActivation activation_type,
+void PwlDesignOpt(const DnnActivation activation_type,
                     std::vector<gna_pwl_segment_t> &ptr_segment,
                     const float scale_in,
                     const float scale_out,
-                    const float pwlMaxErrorPercent) {
+                    const float pwlMaxErrorPercent,
+                    const bool low_precision) {
     std::vector<pwl_t> pwl;
     double err_pct = 0.0;
     auto minInputStats = 0.0f;
@@ -515,7 +516,7 @@ void PwlDesignOpt16(const DnnActivation activation_type,
             auto minInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? -absMax : -SIGMOID_DOMAIN;
             auto maxInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? absMax : SIGMOID_DOMAIN;
             pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
-            make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, low_precision, ptr_segment);
             break;
         }
         case kActTanh: {
@@ -523,7 +524,7 @@ void PwlDesignOpt16(const DnnActivation activation_type,
             auto minInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? -absMax : -TANH_DOMAIN;
             auto maxInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? absMax : TANH_DOMAIN;
             pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
-            make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, low_precision, ptr_segment);
             break;
         }
         case kActSoftSign: {
@@ -531,55 +532,56 @@ void PwlDesignOpt16(const DnnActivation activation_type,
             auto minInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? -absMax : -SOFTSIGN_DOMAIN;
             auto maxInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? absMax : SOFTSIGN_DOMAIN;
             pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
-            make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, low_precision, ptr_segment);
             break;
         }
         case kActRelu:
-            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
             break;
         case kActLeakyRelu:
-            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
             break;
         case kActIdentity:
         case kActFakeQuantize:
-            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
             break;
         case kActKaldiLstmClipping:
-            make_gna_pwl(activation_type, pwl, activation_type.args.clamp.low, activation_type.args.clamp.high, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, activation_type.args.clamp.low, activation_type.args.clamp.high,
+                         scale_in, scale_out, low_precision, ptr_segment);
             break;
         case kActLog: {
             double x_min = (1 + ~XBASEMASK) / scale_in;
             double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
             pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
-            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
             break;
         }
         case kActNegLog: {
             double x_min = (1 + ~XBASEMASK) / scale_in;
             double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
             pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
-            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
             break;
         }
         case kActNegHalfLog: {
             double x_min = (1 + ~XBASEMASK) / scale_in;
             double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
             pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
-            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
             break;
         }
         case kActExp: {
             double x_min = -log(scale_out);
             double x_max = x_min + log(INT16_MAX);
             pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
-            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
             break;
         }
         case kActSign:
-            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
             break;
         case kActAbs:
-            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
             break;
         case kActPow: {
             auto fp32eq = [](float p1, float p2) -> bool {
@@ -600,7 +602,7 @@ void PwlDesignOpt16(const DnnActivation activation_type,
                 pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, maxError, PWL_DESIGN_SAMPLES, err_pct);
             }
 
-            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
+            make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
             break;
         }
         default:
@@ -608,11 +610,12 @@ void PwlDesignOpt16(const DnnActivation activation_type,
     }
 }
 
-void PwlDesign16(const DnnActivation activation_type,
+void PwlDesign(const DnnActivation activation_type,
                  gna_pwl_segment_t *ptr_segment,
                  const uint32_t num_segments,
                  const float scale_in,
-                 const float scale_out) {
+                 const float scale_out,
+                 const bool low_precision) {
     switch (activation_type) {
         case kActSigmoid:
            {
@@ -767,12 +770,12 @@ void PwlDesign16(const DnnActivation activation_type,
                 else
                     gnalog() << "=========================== Identity Segments ===========================\n";
                 if (x_lower_limit < INT32_MIN) {
-                    std::cerr << "Warning:  saturation in PwlDesign16! " << x_lower_limit  << " < INT32_MIN"<< std::endl;
+                    std::cerr << "Warning:  saturation in PwlDesign! " << x_lower_limit  << " < INT32_MIN"<< std::endl;
                     x_lower_limit = INT32_MIN;
                     y_lower_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MIN) - 0.5);
                 }
                 if (x_upper_limit > INT32_MAX) {
-                    std::cerr << "Warning:  saturation in PwlDesign16! " << x_upper_limit  << " > INT32_MAX"<< std::endl;
+                    std::cerr << "Warning:  saturation in PwlDesign! " << x_upper_limit  << " > INT32_MAX"<< std::endl;
                     x_upper_limit = INT32_MAX;
                     y_upper_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MAX) + 0.5);
                 }
diff --git a/inference-engine/src/gna_plugin/runtime/pwl.h b/inference-engine/src/gna_plugin/runtime/pwl.h
index b4ab2dc30d0a37..cf908f9e7652cb 100644
--- a/inference-engine/src/gna_plugin/runtime/pwl.h
+++ b/inference-engine/src/gna_plugin/runtime/pwl.h
@@ -95,13 +95,15 @@ void PwlApply32(intel_dnn_component_t *component,
                 const uint32_t num_row_end,
                 const uint32_t num_col_start,
                 const uint32_t num_col_end);
-void PwlDesign16(const DnnActivation activation_type,
+void PwlDesign(const DnnActivation activation_type,
                  gna_pwl_segment_t *ptr_segment,
                  const uint32_t num_segments,
                  const float scale_in,
-                 const float scale_out);
-void PwlDesignOpt16(const DnnActivation activation_type,
+                 const float scale_out,
+                 const bool low_precision);
+void PwlDesignOpt(const DnnActivation activation_type,
                 std::vector<gna_pwl_segment_t> &ptr_segment,
                 const float scale_in,
                 const float scale_out,
-                const float pwlMaxErrorPercent);
+                const float pwlMaxErrorPercent,
+                const bool low_precision);
diff --git a/inference-engine/src/hetero_plugin/hetero_plugin.cpp b/inference-engine/src/hetero_plugin/hetero_plugin.cpp
index 1a2250442d197f..e09f03d7fa76bd 100644
--- a/inference-engine/src/hetero_plugin/hetero_plugin.cpp
+++ b/inference-engine/src/hetero_plugin/hetero_plugin.cpp
@@ -58,13 +58,13 @@ InferenceEngine::ExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(const
     return std::make_shared<HeteroExecutableNetwork>(network, mergeConfigs(_config, config), this);
 }
 
-InferenceEngine::ExecutableNetwork Engine::ImportNetworkImpl(std::istream& heteroModel, const Configs& config) {
+InferenceEngine::ExecutableNetworkInternal::Ptr Engine::ImportNetworkImpl(std::istream& heteroModel, const Configs& config) {
     if (GetCore() == nullptr) {
         IE_THROW() << "Please, work with HETERO device via InferencEngine::Core object";
     }
 
-    return make_executable_network(std::make_shared<HeteroExecutableNetwork>(heteroModel,
-        mergeConfigs(_config, config), this));
+    return std::make_shared<HeteroExecutableNetwork>(heteroModel,
+        mergeConfigs(_config, config), this);
 }
 
 Engine::Configs Engine::GetSupportedConfig(const Engine::Configs& config, const std::string & deviceName) const {
diff --git a/inference-engine/src/hetero_plugin/hetero_plugin.hpp b/inference-engine/src/hetero_plugin/hetero_plugin.hpp
index a6c3908d8e2f41..9b4198aac72af0 100644
--- a/inference-engine/src/hetero_plugin/hetero_plugin.hpp
+++ b/inference-engine/src/hetero_plugin/hetero_plugin.hpp
@@ -37,7 +37,7 @@ class Engine : public InferenceEngine::InferencePluginInternal {
     InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string,
                                          InferenceEngine::Parameter> & options) const override;
 
-    InferenceEngine::ExecutableNetwork ImportNetworkImpl(std::istream& heteroModel, const Configs& config) override;
+    InferenceEngine::ExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& heteroModel, const Configs& config) override;
 
     DeviceMetaInformationMap GetDevicePlugins(const std::string& targetFallback,
         const Configs & localConfig) const;
diff --git a/inference-engine/src/inference_engine/cpp/ie_executable_network.cpp b/inference-engine/src/inference_engine/cpp/ie_executable_network.cpp
index 7378fbb8a4fb4b..1e508d4ce5148a 100644
--- a/inference-engine/src/inference_engine/cpp/ie_executable_network.cpp
+++ b/inference-engine/src/inference_engine/cpp/ie_executable_network.cpp
@@ -4,117 +4,109 @@
 
 #include "cpp/ie_executable_network.hpp"
 #include "ie_common.h"
+#include "cpp_interfaces/interface/ie_iexecutable_network_internal.hpp"
+#include "cpp_interfaces/exception2status.hpp"
+#include "ie_iexecutable_network.hpp"
+#include "cpp_interfaces/base/ie_executable_network_base.hpp"
 
 namespace InferenceEngine {
 
-ExecutableNetwork::ExecutableNetwork(IExecutableNetwork::Ptr actual_, details::SharedObjectLoader::Ptr plg)
-    : actual(actual_), plg(plg) {
-    //  plg can be null, but not the actual
-    if (actual == nullptr) {
-        IE_THROW() << "ExecutableNetwork wrapper was not initialized.";
+#define CALL_STATEMENT(...)                                                                        \
+    if (_impl == nullptr) IE_THROW() << "ExecutableNetwork was not initialized.";                  \
+    try {                                                                                          \
+        __VA_ARGS__;                                                                               \
+    } CATCH_IE_EXCEPTIONS catch (const std::exception& ex) {                                       \
+        IE_THROW() << ex.what();                                                                   \
+    } catch (...) {                                                                                \
+        IE_THROW(Unexpected);                                                                      \
     }
+
+ExecutableNetwork::ExecutableNetwork(const IExecutableNetworkInternal::Ptr& impl,
+                                     const std::shared_ptr<details::SharedObjectLoader>& so)
+    : _impl(impl), _so(so) {
+    IE_ASSERT(_impl != nullptr);
 }
 
 ExecutableNetwork::~ExecutableNetwork() {
-    actual = {};
+    _impl = {};
 }
 
 ConstOutputsDataMap ExecutableNetwork::GetOutputsInfo() const {
-    ConstOutputsDataMap data;
-    CALL_STATUS_FNC(GetOutputsInfo, data);
-    return data;
+    CALL_STATEMENT(return _impl->GetOutputsInfo());
 }
 
 ConstInputsDataMap ExecutableNetwork::GetInputsInfo() const {
-    ConstInputsDataMap info;
-    CALL_STATUS_FNC(GetInputsInfo, info);
-    return info;
+    CALL_STATEMENT(return _impl->GetInputsInfo());
 }
 
 void ExecutableNetwork::reset(IExecutableNetwork::Ptr newActual) {
-    if (actual == nullptr) {
-        IE_THROW() << "ExecutableNetwork wrapper was not initialized.";
-    }
-    if (newActual == nullptr) {
-        IE_THROW() << "ExecutableNetwork wrapper used for reset was not initialized.";
-    }
-    this->actual.swap(newActual);
+    if (_impl == nullptr) IE_THROW() << "ExecutableNetwork was not initialized.";
+    if (newActual == nullptr) IE_THROW() << "ExecutableNetwork wrapper used for reset was not initialized.";
+    auto newBase = std::dynamic_pointer_cast<ExecutableNetworkBase>(newActual);
+    IE_ASSERT(newBase != nullptr);
+    auto newImpl = newBase->GetImpl();
+    IE_ASSERT(newImpl != nullptr);
+    this->_impl.swap(newImpl);
 }
 
 InferRequest ExecutableNetwork::CreateInferRequest() {
-    IInferRequest::Ptr req;
-    CALL_STATUS_FNC(CreateInferRequest, req);
-    if (req.get() == nullptr) IE_THROW() << "Internal error: pointer to infer request is null";
-    return InferRequest(req, plg);
+    CALL_STATEMENT(return InferRequest{_impl->CreateInferRequest(), _so});
 }
 
 InferRequest::Ptr ExecutableNetwork::CreateInferRequestPtr() {
-    IInferRequest::Ptr req;
-    CALL_STATUS_FNC(CreateInferRequest, req);
-    return std::make_shared<InferRequest>(req, plg);
+    CALL_STATEMENT(return std::make_shared<InferRequest>(_impl->CreateInferRequest(), _so));
 }
 
 void ExecutableNetwork::Export(const std::string& modelFileName) {
-    CALL_STATUS_FNC(Export, modelFileName);
+    CALL_STATEMENT(return _impl->Export(modelFileName));
 }
 
 void ExecutableNetwork::Export(std::ostream& networkModel) {
-    CALL_STATUS_FNC(Export, networkModel);
+    CALL_STATEMENT(return _impl->Export(networkModel));
 }
 
-ExecutableNetwork::operator IExecutableNetwork::Ptr&() {
-    return actual;
+ExecutableNetwork::operator IExecutableNetwork::Ptr() {
+    return std::make_shared<ExecutableNetworkBase>(_impl);
 }
 
 CNNNetwork ExecutableNetwork::GetExecGraphInfo() {
     IE_SUPPRESS_DEPRECATED_START
-    ICNNNetwork::Ptr ptr = nullptr;
-    CALL_STATUS_FNC(GetExecGraphInfo, ptr);
-    return CNNNetwork(ptr);
-    IE_SUPPRESS_DEPRECATED_END
+    CALL_STATEMENT(return _impl->GetExecGraphInfo());
 }
 
-
+IE_SUPPRESS_DEPRECATED_START
 std::vector<VariableState> ExecutableNetwork::QueryState() {
-    if (actual == nullptr) IE_THROW() << "ExecutableNetwork was not initialized.";
-    IVariableState::Ptr pState = nullptr;
-    auto res = OK;
     std::vector<VariableState> controller;
-    for (size_t idx = 0; res == OK; ++idx) {
-        ResponseDesc resp;
-        IE_SUPPRESS_DEPRECATED_START
-        res = actual->QueryState(pState, idx, &resp);
-        IE_SUPPRESS_DEPRECATED_END
-        if (res != OK && res != OUT_OF_BOUNDS) {
-            IE_THROW() << resp.msg;
-        }
-        if (res != OUT_OF_BOUNDS) {
-            controller.push_back(VariableState(pState, plg));
-        }
-    }
-
+    CALL_STATEMENT(
+        for (auto&& state : _impl->QueryState()) {
+            controller.emplace_back(std::make_shared<VariableStateBase>(state), _so);
+        });
     return controller;
 }
+IE_SUPPRESS_DEPRECATED_END
 
 void ExecutableNetwork::SetConfig(const std::map<std::string, Parameter>& config) {
-    CALL_STATUS_FNC(SetConfig, config);
+    CALL_STATEMENT(_impl->SetConfig(config));
 }
 
 Parameter ExecutableNetwork::GetConfig(const std::string& name) const {
-    Parameter configValue;
-    CALL_STATUS_FNC(GetConfig, name, configValue);
-    return configValue;
+    CALL_STATEMENT(return _impl->GetConfig(name));
 }
 
 Parameter ExecutableNetwork::GetMetric(const std::string& name) const {
-    Parameter metricValue;
-    CALL_STATUS_FNC(GetMetric, name, metricValue);
-    return metricValue;
+    CALL_STATEMENT(return _impl->GetMetric(name));
 }
 
 RemoteContext::Ptr ExecutableNetwork::GetContext() const {
-    RemoteContext::Ptr pContext;
-    CALL_STATUS_FNC(GetContext, pContext);
-    return pContext;
+    CALL_STATEMENT(return _impl->GetContext());
 }
+
+bool ExecutableNetwork::operator!() const noexcept {
+    return !_impl;
+}
+
+ExecutableNetwork::operator bool() const noexcept {
+    return !!_impl;
+}
+
 }  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/ie_cache_guard.cpp b/inference-engine/src/inference_engine/ie_cache_guard.cpp
new file mode 100644
index 00000000000000..fa776d130384fa
--- /dev/null
+++ b/inference-engine/src/inference_engine/ie_cache_guard.cpp
@@ -0,0 +1,64 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ie_cache_guard.hpp"
+#include "ie_common.h"
+
+namespace InferenceEngine {
+
+CacheGuardEntry::CacheGuardEntry(CacheGuard& cacheGuard, const std::string& hash,
+                                 std::shared_ptr<std::mutex> m, std::atomic_int& refCount):
+        m_cacheGuard(cacheGuard), m_hash(hash), m_mutex(m), m_refCount(refCount) {
+    // Don't lock mutex right here for exception-safe considerations
+    m_refCount++;
+}
+
+CacheGuardEntry::~CacheGuardEntry() {
+    m_refCount--;
+    m_mutex->unlock();
+    m_cacheGuard.checkForRemove(m_hash);
+}
+
+void CacheGuardEntry::performLock() {
+    m_mutex->lock();
+}
+
+//////////////////////////////////////////////////////
+
+CacheGuard::~CacheGuard() {
+    IE_ASSERT(m_table.size() == 0);
+}
+
+std::unique_ptr<CacheGuardEntry> CacheGuard::getHashLock(const std::string& hash) {
+    std::unique_lock<std::mutex> lock(m_tableMutex);
+    auto& data = m_table[hash];
+    std::unique_ptr<CacheGuardEntry> res;
+    try {
+        // TODO: use std::make_unique when migrated to C++14
+        res = std::unique_ptr<CacheGuardEntry>(
+                new CacheGuardEntry(*this, hash, data.m_mutexPtr, data.m_itemRefCounter));
+    } catch (...) {
+        // In case of exception, we shall remove hash entry if it is not used
+        if (data.m_itemRefCounter == 0) {
+            m_table.erase(hash);
+        }
+        throw;
+    }
+    lock.unlock(); // can unlock table lock here, as refCounter is positive and nobody can remove entry
+    res->performLock(); // in case of exception, 'res' will be destroyed and item will be cleaned up from table
+    return res;
+}
+
+void CacheGuard::checkForRemove(const std::string& hash) {
+    std::lock_guard<std::mutex> lock(m_tableMutex);
+    if (m_table.count(hash)) {
+        auto &data = m_table[hash];
+        if (data.m_itemRefCounter == 0) {
+            // Nobody is using this and nobody is waiting for it - can be removed
+            m_table.erase(hash);
+        }
+    }
+}
+
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/ie_cache_guard.hpp b/inference-engine/src/inference_engine/ie_cache_guard.hpp
new file mode 100644
index 00000000000000..1fe1954d47978f
--- /dev/null
+++ b/inference-engine/src/inference_engine/ie_cache_guard.hpp
@@ -0,0 +1,122 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+/**
+ * @brief This is a header file for the Inference Engine Cache Guard class C++ API
+ *
+ * @file ie_cache_guard.hpp
+ */
+
+#include <string>
+#include <map>
+#include <mutex>
+#include <memory>
+#include <atomic>
+#include <unordered_map>
+
+namespace InferenceEngine {
+
+class CacheGuard;
+/**
+ * @brief This class represents RAII guard class to protect multiple threads to modify the same cached network
+ * Use CacheGuard::getHashLock(hash) to acquire lock for specific cache entry identified by its 'hash'
+ * On destruction, lock will be released
+ * @see CacheGuard
+ */
+class CacheGuardEntry {
+public:
+    /**
+     * @brief Internal constructor, will be called by @CacheGuard
+     *
+     * @param cacheGuard Reference link to parent's Cache Guard
+     * @param hash String representing hash of network
+     * @param m Shared pointer to mutex for internal locking
+     * @param refCount Reference counter. Will be decremented on CacheGuardEntry destruction
+     */
+    CacheGuardEntry(CacheGuard& cacheGuard, const std::string& hash,
+                    std::shared_ptr<std::mutex> m, std::atomic_int& refCount);
+    CacheGuardEntry(const CacheGuardEntry&) = delete;
+
+    /**
+     * @brief Destructor, will perform the following cleanup
+     *
+     * Decrement reference counter
+     * Unlock associated mutex
+     * Call CacheGuard::checkForRemove to check if appropriate table hash entry is not used anymore and can be deleted
+     */
+    ~CacheGuardEntry();
+
+    /**
+     * @brief Performs real lock of associated mutex
+     * It is separated from construction due to exception safety considerations
+     *
+     * @note Will be called only by CacheGuard, it shall not be called from client's code
+     */
+    void performLock();
+
+private:
+    CacheGuard& m_cacheGuard;
+    std::string m_hash;
+    std::shared_ptr<std::mutex> m_mutex;
+    std::atomic_int& m_refCount;
+};
+
+/**
+ * @brief This class holds a table of currently locked hashes
+ * Inference engine core will need to obtain a lock for a specific cache to get exclusive access to it
+ * It is needed to avoid race situations when multiple threads try to to write to the same cache simultaneously
+ *
+ * Usage example:
+ *     auto hash = <calculate hash for network>;
+ *     {
+ *         auto lock = m_cacheGuard.getHashLock(hash);
+ *         <work with cache entry exclusively>
+ *     }
+ */
+class CacheGuard {
+public:
+    CacheGuard() = default;
+    ~CacheGuard();
+
+    /**
+     * @brief Gets a lock for a specific cache entry identified by it's hash value
+     * Once returned, client has an exclusive access to cache entry for read/write/delete
+     * If any other thread holds a lock to same hash - this function will not return until it is unlocked
+     *
+     * @param hash String representing hash of network
+     *
+     * @return RAII pointer to CacheGuardEntry
+     */
+    std::unique_ptr<CacheGuardEntry> getHashLock(const std::string& hash);
+
+    /**
+     * @brief Checks whether there is any clients holding the lock after CacheGuardEntry deletion
+     * It will be called on destruction of CacheGuardEntry and shall not be used directly by client's code
+     * If there is no more clients holding the lock, associated entry will be removed from table unlocked
+     *
+     * @param hash String representing hash of network
+     *
+     * @return RAII pointer to CacheGuardEntry
+     */
+    void checkForRemove(const std::string& hash);
+
+private:
+    struct Item {
+        std::shared_ptr<std::mutex> m_mutexPtr { std::make_shared<std::mutex>() };
+        // Reference counter for item usage
+        std::atomic_int m_itemRefCounter {0};
+
+        Item() = default;
+        Item(const Item& other): m_mutexPtr(other.m_mutexPtr),
+                                 m_itemRefCounter(other.m_itemRefCounter.load()) {}
+        Item(Item&& other): m_mutexPtr(std::move(other.m_mutexPtr)),
+                            m_itemRefCounter(other.m_itemRefCounter.load()) {}
+    };
+    std::mutex m_tableMutex;
+    std::unordered_map<std::string, Item> m_table;
+};
+
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/ie_core.cpp b/inference-engine/src/inference_engine/ie_core.cpp
index 35cb82a3ddb15d..2122fda276e0bd 100644
--- a/inference-engine/src/inference_engine/ie_core.cpp
+++ b/inference-engine/src/inference_engine/ie_core.cpp
@@ -21,6 +21,7 @@
 #include "ie_plugin_cpp.hpp"
 #include "ie_plugin_config.hpp"
 #include "ie_cache_manager.hpp"
+#include "ie_cache_guard.hpp"
 #include "ie_itt.hpp"
 #include "file_utils.h"
 #include "ie_network_reader.hpp"
@@ -197,6 +198,8 @@ class Core::Impl : public ICore {
     // Core settings (cache config, etc)
     CoreConfig coreConfig;
 
+    CacheGuard cacheGuard;
+
     struct PluginDescriptor {
         FileUtils::FilePath libraryLocation;
         std::map<std::string, std::string> defaultConfig;
@@ -447,17 +450,18 @@ class Core::Impl : public ICore {
         }
         auto parsed = parseDeviceNameIntoConfig(context->getDeviceName(), config);
         auto plugin = GetCPPPluginByName(parsed._deviceName);
-        bool loadedFromCache = false;
         ExecutableNetwork res;
-        std::string hash;
         auto cacheManager = coreConfig.getCacheConfig()._cacheManager;
         if (cacheManager && DeviceSupportsImportExport(plugin)) {
-            hash = CalculateNetworkHash(network, parsed._deviceName, plugin, parsed._config);
+            auto hash = CalculateNetworkHash(network, parsed._deviceName, plugin, parsed._config);
+            bool loadedFromCache = false;
+            auto lock = cacheGuard.getHashLock(hash);
             res = LoadNetworkFromCache(cacheManager, hash, plugin, parsed._config, context, loadedFromCache);
-        }
-
-        if (!loadedFromCache) {
-            res = LoadNetworkImpl(network, plugin, parsed._config, context, hash);
+            if (!loadedFromCache) {
+                res = LoadNetworkImpl(network, plugin, parsed._config, context, hash);
+            }
+        } else {
+            res = LoadNetworkImpl(network, plugin, parsed._config, context, {});
         }
         return res;
     }
@@ -472,17 +476,18 @@ class Core::Impl : public ICore {
             parsed._config.erase(CONFIG_KEY_INTERNAL(FORCE_DISABLE_CACHE));
         }
         auto plugin = GetCPPPluginByName(parsed._deviceName);
-        bool loadedFromCache = false;
         ExecutableNetwork res;
-        std::string hash;
         auto cacheManager = coreConfig.getCacheConfig()._cacheManager;
         if (!forceDisableCache && cacheManager && DeviceSupportsImportExport(plugin)) {
-            hash = CalculateNetworkHash(network, parsed._deviceName, plugin, parsed._config);
+            auto hash = CalculateNetworkHash(network, parsed._deviceName, plugin, parsed._config);
+            bool loadedFromCache = false;
+            auto lock = cacheGuard.getHashLock(hash);
             res = LoadNetworkFromCache(cacheManager, hash, plugin, parsed._config, nullptr, loadedFromCache);
-        }
-
-        if (!loadedFromCache) {
-            res = LoadNetworkImpl(network, plugin, parsed._config, nullptr, hash, {}, forceDisableCache);
+            if (!loadedFromCache) {
+                res = LoadNetworkImpl(network, plugin, parsed._config, nullptr, hash, {}, forceDisableCache);
+            }
+        } else {
+            res = LoadNetworkImpl(network, plugin, parsed._config, nullptr, {}, {}, forceDisableCache);
         }
         return res;
     }
@@ -493,19 +498,21 @@ class Core::Impl : public ICore {
         OV_ITT_SCOPED_TASK(itt::domains::IE_LT, "Core::LoadNetwork::Path");
         auto parsed = parseDeviceNameIntoConfig(deviceName, config);
         auto plugin = GetCPPPluginByName(parsed._deviceName);
-        bool loadedFromCache = false;
         ExecutableNetwork res;
-        std::string hash;
         auto cacheManager = coreConfig.getCacheConfig()._cacheManager;
         if (cacheManager && DeviceSupportsImportExport(plugin)) {
-            hash = CalculateFileHash(modelPath, parsed._deviceName, plugin, parsed._config);
+            bool loadedFromCache = false;
+            auto hash = CalculateFileHash(modelPath, parsed._deviceName, plugin, parsed._config);
+            auto lock = cacheGuard.getHashLock(hash);
             res = LoadNetworkFromCache(cacheManager, hash, plugin, parsed._config,
                                        nullptr, loadedFromCache, modelPath);
-        }
-
-        if (!loadedFromCache) {
+            if (!loadedFromCache) {
+                auto cnnNetwork = ReadNetwork(modelPath, std::string());
+                res = LoadNetworkImpl(cnnNetwork, plugin, parsed._config, nullptr, hash, modelPath);
+            }
+        } else {
             auto cnnNetwork = ReadNetwork(modelPath, std::string());
-            res = LoadNetworkImpl(cnnNetwork, plugin, parsed._config, nullptr, hash, modelPath);
+            res = LoadNetworkImpl(cnnNetwork, plugin, parsed._config, nullptr, {}, modelPath);
         }
         return res;
     }
diff --git a/inference-engine/src/inference_engine/ie_plugin_cpp.hpp b/inference-engine/src/inference_engine/ie_plugin_cpp.hpp
index a78ec7b43e9f8e..48c0e05cef5901 100644
--- a/inference-engine/src/inference_engine/ie_plugin_cpp.hpp
+++ b/inference-engine/src/inference_engine/ie_plugin_cpp.hpp
@@ -24,23 +24,6 @@
 # pragma GCC diagnostic ignored "-Wreturn-type"
 #endif
 
-#define CATCH_IE_EXCEPTION(ExceptionType) catch (const InferenceEngine::ExceptionType& e) {throw e;}
-
-#define CATCH_IE_EXCEPTIONS                     \
-        CATCH_IE_EXCEPTION(GeneralError)        \
-        CATCH_IE_EXCEPTION(NotImplemented)      \
-        CATCH_IE_EXCEPTION(NetworkNotLoaded)    \
-        CATCH_IE_EXCEPTION(ParameterMismatch)   \
-        CATCH_IE_EXCEPTION(NotFound)            \
-        CATCH_IE_EXCEPTION(OutOfBounds)         \
-        CATCH_IE_EXCEPTION(Unexpected)          \
-        CATCH_IE_EXCEPTION(RequestBusy)         \
-        CATCH_IE_EXCEPTION(ResultNotReady)      \
-        CATCH_IE_EXCEPTION(NotAllocated)        \
-        CATCH_IE_EXCEPTION(InferNotStarted)     \
-        CATCH_IE_EXCEPTION(NetworkNotRead)      \
-        CATCH_IE_EXCEPTION(InferCancelled)
-
 #define CALL_STATEMENT(...)                                                                        \
     if (!actual) IE_THROW() << "Wrapper used in the CALL_STATEMENT was not initialized.";  \
     try {                                                                                          \
diff --git a/inference-engine/src/inference_engine/os/lin/lin_shared_object_loader.cpp b/inference-engine/src/inference_engine/os/lin/lin_shared_object_loader.cpp
index 4c3251a6f2e479..8143e0634e1b04 100644
--- a/inference-engine/src/inference_engine/os/lin/lin_shared_object_loader.cpp
+++ b/inference-engine/src/inference_engine/os/lin/lin_shared_object_loader.cpp
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+
 #include <dlfcn.h>
+#include <iostream>
 
 #include "details/ie_so_loader.h"
 #include "file_utils.h"
@@ -27,9 +29,9 @@ class SharedObjectLoader::Impl {
     }
 #endif  // ENABLE_UNICODE_PATH_SUPPORT
 
-    ~Impl() noexcept(false) {
+    ~Impl() {
         if (0 != dlclose(shared_object)) {
-            IE_THROW() << "dlclose failed: " << dlerror();
+            std::cerr << "dlclose failed: " << dlerror() << std::endl;
         }
     }
 
@@ -60,7 +62,7 @@ SharedObjectLoader::SharedObjectLoader(const char * pluginName) {
     _impl.reset(new Impl(pluginName));
 }
 
-SharedObjectLoader::~SharedObjectLoader() noexcept(false) {}
+SharedObjectLoader::~SharedObjectLoader() {}
 
 void* SharedObjectLoader::get_symbol(const char* symbolName) const {
     return _impl->get_symbol(symbolName);
diff --git a/inference-engine/src/inference_engine/os/win/win_shared_object_loader.cpp b/inference-engine/src/inference_engine/os/win/win_shared_object_loader.cpp
index 8fd09a06db0785..43f72b4a6fa0a3 100644
--- a/inference-engine/src/inference_engine/os/win/win_shared_object_loader.cpp
+++ b/inference-engine/src/inference_engine/os/win/win_shared_object_loader.cpp
@@ -254,8 +254,7 @@ class SharedObjectLoader::Impl {
     }
 };
 
-SharedObjectLoader::~SharedObjectLoader() noexcept(false) {
-}
+SharedObjectLoader::~SharedObjectLoader() {}
 
 SharedObjectLoader::SharedObjectLoader(const char * pluginName) {
     _impl = std::make_shared<Impl>(pluginName);
diff --git a/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp b/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp
index c40c76f69c320f..0b66531044a62b 100644
--- a/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp
+++ b/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp
@@ -1582,7 +1582,7 @@ InferenceEngine::details::CNNLayerCreator::CNNLayerCreator(const std::shared_ptr
         auto res = std::make_shared<InferenceEngine::CNNLayer>(attrs);
         res->params = params;
         res->params["no_trans"] = node->get_input_size() == 2 ? "1" : "0";
-        // temporary workaround due to incorrect usage of group_size in the nGraph operation for the DeformablePSROIPooling
+        // v1::DeformablePRSOIPooling treats group_size attribute as pooled sizes
         res->params["pooled_height"] = params.at("group_size");
         res->params["pooled_width"] = params.at("group_size");
         return res;
diff --git a/inference-engine/src/low_precision_transformations/include/low_precision/common/ie_lpt_exception.hpp b/inference-engine/src/low_precision_transformations/include/low_precision/common/ie_lpt_exception.hpp
index aef713a02ccb22..1c4cd359f5114e 100644
--- a/inference-engine/src/low_precision_transformations/include/low_precision/common/ie_lpt_exception.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/common/ie_lpt_exception.hpp
@@ -23,6 +23,10 @@ class TRANSFORMATIONS_API Exception : std::exception {
     std::shared_ptr<std::ostringstream> buffer;
     mutable std::string buffer_str;
 public:
+    Exception() {
+        buffer = std::make_shared<std::ostringstream>();
+    }
+
     template <typename T>
     Exception& operator<< (const T& x) {
         *buffer << x;
diff --git a/inference-engine/src/low_precision_transformations/include/low_precision/concat.hpp b/inference-engine/src/low_precision_transformations/include/low_precision/concat.hpp
index 8ed8dfde55c013..e381fd5d0a0401 100644
--- a/inference-engine/src/low_precision_transformations/include/low_precision/concat.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/concat.hpp
@@ -35,6 +35,7 @@ class TRANSFORMATIONS_API ConcatTransformation : public LayerTransformation {
         ngraph::pass::low_precision::Subgraph& subgraph,
         std::function<void(
             std::shared_ptr<ngraph::Node> layer,
+            std::shared_ptr<ngraph::Node> child,
             const std::string originalLayerName,
             std::vector<FakeQuantizeDequantization>& dequantizationsToConcatenate)> getLayerDequantizationCallback) const;
 
@@ -42,6 +43,15 @@ class TRANSFORMATIONS_API ConcatTransformation : public LayerTransformation {
         const TransformationContext& context,
         const std::vector<std::shared_ptr<ngraph::Node>>& quantizationOperations);
 
+    void fillDequantizationNodes(
+        const std::vector<FakeQuantizeDequantization>& layerDequantizations,
+        const std::shared_ptr<Node> layer,
+        NodeVector& convertNodes,
+        NodeVector& subtractNodes,
+        NodeVector& multiplyNodes) const;
+
+    std::shared_ptr<Node> concatenateDeqNodes(NodeVector& nodes) const;
+
 private:
     size_t getMinQuantizationLevels(
         const DataPrecision& dataPrecision,
diff --git a/inference-engine/src/low_precision_transformations/include/low_precision/concat_multi_channels.hpp b/inference-engine/src/low_precision_transformations/include/low_precision/concat_multi_channels.hpp
index 06515d0d72e6d5..48c0a0ef9eaa5f 100644
--- a/inference-engine/src/low_precision_transformations/include/low_precision/concat_multi_channels.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/concat_multi_channels.hpp
@@ -27,12 +27,9 @@ class TRANSFORMATIONS_API ConcatMultiChannelsTransformation : public ConcatTrans
     bool isPrecisionPreserved(std::shared_ptr<Node> layer) const noexcept override;
 
 private:
+    // Go through the parent elements of the layer and fill dequantization collection
+    // with Dq operations that should be inserted before the layer.
     void fillDequantization(
-        std::shared_ptr<ngraph::Node> layer,
-        std::unordered_map<std::string, FakeQuantizeDequantization>& dequantizationByFakeQuantize,
-        std::vector<FakeQuantizeDequantization>& dequantizationsToConcatenate) const;
-
-    void fillQuantization(
         const std::shared_ptr<ngraph::Node> layer,
         const std::unordered_map<std::string, FakeQuantizeDequantization>& dequantizationByFakeQuantize,
         std::vector<FakeQuantizeDequantization>& dequantization) const;
@@ -46,8 +43,6 @@ class TRANSFORMATIONS_API ConcatMultiChannelsTransformation : public ConcatTrans
         const FakeQuantizeDequantization& dequantization,
         const size_t sourceOutputIdx);
 
-    static FakeQuantizeDequantization broadcastDequantiationConstant(const FakeQuantizeDequantization& deq);
-
     bool isMultiChannel(const std::vector<std::shared_ptr<ngraph::opset1::Concat>>& concatLayers) const noexcept;
 };
 
diff --git a/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp b/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp
index f113f749c687fe..f9665f9a886230 100644
--- a/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp
@@ -50,6 +50,12 @@ class TRANSFORMATIONS_API NetworkHelper {
     template <typename OperationType>
     static std::shared_ptr<Node> setOutDataPrecision(std::shared_ptr<OperationType> operation, const element::Type& precision);
 
+    // applies constant folding of operation to constant and returns the specified output
+    static std::shared_ptr<opset1::Constant> foldDequantizationConstant(
+        const std::shared_ptr<opset1::Constant>& foldingConstant,
+        const std::shared_ptr<Node>& operation,
+        const size_t outIdx = 0);
+
     static size_t getOutputChannelsCount(std::shared_ptr<const Node> layer, bool isOnWeights = false);
 
     static std::vector<std::shared_ptr<Node>> getParentsRecursivelyExceptTypes(
diff --git a/inference-engine/src/low_precision_transformations/include/low_precision/split.hpp b/inference-engine/src/low_precision_transformations/include/low_precision/split.hpp
index c7a41cd25c7b37..5a9fbc48ce7916 100644
--- a/inference-engine/src/low_precision_transformations/include/low_precision/split.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/split.hpp
@@ -24,15 +24,6 @@ class TRANSFORMATIONS_API SplitTransformation : public LayerTransformation {
         TransformationContext& context,
         std::vector<std::shared_ptr<ngraph::Node>> lastNodes,
         std::shared_ptr<ngraph::Node> originalNode) const;
-protected:
-    ngraph::Shape getConstSplitShape(
-        const std::vector<size_t>& constSplitLengths,
-        const ngraph::Shape& constShape, const size_t axis,
-        const size_t idx) const;
-    virtual std::vector<size_t> getConstSplitLengths(
-        const OutputVector& inputs,
-        const ngraph::Shape& constShape,
-        const size_t outputSize) const;
 };
 } // namespace low_precision
 } // namespace pass
diff --git a/inference-engine/src/low_precision_transformations/include/low_precision/variadic_split.hpp b/inference-engine/src/low_precision_transformations/include/low_precision/variadic_split.hpp
index c9fdf76998af41..e7cab0c527c10e 100644
--- a/inference-engine/src/low_precision_transformations/include/low_precision/variadic_split.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/variadic_split.hpp
@@ -17,11 +17,6 @@ class TRANSFORMATIONS_API VariadicSplitTransformation : public SplitTransformati
 public:
     VariadicSplitTransformation(const Params& params);
     void registerMatcherIn(GraphRewrite& pass, TransformationContext& context) const override;
-protected:
-    std::vector<size_t> getConstSplitLengths(
-        const OutputVector& inputs,
-        const ngraph::Shape& constShape,
-        const size_t outputSize) const override;
 };
 } // namespace low_precision
 } // namespace pass
diff --git a/inference-engine/src/low_precision_transformations/src/add.cpp b/inference-engine/src/low_precision_transformations/src/add.cpp
index c2d0dc50a0e170..85aef194893107 100644
--- a/inference-engine/src/low_precision_transformations/src/add.cpp
+++ b/inference-engine/src/low_precision_transformations/src/add.cpp
@@ -47,13 +47,13 @@ std::shared_ptr<opset1::Subtract> replaceToSubtract(const std::shared_ptr<Node>&
         return nullptr;
     }
 
-    auto constant = fold<opset1::Negative>(add->get_input_node_shared_ptr(constBranchIndex));
+    auto constant = fold<opset1::Negative>(add->input_value(constBranchIndex));
     auto constOutput = constant->output(0);
 
     const auto subtract = std::make_shared<op::TypeRelaxed<DequantizationSubtract>>(
         std::vector<element::Type>{element::f32, element::f32},
         std::vector<element::Type>{ op->get_output_element_type(0) },
-        ngraph::op::TemporaryReplaceOutputType(add->get_input_node_shared_ptr(dataBranchIndex), element::f32).get(),
+        ngraph::op::TemporaryReplaceOutputType(add->input_value(dataBranchIndex), element::f32).get(),
         ngraph::op::TemporaryReplaceOutputType(constOutput, element::f32).get(),
         add->get_autob());
 
@@ -73,13 +73,13 @@ std::shared_ptr<opset1::Subtract> fuseWithSubtract(const std::shared_ptr<Node>&
     }
 
     const auto newSubConst = fold<opset1::Subtract>(
-        add->get_input_node_shared_ptr(0)->get_input_node_shared_ptr(1),
-        add->get_input_node_shared_ptr(1));
+        add->get_input_node_shared_ptr(0)->input_value(1),
+        add->input_value(1));
 
     const auto newSubtract = std::make_shared<op::TypeRelaxed<DequantizationSubtract>>(
         std::vector<element::Type>{element::f32, element::f32},
         std::vector<element::Type>{ op->get_output_element_type(0) },
-        ngraph::op::TemporaryReplaceOutputType(add->get_input_node_shared_ptr(0)->get_input_node_shared_ptr(0), element::f32).get(),
+        ngraph::op::TemporaryReplaceOutputType(add->get_input_node_shared_ptr(0)->input_value(0), element::f32).get(),
         ngraph::op::TemporaryReplaceOutputType(newSubConst, element::f32).get());
     NetworkHelper::copyInfo(add, newSubtract);
 
@@ -178,7 +178,7 @@ bool AddTransformation::transform(TransformationContext& context, ngraph::patter
         }
 
         // graph update
-        std::vector<Output<Node>> inputs{ {}, {} };
+        OutputVector inputs{ {}, {} };
         auto fullPathInput = dequantizationFullPath.convert == nullptr ? dequantizationFullPath.data : dequantizationFullPath.convert;
 
         inputs[emptyPathIndex] = dequantizationEmptyPath.data;
diff --git a/inference-engine/src/low_precision_transformations/src/concat.cpp b/inference-engine/src/low_precision_transformations/src/concat.cpp
index 8a57bafb7d4e2a..02de081ec03009 100644
--- a/inference-engine/src/low_precision_transformations/src/concat.cpp
+++ b/inference-engine/src/low_precision_transformations/src/concat.cpp
@@ -46,6 +46,10 @@ bool ConcatTransformation::transform(TransformationContext& context, ngraph::pat
     // precisions can be different
     ngraph::Node& quantizationLayer = *subgraph.quantizationLayers[0];
     std::shared_ptr<ngraph::opset1::FakeQuantize> fq = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(quantizationLayer.shared_from_this());
+    if (!NetworkHelper::isQuantizeSupported(fq)) {
+        return false;
+    }
+
     DataPrecision dataPrecision = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false);
     if (dataPrecision.precision == ngraph::element::undefined) {
         return false;
@@ -197,6 +201,7 @@ bool ConcatTransformation::transform(TransformationContext& context, ngraph::pat
 
     auto dequantizationValuesCallback = [&](
         std::shared_ptr<ngraph::Node> layer,
+        std::shared_ptr<ngraph::Node> child,
         const std::string originalLayerName,
         std::vector<FakeQuantizeDequantization>& dequantizationsToConcatenate) {
         dequantizationsToConcatenate.push_back(dequantization);
@@ -230,15 +235,97 @@ bool ConcatTransformation::isPrecisionPreserved(std::shared_ptr<Node>) const noe
 
 bool ConcatTransformation::canBeTransformed(const TransformationContext& context, std::shared_ptr<Node> layer) const {
     std::shared_ptr<opset1::Concat> concat = as_type_ptr<opset1::Concat>(layer);
-    return concat && concat->get_axis() == 1ul;
+    if (concat == nullptr) {
+        return false;
+    }
+
+    const auto axis = concat->get_axis();
+    const size_t normalizedAxis = ngraph::normalize_axis(concat->get_friendly_name(), axis, concat->get_output_partial_shape(0).rank());
+    return normalizedAxis == 1ul;
 }
 
+void ConcatTransformation::fillDequantizationNodes(
+    const std::vector<FakeQuantizeDequantization>& layerDequantizations,
+    const std::shared_ptr<Node> layer,
+    NodeVector& convertNodes,
+    NodeVector& subtractNodes,
+    NodeVector& multiplyNodes) const {
+    if (layerDequantizations.size() > 1ul) {
+        auto broadcastElementWiseConst = [](
+            // FakeQuantize constant shape must be broadcastable to the shape on data.
+            std::shared_ptr<ngraph::opset1::Constant> operation,
+            const ngraph::Shape targetShape) -> std::shared_ptr<Node> {
+                auto targetShapeConst = std::make_shared<ngraph::opset1::Constant>(
+                    element::i64, ngraph::Shape{ targetShape.size() },
+                    targetShape);
+
+                auto broadcast = ngraph::pass::low_precision::fold<ngraph::opset1::Broadcast>(
+                    operation,
+                    targetShapeConst,
+                    ngraph::op::AutoBroadcastType::NUMPY);
+
+                return broadcast;
+        };
+
+        bool allDequantizationShiftAreZero = true;
+        bool allDequantizationMultiplyAreZero = true;
+        for (const auto& dequantization : layerDequantizations) {
+            if (dequantization.subtract != nullptr) {
+                allDequantizationShiftAreZero = false;
+            }
+            if (dequantization.multiply != nullptr) {
+                allDequantizationMultiplyAreZero = false;
+            }
+        }
+
+        for (size_t i = 0; i < layerDequantizations.size(); ++i) {
+            const auto& dequantization = layerDequantizations[i];
+            const ngraph::element::Type precision = deqPrecision;
+            ngraph::Shape targetShape(layer->get_input_shape(i).size(), 1ul);
+            targetShape[1] = layer->get_input_shape(i)[1];
+
+            if (dequantization.convert != nullptr) {
+                convertNodes.push_back(dequantization.convert);
+            }
+
+            if (!allDequantizationShiftAreZero) {
+                subtractNodes.push_back(dequantization.subtract == nullptr ?
+                    std::make_shared<ngraph::opset1::Constant>(precision, targetShape, std::vector<float>({ 0.f })) :
+                    broadcastElementWiseConst(dequantization.subtractConstant, targetShape));
+            }
+
+            if (!allDequantizationMultiplyAreZero) {
+                multiplyNodes.push_back(dequantization.multiply == nullptr ?
+                    std::make_shared<ngraph::opset1::Constant>(precision, targetShape, std::vector<float>({ 1.0f })) :
+                    broadcastElementWiseConst(dequantization.multiplyConstant, targetShape));
+            }
+        }
+    } else {
+        // TODO: check constant shapes here - has to be scalar
+        if (layerDequantizations[0].convert != nullptr) {
+            convertNodes.push_back(layerDequantizations[0].convert);
+        }
+
+        if (layerDequantizations[0].subtract != nullptr) {
+            subtractNodes.push_back(layerDequantizations[0].subtract->input_value(1).get_node_shared_ptr());
+        }
+
+        if (layerDequantizations[0].multiply != nullptr) {
+            multiplyNodes.push_back(layerDequantizations[0].multiply->input_value(1).get_node_shared_ptr());
+        }
+    }
+}
+
+std::shared_ptr<Node> ConcatTransformation::concatenateDeqNodes(NodeVector& nodes) const {
+    return nodes.size() == 1ul ? nodes[0] : fold<ngraph::opset1::Concat>(nodes, 1);
+}
 
 void ConcatTransformation::addDequantizationLayers(
     TransformationContext& context,
     ngraph::pass::low_precision::Subgraph& subgraph,
     std::function<void(
         std::shared_ptr<ngraph::Node> layer,
+        std::shared_ptr<ngraph::Node> child,
         const std::string originalLayerName,
         std::vector<FakeQuantizeDequantization>& dequantizationsToConcatenate)> getLayerDequantizationCallback) const {
     std::unordered_map<std::string, ngraph::Node*> outputs;
@@ -265,95 +352,28 @@ void ConcatTransformation::addDequantizationLayers(
                 ngraph::Node& child = *childInput.get_node();
 
                 if (subgraph.layers.find(child.get_friendly_name()) == subgraph.layers.end()) {
+                    std::shared_ptr<ngraph::Node> source = layer;
+                    const std::shared_ptr<ngraph::Node> destination = child.shared_from_this();
+
                     if (layerDequantizations.size() == 0ul) {
                         // fill layerDequantizations collection
-                        getLayerDequantizationCallback(layer, layer->get_friendly_name(), layerDequantizations);
+                        getLayerDequantizationCallback(source, destination, source->get_friendly_name(), layerDequantizations);
                     }
 
-                    std::shared_ptr<ngraph::Node> source = layer->shared_from_this();
                     {
-                        std::vector<std::shared_ptr<ngraph::Node>> convertNodes;
-                        std::vector<std::shared_ptr<ngraph::Node>> subtractNodes;
-                        std::vector<std::shared_ptr<ngraph::Node>> multiplyNodes;
+                        NodeVector convertNodes;
+                        NodeVector subtractNodes;
+                        NodeVector multiplyNodes;
 
                         // forming nodes for concatenation
-                        if (layerDequantizations.size() > 1ul) {
-                            auto broadcastElementWiseConst = [](
-                                // FakeQuantize constant shape must be broadcastable to the shape on data.
-                                std::shared_ptr<ngraph::opset1::Constant> operation,
-                                const ngraph::Shape targetShape) -> std::shared_ptr<Node> {
-                                auto targetShapeConst = std::make_shared<ngraph::opset1::Constant>(
-                                    element::i64, ngraph::Shape{ targetShape.size() },
-                                    targetShape);
-
-                                auto broadcast = ngraph::pass::low_precision::fold<ngraph::opset1::Broadcast>(
-                                    operation,
-                                    targetShapeConst,
-                                    ngraph::op::AutoBroadcastType::NUMPY);
-
-                                return broadcast;
-                            };
-
-                            bool allDequantizationShiftAreZero = true;
-                            bool allDequantizationMultiplyAreZero = true;
-                            for (FakeQuantizeDequantization dequantization : layerDequantizations) {
-                                if (dequantization.subtract != nullptr) {
-                                    allDequantizationShiftAreZero = false;
-                                }
-                                if (dequantization.multiply != nullptr) {
-                                    allDequantizationMultiplyAreZero = false;
-                                }
-                            }
-
-                            for (size_t i = 0; i < layerDequantizations.size(); ++i) {
-                                const auto& dequantization = layerDequantizations[i];
-
-                                if (dequantization.convert != nullptr) {
-                                    convertNodes.push_back(dequantization.convert);
-                                }
-
-                                const ngraph::element::Type precision = deqPrecision;
-                                ngraph::Shape targetShape(layer->get_input_shape(i).size(), 1ul);
-                                targetShape[1] = layer->get_input_shape(i)[1];
-
-                                if (!allDequantizationShiftAreZero) {
-                                    subtractNodes.push_back(dequantization.subtract == nullptr ?
-                                        std::make_shared<ngraph::opset1::Constant>(precision, targetShape, std::vector<float>({ 0.f })) :
-                                        broadcastElementWiseConst(
-                                            as_type_ptr<ngraph::opset1::Constant>(dequantization.subtract->input_value(1).get_node_shared_ptr()),
-                                            targetShape));
-                                }
-
-                                if (!allDequantizationMultiplyAreZero) {
-                                    multiplyNodes.push_back(dequantization.multiply == nullptr ?
-                                        std::make_shared<ngraph::opset1::Constant>(precision, targetShape, std::vector<float>({ 1.0f })) :
-                                        broadcastElementWiseConst(
-                                            as_type_ptr<ngraph::opset1::Constant>(dequantization.multiply->input_value(1).get_node_shared_ptr()),
-                                            targetShape));
-                                }
-                            }
-                        } else {
-                            // TODO: check constant shapes here - has to be scalar
-                            if (layerDequantizations[0].convert != nullptr) {
-                                convertNodes.push_back(layerDequantizations[0].convert);
-                            }
-
-                            if (layerDequantizations[0].subtract != nullptr) {
-                                subtractNodes.push_back(layerDequantizations[0].subtract->input_value(1).get_node_shared_ptr());
-                            }
-
-                            if (layerDequantizations[0].multiply != nullptr) {
-                                multiplyNodes.push_back(layerDequantizations[0].multiply->input_value(1).get_node_shared_ptr());
-                            }
-                        }
+                        fillDequantizationNodes(layerDequantizations, layer, convertNodes, subtractNodes, multiplyNodes);
 
                         // TODO: the second place (first is FQ decomposition) where dequantization operations are inserted
-                        const std::shared_ptr<ngraph::Node> destination = child.shared_from_this();
-
                         if (!convertNodes.empty()) {
                             const size_t sourceOutputIdx = NetworkHelper::getChildInputIndex(source, destination);
                             std::shared_ptr<ngraph::Node> convert =
                                 convertNodes[0]->clone_with_new_inputs({ destination->get_input_source_output(sourceOutputIdx) });
+
                             insert_new_node_between(source, destination, convert);
                             ngraph::copy_runtime_info({ layer, convert }, convert);
                             source = convert;
@@ -364,9 +384,8 @@ void ConcatTransformation::addDequantizationLayers(
                             const size_t sourceOutputIdx = NetworkHelper::getChildInputIndex(source, destination);
                             std::shared_ptr<ngraph::opset1::Subtract> subtract = std::make_shared<DequantizationSubtract>(
                                 destination->get_input_source_output(sourceOutputIdx),
-                                NetworkHelper::toScalarIfPossible(subtractNodes.size() == 1ul ?
-                                    subtractNodes[0] :
-                                    ngraph::pass::low_precision::fold<ngraph::opset1::Concat>(subtractNodes, 1)));
+                                NetworkHelper::toScalarIfPossible(concatenateDeqNodes(subtractNodes)));
+
                             insert_new_node_between(source, destination, subtract);
                             ngraph::copy_runtime_info({ layer, subtract }, subtract);
                             source = subtract;
@@ -377,10 +396,9 @@ void ConcatTransformation::addDequantizationLayers(
                             std::shared_ptr<ngraph::opset1::Multiply> multiply = std::make_shared<op::TypeRelaxed<DequantizationMultiply>>(
                                 DequantizationMultiply(
                                     destination->get_input_source_output(sourceOutputIdx),
-                                    NetworkHelper::toScalarIfPossible(multiplyNodes.size() == 1ul ?
-                                        multiplyNodes[0] :
-                                        ngraph::pass::low_precision::fold<ngraph::opset1::Concat>(multiplyNodes, 1))),
+                                    NetworkHelper::toScalarIfPossible(concatenateDeqNodes(multiplyNodes))),
                                     layerDequantizations[0].multiply->get_output_element_type(0));
+
                             insert_new_node_between(source, destination, multiply);
                             ngraph::copy_runtime_info({ layer, multiply }, multiply);
                             source = multiply;
diff --git a/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp b/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
index 2877016aa5e19a..62d958d22b4037 100644
--- a/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
+++ b/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
@@ -64,6 +64,10 @@ bool ConcatMultiChannelsTransformation::transform(TransformationContext& context
     {
         for (auto quantizationLayer : subgraph.quantizationLayers) {
             std::shared_ptr<ngraph::opset1::FakeQuantize> fq = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(quantizationLayer->shared_from_this());
+            if (!NetworkHelper::isQuantizeSupported(fq)) {
+                return false;
+            }
+
             const DataPrecision tmp = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false);
 
             if (dataPrecision.precision == ngraph::element::undefined) {
@@ -133,6 +137,7 @@ bool ConcatMultiChannelsTransformation::transform(TransformationContext& context
 
     auto dequantizationValuesCallback = [&](
         std::shared_ptr<ngraph::Node> layer,
+        std::shared_ptr<ngraph::Node> child,
         const std::string originalLayerName,
         std::vector<FakeQuantizeDequantization>& dequantizationsToConcatenate) {
         if (layer->get_friendly_name() != originalLayerName) {
@@ -153,6 +158,15 @@ bool ConcatMultiChannelsTransformation::transform(TransformationContext& context
             layer,
             dequantizations,
             dequantizationsToConcatenate);
+
+        if (!is_type<ngraph::opset1::Concat>(layer)) {
+            // for intermediate layers we should get Dq operations to be inserted between layer and child
+            assert(dequantizationsToConcatenate.size() == 1ul);
+            const size_t sourceOutputIdx = NetworkHelper::getParentOutputIndex(layer, child);
+            if (layer->get_input_shape(0)[1] != layer->get_output_shape(sourceOutputIdx)[1]) {
+                dequantizationsToConcatenate[0] = getFoldedDequantization(layer, dequantizationsToConcatenate[0], sourceOutputIdx);
+            }
+        }
     };
 
     addDequantizationLayers(context, subgraph, dequantizationValuesCallback);
@@ -181,137 +195,66 @@ bool ConcatMultiChannelsTransformation::isPrecisionPreserved(std::shared_ptr<Nod
     return true;
 }
 
-// fill dequantizationsToMerge collection for layer with using dequantizationByFakeQuantize
 void ConcatMultiChannelsTransformation::fillDequantization(
-    std::shared_ptr<ngraph::Node> layer,
-    std::unordered_map<std::string, FakeQuantizeDequantization>& dequantizationByFakeQuantize,
-    std::vector<FakeQuantizeDequantization>& dequantizationsToConcatenate) const {
-    std::shared_ptr<ngraph::opset1::FakeQuantize> currentFakeQuantize = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(layer);
-    if (currentFakeQuantize) {
-        const auto it = dequantizationByFakeQuantize.find(currentFakeQuantize->get_friendly_name());
+    const std::shared_ptr<ngraph::Node> layer,
+    const std::unordered_map<std::string, FakeQuantizeDequantization>& dequantizationByFakeQuantize,
+    std::vector<FakeQuantizeDequantization>& dequantization) const {
+    const auto fillDqByFakeQuantize = [&](const std::shared_ptr<ngraph::Node>& fq) {
+        const auto it = dequantizationByFakeQuantize.find(fq->get_friendly_name());
         if (it == dequantizationByFakeQuantize.end()) {
-            THROW_IE_LPT_EXCEPTION(*currentFakeQuantize) << "dequantization scale values are not found";
+            THROW_IE_LPT_EXCEPTION(*fq) << "dequantization scale values are not found";
         }
+
         const FakeQuantizeDequantization& fakeQuantizeDequantization = it->second;
-        dequantizationsToConcatenate.push_back(broadcastDequantiationConstant(fakeQuantizeDequantization));
-    } else {
-        fillQuantization(layer, dequantizationByFakeQuantize, dequantizationsToConcatenate);
-    }
-}
+        dequantization.push_back(fakeQuantizeDequantization);
+    };
 
-void ConcatMultiChannelsTransformation::fillQuantization(
-    const std::shared_ptr<ngraph::Node> layer,
-    const std::unordered_map<std::string, FakeQuantizeDequantization>& dequantizationByFakeQuantize,
-    std::vector<FakeQuantizeDequantization>& dequantization) const {
-    for (size_t i = 0; i < layer->get_input_size(); ++i) {
-        std::shared_ptr<ngraph::Node> parent = layer->get_input_node_shared_ptr(i);
-
-        std::shared_ptr<ngraph::opset1::FakeQuantize> fakeQuantize = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(parent);
-        if (fakeQuantize) {
-            const auto it = dequantizationByFakeQuantize.find(fakeQuantize->get_friendly_name());
-            if (it == dequantizationByFakeQuantize.end()) {
-                THROW_IE_LPT_EXCEPTION(*fakeQuantize) << "dequantization scale values are not found";
+    if (is_type<ngraph::opset1::FakeQuantize>(layer)) {
+        fillDqByFakeQuantize(layer);
+    } else {
+        for (size_t i = 0; i < layer->get_input_size(); ++i) {
+            std::shared_ptr<ngraph::Node> parent = layer->get_input_node_shared_ptr(i);
+            if (as_type_ptr<ngraph::opset1::Constant>(parent)) {
+                continue;
             }
 
-            const FakeQuantizeDequantization& fakeQuantizeDequantization = it->second;
-            dequantization.push_back(broadcastDequantiationConstant(fakeQuantizeDequantization));
-        } else {
-            std::shared_ptr<ngraph::opset1::Concat> concat = ngraph::as_type_ptr<ngraph::opset1::Concat>(parent);
-            if (concat) {
-                std::vector<FakeQuantizeDequantization> dequantizationToConcatenate;
-                fillQuantization(concat, dequantizationByFakeQuantize, dequantizationToConcatenate);
-
-                // add concatenated dequantization operations to dequantization collection
-                dequantization.push_back(getConcatenatedDequantization(concat, dequantizationToConcatenate));
+            const auto fakeQuantize = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(parent);
+            if (fakeQuantize) {
+                fillDqByFakeQuantize(fakeQuantize);
             } else {
-                std::shared_ptr<ngraph::opset1::StridedSlice> stridedSlice = ngraph::as_type_ptr<ngraph::opset1::StridedSlice>(parent);
-                if (stridedSlice) {
-                    std::vector<FakeQuantizeDequantization> dequantizationToPropagate;
-                    fillQuantization(stridedSlice, dequantizationByFakeQuantize, dequantizationToPropagate);
+                const auto concat = ngraph::as_type_ptr<ngraph::opset1::Concat>(parent);
+                if (concat) {
+                    std::vector<FakeQuantizeDequantization> dequantizationToConcatenate;
+                    fillDequantization(concat, dequantizationByFakeQuantize, dequantizationToConcatenate);
 
-                    const size_t sourceOutputIdx = NetworkHelper::getParentOutputIndex(parent, layer);
-                    // add folded dequantization operations to dequantization colection
-                    dequantization.push_back(getFoldedDequantization(stridedSlice, dequantizationToPropagate[0], sourceOutputIdx));
+                    // add concatenated dequantization operations to dequantization collection
+                    dequantization.push_back(getConcatenatedDequantization(concat, dequantizationToConcatenate));
                 } else {
-                    fillQuantization(parent, dequantizationByFakeQuantize, dequantization);
+                    const size_t sourceOutputIdx = NetworkHelper::getParentOutputIndex(parent, layer);
+                    if (parent->get_input_shape(0)[1] != parent->get_output_shape(sourceOutputIdx)[1]) {
+                        std::vector<FakeQuantizeDequantization> dequantizationToPropagate;
+                        fillDequantization(parent, dequantizationByFakeQuantize, dequantizationToPropagate);
+
+                        // add folded dequantization operations to dequantization colection
+                        dequantization.push_back(getFoldedDequantization(parent, dequantizationToPropagate[0], sourceOutputIdx));
+                    } else {
+                        fillDequantization(parent, dequantizationByFakeQuantize, dequantization);
+                    }
                 }
             }
         }
     }
 }
 
-// broadcast of dequantization constants by channels
-FakeQuantizeDequantization ConcatMultiChannelsTransformation::broadcastDequantiationConstant(const FakeQuantizeDequantization& deq) {
-    ngraph::Shape targetShape(deq.data.get_shape().size(), 1ul);
-    targetShape[1] = deq.data.get_shape()[1];
-
-    FakeQuantizeDequantization result;
-    result.data = deq.data;
-    result.convert = deq.convert;
-
-    const auto targetShapeConst = std::make_shared<ngraph::opset1::Constant>(
-        element::i64, ngraph::Shape{ targetShape.size() },
-        targetShape);
-
-    if (deq.subtract) {
-        auto broadcast = ngraph::pass::low_precision::fold<ngraph::opset1::Broadcast>(
-            deq.subtractConstant,
-            targetShapeConst,
-            ngraph::op::AutoBroadcastType::NUMPY);
-
-        result.subtract = deq.subtract;
-        result.subtractConstant = as_type_ptr<ngraph::opset1::Constant>(broadcast);
-    }
-
-    if (deq.multiply) {
-        auto broadcast = ngraph::pass::low_precision::fold<ngraph::opset1::Broadcast>(
-            deq.multiplyConstant,
-            targetShapeConst,
-            ngraph::op::AutoBroadcastType::NUMPY);
-
-        result.multiply = deq.multiply;
-        result.multiplyConstant = as_type_ptr<ngraph::opset1::Constant>(broadcast);
-    }
-
-    return result;
-}
-
 FakeQuantizeDequantization ConcatMultiChannelsTransformation::getConcatenatedDequantization(
     const std::shared_ptr<ngraph::opset1::Concat> concat,
     const std::vector<FakeQuantizeDequantization>& dequantization) const {
-    bool allDequantizationShiftAreZero = true;
-    bool allDequantizationMultiplyAreZero = true;
-    for (const auto& deq : dequantization) {
-        if (deq.subtract != nullptr) {
-            allDequantizationShiftAreZero = false;
-        }
-        if (deq.multiply != nullptr) {
-            allDequantizationMultiplyAreZero = false;
-        }
-    }
-
     NodeVector convertNodes;
-    NodeVector subNodes;
-    NodeVector mulNodes;
-    //preparing to concatenate dequantization nodes
-    for (const auto& deq : dequantization) {
-        ngraph::Shape targetShape(deq.data.get_shape().size(), 1ul);
-        targetShape[1] = deq.data.get_shape()[1];
-
-        if (deq.convert != nullptr) {
-            convertNodes.push_back(deq.convert);
-        }
-        if (!allDequantizationShiftAreZero) {
-            subNodes.push_back(deq.subtract == nullptr ?
-                std::make_shared<ngraph::opset1::Constant>(deqPrecision, targetShape, std::vector<float>({ 0.f })) :
-                deq.subtractConstant);
-        }
-        if (!allDequantizationMultiplyAreZero) {
-            mulNodes.push_back(deq.multiply == nullptr ?
-                std::make_shared<ngraph::opset1::Constant>(deqPrecision, targetShape, std::vector<float>({ 1.0f })) :
-                deq.multiplyConstant);
-        }
-    }
+    NodeVector subtractNodes;
+    NodeVector multiplyNodes;
+
+    // forming nodes for concatenation
+    fillDequantizationNodes(dequantization, concat, convertNodes, subtractNodes, multiplyNodes);
 
     std::shared_ptr<Node> parent = concat;
     std::shared_ptr<DequantizationConvert> convert;
@@ -322,20 +265,16 @@ FakeQuantizeDequantization ConcatMultiChannelsTransformation::getConcatenatedDeq
 
     std::shared_ptr<DequantizationSubtract> subtract;
     std::shared_ptr<ngraph::opset1::Constant> subConst;
-    if (!subNodes.empty()) {
-        subConst = as_type_ptr<ngraph::opset1::Constant>(
-            subNodes.size() == 1ul ? subNodes[0] : fold<ngraph::opset1::Concat>(subNodes, 1ul));
-
+    if (!subtractNodes.empty()) {
+        subConst = as_type_ptr<ngraph::opset1::Constant>(concatenateDeqNodes(subtractNodes));
         subtract = std::make_shared<DequantizationSubtract>(parent, subConst);
         parent = subtract;
     }
 
     std::shared_ptr<DequantizationMultiply> multiply;
     std::shared_ptr<ngraph::opset1::Constant> mulConst;
-    if (!mulNodes.empty()) {
-        mulConst = as_type_ptr<ngraph::opset1::Constant>(
-            mulNodes.size() == 1ul ? mulNodes[0] : fold<ngraph::opset1::Concat>(mulNodes, 1ul));
-
+    if (!multiplyNodes.empty()) {
+        mulConst = as_type_ptr<ngraph::opset1::Constant>(concatenateDeqNodes(multiplyNodes));
         multiply = std::make_shared<DequantizationMultiply>(parent, mulConst);
     }
 
@@ -348,24 +287,19 @@ FakeQuantizeDequantization ConcatMultiChannelsTransformation::getFoldedDequantiz
     const size_t sourceOutputIdx) {
     OutputVector inputs = operation->input_values();
     OutputVector outputs(operation->get_output_size());
+    Output<Node> data = operation->output(sourceOutputIdx);
 
     std::shared_ptr<Node> parent = operation;
     std::shared_ptr<DequantizationConvert> convert;
     if (dequantization.convert) {
-        convert = as_type_ptr<DequantizationConvert>(dequantization.convert->clone_with_new_inputs({ parent }));
+        convert = as_type_ptr<DequantizationConvert>(dequantization.convert->clone_with_new_inputs({ data }));
         parent = convert;
     }
 
     std::shared_ptr<DequantizationSubtract> subtract;
     std::shared_ptr<ngraph::opset1::Constant> subConst;
     if (dequantization.subtract) {
-        inputs[0] = dequantization.subtractConstant;
-        const auto op = operation->clone_with_new_inputs(inputs);
-
-        // constant folding of subtract constant
-        op->constant_fold(outputs, inputs);
-
-        subConst = as_type_ptr<ngraph::opset1::Constant>(outputs[sourceOutputIdx].get_node_shared_ptr());
+        subConst = NetworkHelper::foldDequantizationConstant(dequantization.subtractConstant, operation, sourceOutputIdx);
         subtract = std::make_shared<DequantizationSubtract>(parent, subConst);
         parent = subtract;
     }
@@ -373,17 +307,11 @@ FakeQuantizeDequantization ConcatMultiChannelsTransformation::getFoldedDequantiz
     std::shared_ptr<DequantizationMultiply> multiply;
     std::shared_ptr<ngraph::opset1::Constant> mulConst;
     if (dequantization.multiply) {
-        inputs[0] = dequantization.multiplyConstant;
-        const auto op = operation->clone_with_new_inputs(inputs);
-
-        // constant folding of multiply constant
-        op->constant_fold(outputs, inputs);
-
-        mulConst = as_type_ptr<ngraph::opset1::Constant>(outputs[sourceOutputIdx].get_node_shared_ptr());
+        mulConst = NetworkHelper::foldDequantizationConstant(dequantization.multiplyConstant, operation, sourceOutputIdx);
         multiply = std::make_shared<DequantizationMultiply>(parent, mulConst);
     }
 
-    return FakeQuantizeDequantization(operation->output(sourceOutputIdx), convert, subtract, nullptr, subConst, multiply, mulConst);
+    return FakeQuantizeDequantization(data, convert, subtract, nullptr, subConst, multiply, mulConst);
 }
 
 } // namespace low_precision
diff --git a/inference-engine/src/low_precision_transformations/src/network_helper.cpp b/inference-engine/src/low_precision_transformations/src/network_helper.cpp
index b9721da78692a6..258c65f91d2b3b 100644
--- a/inference-engine/src/low_precision_transformations/src/network_helper.cpp
+++ b/inference-engine/src/low_precision_transformations/src/network_helper.cpp
@@ -87,6 +87,31 @@ bool NetworkHelper::isConstantPath(const std::shared_ptr<Node>& op) {
     return true;
 }
 
+std::shared_ptr<opset1::Constant> NetworkHelper::foldDequantizationConstant(
+    const std::shared_ptr<opset1::Constant>& foldingConstant,
+    const std::shared_ptr<Node>& operation,
+    const size_t outIdx) {
+    OutputVector inputs = operation->input_values();
+    OutputVector outputs(operation->get_output_size());
+
+    if (shape_size(foldingConstant->get_shape()) == 1ul) {
+        return toScalar(foldingConstant);
+    } else {
+        inputs[0] = foldingConstant;
+        const auto op = operation->clone_with_new_inputs(inputs);
+
+        // constant folding of constant
+        op->constant_fold(outputs, inputs);
+
+        const auto result = as_type_ptr<opset1::Constant>(outputs[outIdx].get_node_shared_ptr());
+        if (result == nullptr) {
+            THROW_IE_LPT_EXCEPTION(*result) << "result of constant folding is not constant";
+        }
+
+        return result;
+    }
+}
+
 size_t NetworkHelper::getOutputChannelsCount(std::shared_ptr<const Node> layer, bool isOnWeights) {
     if (layer->outputs().size() == 0) {
         THROW_TRANSFORMATION_EXCEPTION << "Layer " << layer->get_friendly_name() << " doesn't have output tensors";
@@ -189,7 +214,7 @@ std::shared_ptr<Node> NetworkHelper::swapMultiplyAndAdd(std::shared_ptr<opset1::
     if (multiplyConst == nullptr)
         return addAfterMultiply;
 
-    const auto x = multiply->get_input_node_shared_ptr(multiplyInputBranch);
+    const auto x = multiply->get_input_source_output(multiplyInputBranch);
     auto a = multiply->get_input_node_shared_ptr(multiplyInputBranch == 0 ? 1 : 0);
     auto b = addAfterMultiply->get_input_node_shared_ptr(multiplyBranch == 0 ? 1 : 0);
     std::shared_ptr<Node> bDivA;
@@ -228,14 +253,13 @@ std::shared_ptr<Node> NetworkHelper::swapMultiplyAndAdd(std::shared_ptr<opset1::
         bDivA = fold<opset1::Convert>(bDivA, a->get_output_element_type(0));
     }
 
-    std::vector<std::shared_ptr<Node>> inputs{ {}, {} };
-
+    OutputVector inputs{ {}, {} };
     inputs[0] = x;
     inputs[1] = bDivA;
 
     std::shared_ptr<opset1::Add> newAdd = std::make_shared<op::TypeRelaxed<opset1::Add>>(
         std::vector<element::Type>{element::f32, element::f32},
-        std::vector<element::Type>{ x->get_output_element_type(0) },
+        std::vector<element::Type>{ x.get_element_type() },
         ngraph::op::TemporaryReplaceOutputType(inputs[0], element::f32).get(),
         ngraph::op::TemporaryReplaceOutputType(inputs[1], element::f32).get());
     copyInfo(addAfterMultiply, newAdd);
diff --git a/inference-engine/src/low_precision_transformations/src/split.cpp b/inference-engine/src/low_precision_transformations/src/split.cpp
index a14867fc8e4a63..5f0dbaebef0c77 100644
--- a/inference-engine/src/low_precision_transformations/src/split.cpp
+++ b/inference-engine/src/low_precision_transformations/src/split.cpp
@@ -5,6 +5,7 @@
 #include "low_precision/split.hpp"
 #include "ngraph/node.hpp"
 #include "low_precision/network_helper.hpp"
+#include "low_precision/common/dequantization_op.hpp"
 
 namespace ngraph {
 namespace pass {
@@ -22,81 +23,68 @@ bool SplitTransformation::transform(TransformationContext& context, ngraph::patt
         return false;
     }
 
-    const std::shared_ptr<Node> split = NetworkHelper::separateInStandaloneBranch(m.get_match_root());
-    auto dequantization = NetworkHelper::getDequantization(split);
+    const auto split = NetworkHelper::separateInStandaloneBranch(m.get_match_root());
+    const auto dequantization = NetworkHelper::getDequantization(split);
 
-    OutputVector inputs(split->get_input_size());
-    for (size_t i = 0; i < split->get_input_size(); ++i) {
-        inputs[i] = split->get_input_node_shared_ptr(i);
-    }
-
-    const size_t dequantizationIndex = NetworkHelper::getChildInputIndex(dequantization.multiply, split);
-    inputs[dequantizationIndex] = dequantization.data;
+    OutputVector inputs = split->input_values();
+    inputs[0] = dequantization.data;
 
-    std::shared_ptr<ngraph::Node> newSplit = split->clone_with_new_inputs(inputs);
+    const auto newSplit = split->clone_with_new_inputs(inputs);
     newSplit->set_friendly_name(split->get_friendly_name());
+    ngraph::copy_runtime_info(split, newSplit);
 
-    const ngraph::Shape subConstShape = dequantization.subtract ?
-        dequantization.subtract->get_input_node_shared_ptr(1)->get_shape() : Shape{};
-    std::vector<float> subValues = dequantization.subtract ? as_type_ptr<opset1::Constant>(
-        dequantization.subtract->get_input_node_shared_ptr(1))->cast_vector<float>() : std::vector<float>();
-
-    const ngraph::Shape mulConstShape = dequantization.multiply->get_input_node_shared_ptr(1)->get_shape();
-    std::vector<float> mulValues = as_type_ptr<opset1::Constant>(
-        dequantization.multiply->get_input_node_shared_ptr(1))->cast_vector<float>();
+    const int64_t axis = as_type_ptr<opset1::Constant>(split->get_input_node_shared_ptr(1))->cast_vector<int64_t>()[0];
+    const size_t normalizedAxis = normalize_axis(split->get_friendly_name(), axis, split->get_input_partial_shape(0).rank());
+    const size_t outputSize = newSplit->get_output_size();
 
-    int64_t SplitedAxis = as_type_ptr<opset1::Constant>(split->get_input_node_shared_ptr(1))->cast_vector<int64_t>()[0];
-    size_t axis = SplitedAxis > 0 ? SplitedAxis : split->get_input_shape(0).size() + SplitedAxis;
-    size_t outputSize = newSplit->get_output_size();
+    const auto splitConstant = [&](const std::shared_ptr<Node> operation) {
+        // if batch is absent in constant shape - add batch
+        const auto normalizedConstant = NetworkHelper::normalizeDequantizationShape(operation);
+        const auto constantShape = normalizedConstant->get_shape();
 
-    const auto subSplitLengths = getConstSplitLengths(inputs, subConstShape, outputSize);
-    const auto mulSplitLengths = getConstSplitLengths(inputs, mulConstShape, outputSize);
-
-    std::vector<std::shared_ptr<ngraph::Node>> lastNodes(outputSize);
-    ngraph::OutputVector replacement;
-    for (size_t i = 0; i < outputSize; ++i) {
-        Output<Node> previous = newSplit->output(i);
+        OutputVector results(outputSize);
+        if ((shape_size(constantShape) == 1ul) || (constantShape[normalizedAxis] == 1ul)) {
+            std::for_each(results.begin(), results.end(), [&](Output<Node>& elem) { elem = normalizedConstant->clone_with_new_inputs({}); });
+        } else {
+            // prepare new inputs for constant folding
+            OutputVector inputs = newSplit->input_values();
+            inputs[0] = normalizedConstant;
+            const auto foldSplit = newSplit->clone_with_new_inputs(inputs);
 
-        if (dequantization.convert != nullptr) {
-            const std::shared_ptr<ngraph::Node> convert =
-                dequantization.convert->clone_with_new_inputs({ newSplit->output(i) });
-            previous = convert;
+            // fold and fill results
+            foldSplit->constant_fold(results, inputs);
         }
 
-        if (dequantization.subtract != nullptr) {
-            std::shared_ptr<ngraph::opset1::Constant> subConst;
-            if (!subSplitLengths.empty()) {
-                const auto newSubConstShape = getConstSplitShape(subSplitLengths, subConstShape, axis, i);
-
-                std::vector<float> newSubValues(
-                    subValues.begin() + subSplitLengths[i],
-                    subValues.begin() + subSplitLengths[i + 1]);
-
-                subConst = as_type_ptr<ngraph::opset1::Constant>(std::make_shared<ngraph::opset1::Constant>(
-                    dequantization.subtract->get_input_element_type(1),
-                    newSubConstShape,
-                    newSubValues));
-            } else {
-                subConst = as_type_ptr<ngraph::opset1::Constant>(dequantization.subtract->get_input_node_shared_ptr(1)->clone_with_new_inputs({}));
-            }
-            const std::shared_ptr<ngraph::Node> subtract = std::make_shared<ngraph::opset1::Subtract>(previous, subConst);
-            previous = subtract;
+        for (auto& result : results) {
+            result = NetworkHelper::toScalarIfPossible(result.get_node_shared_ptr());
         }
 
-        std::shared_ptr<ngraph::opset1::Constant> mulConst;
-        if (!mulSplitLengths.empty()) {
-            const auto newMulConstShape = getConstSplitShape(mulSplitLengths, mulConstShape, axis, i);
+        return results;
+    };
 
-            std::vector<float> newMulValues(
-                mulValues.begin() + mulSplitLengths[i],
-                mulValues.begin() + mulSplitLengths[i + 1]);
+    // get splited dequantization constants
+    OutputVector splitedSub = dequantization.subtract ? splitConstant(dequantization.subtract) : OutputVector{};
+    OutputVector splitedMul = splitConstant(dequantization.multiply);
 
-            mulConst = as_type_ptr<ngraph::opset1::Constant>(std::make_shared<ngraph::opset1::Constant>(
-                dequantization.multiply->get_input_element_type(1), newMulConstShape, newMulValues));
-        } else {
-            mulConst = as_type_ptr<ngraph::opset1::Constant>(dequantization.multiply->get_input_node_shared_ptr(1)->clone_with_new_inputs({}));
+    NodeVector lastNodes;
+    OutputVector replacement;
+    for (size_t i = 0; i < outputSize; ++i) {
+        Output<Node> parent = newSplit->output(i);
+
+        if (dequantization.convert) {
+            const auto convert = dequantization.convert->clone_with_new_inputs({ newSplit->output(i) });
+            copy_runtime_info({ newSplit, convert }, convert);
+            parent = convert;
         }
-        const std::shared_ptr<ngraph::Node> multiply = std::make_shared<ngraph::opset1::Multiply>(previous, mulConst);
+
+        if (dequantization.subtract) {
+            const auto subtract = std::make_shared<DequantizationSubtract>(parent, splitedSub[i]);
+            copy_runtime_info({ newSplit, subtract }, subtract);
+            parent = subtract;
+        }
+
+        const auto multiply = std::make_shared<DequantizationMultiply>(parent, splitedMul[i]);
+        copy_runtime_info({ newSplit, multiply }, multiply);
 
         lastNodes.push_back(multiply);
         replacement.push_back(multiply);
@@ -107,33 +95,6 @@ bool SplitTransformation::transform(TransformationContext& context, ngraph::patt
     return true;
 }
 
-std::vector<size_t> SplitTransformation::getConstSplitLengths(
-    const OutputVector& inputs,
-    const ngraph::Shape& constShape,
-    const size_t outputSize) const {
-    int64_t axis = as_type_ptr<opset1::Constant>(inputs[1].get_node_shared_ptr())->cast_vector<int64_t>()[0];
-    size_t splitedAxis = axis > 0 ? axis : inputs[0].get_shape().size() + axis;
-
-    if ((!constShape.empty()) && (constShape[splitedAxis] != 1)) {
-        std::vector<size_t> result(outputSize + 1);
-        result[0] = 0;
-        for (size_t i = 1; i < result.size(); ++i) {
-            result[i] = result[i - 1] + constShape[splitedAxis] / outputSize;
-        }
-        return result;
-    } else {
-        return std::vector<size_t>();
-    }
-}
-
-ngraph::Shape SplitTransformation::getConstSplitShape(
-    const std::vector<size_t>& constSplitLengths,
-    const ngraph::Shape& constShape, const size_t axis,
-    const size_t idx) const {
-    Shape result(constShape);
-    result[axis] = constSplitLengths[idx + 1] - constSplitLengths[idx];
-    return result;
-}
 
 void SplitTransformation::updateOutputs(
     TransformationContext& context,
diff --git a/inference-engine/src/low_precision_transformations/src/strided_slice.cpp b/inference-engine/src/low_precision_transformations/src/strided_slice.cpp
index e7b7a796566900..a269e392302ce4 100644
--- a/inference-engine/src/low_precision_transformations/src/strided_slice.cpp
+++ b/inference-engine/src/low_precision_transformations/src/strided_slice.cpp
@@ -23,7 +23,7 @@ std::shared_ptr<Node> stridedSliceDeqConstant(
     //}
 
     const auto stridedSliceShape = strSlice->get_input_shape(0);
-    const auto constantShape = constant->get_shape();
+    auto constantShape = constant->get_shape();
     if (stridedSliceShape.size() != constantShape.size()) {
         ngraph::Shape newConstantShape;
         if (ngraph::shape_size(constantShape) == 1) {
@@ -37,6 +37,7 @@ std::shared_ptr<Node> stridedSliceDeqConstant(
                 newConstantShape.insert(newConstantShape.begin(), stridedSliceShape[0]);
             }
         }
+        constantShape = newConstantShape;
 
         const auto newConstant = fold<ngraph::opset1::Broadcast>(
             constant,
@@ -45,13 +46,24 @@ std::shared_ptr<Node> stridedSliceDeqConstant(
     }
 
     const auto stridedSlice = as_type_ptr<ngraph::opset1::StridedSlice>(strSlice);
+
+    auto beginMask = stridedSlice->get_begin_mask();
+    auto endMask = stridedSlice->get_end_mask();
+    for (size_t i = 0; i < constantShape.size(); ++i) {
+        // don't slice constant if current dimension is 1
+        if (constantShape[i] == 1ul) {
+            beginMask[i] = 1ul;
+            endMask[i] = 1ul;
+        }
+    }
+
     const auto result = fold<ngraph::opset1::StridedSlice>(
         constant,
         stridedSlice->get_input_node_shared_ptr(1),
         stridedSlice->get_input_node_shared_ptr(2),
         stridedSlice->get_input_node_shared_ptr(3),
-        stridedSlice->get_begin_mask(),
-        stridedSlice->get_end_mask(),
+        beginMask,
+        endMask,
         stridedSlice->get_new_axis_mask(),
         stridedSlice->get_shrink_axis_mask(),
         stridedSlice->get_ellipsis_mask());
diff --git a/inference-engine/src/low_precision_transformations/src/subgraph.cpp b/inference-engine/src/low_precision_transformations/src/subgraph.cpp
index c7f1caf56ddc36..7638fcb0714d48 100644
--- a/inference-engine/src/low_precision_transformations/src/subgraph.cpp
+++ b/inference-engine/src/low_precision_transformations/src/subgraph.cpp
@@ -22,16 +22,15 @@ namespace ngraph {
 namespace pass {
 namespace low_precision {
 
-bool isQuantizationPerChannel(const std::shared_ptr<ngraph::Node>& node) {
-    if (node->outputs().size() > 1ul) {
-        return false;
-    }
-
-    //WA to support StridedSlice in ConcatTransformation
-    if (ngraph::is_type<opset1::StridedSlice>(node)) {
+bool operationIsSupportedInConcat(const std::shared_ptr<ngraph::Node>& node) {
+    // list of operations, which change channels, but supported in ConcatTransformation
+    if (ngraph::is_type<opset1::StridedSlice>(node) ||
+        ngraph::is_type<opset1::Split>(node) ||
+        ngraph::is_type<opset1::VariadicSplit>(node)) {
         return true;
     }
 
+    // operations, which change channels, usually don't support in ConcatTransformation
     const auto inputs = node->input_values();
     for (const auto& input : inputs) {
         if (ngraph::is_type<opset1::Constant>(input.get_node())) {
@@ -82,7 +81,7 @@ bool Subgraph::fillSubgraphForQuantization(
                 if (fakeQuantizeChild != nullptr) {
                     //
                 } else {
-                    if (layerTransformationsManager->isPrecisionPreserved(child) && isQuantizationPerChannel(child)) {
+                    if (layerTransformationsManager->isPrecisionPreserved(child) && operationIsSupportedInConcat(child)) {
                         if (!fillSubgraphForIntermediate(child, handledLayers)) {
                             return false;
                         }
@@ -104,7 +103,7 @@ bool Subgraph::atLeastOneIsIntermediate(const std::shared_ptr<ngraph::Node>& nod
                 return true;
             }
 
-            if (!layerTransformationsManager->isPrecisionPreserved(child) || !isQuantizationPerChannel(child)) {
+            if (!layerTransformationsManager->isPrecisionPreserved(child) || !operationIsSupportedInConcat(child)) {
                 // child branch is out of subgraph
                 continue;
             }
@@ -144,10 +143,6 @@ bool Subgraph::fill(const std::shared_ptr<ngraph::Node>& layer, std::unordered_s
                 return false;
             }
         } else {
-            // WA: issue #46906
-            if (parent->get_output_size() != 1ul) {
-                return false;
-            }
             const FakeQuantizeDequantization dequantization = NetworkHelper::getDequantization(parent, 0, true);
             const std::shared_ptr<ngraph::opset1::FakeQuantize> fakeQuantizeParent = dequantization.empty() ?
                 ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(parent) :
@@ -161,7 +156,7 @@ bool Subgraph::fill(const std::shared_ptr<ngraph::Node>& layer, std::unordered_s
                 if (constant != nullptr) {
                     //
                 } else {
-                    if (layerTransformationsManager->isPrecisionPreserved(parent) && isQuantizationPerChannel(parent)) {
+                    if (layerTransformationsManager->isPrecisionPreserved(parent) && operationIsSupportedInConcat(parent)) {
                         if (!fillSubgraphForIntermediate(parent, handledLayers)) {
                             return false;
                         }
@@ -197,7 +192,7 @@ bool Subgraph::fill(const std::shared_ptr<ngraph::Node>& layer, std::unordered_s
                 const std::shared_ptr<ngraph::opset1::FakeQuantize> fakeQuantizeChild = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(child);
                 if (fakeQuantizeChild != nullptr) {
                     //
-                } else if (layerTransformationsManager->isPrecisionPreserved(child) && isQuantizationPerChannel(child)) {
+                } else if (layerTransformationsManager->isPrecisionPreserved(child) && operationIsSupportedInConcat(child)) {
                     if (!fillSubgraphForIntermediate(child, handledLayers)) {
                         return false;
                     }
@@ -221,6 +216,13 @@ bool Subgraph::empty() const {
 }
 
 bool Subgraph::fillSubgraphForConcat(const std::shared_ptr<ngraph::opset1::Concat>& concat, std::unordered_set<std::string>& handledLayers) {
+    const auto axis = concat->get_axis();
+    const size_t normalizedAxis = ngraph::normalize_axis(concat->get_friendly_name(), axis, concat->get_output_partial_shape(0).rank());
+    // supported only per-channel concat
+    if (normalizedAxis != 1ul) {
+        return false;
+    }
+
     concatLayers.push_back(concat);
     handledLayers.insert(concat->get_friendly_name());
     layers.emplace(concat->get_friendly_name(), concat);
diff --git a/inference-engine/src/low_precision_transformations/src/transformer.cpp b/inference-engine/src/low_precision_transformations/src/transformer.cpp
index 205cd77e930376..d66263bdf07a31 100644
--- a/inference-engine/src/low_precision_transformations/src/transformer.cpp
+++ b/inference-engine/src/low_precision_transformations/src/transformer.cpp
@@ -229,9 +229,11 @@ LowPrecisionTransformations LowPrecisionTransformer::getAllTransformations(const
         add<ReluTransformation, opset1::Relu>(params).
         add<ReshapeTransformation, opset1::Reshape>(params).
         add<SqueezeTransformation, opset1::Squeeze>(params).
+        add<SplitTransformation, opset1::Split>(params).
         add<StridedSliceTransformation, opset1::StridedSlice>(params).
         add<TransposeTransformation, opset1::Transpose>(params).
         add<UnsqueezeTransformation, opset1::Unsqueeze>(params).
+        add<VariadicSplitTransformation, opset1::VariadicSplit>(params).
 
         addCleanup<FoldConvertTransformation, opset1::Subtract>(params).
         addCleanup<FuseConvertTransformation, opset1::Multiply>(params).
diff --git a/inference-engine/src/low_precision_transformations/src/variadic_split.cpp b/inference-engine/src/low_precision_transformations/src/variadic_split.cpp
index ccc8e72634d0bd..685219f27730d0 100644
--- a/inference-engine/src/low_precision_transformations/src/variadic_split.cpp
+++ b/inference-engine/src/low_precision_transformations/src/variadic_split.cpp
@@ -20,26 +20,6 @@ void VariadicSplitTransformation::registerMatcherIn(GraphRewrite& pass, Transfor
                     make_op_label<opset1::Constant>() }));
 }
 
-std::vector<size_t> VariadicSplitTransformation::getConstSplitLengths(
-    const OutputVector& inputs,
-    const ngraph::Shape& constShape,
-    const size_t outputSize) const {
-    std::vector<size_t> lengths = as_type_ptr<opset1::Constant>(inputs[2].get_node_shared_ptr())->cast_vector<size_t>();
-
-    int64_t axis = as_type_ptr<opset1::Constant>(inputs[1].get_node_shared_ptr())->cast_vector<int64_t>()[0];
-    size_t splitedAxis = axis > 0 ? axis : inputs[0].get_shape().size() + axis;
-
-    if ((!constShape.empty()) && (constShape[splitedAxis] != 1)) {
-        std::vector<size_t> result(outputSize + 1);
-        result[0] = 0;
-        for (size_t i = 1; i < result.size(); ++i) {
-            result[i] = result[i - 1] + lengths[i - 1];
-        }
-        return result;
-    } else {
-        return std::vector<size_t>();
-    }
-}
 } // namespace low_precision
 } // namespace pass
 } // namespace ngraph
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
index 181d0525f2db25..0fcac236356f90 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
@@ -421,7 +421,6 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::CNNNetwork &ne
             type != Split &&
             type != Concatenation &&
             type != Eltwise &&
-            type != Crop &&
             type != BatchNormalization &&
             type != Copy) {
             check_result = false;
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
index dde977745e9570..73c353960a1cd4 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@@ -846,7 +846,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
         bool isSupportedParams = layer->_group == 1 &&
                 is1x1Convolution(layer) &&  // TODO [oneDNN] : fusing is permitted only with 1x1 convolutions
                 everyone_is(1, layer->_stride[X_AXIS], layer->_stride[Y_AXIS]) &&
-                one_of(layer->outData[0].get()->getPrecision(), Precision::FP32, Precision::U8) &&
+                everyone_is(Precision::FP32, layer->insData[0].lock()->getPrecision(), layer->outData[0].get()->getPrecision()) &&
                 node->getChildEdgeAt(0)->getDims().ndims() == 4;
         if (!isSupportedParams) return false;
 
@@ -862,10 +862,11 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
         if (parentLayer == nullptr)
             IE_THROW() << "Cannot get convolution layer " << parentNode->getName();
 
-        if (parentLayer->outData[0].get()->getPrecision() != childLayer->outData[0].get()->getPrecision())
+        if (!everyone_is(Precision::FP32, parentLayer->outData[0].get()->getPrecision(), childLayer->insData[0].lock()->getPrecision(),
+                childLayer->outData[0].get()->getPrecision()))
             return false;
 
-        if (parentLayer->precision != childLayer->precision)
+        if (!everyone_is(Precision::FP32, parentLayer->precision, childLayer->precision))
             return false;
 
         auto parentOutputPrecision = !parentNode->fusedWith.empty()
@@ -876,7 +877,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
                 ? childNode->fusedWith[childNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
                 : childNode->getCnnLayer()->outData[0].get()->getPrecision();
 
-        if (parentOutputPrecision != childOutputPrecision)
+        if (!everyone_is(Precision::FP32, parentOutputPrecision, childOutputPrecision))
             return false;
 
         auto* childConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(childNode.get());
@@ -886,6 +887,9 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
         if (!childConvolutionNode->inputZeroPoints.empty() || !childConvolutionNode->weightsZeroPoints.empty())
             return false;
 
+        bool withBias = (childLayer->_biases != nullptr && childLayer->_biases->size() != 0) ||
+                        childConvolutionNode->getBaseIntputsNumber() == 3;
+
         auto allPads = getPaddings(*childLayer);
 
         bool isSupportedParams = childLayer->_out_depth == childLayer->_group &&
@@ -895,13 +899,36 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
                                  everyone_is(1, allPads.end[X_AXIS], allPads.end[Y_AXIS]) &&
                                  everyone_is(1, childLayer->_dilation[X_AXIS], childLayer->_dilation[Y_AXIS]) &&
                                  childLayer->_stride[X_AXIS] == childLayer->_stride[Y_AXIS] &&
-                                 false &&  // TODO [oneDNN]: disabled while not ported
-                                 one_of(childLayer->_stride[X_AXIS], 1 /*, 2*/) &&  // TODO [oneDNN]: stride 2 should also be supported
+                                 withBias &&
+                                 one_of(childLayer->_stride[X_AXIS], 1, 2) &&
                                  childNode->getChildEdgeAt(0)->getDims().ndims() == 4;
 
         return isSupportedParams;
     };
 
+    auto isFusingWorthwhile = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
+        auto layer = std::dynamic_pointer_cast<ConvolutionLayer>(childNode->getCnnLayer());
+        if (layer == nullptr)
+            IE_THROW() << "Cannot get convolution layer " << childNode->getName();
+
+        auto inDims = childNode->inDims[0];
+        auto outDims = childNode->outDims[0];
+        int elemSize = layer->precision.size();
+
+        int L3_cache_size = utils::get_cache_size(3, false);
+        int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize;
+        int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize;
+
+        auto parentConvolutionNode = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(parentNode);
+        if (parentConvolutionNode == nullptr)
+            IE_THROW() << "Cannot get convolution node " << parentNode->getName();
+
+        if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common))
+            return false;
+
+        return (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2);
+    };
+
     for (int i = 0; i < graphNodes.size(); i++) {
         if (!isConvolutionNode(graphNodes[i])) continue;
 
@@ -911,6 +938,8 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
         auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild();
         if (!isSutableChildConvolution(parentConvNode, childConvNode)) continue;
 
+        if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue;
+
         parentConvNode->fuseWith(childConvNode);
 
         for (auto node : childConvNode->getFusedWith())
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
index a888d38a2ebc45..ebef14038577d8 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -16,7 +16,6 @@
 #include <nodes/mkldnn_batchnorm_node.h>
 #include <nodes/mkldnn_concat_node.h>
 #include <nodes/mkldnn_conv_node.h>
-#include <nodes/mkldnn_crop_node.h>
 #include <nodes/mkldnn_deconv_node.h>
 #include <nodes/mkldnn_eltwise_node.h>
 #include <nodes/mkldnn_gemm_node.h>
@@ -39,6 +38,7 @@
 #include <nodes/mkldnn_tensoriterator_node.h>
 #include <nodes/mkldnn_scatter_update_node.h>
 #include <nodes/mkldnn_interpolate_node.h>
+#include <nodes/mkldnn_strided_slice_node.h>
 #include <mkldnn_types.h>
 #include <dnnl_types.h>
 #include "mkldnn_extension_utils.h"
@@ -93,7 +93,6 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
         { "Eltwise", Eltwise },
         { "Mod", Eltwise },
         { "Power", Eltwise },
-        { "Crop", Crop },
         { "Reshape", Reshape },
         { "Tile", Tile },
         { "SimplerNMS", SimplerNMS },
@@ -103,6 +102,7 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
         { "Flatten", Flatten },
         { "Pad", Pad },
         { "Permute", Permute },
+        { "StridedSlice", StridedSlice },
         { "Copy", Copy },
         { "LSTMCell", RNNCell },
         { "GRUCell", RNNCell },
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
index 83c45610cde678..4ed5daee429f20 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@@ -48,7 +48,6 @@ enum Type {
     Concatenation,
     Eltwise,
     Gemm,
-    Crop,
     Reshape,
     Tile,
     SimplerNMS,
@@ -58,6 +57,7 @@ enum Type {
     Flatten,
     Pad,
     Permute,
+    StridedSlice,
     Copy,
     MemoryOutput,
     MemoryInput,
@@ -122,8 +122,6 @@ static std::string NameFromType(Type type) {
             return "Concatenation";
         case Depthwise:
             return "Depthwise";
-        case Crop:
-            return "Crop";
         case Reshape:
             return "Reshape";
         case Tile:
@@ -142,6 +140,8 @@ static std::string NameFromType(Type type) {
             return "Pad";
         case Permute:
             return "Permute";
+        case StridedSlice:
+            return "StridedSlice";
         case Copy:
             return "Copy";
         case MemoryOutput:
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
index c4b70fd3b596c6..5dd4fa84f1dc48 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
@@ -26,6 +26,7 @@
 #include <legacy/transformations/convert_opset1_to_legacy/reshape_fully_connected.hpp>
 #include <legacy/transformations/convert_opset1_to_legacy/convert_nms_5_to_legacy.hpp>
 #include <legacy/transformations/convert_opset1_to_legacy/convert_interpolate_to_interp_or_resample.hpp>
+#include <legacy/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.hpp>
 #include <legacy/ngraph_ops/fully_connected.hpp>
 
 #include <transformations/opset_conversions/convert_opset3_to_opset2.hpp>
@@ -35,6 +36,7 @@
 #include <transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp>
 #include "transformations/common_optimizations/convert_quantize_dequantize.hpp"
 #include <transformations/common_optimizations/depth_to_space_fusion.hpp>
+#include <transformations/common_optimizations/softmax_fusion.hpp>
 #include <transformations/op_conversions/convert_depth_to_space.hpp>
 #include <transformations/op_conversions/convert_space_to_depth.hpp>
 #include <transformations/op_conversions/convert_gelu.hpp>
@@ -260,6 +262,11 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
                 return MKLDNNMVNNode::checkAxesSuitability(node);
             });
 
+    pass_config->set_callback<ngraph::pass::SoftmaxFusion>(
+            [](const_node_ptr &node) -> bool {
+                return node->input_value(0).get_partial_shape().rank().get_length() > 5;
+            });
+
     // List of enabled/disabled transformations
     pass_config->disable<ngraph::pass::ConvertGELU>();
     pass_config->disable<ngraph::pass::Gelu7Downgrade>();
@@ -327,6 +334,7 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
     legacyManager.register_pass<ngraph::pass::UnrollTensorIterator>();
 
     auto legacyPassConfig = legacyManager.get_pass_config();
+    legacyPassConfig->disable<ngraph::pass::ConvertStridedSliceToCropMatcher>();
 
     legacyPassConfig->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr &node) -> bool {
         return !MKLDNNQuantizeNode::isNeedToDecompose(node);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
index a40e5b57ab8710..12604eef081122 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
@@ -41,7 +41,6 @@ MKLDNN_EXTENSION_NODE(ReverseSequenceImpl, ReverseSequence);
 MKLDNN_EXTENSION_NODE(DetectionOutputImpl, DetectionOutput);
 MKLDNN_EXTENSION_NODE(ArgMaxImpl, ArgMax);
 MKLDNN_EXTENSION_NODE(UnsqueezeImpl, Unsqueeze);
-MKLDNN_EXTENSION_NODE(StridedSliceImpl, StridedSlice);
 MKLDNN_EXTENSION_NODE(ExperimentalDetectronDetectionOutputImpl, ExperimentalDetectronDetectionOutput);
 MKLDNN_EXTENSION_NODE(RegionYoloImpl, RegionYolo);
 MKLDNN_EXTENSION_NODE(LogSoftmaxImpl, LogSoftmax);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
index 54218d9aca5f65..d6106b6816587c 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
@@ -420,31 +420,31 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
                     PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
                     PostOpsIntBlobMemory[blob_idx + 1]->SetData(biasPrc, memory::format_tag::x, biases->buffer(),
                                                                 dwBiasesDims.size() * MKLDNNExtensionUtils::sizeOfDataType(biasPrc));
-                    // rewrite onto append_dw_k3s2p1
-//                    ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
-//                                       dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
-//                                       mkldnn::memory::convert_to_c(dw_conv_in_dt),
-//                                       (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-//                                       (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
+                    // todo: rewrite onto append_dw_k3s2p1
+                    ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
+                                       dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
+                                       mkldnn::memory::convert_to_c(dw_conv_in_dt),
+                                       static_cast<const float *>(PostOpsIntBlobMemory[blob_idx]->GetData()),
+                                       static_cast<const float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData()));
 
                     blob_idx += 2;
                 } else {
-                    // rewrite onto append_dw_k3s2p1
-//                    ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
-//                                       dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
-//                                       mkldnn::memory::convert_to_c(dw_conv_in_dt),
-//                                       static_cast<float *>(getParentEdgeAt(
-//                                               baseInputsNumber + 0)->getMemory().GetData()),
-//                                       static_cast<float *>(getParentEdgeAt(
-//                                               baseInputsNumber + 1)->getMemory().GetData()));
+                    // todo: rewrite onto append_dw_k3s2p1
+                    ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
+                                       dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
+                                       mkldnn::memory::convert_to_c(dw_conv_in_dt),
+                                       static_cast<const float *>(getParentEdgeAt(
+                                               baseInputsNumber + 0)->getMemory().GetData()),
+                                       static_cast<const float *>(getParentEdgeAt(
+                                               baseInputsNumber + 1)->getMemory().GetData()));
                 }
             } else {
-                // rewrite onto append_dw_k3s2p1
-//                ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
-//                                   dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
-//                                   mkldnn::memory::convert_to_c(dw_conv_in_dt),
-//                                   nullptr,
-//                                   nullptr);
+                // todo: rewrite onto append_dw_k3s2p1
+                ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
+                                   dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
+                                   mkldnn::memory::convert_to_c(dw_conv_in_dt),
+                                   nullptr,
+                                   nullptr);
             }
 
             if (convolutionNode->wScale != nullptr) {
@@ -482,14 +482,11 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
                                                             oShiftDataVector.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
 
                 ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift,
-                                     (const float *)PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                     (const float *)PostOpsIntBlobMemory[blob_idx + 1]->GetData());
+                                     static_cast<const float *>(PostOpsIntBlobMemory[blob_idx]->GetData()),
+                                     static_cast<const float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData()));
 
                 blob_idx += 2;
             }
-
-            IE_THROW() << "append_dw_conv is not ported";
-
             continue;
         }
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
deleted file mode 100644
index 166461b4b0115d..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "mkldnn_crop_node.h"
-#include <legacy/ie_layers.h>
-#include <string>
-#include <algorithm>
-#include <mkldnn_types.h>
-#include <mkldnn_extension_utils.h>
-#include "ie_parallel.hpp"
-#include "common/cpu_memcpy.h"
-#include "utils/general_utils.h"
-
-using namespace mkldnn;
-using namespace MKLDNNPlugin;
-using namespace InferenceEngine;
-
-MKLDNNCropNode::MKLDNNCropNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
-        MKLDNNNode(layer, eng, cache) {}
-
-void MKLDNNCropNode::getSupportedDescriptors() {
-    CropLayer* cropLayer = dynamic_cast<CropLayer*>(getCnnLayer().get());
-
-    if (cropLayer == nullptr)
-        IE_THROW() << "Cannot convert crop layer.";
-
-    channelAxis = 1;
-    if (getParentEdges().size() != 1 && getParentEdges().size() != 2) {
-        IE_THROW() << "Incorrect number of input edges for layer " << getName();
-    }
-
-    MKLDNNDims childDims = getChildEdgeAt(0)->getDims();
-
-    offsets.resize(static_cast<size_t>(childDims.ndims()));  // plus one dim for batch
-    dims.resize(static_cast<size_t>(childDims.ndims()));     // plus one dim for batch
-    for (int i = 0; i < childDims.ndims(); i++)
-        dims[i] = childDims[i];
-
-    for (int i = 0; i < cropLayer->axis.size(); i++) {
-        offsets[cropLayer->axis[i]] = cropLayer->offset[i];
-    }
-
-    if (cropLayer->axis.size() == dims.size()) {
-        for (size_t i = 0; i < cropLayer->axis.size(); i++) {
-            if (cropLayer->axis[i] == 1) {
-                channelAxis = static_cast<int>(i);
-                break;
-            }
-        }
-    }
-
-    if (!getChildEdges().size())
-        IE_THROW() << "Incorrect number of output edges for layer " << getName();
-}
-
-void MKLDNNCropNode::initSupportedPrimitiveDescriptors() {
-    if (!supportedPrimitiveDescriptors.empty())
-        return;
-
-    InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-    precision = getCnnLayer()->outData[0]->getPrecision();
-    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-    if (inputDataType != outputDataType) {
-        outputDataType = inputDataType; // Crop doesn't convert precisions, only moves data
-    }
-
-    auto& inDims = getParentEdgeAt(0)->getDims();
-    if (inDims.ndims() != 2 && inDims.ndims() != 4 && inDims.ndims() != 5) {
-        IE_THROW() << "Crop supports only 2d, 4d and 5d blobs.";
-    }
-
-    memory::format_tag fmt = memory::format_tag::undef;
-    switch (inDims.ndims()) {
-        case 2: fmt = memory::format_tag::nc; break;
-        case 4: fmt = memory::format_tag::nchw; break;
-        case 5: fmt = memory::format_tag::ncdhw; break;
-    }
-
-    InferenceEngine::LayerConfig config;
-    config.dynBatchSupport = true;
-    config.inConfs.resize(getParentEdges().size());
-    config.outConfs.resize(1);
-    for (size_t i = 0; i < getParentEdges().size(); i++) {
-        config.inConfs[i].inPlace = -1;
-        config.inConfs[i].constant = i != 0;
-        config.inConfs[i].desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, fmt);
-    }
-    config.outConfs[0].inPlace = -1;
-    config.outConfs[0].constant = false;
-    config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
-
-    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, fmt);
-
-    if ((inDims.ndims() == 4 || inDims.ndims() == 5) && channelAxis >= 0 && dims[channelAxis] % 8 == 0) {
-        fmt = inDims.ndims() == 5 ? memory::format_tag::nCdhw8c : memory::format_tag::nChw8c;
-        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, fmt);
-        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
-        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, fmt);
-        if (dims[channelAxis] % 16 == 0) {
-            fmt = inDims.ndims() == 5 ? memory::format_tag::nCdhw16c : memory::format_tag::nChw16c;
-            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, fmt);
-            config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
-            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, fmt);
-        }
-    }
-}
-
-void MKLDNNCropNode::createPrimitive() {
-    auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
-    auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
-    if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
-        IE_THROW() << "Destination memory didn't allocate.";
-    if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
-        IE_THROW() << "Input memory didn't allocate.";
-    if (getSelectedPrimitiveDescriptor() == nullptr)
-        IE_THROW() << "Preferable primitive descriptor is not set.";
-}
-
-void MKLDNNCropNode::execute(mkldnn::stream strm) {
-    auto& parentMem = getParentEdgeAt(0)->getMemory();
-
-    int m_block_size = 1;
-    if (!parentMem.GetDesc().isPlainFormat()) {
-        const auto &desc = parentMem.GetDescriptor().data;
-        const auto &blk = desc.format_desc.blocking;
-        IE_ASSERT(desc.format_kind == dnnl_blocked &&
-                  blk.inner_nblks == 1 &&
-                  blk.inner_idxs[0] == 1);
-        m_block_size = blk.inner_blks[0];
-    }
-    const int m_inner_dim = dims[dims.size() - 1] * m_block_size;
-
-    const auto &dst_mem = getChildEdgeAt(0)->getMemory();
-
-    const int dst_ndims = dst_mem.GetDesc().getDims().ndims();
-
-    // TODO: Rewrite it in general case. For every tensor
-    // and rank, without using letter N,C,D,H,W
-    const int OFFSET_N = (dst_ndims > 0) ? offsets[0] : 0;
-    const int OFFSET_C = (dst_ndims > 1) ? offsets[1] : 0;
-    const int OFFSET_D = (dst_ndims > 4) ? offsets[offsets.size() - 3] : 0;
-    const int OFFSET_H = (dst_ndims > 2) ? offsets[offsets.size() - 2] : 0;
-    const int OFFSET_W = (dst_ndims > 3) ? offsets[offsets.size() - 1] : 0;
-
-    // TODO: Check applicability of dyn_batch_lim in early steps.
-    //       crop of batch dimension doesn't support dyn batch.
-    const int ON = (dst_ndims  > 0) ? std::min<int>(batchToProcess(), getChildEdgeAt(0)->getDims()[0]) : 1;
-    const int OC = (dst_ndims  > 1) ? dims[1] : 1;
-    const int OD = (dst_ndims  > 4) ? dims[dims.size() - 3] : 1;
-    const int OH = (dst_ndims  > 2) ? dims[dims.size() - 2] : 1;
-    const int OW = (dst_ndims  > 3) ? dims[dims.size() - 1] : 1;
-
-    memory::dims src_dims = parentMem.GetDims();
-    int src_ndims = static_cast<int>(src_dims.size());
-
-    const int IC = (src_ndims  > 1) ? rnd_up(src_dims[1], m_block_size) : 1;
-    const int ID = (src_ndims  > 4) ? src_dims[src_dims.size() - 3] : 1;
-    const int IH = (src_ndims  > 2) ? src_dims[src_dims.size() - 2] : 1;
-    const int IW = (src_ndims  > 3) ? src_dims[src_dims.size() - 1] : 1;
-
-    const size_t itemSize = parentMem.GetDesc().GetElementSize();
-
-    const auto *src_data = reinterpret_cast<const uint8_t*>(parentMem.GetPtr());
-    auto *dst_data = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetPtr());
-
-    if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) {
-        parallel_for(ON, [&](int n) {
-            cpu_memcpy(dst_data + itemSize * n * OC, src_data + itemSize *((n+OFFSET_N)*IC + OFFSET_C), OC * itemSize);
-        });
-    } else {
-        parallel_for2d(ON, (OC / m_block_size), [&](int n, int c) {
-            for (int d = 0; d < OD; ++d) {
-                int dst_ind = (n*OC + c*m_block_size)*OD*OH*OW + d*m_block_size*OH*OW;
-
-                int src_ind = ((n+OFFSET_N)*IC + (c*m_block_size+OFFSET_C))*ID*IH*IW +
-                              ((d+OFFSET_D)*IH*IW + OFFSET_H*IW + OFFSET_W)*m_block_size;
-
-                for (int h = 0; h < OH; ++h) {
-                    cpu_memcpy(dst_data + itemSize * dst_ind, src_data + itemSize * src_ind, m_inner_dim * itemSize);
-
-                    src_ind += IW * m_block_size;
-                    dst_ind += OW * m_block_size;
-                }
-            }
-        });
-    }
-}
-
-bool MKLDNNCropNode::created() const {
-    return getType() == Crop;
-}
-REG_MKLDNN_PRIM_FOR(MKLDNNCropNode, Crop);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h
deleted file mode 100644
index 3c02b98180d40a..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ie_common.h>
-#include <mkldnn_node.h>
-#include <string>
-#include <vector>
-
-namespace MKLDNNPlugin {
-
-class MKLDNNCropNode : public MKLDNNNode {
-public:
-    MKLDNNCropNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
-    ~MKLDNNCropNode() override = default;
-
-    void getSupportedDescriptors() override;
-    void initSupportedPrimitiveDescriptors() override;
-    void createPrimitive() override;
-    void execute(mkldnn::stream strm) override;
-    bool created() const override;
-    bool canBeInPlace() const override {
-        return false;
-    }
-
-private:
-    int channelAxis = 1;
-    std::vector<int> offsets;
-    std::vector<int> dims;
-};
-
-}  // namespace MKLDNNPlugin
-
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
index b5f1e51bd2c4ad..3f7b02b9a4c3d3 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
@@ -781,7 +781,13 @@ MKLDNNEltwiseNode::initializers = {
             alpha = 0.0f;
             beta = 0.0f;
             opType = Gelu;
-            algorithm = mkldnn::algorithm::eltwise_gelu;
+            std::string approximationMode = activationLayer->GetParamAsString("approximation_mode", "erf");
+            if (approximationMode == "erf")
+                algorithm = mkldnn::algorithm::eltwise_gelu_erf;
+            else if (approximationMode == "tanh")
+                algorithm = mkldnn::algorithm::eltwise_gelu_tanh;
+            else
+                IE_THROW() << "Gelu layer with name " << activationLayer->name << " doesn't support approximation mode " << approximationMode;
         }},
         {"elu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
             alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
@@ -1743,7 +1749,8 @@ void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops) {
         case mkldnn::algorithm::eltwise_soft_relu:
         case mkldnn::algorithm::eltwise_logistic:
         case mkldnn::algorithm::eltwise_exp:
-        case mkldnn::algorithm::eltwise_gelu:
+        case mkldnn::algorithm::eltwise_gelu_erf:
+        case mkldnn::algorithm::eltwise_gelu_tanh:
         case mkldnn::algorithm::eltwise_clip:
         case mkldnn::algorithm::eltwise_swish:
         case mkldnn::algorithm::eltwise_hswish:
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pad_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pad_node.cpp
index 3c9d9e141e55ae..27554f51237996 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pad_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pad_node.cpp
@@ -239,10 +239,12 @@ static inline size_t parallel_init(size_t start, size_t nDims, const SizeVector&
 }
 
 static inline void parallel_step(size_t nDims, const SizeVector& dims, SizeVector& indexes) {
-    for (int j = nDims - 1; j >= 0; j--) {
-        indexes[j] = (indexes[j] + 1) % dims[j];
-        if (indexes[j] != 0)
-            return;
+    for (int j = nDims - 1; j >= 0; --j) {
+        ++indexes[j];
+        if (indexes[j] < dims[j])
+            break;
+        else
+            indexes[j] = 0;
     }
 }
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.cpp
new file mode 100644
index 00000000000000..37cfc950b9cb8d
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.cpp
@@ -0,0 +1,609 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_strided_slice_node.h"
+
+#include <mkldnn_types.h>
+#include <mkldnn_extension_utils.h>
+
+#include <legacy/ie_layers.h>
+#include "ie_parallel.hpp"
+#include "caseless.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/cpu_memcpy.h"
+#include "common/tensor_desc_creator.h"
+#include "utils/general_utils.h"
+
+#include <string>
+#include <tuple>
+#include <algorithm>
+#include "caseless.hpp"
+
+
+#define THROW_ERROR IE_THROW() << "StridedSlice layer with name '" << getName() << "' "
+
+using namespace mkldnn;
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+using namespace InferenceEngine::details;
+
+static inline size_t parallel_init(size_t start, size_t nDims, const SizeVector& dims, SizeVector& indexes) {
+    for (int j = nDims - 1; j >= 0; j--) {
+        indexes[j] = start % dims[j];
+        start = start / dims[j];
+    }
+    return start;
+}
+
+MKLDNNStridedSliceNode::MKLDNNStridedSliceNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
+        MKLDNNNode(layer, eng, cache) {}
+
+void MKLDNNStridedSliceNode::getSupportedDescriptors() {
+    auto stridedSliceLayer = getCnnLayer();
+
+    if (stridedSliceLayer == nullptr)
+        THROW_ERROR << "cannot convert from CNN layer";
+
+    auto inData = stridedSliceLayer->insData[DATA_ID].lock();
+    auto beginData = stridedSliceLayer->insData[BEGIN_ID].lock();
+    auto endData = stridedSliceLayer->insData[END_ID].lock();
+    if (!inData || !beginData || !endData)
+        THROW_ERROR << "has nullable input data";
+
+    params.parametersAreConstant = CaselessEq<std::string>()(getParentEdgesAtPort(BEGIN_ID)[0]->getParent()->getCnnLayer()->type, "const") &&
+                                   CaselessEq<std::string>()(getParentEdgesAtPort(END_ID)[0]->getParent()->getCnnLayer()->type, "const");
+
+    const SizeVector srcDims = inData->getTensorDesc().getDims();
+    const SizeVector dstDims = stridedSliceLayer->outData[0]->getTensorDesc().getDims();
+    const size_t nSrcDims = srcDims.size();
+
+    if (getParentEdges().size() != 3 && getParentEdges().size() != 4)
+        THROW_ERROR << "has incorrect number of input edges";
+    if (!getChildEdges().size())
+        THROW_ERROR << "has incorrect number of output edges";
+
+    beginDims = beginData->getTensorDesc().getDims();
+    if (beginDims.size() != 1)
+        THROW_ERROR << " should have begin vector with 1 dimension";
+
+    endDims = endData->getTensorDesc().getDims();
+    if (endDims.size() != 1)
+        THROW_ERROR << "should have end vector with 1 dimension";
+    if (beginDims[0] != endDims[0])
+        THROW_ERROR << "should have begin vector with size equal to end vector size";
+
+    if (stridedSliceLayer->insData.size() > STRIDE_ID) {
+        auto strideData = stridedSliceLayer->insData[STRIDE_ID].lock();
+        if (!strideData)
+            THROW_ERROR << "has nullable input data";
+        if (!CaselessEq<std::string>()(getParentEdgesAtPort(STRIDE_ID)[0]->getParent()->getCnnLayer()->type, "const"))
+            params.parametersAreConstant = false;
+
+        strideDims = strideData->getTensorDesc().getDims();
+        if (strideDims.size() > 1)
+            THROW_ERROR << "should have stride vector with 1 dimension";
+        if (beginDims[0] != strideDims[0])
+            THROW_ERROR << "should have stride vector with size equal to begin vector size";
+    }
+
+    auto createMask = [&](const char* maskName, std::vector<int>& mask, const int bit = 0) {
+        mask = stridedSliceLayer->GetParamAsInts(maskName);
+        if (strcmp(maskName, "ellipsis_mask") != 0 || mask.size() == 0) {
+            for (size_t i = mask.size(); i < dstDims.size(); ++i) mask.push_back(bit);
+        }
+    };
+
+    createMask("begin_mask", beginMask, 1);
+    createMask("end_mask", endMask, 1);
+    createMask("new_axis_mask", newAxisMask);
+    createMask("shrink_axis_mask", shrinkAxisMask);
+    createMask("ellipsis_mask", ellipsisMask);
+
+    int ellipsisMaskCounter = 0;
+    params.ellipsisPos1 = -1;
+    for (size_t i = 0; i < ellipsisMask.size(); i++) {
+        ellipsisMaskCounter += ellipsisMask[i];
+        params.ellipsisPos1 = ellipsisMask[i] == 1 && params.ellipsisPos1 == -1 ? i : params.ellipsisPos1;
+    }
+    if (ellipsisMaskCounter > 1)
+        THROW_ERROR << "has incorrect 'Ellipsis_mask'. Only one non-zero bit is allowed";
+
+    int newAxis = std::accumulate(newAxisMask.begin(), newAxisMask.end(), 0);
+    int shrinkAxis = std::accumulate(shrinkAxisMask.begin(), shrinkAxisMask.end(), 0);
+    params.equalDims = newAxis == 0 && shrinkAxis == 0;
+
+    if (params.parametersAreConstant) {
+        auto fillingInParameters = [&](std::vector<int> &parameter, const size_t type, const size_t size, const int value) {
+            auto parentLayer = getParentEdgesAtPort(type)[0]->getParent()->getCnnLayer();
+            auto blob = parentLayer->blobs["custom"];
+            if (blob->getTensorDesc().getPrecision() != Precision::I32)
+                THROW_ERROR << "supports only parameters input with precision I32";
+            const int *ptr = blob->cbuffer().as<const int *>() + blob->getTensorDesc().getBlockingDesc().getOffsetPadding();
+            parameter.assign(ptr, ptr + size);
+
+            if (ellipsisMaskCounter == 0 && size < dstDims.size()) {
+                for (size_t i = size; i < dstDims.size(); i++) parameter.push_back(value);
+            }
+        };
+
+        if (beginDims.size())
+            fillingInParameters(begin, BEGIN_ID, beginDims[0], 0);
+        if (endDims.size())
+            fillingInParameters(end, END_ID, endDims[0], 0);
+        if (strideDims.size())
+            fillingInParameters(stride, STRIDE_ID, strideDims[0], 1);
+
+        if (nSrcDims > 3 && params.equalDims && ellipsisMaskCounter == 1)
+            addHiddenDims(nSrcDims);
+    }
+}
+
+void MKLDNNStridedSliceNode::addHiddenDims(const size_t nSrcDims) {
+    // all masks and input parameters are for planar layouts. So if we use blocked or per channel layout and
+    // there is ellipsis should to add default values in hidden dimensions to know real order of mask or parameter values
+    size_t afterDims = ellipsisMask.size() - params.ellipsisPos1 - 1;
+    size_t ellipsisPos2 = nSrcDims - afterDims - 1;
+
+    auto addHiddenDims = [&](std::vector<int>& data, const int bit = 0) {
+        std::vector<int> temp;
+        for (size_t i = 0; i < params.ellipsisPos1; i++)
+            temp.push_back(data[i]);
+        for (size_t i = params.ellipsisPos1; i < ellipsisPos2 + 1; i++)
+            temp.push_back(bit);
+        for (size_t i = 1; i < nSrcDims - ellipsisPos2; i++)
+            temp.push_back(data[i + params.ellipsisPos1]);
+        data = temp;
+    };
+
+    addHiddenDims(begin);
+    addHiddenDims(end);
+    addHiddenDims(stride, 1);
+    addHiddenDims(beginMask);
+    addHiddenDims(endMask);
+    addHiddenDims(ellipsisMask);
+    addHiddenDims(newAxisMask);
+    addHiddenDims(shrinkAxisMask);
+}
+
+void MKLDNNStridedSliceNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    const bool hasStrides = getParentEdges().size() > 3;
+    InferenceEngine::Precision dataPrecision = getCnnLayer()->insData[DATA_ID].lock()->getPrecision();
+    InferenceEngine::Precision beginPrecision = getCnnLayer()->insData[BEGIN_ID].lock()->getPrecision();
+    auto beginDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(beginPrecision);
+    InferenceEngine::Precision endPrecision = getCnnLayer()->insData[END_ID].lock()->getPrecision();
+    auto endDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(endPrecision);
+    InferenceEngine::Precision stridePrecision;
+    if (hasStrides)
+        stridePrecision = getCnnLayer()->insData[STRIDE_ID].lock()->getPrecision();
+
+    auto srcDims = getParentEdgeAt(DATA_ID)->getDims();
+    auto dstDims = getChildEdgeAt(0)->getDims();
+    size_t nDims = srcDims.ndims();
+
+    InferenceEngine::LayerConfig config;
+    config.dynBatchSupport = false;
+    config.inConfs.resize(getParentEdges().size());
+    config.inConfs[DATA_ID].inPlace = -1;
+    config.inConfs[BEGIN_ID].inPlace = -1;
+    config.inConfs[END_ID].inPlace = -1;
+    config.inConfs[DATA_ID].constant = false;
+    config.inConfs[BEGIN_ID].constant = true;
+    config.inConfs[END_ID].constant = true;
+    if (hasStrides) {
+        config.inConfs[STRIDE_ID].inPlace = -1;
+        config.inConfs[STRIDE_ID].constant = true;
+    }
+    config.outConfs.resize(1);
+
+    std::vector<TensorDescCreatorTypes> supportedTypes;
+    if (nDims > 2 && params.equalDims) {
+        auto canUseBlocked = [=](const size_t blockSize) {
+            return srcDims[1] % blockSize == 0 && abs(stride[1]) == 1 && (begin[1] > srcDims[1] || begin[1] % blockSize == 0);
+        };
+
+        supportedTypes.push_back(TensorDescCreatorTypes::nspc);
+        if (canUseBlocked(8lu))
+            supportedTypes.push_back(TensorDescCreatorTypes::nCsp8c);
+        if (canUseBlocked(16lu))
+            supportedTypes.push_back(TensorDescCreatorTypes::nCsp16c);
+    }
+    supportedTypes.push_back(TensorDescCreatorTypes::ncsp);
+    auto creators = TensorDescCreator::getCommonCreators();
+    auto range = TensorDescCreator::makeFilteredRange(creators, nDims, supportedTypes);
+
+    for (auto itr = range.first; itr != range.second; ++itr) {
+        config.inConfs[0].desc = itr->second->createDesc(dataPrecision, getParentEdgeAt(DATA_ID)->getDims().ToSizeVector());
+        config.inConfs[BEGIN_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(BEGIN_ID)->getDims(), beginDataType, mkldnn::memory::format_tag::x);
+        config.inConfs[END_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(END_ID)->getDims(), endDataType, mkldnn::memory::format_tag::x);
+        if (hasStrides)
+            config.inConfs[STRIDE_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(STRIDE_ID)->getDims(),
+                                                              MKLDNNExtensionUtils::IEPrecisionToDataType(stridePrecision),
+                                                              mkldnn::memory::format_tag::x);
+
+        config.outConfs[0].desc = itr->second->createDesc(dataPrecision, getChildEdgeAt(DATA_ID)->getDims().ToSizeVector());
+        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, MKLDNNMemoryDesc(config.outConfs.front().desc).getFormat());
+    }
+}
+
+void MKLDNNStridedSliceNode::createPrimitive() {
+    auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
+    auto& srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
+    if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
+        THROW_ERROR << "has not allocated destination memory.";
+    if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
+        THROW_ERROR << "has not allocated input memory.";
+    if (getSelectedPrimitiveDescriptor() == nullptr)
+        THROW_ERROR << "has unidentified preferable primitive descriptor.";
+
+    auto srcBlockingDesc = getParentEdgeAt(DATA_ID)->getDesc().getBlockingDesc();
+    auto dstBlockingDesc = getChildEdgeAt(0)->getDesc().getBlockingDesc();
+    auto srcOrder = srcBlockingDesc.getOrder();
+    params.srcDims = srcBlockingDesc.getBlockDims();
+    params.dstDims = dstBlockingDesc.getBlockDims();
+    params.dataSize = getSelectedPrimitiveDescriptor()->getConfig().inConfs[DATA_ID].desc.getPrecision().size();
+
+    if (params.parametersAreConstant) {
+        size_t realNDims = params.dstDims.size();
+        if (!getParentEdgeAt(DATA_ID)->getMemory().GetDesc().isPlainFormat())
+            orderParametersByLayouts();
+
+        SizeVector newSrcDims, newDstDims;
+        dimsNormalization(newSrcDims, newDstDims);
+        dimsGluing(realNDims, newSrcDims, newDstDims);
+
+        if (params.dstDims.size() == 1 || params.nDimsForWork != 1)
+            indicesCalculation();
+    }
+}
+
+void MKLDNNStridedSliceNode::orderParametersByLayouts() {
+    const bool isPerChannelLayout = getParentEdgeAt(DATA_ID)->getMemory().GetDesc().isTailCFormat();
+    const bool isBlockedLayout = getParentEdgeAt(DATA_ID)->getMemory().GetDesc().isBlockedCFormat();
+    auto srcOrder = getParentEdgeAt(DATA_ID)->getDesc().getBlockingDesc().getOrder();
+
+    if (isBlockedLayout) {
+        const size_t blk = params.srcDims.back();
+        begin[1] = begin[1] / blk;
+        end[1] = ceil(end[1] / static_cast<float>(blk));
+        begin.push_back(0);
+        end.push_back(0);
+        stride.push_back(1);
+        beginMask.push_back(0);
+        endMask.push_back(0);
+        ellipsisMask.push_back(0);
+        newAxisMask.push_back(0);
+        shrinkAxisMask.push_back(0);
+    } else if (isPerChannelLayout) {
+        auto sortByOrder = [&](std::vector<int>& data) {
+            std::vector<int> temp(srcOrder.size());
+            for (size_t i = 0; i < srcOrder.size(); i++)
+                temp[i] = data[srcOrder[i]];
+            data = temp;
+        };
+
+        sortByOrder(begin);
+        sortByOrder(end);
+        sortByOrder(stride);
+        sortByOrder(beginMask);
+        sortByOrder(endMask);
+        sortByOrder(ellipsisMask);
+        sortByOrder(newAxisMask);
+        sortByOrder(shrinkAxisMask);
+    }
+}
+
+void MKLDNNStridedSliceNode::dimsNormalization(SizeVector& newSrcDims, SizeVector& newDstDims) {
+    // creating new src and dst dimensions and parameters of the same size using masks
+    //
+    // example 1: before srcDims = [5, 6, 8, 3, 2], begin = [1, 0], end = [4, 0], stride = [1, 1]
+    //            beginMask = [0, 1], endMask = [0, 1], ellipsisMask = [1, 0], newAxisMas = [0, 0], shrinkAxisMask = [0, 0]
+    //            after srcDims = [5, 6, 8, 3, 2], begin = [1, 0, 0, 0, 0], end = [4, 5, 7, 2, 1], stride = [1, 1, 1, 1, 1], dstDims = [4, 6, 8, 3, 2]
+    //
+    // example 2: before srcDims = [5, 6, 8, 3, 2], begin = [0, 3, 0, 0, 0], end = [0, 3, 0, 0, 0], stride = [1, 1, 1, 1, 1]
+    //            beginMask = [1, 0, 1, 1, 1], endMask = [1, 0, 1, 1, 1], ellipsisMask = [0, 0, 0, 0, 0], newAxisMask = [0, 0, 0, 0, 0],
+    //            shrinkAxisMask = [0, 1, 0, 0, 0]
+    //            after srcDims = [5, 6, 8, 3, 2], begin = [0, 3, 0, 0, 0], end = [4, 3, 7, 2, 1], stride = [1, 1, 1, 1, 1], dstDims = [5, 1, 8, 3, 2]
+    //
+    // example 3: before srcDims = [5, 8, 3, 2], begin = [0, 0, 0, 0], end = [0, 0, 0, 0], stride = [1, 1, 1, 1]
+    //            beginMask = [1, 0, 1, 1, 1], endMask = [1, 0, 1, 1, 1], ellipsisMask = [0, 0, 0, 0, 0], newAxisMask = [0, 1, 0, 0, 0],
+    //            shrinkAxisMask = [0, 0, 0, 0, 0]
+    //            after srcDims = [5, 1, 8, 3, 2], begin = [0, 0, 0, 0, 0], end = [4, 0, 7, 2, 1], stride = [1, 1, 1, 1, 1], dstDims = [5, 1, 8, 3, 2]
+
+    auto clipping = [](int& idx, const int min, const int max) {
+        idx = (idx > min) ? idx : min;
+        idx = (idx < max) ? idx : (max - 1);
+    };
+
+    auto correcting = [](int& dim, const size_t shift) {
+        dim = dim >= 0 ? dim : shift + dim;
+    };
+
+    std::vector<int> beginTemp;
+    std::vector<int> endTemp;
+    std::vector<int> strideTemp;
+    size_t srcIdx = 0;
+    for (size_t axis = 0; axis < begin.size(); ++axis) {
+        if (ellipsisMask[axis] == 1) {
+            int nNewAxisAfterEllipses = 0;
+            int nSrcAxisBeforeEllipses = 0;
+            for (size_t i = 0; i < axis; ++i) {
+                if (newAxisMask[i] != 1)
+                    nSrcAxisBeforeEllipses++;
+            }
+            for (size_t i = axis + 1; i < begin.size(); ++i) {
+                if (newAxisMask[i] == 1)
+                    nNewAxisAfterEllipses++;
+            }
+
+            size_t nSrcAxisAfterEllipses = (begin.size() - axis - nNewAxisAfterEllipses - 1);
+            size_t nHiddenDims = params.srcDims.size() - nSrcAxisAfterEllipses - nSrcAxisBeforeEllipses;
+            for (size_t i = 0; i < nHiddenDims; ++i) {
+                newSrcDims.push_back(params.srcDims[srcIdx]);
+                newDstDims.push_back(params.srcDims[srcIdx]);
+                beginTemp.push_back(0);
+                endTemp.push_back(params.srcDims[srcIdx] - 1);
+                strideTemp.push_back(1);
+
+                srcIdx++;
+            }
+        } else {
+            if (newAxisMask[axis] == 1) {
+                beginTemp.push_back(0);
+                endTemp.push_back(0);
+                strideTemp.push_back(1);
+                newSrcDims.push_back(1);
+                newDstDims.push_back(1);
+            } else if (shrinkAxisMask[axis] == 1) {
+                int b = beginMask[axis] == 1 ? begin[axis] : 0;
+                correcting(b, params.srcDims[srcIdx]);
+                clipping(b, 0, params.srcDims[srcIdx]);
+                beginTemp.push_back(b);
+                endTemp.push_back(b);
+                strideTemp.push_back(1);
+                newSrcDims.push_back(params.srcDims[srcIdx]);
+                newDstDims.push_back(1);
+
+                srcIdx++;
+            } else {
+                int b = beginMask[axis] == 1 ? begin[axis] : (stride[axis] > 0 ? 0 : -1);
+                correcting(b, params.srcDims[srcIdx]);
+                clipping(b, 0, params.srcDims[srcIdx]);
+
+                int e = endMask[axis] == 1 ? (stride[axis] > 0 ? end[axis] - 1 : end[axis] + 1) :
+                        (stride[axis] > 0 ? -1 : 0);
+                correcting(e, params.srcDims[srcIdx]);
+                clipping(e, 0, params.srcDims[srcIdx]);
+
+                beginTemp.push_back(b);
+                endTemp.push_back(e);
+                strideTemp.push_back(stride[axis]);
+                newSrcDims.push_back(params.srcDims[srcIdx]);
+                newDstDims.push_back(ceil(static_cast<float>(abs(e - b) + 1) / static_cast<float>(abs(strideTemp.back()))));
+
+                srcIdx++;
+            }
+        }
+    }
+
+    begin = beginTemp;
+    end = endTemp;
+    stride = strideTemp;
+
+    params.dstDims = newDstDims;
+    params.srcDims = newSrcDims;
+    params.dstStrides.resize(newDstDims.size());
+    params.srcStrides.resize(newSrcDims.size());
+    params.dstStrides[params.dstStrides.size() - 1] = params.srcStrides[params.srcStrides.size() - 1] = 1;
+    for (int i = newDstDims.size() - 2; i >= 0; --i) {
+        params.dstStrides[i] = params.dstStrides[i + 1] * params.dstDims[i + 1];
+        params.srcStrides[i] = params.srcStrides[i + 1] * params.srcDims[i + 1];
+    }
+}
+
+void MKLDNNStridedSliceNode::dimsGluing(const size_t realNDims, const SizeVector& newSrcDims, const SizeVector& newDstDims) {
+    // gluing of dimensions if there aren't begin, end and stride != 1 on this axis
+    // example: before gluing srcDims = [5, 6, 8, 3, 2], begin = [1, 0, 0, 0, 0], stride = [1, 1, 2, 1, 1], dstDims = [4, 6, 4, 3, 2]
+    //          after gluing  srcDims = [30, 8, 6],      begin = [6, 0, 0],       stride = [1, 2, 1],       dstDims = [24, 4, 6]
+
+    std::pair<size_t, size_t> secondDim = { 0, begin.size() };
+    SizeVector indexes(1, 0);
+    for (int idx = 0; idx < begin.size(); idx++) {
+        if (begin[idx] != 0 || end[idx] != params.srcDims[idx] - 1 || stride[idx] != 1) {
+            indexes.push_back(std::max(idx - 1, 0));
+            indexes.push_back(stride[idx] == 1 ? idx : idx + 1);
+
+            if (idx != 0 && secondDim.first == 0)
+                secondDim.first = idx;
+            else if (idx != 0 && secondDim.second == begin.size())
+                secondDim.second = idx;
+        }
+    }
+
+    if (indexes.back() < 2) {
+        indexes[indexes.size() - 1] = 1;
+        secondDim.first = 1;
+    }
+
+    const size_t nGluingLastDims = params.dstStrides[std::max(static_cast<int>(indexes.back() - 1), 0)];
+    const bool vLastDim = indexes.back() < begin.size();
+    indexes[indexes.size() - 1] = vLastDim ? indexes.back() : begin.size() - 1;
+    indexes.push_back(begin.size() - 1);
+
+    for (int idx = indexes.size() - 1; idx >= 0; idx -= 2) {
+        if (indexes[idx - 1] < indexes[idx]) {
+            for (size_t jdx = indexes[idx]; jdx > indexes[idx - 1]; --jdx) {
+                params.dstDims[indexes[idx - 1]] *= params.dstDims[jdx];
+                params.srcDims[indexes[idx - 1]] *= params.srcDims[jdx];
+                params.dstStrides[indexes[idx - 1]] /= params.dstDims[jdx];
+                params.srcStrides[indexes[idx - 1]] /= params.srcDims[jdx];
+
+                begin[indexes[idx - 1]] *= params.dstDims[jdx];
+            }
+            const size_t beginShift = indexes[idx - 1] + 1;
+            const size_t endShift = indexes[idx] + 1;
+
+            params.dstDims.erase(params.dstDims.begin() + beginShift, params.dstDims.begin() + endShift);
+            params.srcDims.erase(params.srcDims.begin() + beginShift, params.srcDims.begin() + endShift);
+            params.dstStrides.erase(params.dstStrides.begin() + beginShift, params.dstStrides.begin() + endShift);
+            params.srcStrides.erase(params.srcStrides.begin() + beginShift, params.srcStrides.begin() + endShift);
+
+            begin.erase(begin.begin() + beginShift, begin.begin() + endShift);
+            stride.erase(stride.begin() + beginShift, stride.begin() + endShift);
+        }
+    }
+
+    params.workAmount = params.dstDims[0] * params.dstStrides[0] / nGluingLastDims;
+    params.lastDstDim = nGluingLastDims * params.dataSize;
+    params.nDimsForWork = params.dstDims.size() - static_cast<size_t>(vLastDim);
+
+    if (params.nDimsForWork == 1 && realNDims > 2) {
+        const size_t realSrcDim = newSrcDims[secondDim.first];
+        const size_t realDstDim = newDstDims[secondDim.first];
+
+        params.dstStrides.insert(params.dstStrides.begin() + 1, params.dstStrides[0] / realDstDim);
+        params.srcStrides.insert(params.srcStrides.begin() + 1, params.srcStrides[0] / realSrcDim);
+
+        for (size_t idx = secondDim.first + 1; idx < secondDim.second; idx++)
+            begin[1] /= newDstDims[idx];
+
+        const size_t maxThreads = dnnl_get_max_threads();
+        if (params.dstDims[0] < maxThreads) {
+            params.dstDims[1] /= realDstDim;
+            params.srcDims[1] /= realSrcDim;
+            params.dstDims.insert(params.dstDims.begin() + 1, realDstDim);
+            params.srcDims.insert(params.srcDims.begin() + 1, realSrcDim);
+        }
+
+        if (params.dstDims.size() > 2)
+            params.lastDstDim /= newDstDims[secondDim.first];
+    }
+}
+
+void MKLDNNStridedSliceNode::indicesCalculation() {
+    // indices calculation before execution for the best performance
+    params.nThreads = dnnl_get_max_threads();
+    params.srcIndices.resize(params.workAmount, 0);
+    params.dstIndices.resize(params.workAmount, 0);
+
+    auto getSrcIdx = [this](const SizeVector& indexes){
+        size_t srcIdx = 0;
+        for (int i = 0; i < params.nDimsForWork; ++i)
+            srcIdx += (begin[i] + indexes[i] * stride[i]) * params.srcStrides[i];
+        return srcIdx * params.dataSize;
+    };
+
+    parallel_nt(params.nThreads, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        SizeVector coords(params.nDimsForWork, 0);
+        splitter(params.workAmount, nthr, ithr, start, end);
+        parallel_init(start, params.nDimsForWork, params.dstDims, coords);
+
+        size_t srcIdx = getSrcIdx(coords);
+        for (size_t j = start; j < end; ++j) {
+            params.dstIndices[j] = j * params.lastDstDim;
+            params.srcIndices[j] = srcIdx;
+
+            bool out = false;
+            for (int k = params.nDimsForWork - 1; k >= 0; k--) {
+                coords[k]++;
+                if (coords[k] < params.dstDims[k]) {
+                    srcIdx += stride[k] * params.srcStrides[k] * params.dataSize;
+                    break;
+                } else {
+                    coords[k] = 0;
+                    out = true;
+                }
+            }
+
+            if (out)
+                srcIdx = getSrcIdx(coords);
+        }
+    });
+}
+
+void MKLDNNStridedSliceNode::execute(mkldnn::stream strm) {
+    if (!params.parametersAreConstant) {
+        auto srcDims = getParentEdgeAt(DATA_ID)->getDims();
+        auto dstDims = getChildEdgeAt(0)->getDims();
+        const size_t ellipsisMaskCounter = std::accumulate(ellipsisMask.begin(), ellipsisMask.end(), 0);
+
+        auto fillingInParameters = [&](std::vector<int> &parameter, const size_t type, const size_t size, const int value) {
+            const int *ptr = reinterpret_cast<const int*>(this->getParentEdgeAt(type)->getMemoryPtr()->GetPtr());
+            parameter.assign(ptr, ptr + size);
+
+            if (ellipsisMaskCounter == 0 && size < dstDims.ndims()) {
+                for (size_t i = size; i < dstDims.ndims(); i++) parameter.push_back(value);
+            }
+        };
+
+        if (beginDims.size())
+            fillingInParameters(begin, BEGIN_ID, beginDims[0], 0);
+        if (endDims.size())
+            fillingInParameters(end, END_ID, endDims[0], 0);
+        if (strideDims.size())
+            fillingInParameters(stride, STRIDE_ID, strideDims[0], 1);
+
+        if (srcDims.ndims() > 3 && params.equalDims && ellipsisMaskCounter != 0)
+            addHiddenDims(srcDims.ndims());
+
+        if (!getParentEdgeAt(DATA_ID)->getMemory().GetDesc().isPlainFormat())
+            orderParametersByLayouts();
+
+        SizeVector newSrcDims, newDstDims;
+        dimsNormalization(newSrcDims, newDstDims);
+        dimsGluing(dstDims.ndims(), newSrcDims, newDstDims);
+
+        if (params.dstDims.size() == 1 || params.nDimsForWork != 1)
+            indicesCalculation();
+    }
+
+    if (params.dstDims.size() > 1 && params.nDimsForWork == 1)
+        stridedSliceV();
+    else
+        stridedSlice();
+}
+
+void MKLDNNStridedSliceNode::stridedSliceV() {
+    const uint8_t* srcData = reinterpret_cast<const uint8_t*>(this->getParentEdgeAt(DATA_ID)->getMemoryPtr()->GetPtr()) +
+                             (begin[0] * params.srcStrides[0] + begin[1] * params.srcStrides[1]) * params.dataSize;
+    uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
+
+    const size_t dstIdx = params.dstStrides[0] * params.dataSize;
+    const size_t srcIdx = stride[0] * params.srcStrides[0] * params.dataSize;
+    const size_t dstShift = params.dstStrides[1] * params.dataSize;
+    const size_t srcShift = stride[1] * params.srcStrides[1] * params.dataSize;
+
+    if (params.dstDims.size() > 2) {
+        parallel_for2d(params.dstDims[0], params.dstDims[1], [&](const size_t i, const size_t j) {
+            cpu_memcpy(&dstData[i * dstIdx + j * dstShift], &srcData[i * srcIdx + j * srcShift], params.lastDstDim);
+        });
+    } else {
+        parallel_for(params.dstDims[0], [&](const size_t i) {
+            cpu_memcpy(&dstData[i * dstIdx], &srcData[i * srcIdx], params.lastDstDim);
+        });
+    }
+}
+
+void MKLDNNStridedSliceNode::stridedSlice() {
+    const uint8_t* srcData = reinterpret_cast<const uint8_t*>(this->getParentEdgeAt(DATA_ID)->getMemoryPtr()->GetPtr()) +
+            (stride.back() == 1 && stride.size() > 1 ? begin[params.nDimsForWork] * params.srcStrides[params.nDimsForWork] * params.dataSize : 0);
+    uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
+
+    parallel_nt(params.nThreads, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        splitter(params.workAmount, nthr, ithr, start, end);
+
+        for (size_t iwork = start; iwork < end; ++iwork)
+            cpu_memcpy(&dstData[params.dstIndices[iwork]], &srcData[params.srcIndices[iwork]], params.lastDstDim);
+    });
+}
+
+bool MKLDNNStridedSliceNode::created() const {
+    return getType() == StridedSlice;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNStridedSliceNode, StridedSlice);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.h
new file mode 100644
index 00000000000000..577757791ffa1b
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.h
@@ -0,0 +1,76 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+#include <string>
+#include <vector>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNStridedSliceNode : public MKLDNNNode {
+public:
+    MKLDNNStridedSliceNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+    ~MKLDNNStridedSliceNode() override = default;
+
+    void getSupportedDescriptors() override;
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override;
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+    bool canBeInPlace() const override {
+        return false;
+    }
+
+private:
+    void stridedSliceV();
+    void stridedSlice();
+
+    void addHiddenDims(const size_t nSrcDims);
+    void orderParametersByLayouts();
+    void dimsNormalization(InferenceEngine::SizeVector& newSrcDims, InferenceEngine::SizeVector& newDstDims);
+    void dimsGluing(const size_t realNDims, const InferenceEngine::SizeVector& newSrcDims, const InferenceEngine::SizeVector& newDstDims);
+    void indicesCalculation();
+
+    const size_t DATA_ID = 0;
+    const size_t BEGIN_ID = 1;
+    const size_t END_ID = 2;
+    const size_t STRIDE_ID = 3;
+
+    std::vector<int> begin;
+    std::vector<int> end;
+    std::vector<int> stride;
+
+    std::vector<int> beginMask;
+    std::vector<int> endMask;
+    std::vector<int> ellipsisMask;
+    std::vector<int> newAxisMask;
+    std::vector<int> shrinkAxisMask;
+
+    InferenceEngine::SizeVector beginDims;
+    InferenceEngine::SizeVector endDims;
+    InferenceEngine::SizeVector strideDims;
+
+    struct {
+        InferenceEngine::SizeVector srcDims;
+        InferenceEngine::SizeVector dstDims;
+        InferenceEngine::SizeVector srcStrides;
+        InferenceEngine::SizeVector dstStrides;
+        InferenceEngine::SizeVector srcIndices;
+        InferenceEngine::SizeVector dstIndices;
+        int ellipsisPos1 = -1;
+        int ellipsisPos2 = 0;
+        size_t nThreads = 0;
+        size_t nDimsForWork = 0;
+        size_t workAmount = 0;
+        size_t lastDstDim = 0;
+        size_t dataSize = 0;
+        bool equalDims = false;
+        bool parametersAreConstant = true;
+    } params;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/strided_slice.cpp b/inference-engine/src/mkldnn_plugin/nodes/strided_slice.cpp
deleted file mode 100644
index a99a8454430dd5..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/strided_slice.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <cassert>
-#include <algorithm>
-#include "ie_parallel.hpp"
-#include "common/cpu_memcpy.h"
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-inline void clipping(int *idx, const int min, const int max) {
-    (*idx) = ((*idx) > min) ? (*idx) : min;
-    (*idx) = ((*idx) < max) ? (*idx) : (max - 1);
-    return;
-}
-
-class StridedSliceImpl: public ExtLayerBase {
-public:
-    explicit StridedSliceImpl(const CNNLayer* layer) {
-        try {
-            if (layer->insData.size() > 4 || layer->outData.size() != 1)
-                IE_THROW() << layer->name << " Incorrect number of input/output edges!";
-
-            src_dims = layer->insData[STRIDEDSLICE_DATA].lock()->getTensorDesc().getDims();
-
-            bounds_size = 0;
-            begin_dims = {};
-            if (layer->insData.size() > 1) {
-                begin_dims = layer->insData[STRIDEDSLICE_BEGIN].lock()->getTensorDesc().getDims();
-                if (begin_dims.size() > 1)
-                    IE_THROW() << layer->name << " Begin vector should be 1 dimension";
-                bounds_size = begin_dims[0];
-            }
-
-            if (layer->insData.size() > 2) {
-                end_dims = layer->insData[STRIDEDSLICE_END].lock()->getTensorDesc().getDims();
-                if (end_dims.size() > 1)
-                    IE_THROW() << layer->name << " End vector should be 1 dimension";
-                if (begin_dims[0] != end_dims[0])
-                    IE_THROW() << layer->name << " Begin vector size should be equal end vectror size";
-            }
-
-            if (layer->insData.size() > 3) {
-                stride_dims = layer->insData[STRIDEDSLICE_STRIDE].lock()->getTensorDesc().getDims();
-                if (stride_dims.size() > 1)
-                    IE_THROW() << layer->name << " End vector should be 1 dimension";
-                if (begin_dims[0] != stride_dims[0])
-                    IE_THROW() << layer->name << " Stride vector size should be equal begin vectror size";
-            }
-            dst_dims = layer->outData[0]->getTensorDesc().getDims();
-
-            std::string::size_type i;
-            std::string begin_mask_str = layer->GetParamAsString("begin_mask", "");
-            for (i = 0; i < begin_mask_str.size(); ++i) {
-                if (begin_mask_str[i] == '1') begin_mask.push_back(1);
-                else if (begin_mask_str[i] == '0') begin_mask.push_back(0);
-            }
-            for (; i < src_dims.size(); ++i) begin_mask.push_back(1);
-
-            std::string end_mask_str = layer->GetParamAsString("end_mask", "");
-            for (i = 0; i < end_mask_str.size(); ++i) {
-                if (end_mask_str[i] == '1') end_mask.push_back(1);
-                else if (end_mask_str[i] == '0') end_mask.push_back(0);
-            }
-            for (; i < src_dims.size(); ++i) end_mask.push_back(1);
-
-            std::string ellipsis_mask_str = layer->GetParamAsString("ellipsis_mask", "");
-            size_t ellipsis_mask_counter = 0;
-            for (i = 0; i < ellipsis_mask_str.size(); ++i) {
-                if (ellipsis_mask_str[i] == '1') {
-                    ellipsis_mask_counter++;
-                    ellipsis_mask.push_back(1);
-                } else if (ellipsis_mask_str[i] == '0') {
-                    ellipsis_mask.push_back(0);
-                }
-            }
-            if (ellipsis_mask_counter > 1)
-                IE_THROW() << layer->name << " 'Ellipsis_mask' must be a power of two (only one ellipsis)!";
-            for (; i < src_dims.size(); ++i) ellipsis_mask.push_back(0);
-
-            std::string new_axis_mask_str = layer->GetParamAsString("new_axis_mask", "");
-            for (i = 0; i < new_axis_mask_str.size(); ++i) {
-                if (new_axis_mask_str[i] == '1') new_axis_mask.push_back(1);
-                else if (new_axis_mask_str[i] == '0') new_axis_mask.push_back(0);
-            }
-            for (; i < src_dims.size(); ++i) new_axis_mask.push_back(0);
-
-            std::string shrink_axis_mask_str = layer->GetParamAsString("shrink_axis_mask", "");
-            for (i = 0; i < shrink_axis_mask_str.size(); ++i) {
-                if (shrink_axis_mask_str[i] == '1') shrink_axis_mask.push_back(1);
-                else if (shrink_axis_mask_str[i] == '0') shrink_axis_mask.push_back(0);
-            }
-            for (; i < src_dims.size(); ++i) shrink_axis_mask.push_back(0);
-
-
-            int new_axis = 0;
-            for (auto& na : new_axis_mask)
-                new_axis += na;
-
-            shrink_axis = 0;
-            for (auto& sa : shrink_axis_mask)
-                shrink_axis += sa;
-            max_dims = src_dims.size() + new_axis;
-
-            //  ellipsis_mask must be a power of two (only one ellipsis), so to take a first position
-            ellipsis_pos1 = ellipsis_pos2 = max_dims;
-            for (i = 0; i < ellipsis_mask.size(); i++) {
-                if (ellipsis_mask[i] > 0) {
-                    ellipsis_pos1 = i;
-                    break;
-                }
-            }
-            bounds_size -= ellipsis_pos1;
-            if (bounds_size > 0 && (max_dims - bounds_size) > ellipsis_pos1)
-                ellipsis_pos2 = max_dims - bounds_size;
-
-            begin_dms.assign(max_dims, 0);
-            end_dms.assign(max_dims, -1);
-            stride_dms.assign(max_dims, 1);
-
-            srcStrides = layer->insData[STRIDEDSLICE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides();
-            dstStrides = layer->outData[0]->getTensorDesc().getBlockingDesc().getStrides();
-            Precision dataPrecision = layer->insData[STRIDEDSLICE_DATA].lock()->getTensorDesc().getPrecision();
-            if (layer->insData.size() == 1) {
-                addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision) }, { DataConfigurator(ConfLayout::PLN, dataPrecision) });
-            } else if (layer->insData.size() == 2) {
-                addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision), DataConfigurator(ConfLayout::PLN, Precision::I32) },
-                    { DataConfigurator(ConfLayout::PLN, dataPrecision) });
-            } else if (layer->insData.size() == 3) {
-                addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision), DataConfigurator(ConfLayout::PLN, Precision::I32),
-                    DataConfigurator(ConfLayout::PLN, Precision::I32) }, { DataConfigurator(ConfLayout::PLN, dataPrecision) });
-            } else {
-                addConfig(layer, { DataConfigurator(ConfLayout::PLN, dataPrecision), DataConfigurator(ConfLayout::PLN, Precision::I32),
-                    DataConfigurator(ConfLayout::PLN, Precision::I32), DataConfigurator(ConfLayout::PLN, Precision::I32) },
-                    { DataConfigurator(ConfLayout::PLN, dataPrecision) });
-            }
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        int *begin = nullptr, *end = nullptr, *stride = nullptr;
-        if (begin_dims.size())
-            begin = inputs[STRIDEDSLICE_BEGIN]->cbuffer().as<int *>() + inputs[STRIDEDSLICE_BEGIN]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        if (end_dims.size())
-            end = inputs[STRIDEDSLICE_END]->cbuffer().as<int *>() + inputs[STRIDEDSLICE_END]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        if (stride_dims.size())
-            stride = inputs[STRIDEDSLICE_STRIDE]->cbuffer().as<int *>() + inputs[STRIDEDSLICE_STRIDE]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        InferenceEngine::SizeVector src_dims = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getDims();
-        InferenceEngine::SizeVector srcStrides = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getBlockingDesc().getStrides();
-        InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims();
-        InferenceEngine::SizeVector dstStrides = outputs[0]->getTensorDesc().getBlockingDesc().getStrides();
-
-        size_t i, j, k, bj, ej, sj;
-        InferenceEngine::SizeVector our_dims;
-        InferenceEngine::SizeVector out_dims;
-        for (i = 0, j = 0, k = 0, bj = 0, ej = 0, sj = 0; static_cast<int>(i) < max_dims; i++) {
-            if (static_cast<int>(i) >= ellipsis_pos1 &&
-                    static_cast<int>(i) < ellipsis_pos2) {
-                if (new_axis_mask.size() > i && new_axis_mask[i] == 1)
-                    end_dms[i] = 0;
-                else
-                    end_dms[i] = end_dms[i] >= 0 ? end_dms[i] : src_dims[j++] + end_dms[i];
-
-                out_dims.push_back(static_cast<int>(ceil(static_cast<float>(abs(end_dms[i] - begin_dms[i]) + 1) / static_cast<float>(abs(stride_dms[i])))));
-                our_dims.push_back(static_cast<int>(ceil(static_cast<float>(abs(end_dms[i] - begin_dms[i]) + 1) / static_cast<float>(abs(stride_dms[i])))));
-                k = ellipsis_pos1;
-            } else {
-                stride_dms[i] = (stride != nullptr && stride_dims[0] > sj && stride[sj] != 0) ? stride[sj++] : 1;
-
-                if (begin_mask.size() > j && begin_mask[j] == 0)
-                    begin_dms[i] = stride_dms[i] > 0 ? 0 : -1;
-                else
-                    begin_dms[i] = (begin != nullptr && begin_dims[0] > bj) ? begin[bj] : (stride_dms[i] > 0 ? 0 : -1);
-                bj++;
-                begin_dms[i] = begin_dms[i] >= 0 ? begin_dms[i] : src_dims[j] + begin_dms[i];
-                //  Clipping 'begin'
-                clipping(&begin_dms[i], 0, src_dims[j]);
-
-                if (end_mask.size() > j && end_mask[j] == 0) {
-                    end_dms[i] = stride_dms[i] > 0 ? -1 : 0;
-                } else {
-                    int end_dms_tmp = (end != nullptr && end_dims[0] > ej) ? (stride_dms[i] > 0 ? end[ej] - 1 : end[ej] + 1)
-                                                                     : end_dms[i];
-                    end_dms[i] = (end != nullptr && end_dims[0] > ej) ? end_dms_tmp : (stride_dms[i] > 0 ? -1 : 0);
-                }
-                ej++;
-                end_dms[i] = end_dms[i] >= 0 ? end_dms[i] : src_dims[j] + end_dms[i];
-                //  Clipping 'end'
-                clipping(&end_dms[i], 0, src_dims[j]);
-
-                if (new_axis_mask.size() > i && new_axis_mask[i] == 1)
-                    end_dms[i] = 0;
-                else
-                    j++;
-
-                if (shrink_axis_mask.size() > k && shrink_axis_mask[k] == 1)
-                    end_dms[i] = begin_dms[i];
-                else
-                    out_dims.push_back(static_cast<int>(ceil(static_cast<float>(abs(end_dms[i] - begin_dms[i]) + 1) /
-                                                             static_cast<float>(abs(stride_dms[i])))));
-
-                our_dims.push_back(static_cast<int>(ceil(static_cast<float>(abs(end_dms[i] - begin_dms[i]) + 1) /
-                                                         static_cast<float>(abs(stride_dms[i])))));
-                k++;
-            }
-        }
-
-        for (i = 0; i < (std::min)(out_dims.size(), dst_dims.size()); i++) {
-            if (out_dims[i] != dst_dims[i])
-                return PARAMETER_MISMATCH;
-        }
-
-        const size_t inputsPrecSize = inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().size();
-        if (static_cast<int>(src_dims.size()) == max_dims && shrink_axis == 0 &&
-                stride_dms[stride_dms.size()-1] == 1 && stride_dms.size() > 1) {
-            if (inputsPrecSize != outputs[0]->getTensorDesc().getPrecision().size()) {
-                if (resp) {
-                    std::string errorMsg = "StridedSlice layer doesn't support 'Data' input precision: "
-                        + std::string(inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().name());
-                        errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                }
-                return GENERAL_ERROR;
-            }
-            strided_slice_vp(inputs[STRIDEDSLICE_DATA], outputs[0]);
-        } else if (static_cast<int>(src_dims.size()) == max_dims && shrink_axis == 0) {
-            switch (inputsPrecSize) {
-                case 1: { strided_slice_p<uint8_t>(inputs[STRIDEDSLICE_DATA], outputs[0]); break; }
-                case 2: { strided_slice_p<uint16_t>(inputs[STRIDEDSLICE_DATA], outputs[0]); break; }
-                case 4: { strided_slice_p<uint32_t>(inputs[STRIDEDSLICE_DATA], outputs[0]); break; }
-                case 8: { strided_slice_p<uint64_t>(inputs[STRIDEDSLICE_DATA], outputs[0]); break; }
-                default: {
-                    if (resp) {
-                        std::string errorMsg = "StridedSlice layer doesn't support 'Data' input precision: "
-                            + std::string(inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().name());
-                            errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                    }
-                    return GENERAL_ERROR;
-                }
-            }
-        } else {
-            switch (inputsPrecSize) {
-                case 1: { strided_slice<uint8_t>(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; }
-                case 2: { strided_slice<uint16_t>(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; }
-                case 4: { strided_slice<uint32_t>(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; }
-                case 8: { strided_slice<uint64_t>(inputs[STRIDEDSLICE_DATA], outputs[0], our_dims); break; }
-                default: {
-                    if (resp) {
-                        std::string errorMsg = "StridedSlice layer doesn't support 'Data' input precision: "
-                            + std::string(inputs[STRIDEDSLICE_DATA]->getTensorDesc().getPrecision().name());
-                            errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                    }
-                    return GENERAL_ERROR;
-                }
-            }
-        }
-
-        return OK;
-    }
-
-private:
-    const size_t STRIDEDSLICE_DATA = 0;
-    const size_t STRIDEDSLICE_BEGIN = 1;
-    const size_t STRIDEDSLICE_END = 2;
-    const size_t STRIDEDSLICE_STRIDE = 3;
-
-    template <typename T>
-    void strided_slice(Blob::Ptr&, Blob::Ptr& dst_data, std::vector<size_t> &dims);
-    void strided_slice_vp(Blob::Ptr&, Blob::Ptr& dst_data);
-    template <typename T>
-    void strided_slice_p(Blob::Ptr&, Blob::Ptr& dst_data);
-
-    SizeVector begin_dims;
-    SizeVector end_dims;
-    SizeVector stride_dims;
-
-    SizeVector begin_mask;
-    SizeVector end_mask;
-    SizeVector ellipsis_mask;
-    SizeVector new_axis_mask;
-    SizeVector shrink_axis_mask;
-    int shrink_axis;
-
-    SizeVector src_dims;
-    SizeVector dst_dims;
-    std::vector<int> begin_dms;
-    std::vector<int> end_dms;
-    std::vector<int> stride_dms;
-    SizeVector srcStrides;
-    SizeVector dstStrides;
-    int bounds_size;
-    int max_dims;
-    int ellipsis_pos1, ellipsis_pos2;
-};
-
-template <typename T>
-void StridedSliceImpl::strided_slice(Blob::Ptr& input, Blob::Ptr& output, std::vector<size_t> &dims) {
-    auto* src_data = input->cbuffer().as<const T*>() + input->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto* dst_data = output->buffer().as<T*>() + output->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto dst_size = output->byteSize();
-    memset(dst_data, 0, dst_size);
-
-    size_t work_amount_dst = dstStrides[0] * dst_dims[0];
-    parallel_nt(0, [&](const int ithr, const int nthr) {
-        int j;
-        size_t i, start = 0, end = 0;
-        SizeVector counters(max_dims, 0);
-        splitter(work_amount_dst, nthr, ithr, start, end);
-        for (j = max_dims - 1, i = start; j >= 0; j--) {
-            counters[j] = i % dims[j];
-            i /= dims[j];
-        }
-        for (size_t iwork = start; iwork < end; ++iwork) {
-            int src_idx = 0;
-            for (i = 0, j = 0; static_cast<int>(i) < max_dims; ++i) {
-                if (!(new_axis_mask.size() > i && new_axis_mask[i] == 1))
-                    src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[j++];
-            }
-
-            dst_data[iwork] = src_data[src_idx];
-
-            for (j = max_dims - 1; j >= 0; j--) {
-                counters[j]++;
-                if (counters[j] < dims[j])
-                    break;
-                else
-                    counters[j] = 0;
-            }
-        }
-    });
-}
-
-void StridedSliceImpl::strided_slice_vp(Blob::Ptr& input, Blob::Ptr& output) {
-    size_t dataSize = input->getTensorDesc().getPrecision().size();
-    const uint8_t* src_data = input->cbuffer().as<const uint8_t*>() + input->getTensorDesc().getBlockingDesc().getOffsetPadding() * dataSize;
-    uint8_t* dst_data = output->buffer().as<uint8_t*>() + output->getTensorDesc().getBlockingDesc().getOffsetPadding() * dataSize;
-    auto dst_size = output->byteSize();
-    memset(dst_data, 0, dst_size);
-
-    //  Vectorized copy
-    size_t dims_size_1 = dst_dims.size() - 1;
-    size_t len = dst_dims[dims_size_1] * dataSize;
-    size_t work_amount_dst = dstStrides[0] * dst_dims[0] / dst_dims[dims_size_1];
-
-    parallel_nt(0, [&](const int ithr, const int nthr) {
-        size_t start = 0, end = 0;
-        SizeVector counters(dims_size_1, 0);
-        splitter(work_amount_dst, nthr, ithr, start, end);
-        size_t src_idx = begin_dms[dims_size_1];
-        for (int j = dims_size_1 - 1, i = start; j >= 0; j--) {
-            counters[j] = i % dst_dims[j];
-            src_idx += (begin_dms[j] + counters[j] * stride_dms[j]) * srcStrides[j];
-            i /= dst_dims[j];
-        }
-
-        for (size_t iwork = start, dst_idx = start * len, i = 1; iwork < end; ++iwork, dst_idx += len) {
-            cpu_memcpy(&dst_data[dst_idx], &src_data[src_idx * dataSize], len);
-            for (int j = dims_size_1 - 1; j >= 0; j--) {
-                counters[j]++;
-                if (counters[j] < dst_dims[j]) {
-                    src_idx += stride_dms[j] * srcStrides[j];
-                    break;
-                } else {
-                    counters[j] = i = 0;
-                }
-            }
-            if (!i) {
-                for (src_idx = begin_dms[dims_size_1]; i < dims_size_1; ++i)
-                    src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[i];
-            }
-        }
-    });
-}
-
-template <typename T>
-void StridedSliceImpl::strided_slice_p(Blob::Ptr& input, Blob::Ptr& output) {
-    auto* src_data = input->cbuffer().as<const T*>() + input->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto* dst_data = output->buffer().as<T*>() + output->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    auto dst_size = output->byteSize();
-    memset(dst_data, 0, dst_size);
-
-    size_t dims_size = dst_dims.size();
-    size_t work_amount_dst = dstStrides[0] * dst_dims[0];
-
-    parallel_nt(0, [&](const int ithr, const int nthr) {
-        size_t start = 0, end = 0;
-        SizeVector counters(dims_size, 0);
-        splitter(work_amount_dst, nthr, ithr, start, end);
-        int src_idx = 0;
-        for (int j = dims_size - 1, i = start; j >= 0; j--) {
-            counters[j] = i % dst_dims[j];
-            src_idx += (begin_dms[j] + counters[j] * stride_dms[j]) * srcStrides[j];
-            i /= dst_dims[j];
-        }
-
-        for (size_t iwork = start, dst_idx = start, i = 1; iwork < end; ++iwork, dst_idx++) {
-            dst_data[dst_idx] = src_data[src_idx];
-            for (int j = dims_size - 1; j >= 0; j--) {
-                counters[j]++;
-                if (counters[j] < dst_dims[j]) {
-                    src_idx += stride_dms[j] * srcStrides[j];
-                    break;
-                } else {
-                    counters[j] = i = 0;
-                }
-            }
-            if (!i) {
-                for (src_idx = 0; i < dims_size; ++i)
-                    src_idx += (begin_dms[i] + counters[i] * stride_dms[i]) * srcStrides[i];
-            }
-        }
-    });
-}
-
-REG_FACTORY_FOR(StridedSliceImpl, StridedSlice);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/base/ie_executable_network_base.hpp b/inference-engine/src/plugin_api/cpp_interfaces/base/ie_executable_network_base.hpp
index 458f673a42ba33..1f6c9aaa8bb873 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/base/ie_executable_network_base.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/base/ie_executable_network_base.hpp
@@ -9,6 +9,7 @@
 
 #pragma once
 
+#include <ie_iexecutable_network.hpp>
 #include <cpp/ie_executable_network.hpp>
 #include <cpp_interfaces/base/ie_variable_state_base.hpp>
 #include <cpp_interfaces/interface/ie_ivariable_state_internal.hpp>
@@ -103,23 +104,11 @@ class ExecutableNetworkBase : public IExecutableNetwork {
     StatusCode GetContext(RemoteContext::Ptr& pContext, ResponseDesc* resp) const noexcept override {
         TO_STATUS(pContext = _impl->GetContext());
     }
+
+    std::shared_ptr<IExecutableNetworkInternal> GetImpl() const {
+        return _impl;
+    }
 };
 IE_SUPPRESS_DEPRECATED_END_WIN
 
-/**
- * @brief Create an execuable network public C++ object wrapper based on internal inplementation
- * @ingroup ie_dev_api_exec_network_api
- * @param impl An internal implementation for executable network
- * @tparam T A type of internal implementation
- * @return C++ wrapper for executable network
- */
-template <class T>
-inline typename InferenceEngine::ExecutableNetwork make_executable_network(std::shared_ptr<T> impl) {
-    // to suppress warning about deprecated QueryState
-    IE_SUPPRESS_DEPRECATED_START
-    typename ExecutableNetworkBase::Ptr net(new ExecutableNetworkBase(impl));
-    IE_SUPPRESS_DEPRECATED_END
-    return InferenceEngine::ExecutableNetwork(net);
-}
-
 }  // namespace InferenceEngine
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/exception2status.hpp b/inference-engine/src/plugin_api/cpp_interfaces/exception2status.hpp
index b43d9c1f54b97f..04e0c8132d1a0a 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/exception2status.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/exception2status.hpp
@@ -66,4 +66,21 @@ INFERENCE_ENGINE_API_CPP(StatusCode) ExceptionToStatus(const Exception& exceptio
         return InferenceEngine::DescriptionBuffer(UNEXPECTED);                                                  \
     }
 
+#define CATCH_IE_EXCEPTION(ExceptionType) catch (const InferenceEngine::ExceptionType& e) {throw e;}
+
+#define CATCH_IE_EXCEPTIONS                     \
+        CATCH_IE_EXCEPTION(GeneralError)        \
+        CATCH_IE_EXCEPTION(NotImplemented)      \
+        CATCH_IE_EXCEPTION(NetworkNotLoaded)    \
+        CATCH_IE_EXCEPTION(ParameterMismatch)   \
+        CATCH_IE_EXCEPTION(NotFound)            \
+        CATCH_IE_EXCEPTION(OutOfBounds)         \
+        CATCH_IE_EXCEPTION(Unexpected)          \
+        CATCH_IE_EXCEPTION(RequestBusy)         \
+        CATCH_IE_EXCEPTION(ResultNotReady)      \
+        CATCH_IE_EXCEPTION(NotAllocated)        \
+        CATCH_IE_EXCEPTION(InferNotStarted)     \
+        CATCH_IE_EXCEPTION(NetworkNotRead)      \
+        CATCH_IE_EXCEPTION(InferCancelled)
+
 }  // namespace InferenceEngine
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp
index bf7d8163aa5d67..08020629779222 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp
@@ -22,8 +22,7 @@ namespace InferenceEngine {
  * @brief      This class describes an executable network thread safe asynchronous only implementation.
  * @ingroup    ie_dev_api_exec_network_api
  */
-class ExecutableNetworkThreadSafeAsyncOnly : public ExecutableNetworkInternal,
-                                             public std::enable_shared_from_this<ExecutableNetworkThreadSafeAsyncOnly> {
+class ExecutableNetworkThreadSafeAsyncOnly : public ExecutableNetworkInternal {
 public:
     /**
      * @brief A shared pointer to a ExecutableNetworkThreadSafeAsyncOnly object
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp
index 8a668c9f1a8b07..28a5e0fbf9b3a9 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp
@@ -22,8 +22,7 @@ namespace InferenceEngine {
  * The class is recommended to be used as a base class for Executable Network impleentation during plugin development.
  * @ingroup ie_dev_api_exec_network_api
  */
-class ExecutableNetworkThreadSafeDefault : public ExecutableNetworkInternal,
-                                           public std::enable_shared_from_this<ExecutableNetworkThreadSafeDefault> {
+class ExecutableNetworkThreadSafeDefault : public ExecutableNetworkInternal {
 public:
     /**
      * @brief A shared pointer to a ExecutableNetworkThreadSafeDefault object
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_infer_request_internal.hpp b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_infer_request_internal.hpp
index 5ca846b10e34ec..6ac53add48a5d8 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_infer_request_internal.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_infer_request_internal.hpp
@@ -22,7 +22,7 @@
 
 namespace InferenceEngine {
 
-class ExecutableNetworkInternal;
+class IExecutableNetworkInternal;
 
 /**
  * @brief An optimal implementation of IInferRequestInternal interface to avoid duplication in all plugins
@@ -223,7 +223,7 @@ class InferRequestInternal : virtual public IInferRequestInternal {
      * @note       Needed to correctly handle ownership between objects.
      * @param[in]  exeNetwork  The executable network
      */
-    void setPointerToExecutableNetworkInternal(std::shared_ptr<ExecutableNetworkInternal> exeNetwork) {
+    void setPointerToExecutableNetworkInternal(std::shared_ptr<IExecutableNetworkInternal> exeNetwork) {
         _exeNetwork = exeNetwork;
     }
 
@@ -258,7 +258,7 @@ class InferRequestInternal : virtual public IInferRequestInternal {
      * @brief A shared pointer to ExecutableNetworkInternal interface
      * @note Needed to correctly handle ownership between objects.
      */
-    std::shared_ptr<ExecutableNetworkInternal> _exeNetwork;
+    std::shared_ptr<IExecutableNetworkInternal> _exeNetwork;
     /**
      * @brief Checks and executes input data pre-processing if needed.
      * @param inputs Inputs blobs to perform preprocessing on
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_plugin_internal.hpp b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_plugin_internal.hpp
index e573fcd5dc6f50..b1ac89bb61d787 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_plugin_internal.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/impl/ie_plugin_internal.hpp
@@ -48,13 +48,13 @@ static inline void parsePluginName(std::istream& networkModel) {
  */
 class InferencePluginInternal : public IInferencePlugin {
 public:
-    ExecutableNetwork LoadNetwork(const CNNNetwork& network,
+    IExecutableNetworkInternal::Ptr LoadNetwork(const CNNNetwork& network,
                                   const std::map<std::string, std::string>& config) override {
         return LoadNetwork(network, config, nullptr);
     }
 
-    ExecutableNetwork LoadNetwork(const CNNNetwork& network, const std::map<std::string, std::string>& config,
-                                  RemoteContext::Ptr context) override {
+    IExecutableNetworkInternal::Ptr LoadNetwork(const CNNNetwork& network, const std::map<std::string, std::string>& config,
+                                                RemoteContext::Ptr context) override {
         InputsDataMap networkInputs = network.getInputsInfo(), networkInputsCloned;
         OutputsDataMap networkOutputs = network.getOutputsInfo(), networkOutputsCloned;
         copyInputOutputInfo(networkInputs, networkOutputs, networkInputsCloned, networkOutputsCloned);
@@ -70,26 +70,25 @@ class InferencePluginInternal : public IInferencePlugin {
         impl->setNetworkOutputs(networkOutputsCloned);
         impl->SetPointerToPlugin(shared_from_this());
 
-        auto executableNetwork = make_executable_network(impl);
-        return ExecutableNetwork(executableNetwork);
+        return impl;
     }
 
-    ExecutableNetwork ImportNetwork(const std::string& modelFileName,
-                                    const std::map<std::string, std::string>& config) override {
+    IExecutableNetworkInternal::Ptr ImportNetwork(const std::string& modelFileName,
+                                                  const std::map<std::string, std::string>& config) override {
         (void)modelFileName;
         (void)config;
         IE_THROW(NotImplemented);
     }
 
-    ExecutableNetwork ImportNetwork(std::istream& networkModel,
-                                    const std::map<std::string, std::string>& config) override {
+    IExecutableNetworkInternal::Ptr ImportNetwork(std::istream& networkModel,
+                                                  const std::map<std::string, std::string>& config) override {
         parsePluginName(networkModel);
         return ImportNetworkImpl(networkModel, config);
     }
 
-    ExecutableNetwork ImportNetwork(std::istream& networkModel,
-                                    const RemoteContext::Ptr& context,
-                                    const std::map<std::string, std::string>& config) override {
+    IExecutableNetworkInternal::Ptr ImportNetwork(std::istream& networkModel,
+                                                  const RemoteContext::Ptr& context,
+                                                  const std::map<std::string, std::string>& config) override {
         parsePluginName(networkModel);
         return ImportNetworkImpl(networkModel, context, config);
     }
@@ -184,8 +183,8 @@ class InferencePluginInternal : public IInferencePlugin {
      * @param config A string -> string map of parameters
      * @return An Executable network
      */
-    virtual ExecutableNetwork ImportNetworkImpl(std::istream& networkModel,
-                                                const std::map<std::string, std::string>& config) {
+    virtual ExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& networkModel,
+                                                             const std::map<std::string, std::string>& config) {
         (void)networkModel;
         (void)config;
         IE_THROW(NotImplemented);
@@ -199,9 +198,9 @@ class InferencePluginInternal : public IInferencePlugin {
      * @param config A string -> string map of parameters
      * @return An Executable network
      */
-    virtual ExecutableNetwork ImportNetworkImpl(std::istream& networkModel,
-                                                const RemoteContext::Ptr& context,
-                                                const std::map<std::string, std::string>& config) {
+    virtual ExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& networkModel,
+                                                             const RemoteContext::Ptr& context,
+                                                             const std::map<std::string, std::string>& config) {
         (void)networkModel;
         (void)context;
         (void)config;
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp b/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp
index 5992fda1e97f10..6868277331f22b 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp
@@ -17,10 +17,9 @@ namespace InferenceEngine {
 /**
  * @interface IExecutableNetworkInternal
  * @brief An internal API of executable network to be implemented by plugin,
- * which is used in ExecutableNetworkBase forwarding mechanism.
  * @ingroup ie_dev_api_exec_network_api
  */
-class IExecutableNetworkInternal {
+class IExecutableNetworkInternal : public std::enable_shared_from_this<IExecutableNetworkInternal> {
 public:
     /**
      * @brief A shared pointer to IExecutableNetworkInternal interface
diff --git a/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iplugin_internal.hpp b/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iplugin_internal.hpp
index 8d251b50f44193..1eecf462c636be 100644
--- a/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iplugin_internal.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iplugin_internal.hpp
@@ -26,6 +26,8 @@
 
 namespace InferenceEngine {
 
+class IExecutableNetworkInternal;
+
 /**
  * @brief      Copies preprocess info
  *
@@ -152,8 +154,8 @@ class IInferencePlugin : public std::enable_shared_from_this<IInferencePlugin> {
      * @param config A string-string map of config parameters relevant only for this load operation
      * @return Created Executable Network object
      */
-    virtual ExecutableNetwork LoadNetwork(const CNNNetwork& network,
-                                          const std::map<std::string, std::string>& config) = 0;
+    virtual std::shared_ptr<IExecutableNetworkInternal> LoadNetwork(const CNNNetwork& network,
+                                                                    const std::map<std::string, std::string>& config) = 0;
 
     /**
      * @brief Creates an executable network from network object, on specified remote context
@@ -163,9 +165,9 @@ class IInferencePlugin : public std::enable_shared_from_this<IInferencePlugin> {
      *        execute the network
      * @return Created Executable Network object
      */
-    virtual ExecutableNetwork LoadNetwork(const CNNNetwork& network,
-                                          const std::map<std::string, std::string>& config,
-                                          RemoteContext::Ptr context) = 0;
+    virtual std::shared_ptr<IExecutableNetworkInternal> LoadNetwork(const CNNNetwork& network,
+                                                                    const std::map<std::string, std::string>& config,
+                                                                    RemoteContext::Ptr context) = 0;
     /**
      * @brief Registers extension within plugin
      * @param extension - pointer to already loaded extension
@@ -215,8 +217,8 @@ class IInferencePlugin : public std::enable_shared_from_this<IInferencePlugin> {
      * @param config A string -> string map of parameters
      * @return An Executable network
      */
-    virtual ExecutableNetwork ImportNetwork(const std::string& modelFileName,
-                                            const std::map<std::string, std::string>& config) = 0;
+    virtual std::shared_ptr<IExecutableNetworkInternal> ImportNetwork(const std::string& modelFileName,
+                                                                      const std::map<std::string, std::string>& config) = 0;
 
     /**
      * @brief Creates an executable network from an previously exported network using plugin implementation
@@ -225,8 +227,8 @@ class IInferencePlugin : public std::enable_shared_from_this<IInferencePlugin> {
      * @param config A string -> string map of parameters
      * @return An Executable network
      */
-    virtual ExecutableNetwork ImportNetwork(std::istream& networkModel,
-                                            const std::map<std::string, std::string>& config) = 0;
+    virtual std::shared_ptr<IExecutableNetworkInternal> ImportNetwork(std::istream& networkModel,
+                                                                      const std::map<std::string, std::string>& config) = 0;
 
     /**
      * @brief Creates an executable network from an previously exported network using plugin implementation
@@ -237,9 +239,9 @@ class IInferencePlugin : public std::enable_shared_from_this<IInferencePlugin> {
      * @param config A string -> string map of parameters
      * @return An Executable network
      */
-    virtual ExecutableNetwork ImportNetwork(std::istream& networkModel,
-                                            const RemoteContext::Ptr& context,
-                                            const std::map<std::string, std::string>& config) = 0;
+    virtual std::shared_ptr<IExecutableNetworkInternal> ImportNetwork(std::istream& networkModel,
+                                                                      const RemoteContext::Ptr& context,
+                                                                      const std::map<std::string, std::string>& config) = 0;
 
     /**
      * @brief Sets pointer to ICore interface
diff --git a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
index 8992c24fd10d66..493ed365e45617 100644
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@@ -127,6 +127,19 @@ void copyRow_32F(const float in[], float out[], int length) {
     copyRow_32F_impl(in, out, length);
 }
 
+// Resize (bi-linear, 32F)
+void calcRowLinear_32F(float* dst[],
+                       const float* src0[],
+                       const float* src1[],
+                       const float  alpha[],
+                       const int    mapsx[],
+                       const float  beta[],
+                       const Size& inSz,
+                       const Size& outSz,
+                       const int   lpi) {
+    calcRowLinear_32FC1(dst, src0, src1, alpha, mapsx, beta, inSz, outSz, lpi);
+}
+
 template<int chanNum>
 CV_ALWAYS_INLINE void channels2planes_store(std::array<std::array<uint8_t*, 4>, chanNum>& dst,
                                             const uchar* src, const int width,
diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi.cpp
index cbaa42b6fbc5ba..f38de9d0ead0ba 100644
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi.cpp
@@ -81,7 +81,8 @@ inline int get_cv_depth(const TensorDesc &ie_desc) {
     case Precision::U8:   return CV_8U;
     case Precision::FP32: return CV_32F;
     case Precision::U16:  return CV_16U;
-    case Precision::FP16: return CV_16U;
+    case Precision::I16:  return CV_16S;
+    case Precision::FP16: return CV_16F;
 
     default: IE_THROW() << "Unsupported data type";
     }
diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
index 72eacfdd306e02..0e49f4116ec95c 100644
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@@ -434,6 +434,11 @@ void splitRow(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length) {
 
 namespace {
 
+struct fp_16_t {
+    int16_t v;
+};
+
+
 template<typename type>
 struct cv_type_to_depth;
 
@@ -443,6 +448,7 @@ template<> struct cv_type_to_depth<std::uint16_t>   { enum { depth = CV_16U }; }
 template<> struct cv_type_to_depth<std::int16_t>    { enum { depth = CV_16S }; };
 template<> struct cv_type_to_depth<std::int32_t>    { enum { depth = CV_32S }; };
 template<> struct cv_type_to_depth<float>           { enum { depth = CV_32F }; };
+template<> struct cv_type_to_depth<fp_16_t>         { enum { depth = CV_16F }; };
 
 template<typename ... types>
 struct typelist {};
@@ -500,7 +506,7 @@ bool is_cv_type_in_list(const int type_id) {
 
 namespace {
 
-using merge_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float>;
+using merge_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float, fp_16_t>;
 
 template<int chs>
 struct typed_merge_row {
@@ -508,6 +514,12 @@ struct typed_merge_row {
 
     template <typename type>
     p_f operator()(type_to_type<type> ) { return mergeRow<type, chs>; }
+
+    p_f operator()(type_to_type<fp_16_t> ) {
+        static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v),
+                "fp_16_t should be a plain wrap over FP16 implementation type");
+        return mergeRow<decltype(fp_16_t::v), chs>;
+    }
 };
 
 }  // namespace
@@ -562,8 +574,7 @@ GAPI_FLUID_KERNEL(FMerge4, Merge4, false) {
 
 
 namespace {
-
-using split_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float>;
+using split_supported_types = typelist<uint8_t, int8_t, uint16_t, int16_t, int32_t, float, fp_16_t>;
 
 template<int chs>
 struct typed_split_row {
@@ -571,6 +582,12 @@ struct typed_split_row {
 
     template <typename type>
     p_f operator()(type_to_type<type> ) { return splitRow<type, chs>; }
+
+    p_f operator()(type_to_type<fp_16_t> ) {
+        static_assert(sizeof(fp_16_t) == sizeof(fp_16_t::v),
+                "fp_16_t should be a plain wrap over FP16 implementation type");
+        return splitRow<decltype(fp_16_t::v), chs>;
+    }
 };
 
 }  // namespace
@@ -1120,6 +1137,17 @@ static void calcRowLinear(const cv::gapi::fluid::View  & in,
             return;
         }
     }
+
+    if (std::is_same<T, float>::value) {
+        neon::calcRowLinear_32F(reinterpret_cast<float**>(dst),
+                                reinterpret_cast<const float**>(src0),
+                                reinterpret_cast<const float**>(src1),
+                                reinterpret_cast<const float*>(alpha),
+                                reinterpret_cast<const int*>(mapsx),
+                                reinterpret_cast<const float*>(beta),
+                                inSz, outSz, lpi);
+        return;
+    }
 #endif
 
     for (int l = 0; l < lpi; l++) {
diff --git a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
index 5f21a1ed0bbb27..3a68bd4a9804da 100644
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
@@ -788,19 +788,19 @@ inline void copyRow_32F_impl(const float in[], float out[], int length) {
 }
 
 // Resize (bi-linear, 32FC1)
-static inline void calcRowLinear_32FC1(float *dst[],
-                                       const float *src0[],
-                                       const float *src1[],
-                                       const float  alpha[],
-                                       const int    mapsx[],
-                                       const float  beta[],
-                                       const Size& inSz,
-                                       const Size& outSz,
-                                               int lpi) {
+CV_ALWAYS_INLINE void calcRowLinear_32FC1(float *dst[],
+                                          const float *src0[],
+                                          const float *src1[],
+                                          const float  alpha[],
+                                          const int    mapsx[],
+                                          const float  beta[],
+                                          const Size& inSz,
+                                          const Size& outSz,
+                                          const int   lpi) {
     bool xRatioEq1 = inSz.width == outSz.width;
     bool yRatioEq1 = inSz.height == outSz.height;
 
-#if CPU_SIMD
+#if MANUAL_SIMD
     const int nlanes = v_float32::nlanes;
 #endif
 
@@ -811,19 +811,19 @@ static inline void calcRowLinear_32FC1(float *dst[],
 
             int x = 0;
 
-#if CPU_SIMD
+#if MANUAL_SIMD
+            v_float32 low1, high1, s00, s01;
+            v_float32 low2, high2, s10, s11;
             for (; x <= outSz.width - nlanes; x += nlanes) {
                 v_float32 alpha0 = vx_load(&alpha[x]);
                 //  v_float32 alpha1 = 1.f - alpha0;
 
-                v_float32 low1, high1, s00, s01;
                 v_gather_pairs(src0[line], mapsx, x, low1, high1);
                 v_deinterleave(low1, high1, s00, s01);
 
                 //  v_float32 res0 = s00*alpha0 + s01*alpha1;
                 v_float32 res0 = v_fma(s00 - s01, alpha0, s01);
 
-                v_float32 low2, high2, s10, s11;
                 v_gather_pairs(src1[line], mapsx, x, low2, high2);
                 v_deinterleave(low2, high2, s10, s11);
 
@@ -854,12 +854,12 @@ static inline void calcRowLinear_32FC1(float *dst[],
         for (int line = 0; line < lpi; ++line) {
             int x = 0;
 
-#if CPU_SIMD
+#if MANUAL_SIMD
+            v_float32 low, high, s00, s01;
             for (; x <= outSz.width - nlanes; x += nlanes) {
                 v_float32 alpha0 = vx_load(&alpha[x]);
                 //  v_float32 alpha1 = 1.f - alpha0;
 
-                v_float32 low, high, s00, s01;
                 v_gather_pairs(src0[line], mapsx, x, low, high);
                 v_deinterleave(low, high, s00, s01);
 
@@ -889,7 +889,7 @@ static inline void calcRowLinear_32FC1(float *dst[],
 
             int x = 0;
 
-#if CPU_SIMD
+#if MANUAL_SIMD
             for (; x <= length - nlanes; x += nlanes) {
                 v_float32 s0 = vx_load(&src0[line][x]);
                 v_float32 s1 = vx_load(&src1[line][x]);
diff --git a/inference-engine/src/readers/ir_reader/ie_ir_parser.cpp b/inference-engine/src/readers/ir_reader/ie_ir_parser.cpp
index 6c6c4f28ac3c9b..a3daa63c9da869 100644
--- a/inference-engine/src/readers/ir_reader/ie_ir_parser.cpp
+++ b/inference-engine/src/readers/ir_reader/ie_ir_parser.cpp
@@ -82,11 +82,13 @@ bool getParameters(const pugi::xml_node& node, const std::string& name, std::vec
 }
 
 template <class T>
-bool stringToType(const std::string& valStr, T& value) {
+T stringToType(const std::string& valStr) {
+    T ret{0};
     std::istringstream ss(valStr);
-    if (ss.eof()) return false;
-    ss >> value;
-    return !ss.fail();
+    if (!ss.eof()) {
+        ss >> ret;
+    }
+    return ret;
 }
 
 class XmlDeserializer : public ngraph::AttributeVisitor {
@@ -124,16 +126,12 @@ class XmlDeserializer : public ngraph::AttributeVisitor {
     void on_adapter(const std::string& name, ngraph::ValueAccessor<double>& adapter) override {
         std::string val;
         if (!getStrAttribute(node.child("data"), name, val)) return;
-        double value;
-        stringToType<double>(val, value);
-        adapter.set(value);
+        adapter.set(stringToType<double>(val));
     }
     void on_adapter(const std::string& name, ngraph::ValueAccessor<int64_t>& adapter) override {
         std::string val;
         if (!getStrAttribute(node.child("data"), name, val)) return;
-        int64_t value;
-        stringToType<int64_t>(val, value);
-        adapter.set(value);
+        adapter.set(stringToType<int64_t>(val));
     }
 
     void on_adapter(
diff --git a/inference-engine/src/transformations/include/transformations/common_optimizations/transpose_sinking.hpp b/inference-engine/src/transformations/include/transformations/common_optimizations/transpose_sinking.hpp
new file mode 100644
index 00000000000000..497d9f242308c2
--- /dev/null
+++ b/inference-engine/src/transformations/include/transformations/common_optimizations/transpose_sinking.hpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include <transformations_visibility.hpp>
+
+#include <ngraph/ngraph.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+#include "ngraph/pattern/matcher.hpp"
+
+namespace ngraph {
+namespace pass {
+
+class TRANSFORMATIONS_API TransposeSinking;
+class TRANSFORMATIONS_API TransposeOptimization;
+class TRANSFORMATIONS_API TransposeReduction;
+class TRANSFORMATIONS_API TransposeFQReduction;
+
+}  // namespace pass
+}  // namespace ngraph
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief TransposeOptimization transformation replaces suitable Transposes with Reshape operation or optimises them out
+ */
+class ngraph::pass::TransposeOptimization : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    TransposeOptimization();
+};
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief TransposeReduction transformation sinks Transpose through Reduce operations
+ */
+class ngraph::pass::TransposeReduction : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    TransposeReduction();
+};
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief TransposeFQReduction transformation sinks Transpose through FakeQuantize in case it is followed by reduction or squeeze
+ */
+class ngraph::pass::TransposeFQReduction : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    TransposeFQReduction();
+};
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief TransposeSinking transformation sinks Transposes through known operations
+ */
+class ngraph::pass::TransposeSinking: public ngraph::pass::GraphRewrite {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    TransposeSinking() {
+        add_matcher<ngraph::pass::TransposeFQReduction>();
+        add_matcher<ngraph::pass::TransposeReduction>();
+        add_matcher<ngraph::pass::TransposeOptimization>();
+    }
+};
\ No newline at end of file
diff --git a/inference-engine/src/transformations/include/transformations/utils/utils.hpp b/inference-engine/src/transformations/include/transformations/utils/utils.hpp
index 9a4016dfbb75c0..f932aca04a922f 100644
--- a/inference-engine/src/transformations/include/transformations/utils/utils.hpp
+++ b/inference-engine/src/transformations/include/transformations/utils/utils.hpp
@@ -106,6 +106,15 @@ TRANSFORMATIONS_API std::shared_ptr<ngraph::Node> activation(const std::string&
 
 TRANSFORMATIONS_API bool is_seq_len_provided(const std::shared_ptr<Node> &seq_len_input, int64_t max_seq_len);
 
+TRANSFORMATIONS_API std::shared_ptr<Node> try_fold_unary_output(const std::shared_ptr<Node>& node);
+
+TRANSFORMATIONS_API std::shared_ptr<Node> clone_try_fold(const std::shared_ptr<Node>& node, const OutputVector& inputs);
+
+template <typename T, typename... Args>
+std::shared_ptr<Node> make_try_fold(Args&&... args) {
+    auto unary_output_node = std::make_shared<T>(std::forward<Args>(args)...);
+    return try_fold_unary_output(unary_output_node);
+}
 
 template <class T>
 Output<Node> eltwise_fold(const Output<Node> & input0, const Output<Node> & input1) {
diff --git a/inference-engine/src/transformations/src/transformations/common_optimizations/algebraic_simplification.cpp b/inference-engine/src/transformations/src/transformations/common_optimizations/algebraic_simplification.cpp
index ce73883ac15a69..9e36bb8c8f08c9 100644
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/algebraic_simplification.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/algebraic_simplification.cpp
@@ -136,93 +136,6 @@ static bool simplify_gather_shapeof(shared_ptr<Node> node) {
     return true;
 }
 
-static bool replace_transpose_with_reshape(shared_ptr<Node> transpose) {
-    auto data = transpose->input_value(0);
-    const auto input_shape = transpose->input(0).get_partial_shape();
-    if (input_shape.rank().is_dynamic()) {
-        return false;
-    }
-
-    const size_t input_shape_rank = input_shape.rank().get_length();
-
-    auto order = as_type_ptr<opset3::Constant>(transpose->input_value(1).get_node_shared_ptr());
-    if (!order || !ngraph::shape_size(order->get_shape())) {
-        return false;
-    }
-
-    const auto order_value = order->cast_vector<int64_t>();
-
-    // Check that transpose order without 1 dims has an ascending order
-    int64_t last_dim(-1);
-    for (size_t i = 0; i < input_shape_rank; ++i) {
-        if (input_shape[order_value[i]].is_dynamic() || input_shape[order_value[i]] != 1) {
-            if (order_value[i] < last_dim) {
-                return false;
-            }
-            last_dim = order_value[i];
-        }
-    }
-
-    // Transpose operation can be removed if original transpose order is sorted
-    // or dimension that changes their places equal to 1
-    using DimensionToPosition = struct {
-        Dimension dim;
-        size_t pos;
-    };
-    std::vector<DimensionToPosition> dims;
-    for (size_t i = 0; i < input_shape_rank; ++i) {
-        if (order_value[i] != static_cast<int64_t>(i)) {
-            dims.push_back({input_shape[order_value[i]], i});
-        }
-    }
-
-    // If number of dimensions != 1 to move equal to 0 we can remove this Transpose
-    if (count_if(dims.begin(), dims.end(), [](const DimensionToPosition& item) {
-            return !(item.dim.is_static() && item.dim.get_length() == 1);
-        }) == 0) {
-        return replace_output_update_name(transpose->output(0), transpose->input_value(0));
-    }
-
-    // Transpose can be replaced with Reshape in two ways:
-    // 1. Reshape with dims as Constant
-    // 2. Reshape with dims as input (ShapeOf->Gather)
-    //
-    // The first case is possible only if one or less dynamic dimensions changes their position
-    // For example: input_shape {?, 3, 1, ?} and order {0, 1, 3, 2} can be replaced with Reshape
-    // with Constant {0, 3, -1, 1} but if input_shape {?, 1, 1, ?} and order {1, 0, 3, 2} transpose
-    // cannot be replaced int the same way and in this case its only possible to use Gather(ShapeOf,
-    // order)
-
-    Output<Node> reshape_dim;
-    NodeVector new_ops;
-
-    if (count_if(dims.begin(), dims.end(), [](const DimensionToPosition& item) {
-            return item.dim.is_dynamic();
-        }) < 2) {
-        vector<int64_t> reshape_value(input_shape_rank, 0);
-        for (const auto& item : dims) {
-            reshape_value[item.pos] = item.dim.is_dynamic() ? -1 : item.dim.get_length();
-        }
-        reshape_dim =
-            opset3::Constant::create(element::i64, Shape{reshape_value.size()}, reshape_value);
-    } else {
-        auto shape_of = make_shared<opset3::ShapeOf>(data);
-        new_ops.push_back(shape_of);
-        reshape_dim = make_shared<opset3::Gather>(
-            shape_of, order, opset3::Constant::create(element::i64, Shape{1}, {0}));
-        new_ops.push_back(reshape_dim.get_node_shared_ptr());
-    }
-
-    auto reshape_op = make_shared<opset3::Reshape>(data, reshape_dim, true);
-    new_ops.push_back(reshape_op);
-
-    reshape_op->set_friendly_name(transpose->get_friendly_name());
-    copy_runtime_info(transpose, new_ops);
-    replace_node(transpose, reshape_op);
-
-    return true;
-}
-
 #define ECHO(NAME) #NAME
 #define STR(NAME) ECHO(NAME)
 #define SIMPLE_MATCHER_PASS_DEFINITION(NAME, OP, FUNC) \
@@ -244,11 +157,9 @@ NGRAPH_RTTI_DEFINITION(NAME, STR(NAME), 0);
 SIMPLE_MATCHER_PASS_DEFINITION(EliminateGather, opset3::Gather, simplify_gather);
 SIMPLE_MATCHER_PASS_DEFINITION(SimplifyShapeOf2Gather, opset2::ShapeOf, simplify_gather_shapeof);
 SIMPLE_MATCHER_PASS_DEFINITION(SimplifyShapeOf3Gather, opset3::ShapeOf, simplify_gather_shapeof);
-SIMPLE_MATCHER_PASS_DEFINITION(ConvertTransposeToReshape, opset3::Transpose, replace_transpose_with_reshape);
 
 ngraph::pass::AlgebraicSimplification::AlgebraicSimplification() {
     add_matcher<EliminateGather>();
     add_matcher<SimplifyShapeOf2Gather>();
     add_matcher<SimplifyShapeOf3Gather>();
-    add_matcher<ConvertTransposeToReshape>();
 }
diff --git a/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp b/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
index 9b08b9b93f6198..244cc6e0847100 100644
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
@@ -38,6 +38,7 @@
 #include "transformations/common_optimizations/space_to_batch_fusion.hpp"
 #include "transformations/common_optimizations/batch_to_space_fusion.hpp"
 #include "transformations/common_optimizations/dilated_convolution_converter.hpp"
+#include "transformations/common_optimizations/transpose_sinking.hpp"
 #include "transformations/op_conversions/bidirectional_sequences_decomposition.hpp"
 #include "transformations/op_conversions/convert_pad_to_group_conv.hpp"
 #include "transformations/op_conversions/convert_divide.hpp"
@@ -85,6 +86,7 @@ bool ngraph::pass::CommonOptimizations::run_on_function(std::shared_ptr<ngraph::
     manager.register_pass<ngraph::pass::ConstantFolding>();
     manager.register_pass<ngraph::pass::StridedSliceOptimization>(); // depends on CF
     manager.register_pass<ngraph::pass::BroadcastElementwiseFusion>();
+    manager.register_pass<ngraph::pass::TransposeSinking>();
 
     auto eliminations = manager.register_pass<ngraph::pass::GraphRewrite>();
     eliminations->add_matcher<ngraph::pass::EliminateUnsqueezeGather>();
@@ -160,8 +162,6 @@ bool ngraph::pass::CommonOptimizations::run_on_function(std::shared_ptr<ngraph::
     fq_fusions->add_matcher<ngraph::pass::FakeQuantizeReshapeFusion>();
     fq_fusions->add_matcher<ngraph::pass::PullTransposeThroughFQUp>();
     fq_fusions->add_matcher<ngraph::pass::ReluFakeQuantizeFusion>();
-    fq_fusions->add_matcher<ngraph::pass::AddFakeQuantizeFusion>();
-    fq_fusions->add_matcher<ngraph::pass::MulFakeQuantizeFusion>();
     fq_fusions->set_name("ngraph::pass::FakeQuantizeFusions");
 
     manager.run_passes(f);
diff --git a/inference-engine/src/transformations/src/transformations/common_optimizations/mul_fake_quantize_fusion.cpp b/inference-engine/src/transformations/src/transformations/common_optimizations/mul_fake_quantize_fusion.cpp
index a889aeee310204..1fcff0ac15cc4f 100644
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/mul_fake_quantize_fusion.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/mul_fake_quantize_fusion.cpp
@@ -73,7 +73,6 @@ ngraph::pass::MulFakeQuantizeFusion::MulFakeQuantizeFusion() {
         } else if (std::any_of(mul_const_value.begin(), mul_const_value.end(), [] (float f) -> bool { return f < 0.0f; })) {
             const auto& output_low = fq->input_value(3);
             const auto& output_high = fq->input_value(4);
-            auto zero = op::Constant::create(element::f32, Shape{}, {0.0f});
             // get the mask of the values from mul_const that are less than zero
             std::vector<float> less_than_zero;
             less_than_zero.reserve(mul_const_value.size());
@@ -84,8 +83,8 @@ ngraph::pass::MulFakeQuantizeFusion::MulFakeQuantizeFusion() {
                 less_than_zero.push_back(mul_const_value[i] < 0);
                 greater_eq_zero.push_back(mul_const_value[i] >= 0);
             }
-            auto less_const = op::Constant::create(element::f32, const_shape, less_than_zero);
-            auto greater_eq_const = op::Constant::create(element::f32, const_shape, greater_eq_zero);
+            auto less_const = op::Constant::create(output_low.get_element_type(), const_shape, less_than_zero);
+            auto greater_eq_const = op::Constant::create(output_low.get_element_type(), const_shape, greater_eq_zero);
             // new_output_low is defined as follows:
             //   output_low[i],  when mul_const[i] >= 0
             //   output_high[i], when mul_const[i] < 0
diff --git a/inference-engine/src/transformations/src/transformations/common_optimizations/softmax_fusion.cpp b/inference-engine/src/transformations/src/transformations/common_optimizations/softmax_fusion.cpp
index 6e8a553ad6ffad..25a1a2272d1478 100644
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/softmax_fusion.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/softmax_fusion.cpp
@@ -28,6 +28,9 @@ ngraph::pass::SoftmaxFusion::SoftmaxFusion() {
     auto div_pattern = ngraph::pattern::wrap_type<opset6::Divide>({exp_pattern, reduce_sum_pattern});
 
     ngraph::matcher_pass_callback callback = [=](pattern::Matcher& m) {
+        if (transformation_callback(m.get_match_root()))
+            return false;
+
         const auto& pattern_map = m.get_pattern_value_map();
 
         auto reduce_max_axes = std::dynamic_pointer_cast<opset6::Constant>(pattern_map.at(reduce_max_axes_pattern).get_node_shared_ptr());
diff --git a/inference-engine/src/transformations/src/transformations/common_optimizations/transpose_sinking.cpp b/inference-engine/src/transformations/src/transformations/common_optimizations/transpose_sinking.cpp
new file mode 100644
index 00000000000000..a83a9a945d7cf3
--- /dev/null
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/transpose_sinking.cpp
@@ -0,0 +1,273 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "itt.hpp"
+#include "transformations/common_optimizations/transpose_sinking.hpp"
+#include "transformations/utils/utils.hpp"
+
+#include <memory>
+#include <vector>
+
+#include <ngraph/opsets/opset6.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <numeric>
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeSinking, "TransposeSinking", 0);
+NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeOptimization, "TransposeOptimization", 0);
+NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeReduction, "TransposeReduction", 0);
+NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeFQReduction, "TransposeFQReduction", 0);
+
+using namespace ngraph;
+
+std::shared_ptr<ngraph::opset6::Constant> get_reduced_order_constant(const std::shared_ptr<ngraph::opset6::Constant>& axes_const,
+                                                                     const std::shared_ptr<ngraph::opset6::Constant>& order_const) {
+    auto order = order_const->cast_vector<int64_t>();
+
+    auto axes = axes_const->cast_vector<int64_t>();
+    std::sort(axes.rbegin(), axes.rend());
+    for (const auto& i : axes)
+        order.erase(order.begin() + i);
+
+    const auto& updated_order_size = static_cast<int64_t>(order.size());
+
+    auto order_sorted = order;
+    sort(order_sorted.begin(), order_sorted.end());
+    for (int64_t i = 0; i < updated_order_size; ++i) {
+        auto lowest_greater_eq_i = std::lower_bound(order_sorted.begin(), order_sorted.end(), i);
+        std::replace(order.begin(), order.end(), *lowest_greater_eq_i, i);
+        std::replace(order_sorted.begin(), order_sorted.end(), *lowest_greater_eq_i, i);
+    }
+    return std::make_shared<ngraph::opset6::Constant>(
+            ngraph::element::i64, ngraph::Shape{order.size()}, order);
+}
+
+std::shared_ptr<ngraph::opset6::Constant> get_reversed_order_constant(const std::shared_ptr<ngraph::opset6::Constant>& order_const) {
+    const auto& order = order_const->cast_vector<size_t>();
+    const auto& rank = order.size();
+    const auto& default_order = ngraph::get_default_order(rank);
+    std::vector<size_t> reverse_order(rank);
+    for (size_t i = 0; i < rank; ++i)
+        reverse_order[order[i]] = default_order[i];
+
+    return std::make_shared<ngraph::opset6::Constant>(
+            ngraph::element::i64, ngraph::Shape{reverse_order.size()}, reverse_order);
+}
+
+
+bool replace_transpose_with_reshape(const std::shared_ptr<Node>& transpose) {
+    auto data = transpose->input_value(0);
+    const auto input_shape = transpose->input(0).get_partial_shape();
+
+    const size_t input_shape_rank = input_shape.rank().get_length();
+
+    auto order = as_type_ptr<opset6::Constant>(transpose->input_value(1).get_node_shared_ptr());
+    if (!order || !ngraph::shape_size(order->get_shape())) {
+        return false;
+    }
+
+    const auto order_value = order->cast_vector<int64_t>();
+
+    // Check that transpose order without 1 dims has an ascending order
+    int64_t last_dim(-1);
+    for (size_t i = 0; i < input_shape_rank; ++i) {
+        if (input_shape[order_value[i]].is_dynamic() || input_shape[order_value[i]] != 1) {
+            if (order_value[i] < last_dim) {
+                return false;
+            }
+            last_dim = order_value[i];
+        }
+    }
+
+    // Transpose operation can be removed if original transpose order is sorted
+    // or dimension that changes their places equal to 1
+    using DimensionToPosition = struct {
+        Dimension dim;
+        size_t pos;
+    };
+    std::vector<DimensionToPosition> dims;
+    for (size_t i = 0; i < input_shape_rank; ++i) {
+        if (order_value[i] != static_cast<int64_t>(i)) {
+            dims.push_back({input_shape[order_value[i]], i});
+        }
+    }
+
+    // If number of dimensions != 1 to move equal to 0 we can remove this Transpose
+    if (count_if(dims.begin(), dims.end(), [](const DimensionToPosition& item) {
+        return !(item.dim.is_static() && item.dim.get_length() == 1);
+    }) == 0) {
+        return replace_output_update_name(transpose->output(0), transpose->input_value(0));
+    }
+
+    // Transpose can be replaced with Reshape in two ways:
+    // 1. Reshape with dims as Constant
+    // 2. Reshape with dims as input (ShapeOf->Gather)
+    //
+    // The first case is possible only if one or less dynamic dimensions changes their position
+    // For example: input_shape {?, 3, 1, ?} and order {0, 1, 3, 2} can be replaced with Reshape
+    // with Constant {0, 3, -1, 1} but if input_shape {?, 1, 1, ?} and order {1, 0, 3, 2} transpose
+    // cannot be replaced int the same way and in this case its only possible to use Gather(ShapeOf,
+    // order)
+
+    Output<Node> reshape_dim;
+    NodeVector new_ops;
+
+    if (count_if(dims.begin(), dims.end(), [](const DimensionToPosition& item) {
+        return item.dim.is_dynamic();
+    }) < 2) {
+        std::vector<int64_t> reshape_value(input_shape_rank, 0);
+        for (const auto& item : dims) {
+            reshape_value[item.pos] = item.dim.is_dynamic() ? -1 : item.dim.get_length();
+        }
+        reshape_dim =
+                opset3::Constant::create(element::i64, Shape{reshape_value.size()}, reshape_value);
+    } else {
+        auto shape_of = std::make_shared<opset3::ShapeOf>(data);
+        new_ops.push_back(shape_of);
+        reshape_dim = std::make_shared<opset3::Gather>(
+                shape_of, order, opset3::Constant::create(element::i64, Shape{1}, {0}));
+        new_ops.push_back(reshape_dim.get_node_shared_ptr());
+    }
+
+    auto reshape_op = std::make_shared<opset3::Reshape>(data, reshape_dim, true);
+    new_ops.push_back(reshape_op);
+
+    reshape_op->set_friendly_name(transpose->get_friendly_name());
+    copy_runtime_info(transpose, new_ops);
+    replace_node(transpose, reshape_op);
+    return true;
+}
+
+ngraph::pass::TransposeOptimization::TransposeOptimization() {
+    MATCHER_SCOPE(TransposeOptimization);
+
+    auto transpose_label = pattern::wrap_type<opset6::Transpose>(
+            {pattern::any_input(pattern::has_static_rank()), pattern::wrap_type<opset6::Constant>()});
+    ngraph::matcher_pass_callback matcher_pass_callback = [=](ngraph::pattern::Matcher &m) {
+        return replace_transpose_with_reshape(m.get_match_root());
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(transpose_label, matcher_name);
+    register_matcher(m, matcher_pass_callback);
+}
+
+ngraph::pass::TransposeReduction::TransposeReduction() {
+    MATCHER_SCOPE(TransposeReduction);
+
+    auto transpose_label = pattern::wrap_type<opset6::Transpose>({pattern::any_input(), pattern::wrap_type<opset6::Constant>()});
+    auto reduce_or_squeeze_label = pattern::wrap_type<op::util::ArithmeticReductionKeepDims, op::util::LogicalReductionKeepDims, opset6::Squeeze>(
+            {transpose_label, pattern::wrap_type<opset6::Constant>()});
+
+    ngraph::matcher_pass_callback matcher_pass_callback = [=](ngraph::pattern::Matcher &m) {
+        const auto &pattern_to_output = m.get_pattern_value_map();
+
+        auto transpose = pattern_to_output.at(transpose_label).get_node_shared_ptr();
+        auto reduction = pattern_to_output.at(reduce_or_squeeze_label).get_node_shared_ptr();
+        auto arithmetic_reduce = std::dynamic_pointer_cast<op::util::ArithmeticReductionKeepDims>(reduction);
+        auto logical_reduce = std::dynamic_pointer_cast<op::util::LogicalReductionKeepDims>(reduction);
+        auto squeeze = std::dynamic_pointer_cast<opset6::Squeeze>(reduction);
+        if (!transpose || !(arithmetic_reduce || logical_reduce || squeeze))
+            return false;
+
+        bool keep_dims = false; // squeeze always reduces number of output dimensions
+        if (logical_reduce)
+            keep_dims = logical_reduce->get_keep_dims();
+        else if (arithmetic_reduce)
+            keep_dims = arithmetic_reduce->get_keep_dims();
+
+        auto transpose_order = std::dynamic_pointer_cast<ngraph::opset6::Constant>(transpose->get_input_node_shared_ptr(1));
+        auto reduction_axes = std::dynamic_pointer_cast<ngraph::opset6::Constant>(reduction->get_input_node_shared_ptr(1));
+        if (!transpose_order || !reduction_axes)
+            return false;
+
+        const auto& non_negative_axes = ngraph::normalize_axes(
+                reduction->get_friendly_name(), reduction_axes->cast_vector<int64_t>(), reduction->get_input_partial_shape(0).rank());
+        reduction_axes = ngraph::opset6::Constant::create(ngraph::element::i64, {non_negative_axes.size()}, non_negative_axes);
+
+        ngraph::NodeVector new_ops;
+        auto new_axes = ngraph::op::util::make_try_fold<ngraph::opset6::Gather>(
+                transpose_order, reduction_axes, ngraph::opset6::Constant::create(ngraph::element::i64, {}, {0}));
+        new_ops.push_back(new_axes);
+        auto new_reduce = reduction->copy_with_new_inputs({transpose->input_value(0), new_axes});
+        new_ops.push_back(new_reduce);
+
+        auto updated_order = transpose_order;
+        if (!keep_dims) {
+            updated_order = get_reduced_order_constant(reduction_axes, transpose_order);
+            new_ops.push_back(updated_order);
+        }
+        auto new_transpose = register_new_node<opset6::Transpose>(new_reduce, updated_order);
+        new_ops.push_back(new_transpose);
+        new_transpose->set_friendly_name(reduction->get_friendly_name());
+
+        ngraph::copy_runtime_info({reduction, transpose}, new_ops);
+        ngraph::replace_node(reduction, new_transpose);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(reduce_or_squeeze_label, matcher_name);
+    register_matcher(m, matcher_pass_callback);
+}
+
+ngraph::pass::TransposeFQReduction::TransposeFQReduction() {
+    MATCHER_SCOPE(TransposeFQReduction);
+
+    auto transpose_label = pattern::wrap_type<opset6::Transpose>({pattern::any_input(), pattern::wrap_type<opset6::Constant>()});
+    auto fq_label = pattern::wrap_type<opset6::FakeQuantize>(
+            {transpose_label, pattern::any_input(pattern::has_static_rank()), pattern::any_input(pattern::has_static_rank()),
+                              pattern::any_input(pattern::has_static_rank()), pattern::any_input(pattern::has_static_rank())});
+    auto reduce_or_squeeze_label = pattern::wrap_type<op::util::ArithmeticReductionKeepDims, op::util::LogicalReductionKeepDims, opset6::Squeeze>(
+            {fq_label, pattern::wrap_type<opset6::Constant>()});
+
+    ngraph::matcher_pass_callback matcher_pass_callback = [=](ngraph::pattern::Matcher &m) {
+        auto &pattern_to_output = m.get_pattern_value_map();
+
+        auto transpose = pattern_to_output.at(transpose_label).get_node_shared_ptr();
+        auto transpose_order = std::dynamic_pointer_cast<opset6::Constant>(transpose->get_input_node_shared_ptr(1));
+        auto fq = pattern_to_output.at(fq_label).get_node_shared_ptr();
+        if (!transpose || !transpose_order || !fq)
+            return false;
+
+        ngraph::NodeVector new_ops;
+
+        const auto& reverse_order_constant = get_reversed_order_constant(transpose_order);
+        new_ops.push_back(reverse_order_constant);
+
+        const auto& input_rank = fq->get_input_partial_shape(0).rank().get_length();
+        ngraph::OutputVector fq_inputs = {transpose->input_value(0)};
+        for (size_t i = 1; i < fq->inputs().size(); ++i) {
+            auto input = fq->input_value(i);
+            const auto& ranks_diff = input_rank - input.get_partial_shape().rank().get_length();
+            NGRAPH_CHECK(ranks_diff >= 0);
+            if (ranks_diff > 0) {
+                std::vector<int64_t> axes(ranks_diff);
+                std::iota(axes.begin(), axes.end(), 0);
+                const auto& axes_const = opset6::Constant::create(element::i64, Shape{axes.size()}, axes);
+                new_ops.push_back(axes_const);
+                const auto& unsqueezed_input = op::util::make_try_fold<opset6::Unsqueeze>(input, axes_const);
+                new_ops.push_back(unsqueezed_input);
+                input = unsqueezed_input->output(0);
+            }
+            const auto& transposed_input = op::util::make_try_fold<opset6::Transpose>(input, reverse_order_constant);
+            new_ops.push_back(transposed_input);
+            fq_inputs.push_back(transposed_input);
+        }
+        auto new_fq = fq->copy_with_new_inputs(fq_inputs);
+        new_ops.push_back(new_fq);
+
+        auto new_transpose = std::make_shared<ngraph::opset6::Transpose>(new_fq, transpose_order);
+        new_ops.push_back(new_transpose);
+        new_transpose->set_friendly_name(fq->get_friendly_name());
+
+        ngraph::copy_runtime_info({fq, transpose}, new_ops);
+        ngraph::replace_node(fq, new_transpose);
+        // The root node (reduction) left unchanged during current matcher pass.
+        // We return false here for further MatcherPasses to be applicable for this node as a root node
+        return false;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(reduce_or_squeeze_label, matcher_name);
+    register_matcher(m, matcher_pass_callback);
+}
diff --git a/inference-engine/src/transformations/src/transformations/op_conversions/convert_interpolate1_to_interpolate4.cpp b/inference-engine/src/transformations/src/transformations/op_conversions/convert_interpolate1_to_interpolate4.cpp
index 80d4e804b92e32..94173079c6203f 100644
--- a/inference-engine/src/transformations/src/transformations/op_conversions/convert_interpolate1_to_interpolate4.cpp
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/convert_interpolate1_to_interpolate4.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2021 Intel Corporation
+// Copyright (C) 2020-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -49,9 +49,9 @@ ngraph::pass::ConvertInterpolate1ToInterpolate4::ConvertInterpolate1ToInterpolat
             // If we write only
             //    attrsV4.mode = ngraph::op::v4::Interpolate::InterpolateMode::linear;
             // instead of a conditional statements below when attrsV0.mode == "linear",
-            // then we have a performance drop, because CPU and GPU have no optimized
+            // then we have a performance drop, because CPU have no optimized
             // version of the 'linear' mode.
-            // TODO: delete this conditional statement, when CPU and GPU will have
+            // TODO: delete this conditional statement, when CPU will have
             // optimized version of the 'linear' mode.
             if (input_shape_rank < 5) {
                 attrsV4.mode = ngraph::op::v4::Interpolate::InterpolateMode::linear_onnx;
diff --git a/inference-engine/src/transformations/src/transformations/serialize.cpp b/inference-engine/src/transformations/src/transformations/serialize.cpp
index 9b29529b82608c..c2eb4d6a5bd2fb 100644
--- a/inference-engine/src/transformations/src/transformations/serialize.cpp
+++ b/inference-engine/src/transformations/src/transformations/serialize.cpp
@@ -57,10 +57,66 @@ std::string translate_type_name(const std::string& name) {
     return name;
 }
 
+size_t hash_combine(const void* v, int64_t size) {
+    constexpr auto cel_size = sizeof(size_t);
+    size_t seed = static_cast<size_t>(size);
+    const auto data = static_cast<const size_t*>(v);
+    const auto d_end = std::next(data, size / cel_size);
+    // The constant value used as a magic number has been
+    // traditionally used e.g. in boost library's hash_combine.
+    // It happens to be derived from the golden ratio.
+    for (auto d = data; d != d_end; ++d) {
+        seed ^= *d + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    }
+    size_t last_bytes{0};
+    std::memcpy(&last_bytes, d_end, size % cel_size);
+    seed ^= last_bytes + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+    return seed;
+}
+
+class ConstantWriter {
+public:
+    using FilePosition = int64_t;
+    using HashValue = size_t;
+    using ConstWritePositions = std::unordered_map<HashValue, FilePosition>;
+
+    ConstantWriter(std::ostream& bin_data, bool enable_compression = true)
+        : m_binary_output(bin_data)
+        , m_enable_compression(enable_compression) {
+    }
+
+    FilePosition write(const char* ptr, size_t size) {
+        const auto offset = m_binary_output.tellp();
+        if (!m_enable_compression) {
+            m_binary_output.write(ptr, size);
+            return offset;
+        }
+        // The biggest supported models have at maximum 1-2 thousand constant nodes,
+        // with 64 bit hash that gives a probability around 1 in 10 trillion that a
+        // hash collision will appear. Because of this, a choice has been made to
+        // not perform collision detection and keep the hashing quick and seamless.
+        const HashValue hash = hash_combine(ptr, size);
+        const auto found = m_hash_to_file_positions.find(hash);
+        if (found != end(m_hash_to_file_positions)) {
+            return found->second;
+        }
+
+        m_binary_output.write(ptr, size);
+        m_hash_to_file_positions.insert({hash, offset});
+
+        return offset;
+    }
+
+private:
+    ConstWritePositions m_hash_to_file_positions;
+    std::ostream& m_binary_output;
+    bool m_enable_compression;
+};
+
 void ngfunction_2_irv10(pugi::xml_node& node,
-                        std::ostream& bin_file,
                         const ngraph::Function& f,
-                        const std::map<std::string, ngraph::OpSet>& custom_opsets);
+                        const std::map<std::string, ngraph::OpSet>& custom_opsets,
+                        ConstantWriter& constant_write_handler);
 
 // Some of the operators were added to wrong opsets. This is a mapping
 // that allows such operators to be serialized with proper opsets.
@@ -115,9 +171,9 @@ class XmlSerializer {
 
 class XmlSerializer : public ngraph::AttributeVisitor {
     pugi::xml_node& m_xml_node;
-    std::ostream& m_bin_data;
     std::string& m_node_type_name;
     const std::map<std::string, ngraph::OpSet>& m_custom_opsets;
+    ConstantWriter& m_constant_write_handler;
 
     template <typename T>
     std::string create_atribute_list(
@@ -223,13 +279,13 @@ class XmlSerializer : public ngraph::AttributeVisitor {
 
 public:
     XmlSerializer(pugi::xml_node& data,
-                  std::ostream& bin_data,
                   std::string& node_type_name,
-                  const std::map<std::string, ngraph::OpSet>& custom_opsets)
+                  const std::map<std::string, ngraph::OpSet>& custom_opsets,
+                  ConstantWriter& constant_write_handler)
         : m_xml_node(data)
-        , m_bin_data(bin_data)
         , m_node_type_name(node_type_name)
-        , m_custom_opsets(custom_opsets) {
+        , m_custom_opsets(custom_opsets)
+        , m_constant_write_handler(constant_write_handler) {
     }
 
     void on_adapter(const std::string& name, ngraph::ValueAccessor<void>& adapter) override {
@@ -256,13 +312,11 @@ class XmlSerializer : public ngraph::AttributeVisitor {
         } else if (const auto& a = ngraph::as_type<ngraph::AttributeAdapter<std::shared_ptr<ngraph::runtime::AlignedBuffer>>>(&adapter)) {
             if (name == "value" &&  translate_type_name(m_node_type_name) == "Const") {
                 const int64_t size = a->get()->size();
-                const int64_t offset = m_bin_data.tellp();
+                int64_t offset = m_constant_write_handler.write(
+                    static_cast<const char *>(a->get()->get_ptr()), size);
 
                 m_xml_node.append_attribute("offset").set_value(offset);
                 m_xml_node.append_attribute("size").set_value(size);
-
-                auto data = static_cast<const char*>(a->get()->get_ptr());
-                m_bin_data.write(data, size);
             }
         }
     }
@@ -322,11 +376,11 @@ class XmlSerializer : public ngraph::AttributeVisitor {
             // to layer above (m_xml_node.parent()) as in ngfunction_2_irv10() layer (m_xml_node) with empty attributes
             // is removed.
             pugi::xml_node xml_body = m_xml_node.parent().append_child(name.c_str());
-            ngfunction_2_irv10(xml_body, m_bin_data, *adapter.get(), m_custom_opsets);
+            ngfunction_2_irv10(xml_body, *adapter.get(), m_custom_opsets, m_constant_write_handler);
             xml_body.remove_attribute("name");
             xml_body.remove_attribute("version");
         } else if (name == "net") {
-            ngfunction_2_irv10(m_xml_node, m_bin_data, *adapter.get(), m_custom_opsets);
+            ngfunction_2_irv10(m_xml_node, *adapter.get(), m_custom_opsets, m_constant_write_handler);
         } else {
             NGRAPH_CHECK(false, "Unsupported Function name.");
         }
@@ -585,9 +639,9 @@ bool resolve_dynamic_shapes(const ngraph::Function& f) {
 }
 
 void ngfunction_2_irv10(pugi::xml_node& netXml,
-                        std::ostream& bin_file,
                         const ngraph::Function& f,
-                        const std::map<std::string, ngraph::OpSet>& custom_opsets) {
+                        const std::map<std::string, ngraph::OpSet>& custom_opsets,
+                        ConstantWriter& constant_node_write_handler) {
     const bool exec_graph = is_exec_graph(f);
 
     netXml.append_attribute("name").set_value(f.get_friendly_name().c_str());
@@ -623,7 +677,7 @@ void ngfunction_2_irv10(pugi::xml_node& netXml,
         if (exec_graph) {
             visit_exec_graph_node(data, node_type_name, node);
         } else {
-            XmlSerializer visitor(data, bin_file, node_type_name, custom_opsets);
+            XmlSerializer visitor(data, node_type_name, custom_opsets, constant_node_write_handler);
             NGRAPH_CHECK(node->visit_attributes(visitor),
                          "Visitor API is not supported in ", node);
             rt_info::XmlSerializer{data}.serialize(node->get_rt_info());
@@ -734,7 +788,8 @@ bool pass::Serialize::run_on_function(std::shared_ptr<ngraph::Function> f) {
                 std::string name = "net";
                 pugi::xml_document xml_doc;
                 pugi::xml_node net_node = xml_doc.append_child(name.c_str());
-                XmlSerializer visitor(net_node, bin_file, name, m_custom_opsets);
+                ConstantWriter constant_write_handler(bin_file);
+                XmlSerializer visitor(net_node, name, m_custom_opsets, constant_write_handler);
                 visitor.on_attribute(name, f);
 
                 xml_doc.save(xml_file);
diff --git a/inference-engine/src/transformations/src/transformations/utils/utils.cpp b/inference-engine/src/transformations/src/transformations/utils/utils.cpp
index c5df435a7e36c7..f8179fbb64375a 100644
--- a/inference-engine/src/transformations/src/transformations/utils/utils.cpp
+++ b/inference-engine/src/transformations/src/transformations/utils/utils.cpp
@@ -130,6 +130,18 @@ bool is_seq_len_provided(const std::shared_ptr<Node> &seq_len_input, int64_t max
     return true;
 }
 
+std::shared_ptr<Node> try_fold_unary_output(const std::shared_ptr<Node>& node) {
+    const auto& num_outputs = node->get_output_size();
+    NGRAPH_CHECK(num_outputs == 1, "Unary has unexpected number of outputs:" + std::to_string(num_outputs));
+    OutputVector output(num_outputs);
+    return node->constant_fold(output, node->input_values()) ? output[0].get_node_shared_ptr() : node;
+}
+
+std::shared_ptr<Node> clone_try_fold(const std::shared_ptr<Node>& node, const OutputVector& inputs) {
+    auto unary_output_node = node->clone_with_new_inputs(inputs);
+    return try_fold_unary_output(unary_output_node);
+}
+
 }  // namespace util
 }  // namespace op
 }  // namespace ngraph
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_plugin.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_plugin.cpp
index e83a0704dfff1f..0f9d27fb28e227 100644
--- a/inference-engine/src/vpu/myriad_plugin/myriad_plugin.cpp
+++ b/inference-engine/src/vpu/myriad_plugin/myriad_plugin.cpp
@@ -135,7 +135,7 @@ IE_SUPPRESS_DEPRECATED_START
 IE_SUPPRESS_DEPRECATED_END
 }
 
-InferenceEngine::ExecutableNetwork Engine::ImportNetwork(
+InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetwork(
         std::istream& model,
         const std::map<std::string, std::string>& config) {
     VPU_PROFILE(ImportNetwork);
@@ -148,10 +148,10 @@ InferenceEngine::ExecutableNetwork Engine::ImportNetwork(
                 model, _mvnc, _devicePool, parsedConfigCopy, GetCore());
     executableNetwork->SetPointerToPlugin(shared_from_this());
 
-    return make_executable_network(executableNetwork);
+    return executableNetwork;
 }
 
-InferenceEngine::ExecutableNetwork Engine::ImportNetwork(
+InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetwork(
         const std::string& modelFileName,
         const std::map<std::string, std::string>& config) {
     VPU_PROFILE(ImportNetwork);
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_plugin.h b/inference-engine/src/vpu/myriad_plugin/myriad_plugin.h
index ac965a78810d2c..3cae6ae3e5d126 100644
--- a/inference-engine/src/vpu/myriad_plugin/myriad_plugin.h
+++ b/inference-engine/src/vpu/myriad_plugin/myriad_plugin.h
@@ -37,11 +37,11 @@ class Engine : public ie::InferencePluginInternal {
 
     using ie::InferencePluginInternal::ImportNetwork;
 
-    ie::ExecutableNetwork ImportNetwork(
+    ie::IExecutableNetworkInternal::Ptr ImportNetwork(
             const std::string& modelFileName,
             const std::map<std::string, std::string>& config) override;
 
-    ie::ExecutableNetwork ImportNetwork(
+    ie::IExecutableNetworkInternal::Ptr ImportNetwork(
             std::istream& model,
             const std::map<std::string, std::string>& config) override;
 
diff --git a/inference-engine/tests/functional/inference_engine/caching_test.cpp b/inference-engine/tests/functional/inference_engine/caching_test.cpp
index 20b3ac5bc7c14a..78204382898a96 100644
--- a/inference-engine/tests/functional/inference_engine/caching_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/caching_test.cpp
@@ -27,6 +27,10 @@
 #include "functional_test_utils/network_utils.hpp"
 
 #include "unit_test_utils/mocks/mock_iexecutable_network.hpp"
+#include "unit_test_utils/mocks/mock_iinfer_request.hpp"
+#include "unit_test_utils/mocks/cpp_interfaces/interface/mock_iinference_plugin.hpp"
+#include "unit_test_utils/mocks/cpp_interfaces/interface/mock_iexecutable_network_internal.hpp"
+#include "ie_plugin_cpp.hpp"
 
 using namespace InferenceEngine;
 using namespace ::testing;
@@ -76,12 +80,12 @@ class MockCachingInferencePlugin : public InferenceEngine::InferencePluginIntern
     MOCK_METHOD3(LoadExeNetworkImpl, ExecutableNetworkInternal::Ptr(const CNNNetwork& network, RemoteContext::Ptr context,
                                                       const std::map<std::string, std::string>& config));
 
-    MOCK_METHOD2(ImportNetworkImpl, ExecutableNetwork(std::istream& networkModel,
-                                        const std::map<std::string, std::string>& config));
+    MOCK_METHOD2(ImportNetworkImpl, ExecutableNetworkInternal::Ptr(std::istream& networkModel,
+                                                                   const std::map<std::string, std::string>& config));
 
-    MOCK_METHOD3(ImportNetworkImpl, ExecutableNetwork(std::istream& networkModel,
-                                        const RemoteContext::Ptr& context,
-                                        const std::map<std::string, std::string>& config));
+    MOCK_METHOD3(ImportNetworkImpl, ExecutableNetworkInternal::Ptr(std::istream& networkModel,
+                                                                   const RemoteContext::Ptr& context,
+                                                                   const std::map<std::string, std::string>& config));
 
     MOCK_CONST_METHOD2(QueryNetwork, QueryNetworkResult(const CNNNetwork& network,
                                     const std::map<std::string, std::string>& config));
@@ -97,6 +101,8 @@ class MockExecutableNetwork : public ExecutableNetworkInternal {
     MOCK_METHOD0(CreateInferRequest, IInferRequest::Ptr());
     MOCK_CONST_METHOD0(GetInputsInfo, ConstInputsDataMap());
     MOCK_CONST_METHOD0(GetOutputsInfo, ConstOutputsDataMap());
+    MOCK_CONST_METHOD1(GetConfig, Parameter(const std::string& name));
+    MOCK_CONST_METHOD1(GetMetric, Parameter(const std::string& name));
 };
 
 //------------------------------------------------------
@@ -141,7 +147,6 @@ class CachingTest : public ::testing::TestWithParam<std::tuple<TestParam, std::s
     using CNNCallback = std::function<void(CNNNetwork&)>;
     CNNCallback                 m_cnnCallback = nullptr;
 
-
     std::string get_mock_engine_name() {
         std::string mockEngineName("mock_engine");
         return CommonTestUtils::pre + mockEngineName + IE_BUILD_POSTFIX + CommonTestUtils::ext;
@@ -240,6 +245,19 @@ class CachingTest : public ::testing::TestWithParam<std::tuple<TestParam, std::s
         ie.LoadNetwork(cnnNetwork, context, config);
     }
 
+    std::shared_ptr<MockExecutableNetwork> createMockIExecutableNet() {
+        auto mock = std::make_shared<MockExecutableNetwork>();
+        EXPECT_CALL(*mock, GetInputsInfo()).Times(AnyNumber()).WillRepeatedly(Return(ConstInputsDataMap{}));
+        EXPECT_CALL(*mock, GetOutputsInfo()).Times(AnyNumber()).WillRepeatedly(Return(ConstOutputsDataMap{}));
+        EXPECT_CALL(*mock, GetConfig(PluginConfigParams::KEY_PERF_COUNT)).Times(AnyNumber()).WillRepeatedly(Return(Parameter{PluginConfigParams::NO}));
+        EXPECT_CALL(*mock, GetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS))).Times(AnyNumber()).WillRepeatedly(Return(Parameter{1u}));
+        auto ptr = std::make_shared<MockIInferRequest>();
+        EXPECT_CALL(*ptr, SetCompletionCallback(_)).Times(AnyNumber()).WillRepeatedly(Return(OK));
+        EXPECT_CALL(*ptr, SetUserData(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
+        EXPECT_CALL(*mock, CreateInferRequest()).Times(AnyNumber()).WillRepeatedly(Return(ptr));
+        return mock;
+    }
+
 private:
     template <class T>
     std::function<T> make_std_function(const std::string& functionName) {
@@ -271,18 +289,12 @@ class CachingTest : public ::testing::TestWithParam<std::tuple<TestParam, std::s
         ON_CALL(plugin, ImportNetworkImpl(_, _, _)).
                 WillByDefault(Invoke([&](std::istream &istr, RemoteContext::Ptr,
                                          const std::map<std::string, std::string> &) {
-            auto mock = std::make_shared<MockIExecutableNetwork>();
-            EXPECT_CALL(*mock, GetInputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-            EXPECT_CALL(*mock, GetOutputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-            return ExecutableNetwork(mock);
+            return createMockIExecutableNet();
         }));
 
         ON_CALL(plugin, ImportNetworkImpl(_, _)).
                 WillByDefault(Invoke([&](std::istream &istr, const std::map<std::string, std::string> &) {
-            auto mock = std::make_shared<MockIExecutableNetwork>();
-            EXPECT_CALL(*mock, GetInputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-            EXPECT_CALL(*mock, GetOutputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-            return ExecutableNetwork(mock);
+            return createMockIExecutableNet();
         }));
 
         ON_CALL(plugin, LoadExeNetworkImpl(_, _, _)).
@@ -318,6 +330,27 @@ class CachingTest : public ::testing::TestWithParam<std::tuple<TestParam, std::s
                 .WillRepeatedly(Return(ConstInputsDataMap{}));
         EXPECT_CALL(*net, GetOutputsInfo()).Times(AnyNumber())
                 .WillRepeatedly(Return(ConstOutputsDataMap{}));
+        EXPECT_CALL(*net, GetConfig(PluginConfigParams::KEY_PERF_COUNT)).Times(AnyNumber())
+                .WillRepeatedly(Return(PluginConfigParams::NO));
+        EXPECT_CALL(*net, GetMetric(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS))).Times(AnyNumber())
+                .WillRepeatedly(Return((unsigned int) 1));
+        EXPECT_CALL(*net, GetMetric(METRIC_KEY(NETWORK_NAME))).Times(AnyNumber())
+                .WillRepeatedly(Return("mock_net"));
+        EXPECT_CALL(*net, GetMetric(METRIC_KEY(SUPPORTED_METRICS))).Times(AnyNumber())
+                .WillRepeatedly(Invoke([&](const std::string &) {
+            std::vector<std::string> res;
+            res.push_back(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS));
+            res.push_back(METRIC_KEY(NETWORK_NAME));
+            return res;
+        }));
+        EXPECT_CALL(*net, CreateInferRequest()).Times(AnyNumber())
+                .WillRepeatedly(Invoke([&]() {
+            std::vector<std::string> res;
+            auto inferReq = std::make_shared<MockIInferRequest>();
+            EXPECT_CALL(*inferReq, SetCompletionCallback(_)).Times(AnyNumber()).WillRepeatedly(Return(OK));
+            EXPECT_CALL(*inferReq, SetUserData(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
+            return inferReq;
+        }));
     }
 };
 
@@ -361,10 +394,10 @@ TEST_P(CachingTest, TestLoadCustomImportExport) {
         int a;
         s >> a;
         EXPECT_EQ(customNumber, a);
-        auto mock = std::make_shared<MockIExecutableNetwork>();
-        EXPECT_CALL(*mock, GetInputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-        EXPECT_CALL(*mock, GetOutputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-        return ExecutableNetwork(mock);
+        auto mock = std::make_shared<MockExecutableNetwork>();
+        EXPECT_CALL(*mock, GetInputsInfo()).Times(AnyNumber()).WillRepeatedly(Return(ConstInputsDataMap{}));
+        EXPECT_CALL(*mock, GetOutputsInfo()).Times(AnyNumber()).WillRepeatedly(Return(ConstOutputsDataMap{}));
+        return mock;
     }));
 
     ON_CALL(*mockPlugin, ImportNetworkImpl(_, _)).
@@ -372,10 +405,10 @@ TEST_P(CachingTest, TestLoadCustomImportExport) {
         int a;
         s >> a;
         EXPECT_EQ(customNumber, a);
-        auto mock = std::make_shared<MockIExecutableNetwork>();
-        EXPECT_CALL(*mock, GetInputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-        EXPECT_CALL(*mock, GetOutputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-        return ExecutableNetwork(mock);
+        auto mock = std::make_shared<MockExecutableNetwork>();
+        EXPECT_CALL(*mock, GetInputsInfo()).Times(AnyNumber()).WillRepeatedly(Return(ConstInputsDataMap{}));
+        EXPECT_CALL(*mock, GetOutputsInfo()).Times(AnyNumber()).WillRepeatedly(Return(ConstOutputsDataMap{}));
+        return mock;
     }));
 
     ON_CALL(*net, ExportImpl(_)).WillByDefault(Invoke([&] (std::ostream& s) {
@@ -1037,10 +1070,10 @@ TEST_P(CachingTest, LoadHetero_MultiArchs) {
         int a;
         s >> a;
         EXPECT_EQ(customNumber, a);
-        auto mock = std::make_shared<MockIExecutableNetwork>();
-        EXPECT_CALL(*mock, GetInputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-        EXPECT_CALL(*mock, GetOutputsInfo(_, _)).Times(AnyNumber()).WillRepeatedly(Return(OK));
-        return ExecutableNetwork(mock);
+        auto mock = std::make_shared<MockExecutableNetwork>();
+        EXPECT_CALL(*mock, GetInputsInfo()).Times(AnyNumber()).WillRepeatedly(Return(ConstInputsDataMap{}));
+        EXPECT_CALL(*mock, GetOutputsInfo()).Times(AnyNumber()).WillRepeatedly(Return(ConstOutputsDataMap{}));
+        return mock;
     }));
 
     ON_CALL(*net, ExportImpl(_)).WillByDefault(Invoke([&] (std::ostream& s) {
@@ -1172,6 +1205,130 @@ TEST_P(CachingTest, LoadHetero_MultiArchs_TargetFallback_FromCore) {
     }
 }
 
+// MULTI-DEVICE test
+// Test that it is safe to load multiple devices sharing same cache
+TEST_P(CachingTest, LoadMulti_race) {
+    const auto TEST_DURATION_MS = 2000;
+    const auto TEST_DEVICE_MAX_COUNT = 10;
+    EXPECT_CALL(*mockPlugin, GetMetric(_, _)).Times(AnyNumber());
+    EXPECT_CALL(*mockPlugin, QueryNetwork(_, _)).Times(AnyNumber());
+    EXPECT_CALL(*mockPlugin, GetMetric(METRIC_KEY(DEVICE_ARCHITECTURE), _)).Times(AnyNumber());
+    if (m_remoteContext) {
+        return; // skip the remote Context test for Multi plugin
+    }
+    int index = 0;
+    auto start = high_resolution_clock::now();
+    do {
+        std::string cacheDir = m_cacheDir + std::to_string(index);
+        MkDirGuard guard(cacheDir);
+        int devCount = 1 + index % (TEST_DEVICE_MAX_COUNT - 1); // try dynamic number of devices from 1 to max
+        deviceToLoad = CommonTestUtils::DEVICE_MULTI;
+        deviceToLoad += ":mock.0";
+        for (int i = 1; i < devCount; i++) {
+            deviceToLoad += ",mock." + std::to_string(i);
+        }
+
+        EXPECT_CALL(*mockPlugin, LoadExeNetworkImpl(_, _, _)).Times(0);
+        EXPECT_CALL(*mockPlugin, LoadExeNetworkImpl(_, _)).Times(1);
+        EXPECT_CALL(*mockPlugin, ImportNetworkImpl(_, _, _)).Times(0);
+        EXPECT_CALL(*mockPlugin, ImportNetworkImpl(_, _)).Times(devCount - 1);
+        EXPECT_CALL(*net, ExportImpl(_)).Times(1);
+        testLoad([&](Core &ie) {
+            ie.SetConfig({{CONFIG_KEY(CACHE_DIR), cacheDir}});
+            ASSERT_NO_THROW(m_testFunction(ie));
+        });
+        index++;
+    } while (duration_cast<milliseconds>(high_resolution_clock::now() - start).count() < TEST_DURATION_MS);
+    std::cout << "Caching LoadMulti Test completed. Tried " << index << " times" << std::endl;
+}
+
+TEST_P(CachingTest, Load_threads) {
+    const auto TEST_DURATION_MS = 2000;
+    const auto THREADS_COUNT = 4;
+    EXPECT_CALL(*mockPlugin, GetMetric(_, _)).Times(AnyNumber());
+    EXPECT_CALL(*mockPlugin, QueryNetwork(_, _)).Times(AnyNumber());
+    EXPECT_CALL(*mockPlugin, GetMetric(METRIC_KEY(DEVICE_ARCHITECTURE), _)).Times(AnyNumber());
+    if (m_remoteContext) {
+        return; // skip the remote Context test for Multi plugin
+    }
+    auto start = high_resolution_clock::now();
+    int index = 0;
+    do {
+        std::string cacheDir = m_cacheDir + std::to_string(index);
+        MkDirGuard guard(cacheDir);
+        EXPECT_CALL(*mockPlugin, LoadExeNetworkImpl(_, _, _)).Times(0);
+        EXPECT_CALL(*mockPlugin, LoadExeNetworkImpl(_, _)).Times(1);
+        EXPECT_CALL(*mockPlugin, ImportNetworkImpl(_, _, _)).Times(0);
+        EXPECT_CALL(*mockPlugin, ImportNetworkImpl(_, _)).Times(THREADS_COUNT - 1);
+        EXPECT_CALL(*net, ExportImpl(_)).Times(1);
+        testLoad([&](Core &ie) {
+            ie.SetConfig({{CONFIG_KEY(CACHE_DIR), cacheDir}});
+            std::vector<std::thread> threads;
+            for (int i = 0; i < THREADS_COUNT; i++) {
+                threads.emplace_back(([&]() { m_testFunction(ie); }));
+            }
+            for (int i = 0; i < THREADS_COUNT; i++) {
+                threads[i].join();
+            }
+        });
+        index++;
+    } while (duration_cast<milliseconds>(high_resolution_clock::now() - start).count() < TEST_DURATION_MS);
+    std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl;
+}
+
+// MULTI-DEVICE test
+// Test that loading of device with one architecture doesn't block loading of device with another architecture
+TEST_P(CachingTest, LoadMulti_Archs) {
+    const auto IMPORT_DELAY_LONG_MS = 3000;
+    const auto TEST_DEVICE_MAX_COUNT = 30; // Shall be >= 2
+    const auto IMPORT_DELAY_SHORT_MS = 100;
+    const auto EXP_MAX_EXEC_TIME_MS = 5500;
+    EXPECT_CALL(*mockPlugin, GetMetric(_, _)).Times(AnyNumber());
+    EXPECT_CALL(*mockPlugin, QueryNetwork(_, _)).Times(AnyNumber());
+    EXPECT_CALL(*mockPlugin, GetMetric(METRIC_KEY(DEVICE_ARCHITECTURE), _)).Times(AnyNumber())
+            .WillRepeatedly(Invoke([&](const std::string &, const std::map<std::string, Parameter> &options) {
+                auto id = options.at("DEVICE_ID").as<std::string>();
+                if (std::stoi(id) < 2) {
+                    return "mock_first_architecture";
+                } else {
+                    return "mock_another_architecture";
+                }
+            }));
+    if (m_remoteContext) {
+        return; // skip the remote Context test for Multi plugin
+    }
+
+    deviceToLoad = CommonTestUtils::DEVICE_MULTI;
+    deviceToLoad += ":mock.0";
+    for (int i = 1; i < TEST_DEVICE_MAX_COUNT; i++) {
+        deviceToLoad += ",mock." + std::to_string(i);
+    }
+
+    auto start = high_resolution_clock::now();
+    {
+        EXPECT_CALL(*mockPlugin, LoadExeNetworkImpl(_, _, _)).Times(0);
+        EXPECT_CALL(*mockPlugin, LoadExeNetworkImpl(_, _)).Times(2);
+
+        EXPECT_CALL(*mockPlugin, ImportNetworkImpl(_, _, _)).Times(0);
+        EXPECT_CALL(*mockPlugin, ImportNetworkImpl(_, _)).Times(TEST_DEVICE_MAX_COUNT - 2)
+                .WillRepeatedly(Invoke([&](std::istream &, const std::map<std::string, std::string> &opt) {
+            auto id = opt.at("DEVICE_ID");
+            if (std::stoi(id) < 2) {
+                std::this_thread::sleep_for(milliseconds(IMPORT_DELAY_LONG_MS));
+            } else {
+                std::this_thread::sleep_for(milliseconds(IMPORT_DELAY_SHORT_MS));
+            }
+            return createMockIExecutableNet();
+        }));
+        EXPECT_CALL(*net, ExportImpl(_)).Times(2);
+        testLoad([&](Core &ie) {
+            ie.SetConfig({{CONFIG_KEY(CACHE_DIR), m_cacheDir}});
+            ASSERT_NO_THROW(m_testFunction(ie));
+        });
+    }
+    ASSERT_LT(duration_cast<milliseconds>(high_resolution_clock::now() - start).count(), EXP_MAX_EXEC_TIME_MS);
+}
+
 INSTANTIATE_TEST_CASE_P(CachingTest, CachingTest,
                         ::testing::Combine(
                             ::testing::ValuesIn(loadVariants),
diff --git a/inference-engine/tests/functional/inference_engine/executable_network.cpp b/inference-engine/tests/functional/inference_engine/executable_network.cpp
index bf90be8b33ca20..16af81dd7b2a7f 100644
--- a/inference-engine/tests/functional/inference_engine/executable_network.cpp
+++ b/inference-engine/tests/functional/inference_engine/executable_network.cpp
@@ -4,17 +4,13 @@
 
 #include <gtest/gtest.h>
 #include <cpp/ie_executable_network.hpp>
+#include <ie_iexecutable_network.hpp>
 
 using namespace ::testing;
 using namespace std;
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
 
-TEST(ExecutableNetworkTests, throwsOnInitWithNull) {
-    std::shared_ptr<IExecutableNetwork> nlptr = nullptr;
-    ASSERT_THROW(ExecutableNetwork exec(nlptr), InferenceEngine::Exception);
-}
-
 TEST(ExecutableNetworkTests, throwsOnUninitializedGetOutputsInfo) {
     ExecutableNetwork exec;
     ASSERT_THROW(exec.GetOutputsInfo(), InferenceEngine::Exception);
@@ -35,11 +31,6 @@ TEST(ExecutableNetworkTests, throwsOnUninitializedExportStream) {
     ASSERT_THROW(exec.Export(std::cout), InferenceEngine::Exception);
 }
 
-TEST(ExecutableNetworkTests, nothrowsOnUninitializedCast) {
-    ExecutableNetwork exec;
-    ASSERT_NO_THROW((void)static_cast<IExecutableNetwork::Ptr &>(exec));
-}
-
 TEST(ExecutableNetworkTests, throwsOnUninitializedGetExecGraphInfo) {
     ExecutableNetwork exec;
     ASSERT_THROW(exec.GetExecGraphInfo(), InferenceEngine::Exception);
diff --git a/inference-engine/tests/functional/inference_engine/ir_serialization/const_compression.cpp b/inference-engine/tests/functional/inference_engine/ir_serialization/const_compression.cpp
new file mode 100644
index 00000000000000..ce3ea231b55725
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/ir_serialization/const_compression.cpp
@@ -0,0 +1,231 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <fstream>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+#include "ie_core.hpp"
+#include "gtest/gtest.h"
+
+#include <ngraph/function.hpp>
+#include <transformations/serialize.hpp>
+
+#ifndef IR_SERIALIZATION_MODELS_PATH // should be already defined by cmake
+#define IR_SERIALIZATION_MODELS_PATH ""
+#endif
+
+class SerializatioConstantCompressionTest : public ::testing::Test {
+protected:
+    std::string test_name =
+        ::testing::UnitTest::GetInstance()->current_test_info()->name();
+    std::string m_out_xml_path_1 = test_name + "1" + ".xml";
+    std::string m_out_bin_path_1 = test_name + "1" + ".bin";
+
+    void TearDown() override {
+        std::remove(m_out_xml_path_1.c_str());
+        std::remove(m_out_bin_path_1.c_str());
+    }
+
+    std::uintmax_t file_size(std::ifstream &f) {
+        // get length of file:
+        const auto pos_to_restore = f.tellg();
+        f.seekg(0, f.end);
+        std::uintmax_t length = f.tellg();
+        f.seekg(pos_to_restore, f.beg);
+        return length;
+    }
+};
+
+TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsI32) {
+    constexpr int unique_const_count = 1;
+    const ngraph::Shape shape{2, 2, 2};
+
+    auto A = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto B = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B},
+        ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(int32_t));
+}
+
+TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsI64) {
+    constexpr int unique_const_count = 1;
+    const ngraph::Shape shape{2, 2, 2};
+
+    auto A = ngraph::op::Constant::create(ngraph::element::i64, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto B = ngraph::op::Constant::create(ngraph::element::i64, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B},
+        ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(int64_t));
+}
+
+TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsFP16) {
+    constexpr int unique_const_count = 1;
+    const ngraph::Shape shape{2, 2, 2};
+
+    auto A = ngraph::op::Constant::create(ngraph::element::f16, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto B = ngraph::op::Constant::create(ngraph::element::f16, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B},
+        ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(ngraph::float16));
+}
+
+TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsFP32) {
+    constexpr int unique_const_count = 1;
+    const ngraph::Shape shape{2, 2, 2};
+
+    auto A = ngraph::op::Constant::create(ngraph::element::f32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto B = ngraph::op::Constant::create(ngraph::element::f32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B},
+        ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(float));
+}
+
+TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsTimesTwo) {
+    constexpr int unique_const_count = 2;
+    const ngraph::Shape shape{2, 2, 2};
+
+    auto A = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto B = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto C = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {0, 3, 1, 2, 5, 6, 25, 3});
+    auto D = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {0, 3, 1, 2, 5, 6, 25, 3});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B, C, D},
+        ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(int32_t));
+}
+
+TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsTimesTwoMultipleOccurences) {
+    constexpr int unique_const_count = 2;
+    const ngraph::Shape shape{2, 2, 2};
+
+    auto A = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto B = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {0, 3, 1, 2, 5, 6, 25, 3});
+    auto C = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto D = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {0, 3, 1, 2, 5, 6, 25, 3});
+    auto E = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto F = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {0, 3, 1, 2, 5, 6, 25, 3});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B, C, D, E, F},
+        ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(int32_t));
+}
+
+TEST_F(SerializatioConstantCompressionTest, NonIdenticalConstants) {
+    constexpr int unique_const_count = 2;
+    const ngraph::Shape shape{2, 2, 2};
+
+    auto A = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 2, 3, 4, 5, 6, 7, 8});
+    auto B = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {2, 2, 3, 4, 5, 6, 7, 8});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B},
+        ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(int32_t));
+}
+
+TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsDifferentTypesI32I64) {
+    constexpr int unique_const_count = 1;
+    const ngraph::Shape shape{2, 2, 2};
+
+    auto A = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 0, 2, 0, 3, 0, 4, 0});
+    auto B = ngraph::op::Constant::create(ngraph::element::i64, ngraph::Shape({1, 2, 2}),
+        {1, 2, 3, 4});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B},
+        ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(int32_t));
+}
+
+TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsDifferentTypesI32I8) {
+    constexpr int unique_const_count = 1;
+    const ngraph::Shape shape{1, 1, 2};
+
+    auto A = ngraph::op::Constant::create(ngraph::element::i32, shape,
+        {1, 2});
+    auto B = ngraph::op::Constant::create(ngraph::element::i8, ngraph::Shape({1, 2, 4}),
+        {1, 0, 0, 0,
+         2, 0, 0, 0});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B},
+        ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(int32_t));
+}
diff --git a/inference-engine/tests/functional/inference_engine/lp_transformations/concat_transformation.cpp b/inference-engine/tests/functional/inference_engine/lp_transformations/concat_transformation.cpp
index 22c678dcaba094..c199e60fbfd143 100644
--- a/inference-engine/tests/functional/inference_engine/lp_transformations/concat_transformation.cpp
+++ b/inference-engine/tests/functional/inference_engine/lp_transformations/concat_transformation.cpp
@@ -74,6 +74,7 @@ class ConcatTransformationTestValues {
 public:
     ngraph::pass::low_precision::LayerTransformation::Params params;
     bool multiChannels;
+    std::int64_t axis;
     ConcatTransformationActualValues actual;
     ConcatTransformationResultValues result;
 };
@@ -114,7 +115,8 @@ class ConcatTransformation : public LayerTransformation, public testing::WithPar
             testValues.actual.convert2,
             testValues.actual.dequantization2,
             ngraph::element::undefined,
-            {});
+            {},
+            testValues.axis);
 
         SimpleLowPrecisionTransformer transform;
         if (testValues.multiChannels) {
@@ -146,7 +148,8 @@ class ConcatTransformation : public LayerTransformation, public testing::WithPar
             testValues.result.convert2,
             testValues.result.dequantization2,
             testValues.result.precisionAfterOperation,
-            testValues.result.dequantizationAfter);
+            testValues.result.dequantizationAfter,
+            testValues.axis);
     }
 
     static std::string getTestCaseName(testing::TestParamInfo<ConcatTransformationParams> obj) {
@@ -158,6 +161,7 @@ class ConcatTransformation : public LayerTransformation, public testing::WithPar
         result <<
             LayerTransformation::getTestCaseNameByParams(precision, shape, testValues.params) << "_" <<
             (testValues.multiChannels ? "multiChannels_" : "notMultiChannels_") <<
+            "axis_" << testValues.axis << "_" <<
             testValues.actual << "_" <<
             testValues.result << "_";
         return result.str();
@@ -180,6 +184,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         false,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -201,6 +206,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         false,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {255.f} },
             { ngraph::element::u8 },
@@ -232,6 +238,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         true,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {255.f} },
             { ngraph::element::u8 },
@@ -263,6 +270,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         false,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -290,6 +298,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         true,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -317,6 +326,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         false,
+        1,
         {
             { 256ul, {{1}, {1}, {1}, {1}}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -340,6 +350,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         false,
+        1,
         {
             { 256ul, {{1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1}}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -363,6 +374,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         true,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -386,6 +398,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         true,
+        1,
         {
             { 256ul, {{1}, {1}, {1}, {1}}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -409,6 +422,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         true,
+        1,
         {
             {
                 256ul,
@@ -450,6 +464,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         true,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -477,6 +492,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsI8I8(),
         false,
+        1,
         {
             { 256ul, {}, {-1.28f}, {1.27f}, {-1.28f}, {1.27f} },
             {},
@@ -500,6 +516,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         false,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -523,6 +540,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         true,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -546,6 +564,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         false,
+        1,
         {
             { 256ul, {}, {-1.28f}, {1.27f}, {-1.28f}, {1.27f} },
             {},
@@ -569,6 +588,7 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         LayerTransformation::createParamsU8I8(),
         false,
+        1,
         {
             { 256ul, {}, {-1.28f}, {1.27f}, {0.f}, {2.3007815f} },
             {},
@@ -588,10 +608,61 @@ const std::vector<ConcatTransformationTestValues> testValues = {
             { ngraph::element::f32, { 128 }, { 0.0302619f } }
         }
     },
+    // U8: concat multi channels with subtract, negative axis
+    {
+        LayerTransformation::createParamsU8I8(),
+        true,
+        -3,
+        {
+            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
+            {},
+            {},
+            { 256ul, {}, {1.275f}, {2.55f}, {1.275f}, {2.55f} },
+            {},
+            {}
+        },
+        {
+            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {255.f}, ngraph::element::u8 },
+            {},
+            {},
+            { 256ul, {}, {1.275f}, {2.55f}, {0.f}, {255.f}, ngraph::element::u8 },
+            {},
+            {},
+            ngraph::element::u8,
+            {
+                ngraph::element::f32,
+                {{ 0.f, 0.f, 0.f, -255.f, -255.f, -255.f }},
+                {{ 0.01f, 0.01f, 0.01f, 0.005f, 0.005f, 0.005f }}
+            }
+        }
+    },
+    // U8: concat multi channels with subtract, not supported axis
+    {
+        LayerTransformation::createParamsU8I8(),
+        true,
+        0,
+        {
+            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
+            {},
+            {},
+            { 256ul, {}, {1.275f}, {2.55f}, {1.275f}, {2.55f} },
+            {},
+            {}
+        },
+        {
+            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
+            {},
+            {},
+            { 256ul, {}, {1.275f}, {2.55f}, {1.275f}, {2.55f} },
+            {},
+            {}
+        },
+    },
     // not update precisions
     {
         LayerTransformation::createParamsU8I8().setUpdatePrecisions(false),
         false,
+        1,
         {
             { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
             {},
@@ -610,6 +681,54 @@ const std::vector<ConcatTransformationTestValues> testValues = {
             ngraph::element::f32,
             { {element::f32}, {}, { 0.01f } },
         }
+    },
+    // unexpected quantization levels, concat
+    {
+        LayerTransformation::createParamsU8I8(),
+        false,
+        1,
+        {
+            { 16ul, {}, {0.f}, {1.5f}, {0.f}, {15.f} },
+            {},
+            {},
+            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
+            {},
+            {}
+        },
+        {
+            { 16ul, {}, {0.f}, {1.5f}, {0.f}, {15.f} },
+            {},
+            {},
+            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
+            {},
+            {},
+            ngraph::element::f32,
+            {},
+        }
+    },
+    // unexpected quantization levels, concat multi channels
+    {
+        LayerTransformation::createParamsU8I8(),
+        true,
+        1,
+        {
+            { 16ul, {}, {0.f}, {1.5f}, {0.f}, {15.f} },
+            {},
+            {},
+            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
+            {},
+            {}
+        },
+        {
+            { 16ul, {}, {0.f}, {1.5f}, {0.f}, {15.f} },
+            {},
+            {},
+            { 256ul, {}, {0.f}, {2.55f}, {0.f}, {2.55f} },
+            {},
+            {},
+            ngraph::element::f32,
+            {},
+        }
     }
 };
 
diff --git a/inference-engine/tests/functional/inference_engine/lp_transformations/concat_with_split_transformation.cpp b/inference-engine/tests/functional/inference_engine/lp_transformations/concat_with_split_transformation.cpp
index dbbe4b35f11549..76b137d74683c9 100644
--- a/inference-engine/tests/functional/inference_engine/lp_transformations/concat_with_split_transformation.cpp
+++ b/inference-engine/tests/functional/inference_engine/lp_transformations/concat_with_split_transformation.cpp
@@ -217,6 +217,40 @@ const std::vector<ConcatTransformationTestValues> testValues = {
             { ngraph::element::f32, {}, { 0.005f } }
         }
     },
+    // U8: concat multi channels with per-channel quantization
+    {
+        { 1, 6, 10, 10 },
+        LayerTransformation::createParamsU8I8(),
+        true,
+        {
+            { 256ul, ngraph::Shape({}), {0.f}, {2.55f / 2.f}, {0.f}, {2.55f / 2.f} },
+            {
+                256ul,
+                ngraph::Shape({ 1, 6, 1, 1 }),
+                {0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+                {255.f, 25.5f, 2.55f, 25.5f, 255.f, 2.55f},
+                {0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+                {255.f, 25.5f, 2.55f, 25.5f, 255.f, 2.55f}
+            }
+        },
+        {
+            { 256ul, ngraph::Shape({}), {0.f}, {2.55f / 2.f}, {0.f}, {255.f}},
+            {
+                256ul,
+                ngraph::Shape({ 1, 6, 1, 1 }),
+                {0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
+                {255.f, 25.5f, 2.55f, 25.5f, 255.f, 2.55f},
+                {0.f},
+                {255.f}
+            },
+            ngraph::element::u8,
+            {{}, {}, {}},
+            {{}, {}, {}},
+            ngraph::element::u8,
+            { ngraph::element::f32, {}, {{ 0.005f, 0.005f, 0.005f, 1.f, 0.1f, 0.01f }} },
+            { ngraph::element::f32, {}, {{ 0.1f, 1.f, 0.01f }} }
+        }
+    },
     // I8: concat multi channels
     {
         { 1, 6, 10, 10 },
@@ -259,9 +293,8 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     },
 };
 
-// TODO: Split/VariadicSplit operations are not supported in ConcatTransformation
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_smoke_LPT,
+    smoke_LPT,
     ConcatWithSplitTransformation,
     ::testing::Combine(
         ::testing::ValuesIn(precisions),
diff --git a/inference-engine/tests/functional/inference_engine/lp_transformations/split_transformation.cpp b/inference-engine/tests/functional/inference_engine/lp_transformations/split_transformation.cpp
index bc4e5580b4bd3e..9f04c18a580e0f 100644
--- a/inference-engine/tests/functional/inference_engine/lp_transformations/split_transformation.cpp
+++ b/inference-engine/tests/functional/inference_engine/lp_transformations/split_transformation.cpp
@@ -160,21 +160,30 @@ const std::vector<SplitTransformationTestValues> testValues = {
             {},
             ngraph::element::u8,
             {
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{2.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{22.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{3.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{33.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
+                {{ngraph::element::f32}, {1.f}, {11.f}},
+                {{ngraph::element::f32}, {2.f}, {22.f}},
+                {{ngraph::element::f32}, {3.f}, {33.f}},
+            }
+        }
+    },
+    // U8 per channel quantization with different values (constants without batch)
+    {
+        ngraph::Shape({ 1, 3, 16, 16 }), std::int64_t{-3}, size_t{3},
+        LayerTransformation::createParamsU8I8(),
+        {
+            ngraph::element::u8,
+            {{ngraph::element::f32},
+            {{1.f, 2.f, 3.f}, ngraph::element::f32, {3, 1, 1}},
+            {{11.f, 22.f, 33.f}, ngraph::element::f32, {3, 1, 1}}}
+        },
+        {
+            ngraph::element::u8,
+            {},
+            ngraph::element::u8,
+            {
+                {{ngraph::element::f32}, {1.f}, {11.f}},
+                {{ngraph::element::f32}, {2.f}, {22.f}},
+                {{ngraph::element::f32}, {3.f}, {33.f}},
             }
         }
     },
@@ -193,21 +202,9 @@ const std::vector<SplitTransformationTestValues> testValues = {
             {},
             ngraph::element::i8,
             {
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{2.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{22.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{3.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{33.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
+                {{ngraph::element::f32}, {1.f}, {11.f}},
+                {{ngraph::element::f32}, {2.f}, {22.f}},
+                {{ngraph::element::f32}, {3.f}, {33.f}},
             }
         }
     },
@@ -226,21 +223,9 @@ const std::vector<SplitTransformationTestValues> testValues = {
             {},
             ngraph::element::u8,
             {
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
+                {{ngraph::element::f32}, {1.f}, {11.f}},
+                {{ngraph::element::f32}, {1.f}, {11.f}},
+                {{ngraph::element::f32}, {1.f}, {11.f}},
             }
         }
     },
@@ -259,21 +244,9 @@ const std::vector<SplitTransformationTestValues> testValues = {
             {},
             ngraph::element::i8,
             {
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
+                {{ngraph::element::f32}, {1.f}, {11.f}},
+                {{ngraph::element::f32}, {1.f}, {11.f}},
+                {{ngraph::element::f32}, {1.f}, {11.f}}
             }
         }
     },
@@ -358,21 +331,9 @@ const std::vector<SplitTransformationTestValues> testValues = {
             {},
             ngraph::element::u8,
             {
-                {
-                    {ngraph::element::f32},
-                    {},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {},
-                    {{22.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {},
-                    {{33.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
+                {{ngraph::element::f32}, {}, {11.f}},
+                {{ngraph::element::f32}, {}, {22.f}},
+                {{ngraph::element::f32}, {}, {33.f}},
             }
         }
     },
@@ -391,21 +352,9 @@ const std::vector<SplitTransformationTestValues> testValues = {
             {},
             ngraph::element::i8,
             {
-                {
-                    {ngraph::element::f32},
-                    {},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {},
-                    {{22.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {},
-                    {{33.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
+                {{ngraph::element::f32}, {}, {11.f}},
+                {{ngraph::element::f32}, {}, {22.f}},
+                {{ngraph::element::f32}, {}, {33.f}},
             }
         }
     },
diff --git a/inference-engine/tests/functional/inference_engine/lp_transformations/strided_slice_transformation.cpp b/inference-engine/tests/functional/inference_engine/lp_transformations/strided_slice_transformation.cpp
index 8f570484c81a1f..8b16ce99d75eda 100644
--- a/inference-engine/tests/functional/inference_engine/lp_transformations/strided_slice_transformation.cpp
+++ b/inference-engine/tests/functional/inference_engine/lp_transformations/strided_slice_transformation.cpp
@@ -150,6 +150,17 @@ StridedSliceTransformationTestValues::LayerParams specialDimensionSlice = {
     {}
 };
 
+StridedSliceTransformationTestValues::LayerParams specialDimensionEndSlice = {
+    { 0, 0, 20, 0 },
+    { 1, 3, 24, 24 },
+    { 1, 1, 1, 1 },
+    { 1, 1, 0, 1 },
+    { 1, 1, 0, 1 },
+    {},
+    {},
+    {}
+};
+
 const std::vector<StridedSliceTransformationTestValues> stridedSliceTransformationTestValues = {
     // U8: channel slice, per-tensor quantization
     {
@@ -311,6 +322,38 @@ const std::vector<StridedSliceTransformationTestValues> stridedSliceTransformati
             {{ngraph::element::f32}, {{ 32.f, 64.f, 32.f }}, {{ 0.1f, 0.01f, 1.f }}}
         }
     },
+    // I8: special dimension end slice, per-channel quantization with different values
+    {
+        ngraph::Shape{1, 3, 24, 24},
+        LayerTransformation::createParamsI8I8(),
+        specialDimensionEndSlice,
+        {
+            ngraph::element::i8,
+            {{ngraph::element::f32}, {{ 32.f, 64.f, 32.f }}, {{ 0.1f, 0.01f, 1.f }}}
+        },
+        {
+            ngraph::element::i8,
+            {},
+            ngraph::element::i8,
+            {{ngraph::element::f32}, {{ 32.f, 64.f, 32.f }}, {{ 0.1f, 0.01f, 1.f }}}
+        }
+    },
+    // I8: special dimension end slice, per-tensor quantization with different values
+    {
+        ngraph::Shape{1, 3, 24, 24},
+        LayerTransformation::createParamsI8I8(),
+        specialDimensionEndSlice,
+        {
+            ngraph::element::i8,
+            {{ngraph::element::f32}, { 32.f }, { 0.1f }}
+        },
+        {
+            ngraph::element::i8,
+            {},
+            ngraph::element::i8,
+            {{ngraph::element::f32}, { 32.f }, { 0.1f }}
+        }
+    },
     // I8: channel slice, quantization by special dimension
     {
         ngraph::Shape{1, 3, 4, 4},
diff --git a/inference-engine/tests/functional/inference_engine/lp_transformations/transformations_after_split_transformation.cpp b/inference-engine/tests/functional/inference_engine/lp_transformations/transformations_after_split_transformation.cpp
index c63a38f1b45780..fa3fe4097679b5 100644
--- a/inference-engine/tests/functional/inference_engine/lp_transformations/transformations_after_split_transformation.cpp
+++ b/inference-engine/tests/functional/inference_engine/lp_transformations/transformations_after_split_transformation.cpp
@@ -57,7 +57,7 @@ SimpleLowPrecisionTransformer getTransformerWithTransformationByName(
     using namespace pass::low_precision;
     SimpleLowPrecisionTransformer transformer;
 
-    if (name == "AddTransformation") {
+    if (name == "AddTransformationWithoutConcat" || name == "AddTransformationWithConcat") {
         transformer.add<AddTransformation, ngraph::opset1::Add>(params);
         return transformer;
     }
@@ -185,7 +185,8 @@ TEST_P(TransformationsAfterSplitTransformation, Run) {
 }
 
 const std::vector<std::string> transformationNames = {
-    "AddTransformation",
+    "AddTransformationWithoutConcat",
+    "AddTransformationWithConcat",
     "AvgPoolTransformation",
     "ClampTransformation",
     "ConvolutionTransformation",
diff --git a/inference-engine/tests/functional/inference_engine/lp_transformations/variadic_split_transformation.cpp b/inference-engine/tests/functional/inference_engine/lp_transformations/variadic_split_transformation.cpp
index d77e80b21d797b..990de4d98d7a56 100644
--- a/inference-engine/tests/functional/inference_engine/lp_transformations/variadic_split_transformation.cpp
+++ b/inference-engine/tests/functional/inference_engine/lp_transformations/variadic_split_transformation.cpp
@@ -177,11 +177,31 @@ const std::vector<VariadicSplitTransformationTestValues> testValues = {
                     {{1.f, 2.f}, ngraph::element::f32, {1, 2, 1, 1}},
                     {{11.f, 22.f}, ngraph::element::f32, {1, 2, 1, 1}}
                 },
+                {{ngraph::element::f32}, {3.f}, {33.f}}
+            }
+        }
+    },
+    // U8 per channel quantization with different values (constants without batch)
+    {
+        ngraph::Shape({ 1, 3, 16, 16 }), std::int64_t{ -3 }, std::vector<size_t>{ 2, 1 },
+        LayerTransformation::createParamsU8I8(),
+        {
+            ngraph::element::u8,
+            {{ngraph::element::f32},
+            {{1.f, 2.f, 3.f}, ngraph::element::f32, {3, 1, 1}},
+            {{11.f, 22.f, 33.f}, ngraph::element::f32, {3, 1, 1}}}
+        },
+        {
+            ngraph::element::u8,
+            {},
+            ngraph::element::u8,
+            {
                 {
                     {ngraph::element::f32},
-                    {{3.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{33.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                }
+                    {{1.f, 2.f}, ngraph::element::f32, {1, 2, 1, 1}},
+                    {{11.f, 22.f}, ngraph::element::f32, {1, 2, 1, 1}}
+                },
+                {{ngraph::element::f32}, {3.f}, {33.f}}
             }
         }
     },
@@ -205,11 +225,7 @@ const std::vector<VariadicSplitTransformationTestValues> testValues = {
                     {{1.f, 2.f}, ngraph::element::f32, {1, 2, 1, 1}},
                     {{11.f, 22.f}, ngraph::element::f32, {1, 2, 1, 1}}
                 },
-                {
-                    {ngraph::element::f32},
-                    {{3.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{33.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                }
+                {{ngraph::element::f32}, {3.f}, {33.f}}
             }
         }
     },
@@ -228,16 +244,8 @@ const std::vector<VariadicSplitTransformationTestValues> testValues = {
             {},
             ngraph::element::u8,
             {
-                {
-                    {ngraph::element::f32},
-                    {{1.f, 1.f}, ngraph::element::f32, {1, 2, 1, 1}},
-                    {{11.f, 11.f}, ngraph::element::f32, {1, 2, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                }
+                {{ngraph::element::f32}, {1.f}, {11.f}},
+                {{ngraph::element::f32}, {1.f}, {11.f}}
             }
         }
     },
@@ -256,16 +264,8 @@ const std::vector<VariadicSplitTransformationTestValues> testValues = {
             {},
             ngraph::element::i8,
             {
-                {
-                    {ngraph::element::f32},
-                    {{1.f, 1.f}, ngraph::element::f32, {1, 2, 1, 1}},
-                    {{11.f, 11.f}, ngraph::element::f32, {1, 2, 1, 1}}
-                },
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                }
+                {{ngraph::element::f32}, {1.f}, {11.f}},
+                {{ngraph::element::f32}, {1.f}, {11.f}}
             }
         }
     },
@@ -322,21 +322,13 @@ const std::vector<VariadicSplitTransformationTestValues> testValues = {
             {},
             ngraph::element::i8,
             {
-                {
-                    {ngraph::element::f32},
-                    {{1.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{11.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                },
+                {{ngraph::element::f32}, {1.f}, {11.f}},
                 {
                     {ngraph::element::f32},
                     {{2.f, 3.f}, ngraph::element::f32, {1, 2, 1, 1}},
                     {{22.f, 33.f}, ngraph::element::f32, {1, 2, 1, 1}}
                 },
-                {
-                    {ngraph::element::f32},
-                    {{4.f}, ngraph::element::f32, {1, 1, 1, 1}},
-                    {{44.f}, ngraph::element::f32, {1, 1, 1, 1}}
-                }
+                {{ngraph::element::f32}, {4.f}, {44.f}}
             }
         }
     },
diff --git a/inference-engine/tests/functional/inference_engine/ngraph_reader/deformable_psroi_pooling_tests.cpp b/inference-engine/tests/functional/inference_engine/ngraph_reader/deformable_psroi_pooling_tests.cpp
index 33690d29f25b2b..69f6313e3a5fa8 100644
--- a/inference-engine/tests/functional/inference_engine/ngraph_reader/deformable_psroi_pooling_tests.cpp
+++ b/inference-engine/tests/functional/inference_engine/ngraph_reader/deformable_psroi_pooling_tests.cpp
@@ -5,7 +5,7 @@
 #include <string>
 #include "ngraph_reader_tests.hpp"
 
-TEST_F(NGraphReaderTests, ReadDeformablePSROIPoolingNetwork) {
+TEST_F(NGraphReaderTests, ReadDeformablePSROIPoolingNetwork_incorrect_mode) {
     std::string model = R"V0G0N(
 <net name="DeformablePSROIPooling" version="10">
 	<layers>
@@ -29,7 +29,7 @@ TEST_F(NGraphReaderTests, ReadDeformablePSROIPoolingNetwork) {
 				</port>
 			</output>
 		</layer>
-		<layer id="2" name="DeformablePSROIPooling" type="DeformablePSROIPooling" type="PSROIPooling" version="opset2">
+		<layer id="2" name="DeformablePSROIPooling" type="DeformablePSROIPooling" version="opset2">
 			<data group_size="6" mode="bilinear" no_trans="1" output_dim="360" spatial_bins_x="3" spatial_bins_y="3" spatial_scale="1"/>
 			<input>                
 				<port id="0">
@@ -121,5 +121,124 @@ TEST_F(NGraphReaderTests, ReadDeformablePSROIPoolingNetwork) {
 	</edges>
 </net>
 )V0G0N";
-    compareIRs(model, modelV7, 117600);
-}
\ No newline at end of file
+    compareIRs(model, modelV7);
+}
+
+TEST_F(NGraphReaderTests, ReadDeformablePSROIPoolingNetwork) {
+    std::string model = R"V0G0N(
+<net name="DeformablePSROIPooling" version="10">
+	<layers>
+		<layer id="0" name="port_0" type="Parameter" version="opset1">
+			<data shape="1,3240,38,38" element_type="f32"/>
+			<output>
+				<port id="0" precision="FP32">
+					<dim>1</dim>
+					<dim>3240</dim>
+					<dim>38</dim>
+					<dim>38</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="port_1" type="Parameter" version="opset1">
+			<data shape="100,5" element_type="f32"/>
+			<output>
+				<port id="0" precision="FP32">
+					<dim>100</dim>
+					<dim>5</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="DeformablePSROIPooling" type="DeformablePSROIPooling" version="opset1">
+		<data group_size="3" mode="bilinear_deformable" no_trans="1" part_size="1" pooled_height="3" pooled_width="3" trans_std="1" output_dim="360" spatial_bins_x="3" spatial_bins_y="3" spatial_scale="1"/>
+			<input>                
+				<port id="0">
+					<dim>1</dim>
+					<dim>3240</dim>
+					<dim>38</dim>
+					<dim>38</dim>
+				</port>
+				<port id="1">
+					<dim>100</dim>
+					<dim>5</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>100</dim>
+					<dim>360</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="sink_port_0" type="Result" version="opset1">
+			<input>
+				<port id="0">
+					<dim>100</dim>
+					<dim>360</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="2" to-port="0"/>
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="1"/>
+		<edge from-layer="2" from-port="2" to-layer="3" to-port="0"/>
+	</edges>
+</net>
+)V0G0N";
+    std::string modelV7 = R"V0G0N(
+<net name="DeformablePSROIPooling" version="7">
+	<layers>
+		<layer id="0" name="port_0" type="Input" version="opset1">
+			<output>
+				<port id="0" precision="FP32">
+					<dim>1</dim>
+					<dim>3240</dim>
+					<dim>38</dim>
+					<dim>38</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="port_1" type="Input" version="opset1">
+			<output>
+				<port id="0" precision="FP32">
+					<dim>100</dim>
+					<dim>5</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="DeformablePSROIPooling" type="PSROIPooling" version="opset2">
+		<data group_size="3" mode="bilinear_deformable" no_trans="1" part_size="1" pooled_height="3" pooled_width="3" trans_std="1" output_dim="360" spatial_bins_x="3" spatial_bins_y="3" spatial_scale="1"/>
+			<input>                
+				<port id="0">
+					<dim>1</dim>
+					<dim>3240</dim>
+					<dim>38</dim>
+					<dim>38</dim>
+				</port>
+				<port id="1">
+					<dim>100</dim>
+					<dim>5</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" precision="FP32">
+					<dim>100</dim>
+					<dim>360</dim>
+					<dim>3</dim>
+					<dim>3</dim>
+				</port>
+			</output>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="2" to-port="0"/>
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="1"/>
+	</edges>
+</net>
+)V0G0N";
+    compareIRs(model, modelV7);
+}
diff --git a/inference-engine/tests/functional/inference_engine/serialization/single_layer/clamp.cpp b/inference-engine/tests/functional/inference_engine/serialization/single_layer/clamp.cpp
new file mode 100644
index 00000000000000..607989781e6f72
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/serialization/single_layer/clamp.cpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "shared_test_classes/single_layer/clamp.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+TEST_P(ClampLayerTest, Serialize) {
+    Serialize();
+}
+
+const std::vector<std::vector<size_t>> inShapes = {
+    {50}, {10, 10}, {1, 20, 20}, {2, 3, 50, 50}};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32, InferenceEngine::Precision::I32};
+
+const std::vector<std::pair<float, float>> intervals = {
+    {-20.1, -10.5}, {-10.0, 10.0}, {10.3, 20.4}};
+
+INSTANTIATE_TEST_CASE_P(
+    smoke_Clamp_Serialization, ClampLayerTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(inShapes),
+        ::testing::ValuesIn(intervals),
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+    ClampLayerTest::getTestCaseName);
+
+}  // namespace
diff --git a/inference-engine/tests/functional/inference_engine/serialization/single_layer/split.cpp b/inference-engine/tests/functional/inference_engine/serialization/single_layer/split.cpp
new file mode 100644
index 00000000000000..80b1ca17ecacad
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/serialization/single_layer/split.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "shared_test_classes/single_layer/split.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+TEST_P(SplitLayerTest, Serialize) {
+    Serialize();
+}
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::I32,
+        InferenceEngine::Precision::U16,
+        InferenceEngine::Precision::BOOL};
+
+INSTANTIATE_TEST_CASE_P(
+    smoke_Split_Serialization, SplitLayerTest,
+    ::testing::Combine(
+        ::testing::Values(1, 2, 5, 10),
+        ::testing::Values(0, 1, 2, 3),
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+        ::testing::Values(InferenceEngine::Layout::ANY),
+        ::testing::Values(InferenceEngine::Layout::ANY),
+        ::testing::Values(std::vector<size_t>{20, 30, 50, 50}),
+        ::testing::Values(std::vector<size_t>({})),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+    SplitLayerTest::getTestCaseName
+);
+
+}   // namespace
diff --git a/inference-engine/tests/functional/inference_engine/transformations/algebraic_simplification.cpp b/inference-engine/tests/functional/inference_engine/transformations/algebraic_simplification.cpp
index ea0b588881b420..4f94db681a57ce 100644
--- a/inference-engine/tests/functional/inference_engine/transformations/algebraic_simplification.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/algebraic_simplification.cpp
@@ -19,6 +19,7 @@
 #include <transformations/common_optimizations/algebraic_simplification.hpp>
 #include <transformations/utils/utils.hpp>
 #include <transformations/init_node_info.hpp>
+#include <transformations/common_optimizations/transpose_sinking.hpp>
 
 #include "common_test_utils/ngraph_test_utils.hpp"
 
@@ -311,8 +312,7 @@ TEST(algebraic_simplification, replace_transpose_with_reshape) {
 
         pass::Manager pass_manager;
         pass_manager.register_pass<pass::Validate>();
-        pass_manager.register_pass<pass::AlgebraicSimplification>();
-        pass_manager.register_pass<pass::ConstantFolding>();
+        pass_manager.register_pass<pass::TransposeSinking>();
         pass_manager.run_passes(optimized_f);
 
         auto ps = baseline_f->get_results()[0]->get_output_partial_shape(0);
diff --git a/inference-engine/tests/functional/inference_engine/transformations/mul_fake_quantize_fusion.cpp b/inference-engine/tests/functional/inference_engine/transformations/mul_fake_quantize_fusion.cpp
index c3901203b49bb4..b8133abffc2241 100644
--- a/inference-engine/tests/functional/inference_engine/transformations/mul_fake_quantize_fusion.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/mul_fake_quantize_fusion.cpp
@@ -218,6 +218,45 @@ TEST(TransformationTests, MulFakeQuantizeFusionConstantSomeNegative) {
     ASSERT_TRUE(res.first) << res.second;
 }
 
+TEST(TransformationTests, MulFakeQuantizeFusionConstantSomeNegativeF16) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+
+    Shape data_shape{1, 3, 14, 14};
+    {
+        auto data = std::make_shared<opset5::Parameter>(element::f16, data_shape);
+        auto mul_const = opset5::Constant::create(element::f16, Shape{3, 1, 1}, {2, 1, -2});
+        auto mul = std::make_shared<opset5::Multiply>(data, mul_const);
+        auto input_low = opset5::Constant::create(element::f16, Shape{1}, {1});
+        auto input_high = opset5::Constant::create(element::f16, Shape{1}, {20});
+        auto output_low = opset5::Constant::create(element::f16, Shape{1, 3, 1, 1}, {-10, -10, -10});
+        auto output_high = opset5::Constant::create(element::f16, Shape{1}, {10});
+        auto fq = std::make_shared<opset5::FakeQuantize>(mul, input_low,
+                                                         input_high, output_low,
+                                                         output_high, 20);
+        f = std::make_shared<Function>(NodeVector{fq}, ParameterVector{data});
+        pass::Manager m;
+        m.register_pass<pass::InitNodeInfo>();
+        m.register_pass<pass::MulFakeQuantizeFusion>();
+        m.register_pass<pass::ConstantFolding>();
+        m.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    {
+        auto data = std::make_shared<opset5::Parameter>(element::f16, data_shape);
+        auto input_low = opset5::Constant::create(element::f16, Shape{1, 3, 1, 1}, {0.5f, 1.0f, -0.5f});
+        auto input_high = opset5::Constant::create(element::f16, Shape{1, 3, 1, 1}, {10.0f, 20.0f, -10.0f});
+        auto output_low = opset5::Constant::create(element::f16, Shape{1, 3, 1, 1}, {-10.0f, -10.0f, 10.0f});
+        auto output_high = opset5::Constant::create(element::f16, Shape{1, 3, 1, 1}, {10.0f, 10.0f, -10.0f});
+        auto fq = std::make_shared<opset5::FakeQuantize>(data, input_low,
+                                                         input_high, output_low,
+                                                         output_high, 20);
+        f_ref = std::make_shared<Function>(NodeVector{fq}, ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref, true);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
 TEST(TransformationTests, NegativeMulFakeQuantizeFusionNotAConstant) {
     std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
 
diff --git a/inference-engine/tests/functional/inference_engine/transformations/transpose_sinking_test.cpp b/inference-engine/tests/functional/inference_engine/transformations/transpose_sinking_test.cpp
new file mode 100644
index 00000000000000..13cdd90149389e
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/transformations/transpose_sinking_test.cpp
@@ -0,0 +1,203 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset6.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <transformations/common_optimizations/transpose_sinking.hpp>
+#include <transformations/init_node_info.hpp>
+#include <ngraph_functions/utils/ngraph_helpers.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+using namespace ngraph;
+
+struct TransposeFQReduceParams {
+    // given params
+    PartialShape transpose_input_shape;
+    std::vector<int32_t> transpose_order;
+    Shape il, ih, ol, oh;
+    std::vector<int32_t> reduce_axes;
+    bool reduce_keep_dims;
+
+    // expected params
+    Shape ex_il, ex_ih, ex_ol, ex_oh;
+    std::vector<int32_t>  ex_reduce_axes;
+    std::vector<int32_t> ex_transpose_order;
+};
+
+class TransposeSinkingFQ : public CommonTestUtils::TestsCommon,
+                         public testing::WithParamInterface<std::tuple<TransposeFQReduceParams>> {
+public:
+    std::shared_ptr<Function> f, f_ref;
+
+    void SetUp() override {
+        const auto& test_case = std::get<0>(GetParam());
+
+        {
+            auto input = std::make_shared<opset6::Parameter>(element::f32, test_case.transpose_input_shape);
+
+            auto order = std::make_shared<opset6::Constant>(element::i64, Shape{test_case.transpose_order.size()}, test_case.transpose_order);
+            auto transpose = std::make_shared<ngraph::opset6::Transpose>(input, order);
+
+            auto i_low = std::make_shared<ngraph::opset6::Constant>(element::i64, test_case.il, std::vector<int32_t>{0});
+            auto i_high = std::make_shared<ngraph::opset6::Constant>(element::i64, test_case.ih, std::vector<int32_t>{0});
+            auto o_low = std::make_shared<ngraph::opset6::Constant>(element::i64, test_case.ol, std::vector<int32_t>{0});
+            auto o_high = std::make_shared<ngraph::opset6::Constant>(element::i64, test_case.oh, std::vector<int32_t>{0});
+            auto fq = std::make_shared<ngraph::opset6::FakeQuantize>(transpose, i_low, i_high, o_low, o_high, 256);
+
+            auto axes = std::make_shared<ngraph::opset6::Constant>(
+                    element::i64, Shape{test_case.reduce_axes.size()}, test_case.reduce_axes);
+            auto reduce = std::make_shared<ngraph::opset6::ReduceMean>(fq, axes, test_case.reduce_keep_dims);
+
+            f = std::make_shared<ngraph::Function>(ngraph::NodeVector{reduce}, ngraph::ParameterVector{input});
+        }
+
+        {
+            auto input = std::make_shared<opset6::Parameter>(element::f32, test_case.transpose_input_shape);
+
+            auto i_low = std::make_shared<ngraph::opset6::Constant>(element::i64, test_case.ex_il, std::vector<int32_t>{0});
+            auto i_high = std::make_shared<ngraph::opset6::Constant>(element::i64, test_case.ex_ih, std::vector<int32_t>{0});
+            auto o_low = std::make_shared<ngraph::opset6::Constant>(element::i64, test_case.ex_ol, std::vector<int32_t>{0});
+            auto o_high = std::make_shared<ngraph::opset6::Constant>(element::i64, test_case.ex_oh, std::vector<int32_t>{0});
+            auto fq = std::make_shared<ngraph::opset6::FakeQuantize>(input, i_low, i_high, o_low, o_high, 256);
+
+            auto axes = std::make_shared<ngraph::opset6::Constant>(
+                    element::i64, Shape{test_case.ex_reduce_axes.size()}, test_case.ex_reduce_axes);
+            auto reduce = std::make_shared<ngraph::opset6::ReduceMean>(fq, axes, test_case.reduce_keep_dims);
+
+            auto order = std::make_shared<opset6::Constant>(element::i64, Shape{test_case.ex_transpose_order.size()}, test_case.ex_transpose_order);
+            auto transpose = std::make_shared<ngraph::opset6::Transpose>(reduce, order);
+
+            f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{transpose}, ngraph::ParameterVector{input});
+        }
+    }
+};
+
+TEST_P(TransposeSinkingFQ, TransposeFQReduce) {
+    ngraph::pass::Manager manager;
+    manager.register_pass<ngraph::pass::InitNodeInfo>();
+    manager.register_pass<ngraph::pass::TransposeFQReduction>();
+    manager.register_pass<ngraph::pass::TransposeReduction>();
+    manager.run_passes(f);
+    ASSERT_NO_THROW(check_rt_info(f));
+
+    auto res = compare_functions(f, f_ref, true);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+
+INSTANTIATE_TEST_CASE_P(TransformationTest, TransposeSinkingFQ, testing::Values(
+        TransposeFQReduceParams{{1, 3, 240, 140}, {0, 2, 3, 1}, {1}, {3}, {1, 1, 1, 1}, {1, 1, 1, 3}, {1, 2}, true,
+                                {1, 1, 1, 1}, {1, 3, 1, 1}, {1, 1, 1, 1}, {1, 3, 1, 1}, {2, 3}, {0, 2, 3, 1}},
+        TransposeFQReduceParams{{1, 3, 240, 140}, {0, 2, 3, 1}, {1}, {3}, {1, 1, 1, 1}, {1, 1, 1, 3}, {1, 2}, false,
+                                {1, 1, 1, 1}, {1, 3, 1, 1}, {1, 1, 1, 1}, {1, 3, 1, 1}, {2, 3}, {0, 1}}));
+
+
+
+struct TransposeReduceParams {
+    // given params
+    PartialShape transpose_input_shape;
+    std::vector<int32_t> transpose_order;
+    std::vector<int32_t> reduce_axes;
+    bool reduction_keep_dims;
+
+    // expected params
+    std::vector<int32_t>  ex_reduce_axes;
+    std::vector<int32_t> ex_transpose_order;
+};
+
+class TransposeSinking : public CommonTestUtils::TestsCommon,
+                         public testing::WithParamInterface<std::tuple<TransposeReduceParams, ngraph::NodeTypeInfo>> {
+public:
+    std::shared_ptr<Function> f, f_ref;
+
+    void SetUp() override {
+        const auto& test_case = std::get<0>(GetParam());
+        const auto& reduction_type_info = std::get<1>(GetParam());
+
+        {
+            auto input = std::make_shared<opset6::Parameter>(element::dynamic, test_case.transpose_input_shape);
+
+            auto order = std::make_shared<opset6::Constant>(element::i64, Shape{test_case.transpose_order.size()}, test_case.transpose_order);
+            auto transpose = std::make_shared<ngraph::opset6::Transpose>(input, order);
+
+            auto axes = std::make_shared<ngraph::opset6::Constant>(
+                    element::i64, Shape{test_case.reduce_axes.size()}, test_case.reduce_axes);
+
+            auto reduction = get_reduction(reduction_type_info, {transpose, axes}, test_case.reduction_keep_dims);
+
+            f = std::make_shared<ngraph::Function>(ngraph::NodeVector{reduction}, ngraph::ParameterVector{input});
+        }
+
+        {
+            auto input = std::make_shared<opset6::Parameter>(element::dynamic, test_case.transpose_input_shape);
+
+            auto axes = std::make_shared<ngraph::opset6::Constant>(
+                    element::i64, Shape{test_case.ex_reduce_axes.size()}, test_case.ex_reduce_axes);
+            auto reduction = get_reduction(reduction_type_info, {input, axes}, test_case.reduction_keep_dims);
+
+            auto order = std::make_shared<opset6::Constant>(element::i64, Shape{test_case.ex_transpose_order.size()}, test_case.ex_transpose_order);
+            auto transpose = std::make_shared<ngraph::opset6::Transpose>(reduction, order);
+
+            f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{transpose}, ngraph::ParameterVector{input});
+        }
+    }
+private:
+    std::shared_ptr<Node> get_reduction(ngraph::NodeTypeInfo reduction_type_info, const OutputVector& inputs, bool keep_dims) {
+        auto reduction = ngraph::helpers::getNodeSharedPtr(reduction_type_info, inputs);
+        if (auto arithmetic_reduce = std::dynamic_pointer_cast<op::util::ArithmeticReductionKeepDims>(reduction))
+            arithmetic_reduce->set_keep_dims(keep_dims);
+        else if (auto logical_reduce = std::dynamic_pointer_cast<op::util::LogicalReductionKeepDims>(reduction))
+            logical_reduce->set_keep_dims(keep_dims);
+        reduction->validate_and_infer_types();
+        return reduction;
+    }
+};
+
+TEST_P(TransposeSinking, TransposeReduction) {
+    ngraph::pass::Manager manager;
+    manager.register_pass<ngraph::pass::InitNodeInfo>();
+    manager.register_pass<ngraph::pass::TransposeReduction>();
+    manager.run_passes(f);
+    ASSERT_NO_THROW(check_rt_info(f));
+
+    auto res = compare_functions(f, f_ref, true);
+
+ASSERT_TRUE(res.first) << res.second;
+}
+
+
+INSTANTIATE_TEST_CASE_P(TransposeSinkingReduces, TransposeSinking, testing::Combine(
+        testing::Values(
+            TransposeReduceParams{{1, 3, 240, 140}, {0, 2, 3, 1}, {1, 2}, true, {2, 3}, {0, 2, 3, 1}},
+            TransposeReduceParams{{10, 20, 30, 40, 50, 60, 70}, {0, 6, 1, 5, 2, 4, 3}, {1, 3, 6}, true, {6, 5, 3}, {0, 6, 1, 5, 2, 4, 3}},
+            TransposeReduceParams{{1, 3, 240, 140}, {0, 2, 3, 1}, {1, 2}, false, {2, 3}, {0, 1}},
+            TransposeReduceParams{{10, 20, 30, 40, 50, 60, 70}, {0, 6, 1, 5, 2, 4, 3}, {1, 3, 6}, false, {6, 5, 3}, {0, 1, 2, 3}},
+            TransposeReduceParams{{10, 20, 30, 40, 50, 60, 70}, {0, 6, 1, 5, 2, 4, 3}, {1, -4, 6}, false, {6, 5, 3}, {0, 1, 2, 3}},
+            TransposeReduceParams{{1, 3, 240, 140}, {0, 1, 2, 3}, {0, 1, 2, -1}, false, {0, 1, 2, 3}, {}}),
+        testing::Values(
+            ngraph::opset6::ReduceMax::type_info,
+            ngraph::opset6::ReduceMean::type_info,
+            ngraph::opset6::ReduceMin::type_info,
+            ngraph::opset6::ReduceProd::type_info,
+            ngraph::opset6::ReduceSum::type_info,
+            ngraph::opset6::ReduceL1::type_info,
+            ngraph::opset6::ReduceL2::type_info,
+            ngraph::opset6::ReduceLogicalAnd::type_info,
+            ngraph::opset6::ReduceLogicalOr::type_info)));
+
+INSTANTIATE_TEST_CASE_P(TransposeSinkingSqueeze, TransposeSinking, testing::Combine(
+        testing::Values(
+            TransposeReduceParams{{2, 3, 1, 1}, {0, 2, 3, 1}, {1, 2}, false, {2, 3}, {0, 1}},
+            TransposeReduceParams{{10, 20, 30, 1, 50, 1, 1}, {0, 6, 1, 5, 2, 4, 3}, {1, 3, 6}, false, {6, 5, 3}, {0, 1, 2, 3}}),
+        testing::Values(
+            ngraph::opset6::Squeeze::type_info)));
+
diff --git a/inference-engine/tests/functional/inference_engine/transformations/transpose_to_reshape_test.cpp b/inference-engine/tests/functional/inference_engine/transformations/transpose_to_reshape_test.cpp
index 7b6d2384387c3f..61ab9d3964ca7f 100644
--- a/inference-engine/tests/functional/inference_engine/transformations/transpose_to_reshape_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/transpose_to_reshape_test.cpp
@@ -19,6 +19,7 @@
 #include <transformations/init_node_info.hpp>
 #include <transformations/common_optimizations/algebraic_simplification.hpp>
 #include <ngraph/pass/visualize_tree.hpp>
+#include <transformations/common_optimizations/transpose_sinking.hpp>
 
 #include "common_test_utils/ngraph_test_utils.hpp"
 
@@ -97,7 +98,7 @@ class TransposeToReshapeTests: public CommonTestUtils::TestsCommon,
 
 TEST_P(TransposeToReshapeTests, CompareFunctions) {
     ngraph::pass::InitNodeInfo().run_on_function(f);
-    ngraph::pass::AlgebraicSimplification().run_on_function(f);
+    ngraph::pass::TransposeSinking().run_on_function(f);
     f->validate_nodes_and_infer_types();
     ASSERT_NO_THROW(check_rt_info(f));
     auto res = compare_functions(f, f_ref);
diff --git a/inference-engine/tests/functional/plugin/conformance/subgraphs_dumper/src/matchers/single_op.cpp b/inference-engine/tests/functional/plugin/conformance/subgraphs_dumper/src/matchers/single_op.cpp
index 9989a37da1e75c..987744ca915198 100644
--- a/inference-engine/tests/functional/plugin/conformance/subgraphs_dumper/src/matchers/single_op.cpp
+++ b/inference-engine/tests/functional/plugin/conformance/subgraphs_dumper/src/matchers/single_op.cpp
@@ -4,6 +4,7 @@
 
 #include "matchers/single_op.hpp"
 #include "ngraph/ops.hpp"
+#include "ngraph/validation_util.hpp"
 #include <cstdlib>
 
 using namespace SubgraphsDumper;
@@ -121,17 +122,17 @@ bool SingleOpMatcher::match_ports(const std::shared_ptr<ngraph::Node> &node, con
         if (std::any_of(begin(ignored_ports), end(ignored_ports), [=](size_t p){return p == port_id;})) {
             continue;
         }
-        const auto &cur_node_input = node->input_value(port_id).get_node_shared_ptr();
-        const auto &ref_node_input = ref->input_value(port_id).get_node_shared_ptr();
+        const auto &cur_node_input = node->input_value(port_id);
+        const auto &ref_node_input = ref->input_value(port_id);
 
-        const auto &cur_const_input = std::dynamic_pointer_cast<ngraph::op::Constant>(cur_node_input);
-        const auto &ref_const_input = std::dynamic_pointer_cast<ngraph::op::Constant>(ref_node_input);
+        const auto &cur_const_input = ngraph::get_constant_from_source(cur_node_input);
+        const auto &ref_const_input = ngraph::get_constant_from_source(ref_node_input);
 
         // Check that both OP an reference port inputs are constant and have same data
         if (cur_const_input && ref_const_input &&
             !compare_constants_data(cur_const_input, ref_const_input)) {
             return false;
-            // Check that input nodes on the port both not constants
+        // Check that input nodes on the port both not constants
         } else if ((cur_const_input && !ref_const_input) || (!cur_const_input && ref_const_input)) {
             return false;
         }
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp
index fa6166888dfe1e..be35416515a9d7 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp
@@ -36,7 +36,12 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         { 256ul, ngraph::Shape({}), {-1.28f}, {1.27f}, {-1.28f}, {1.27f} },
         { 256ul, ngraph::Shape({}), {0.f}, {2.55f}, {0.f}, {2.55f} }
-    }
+    },
+    // FQ with unexpected quantizationLevels
+    {
+        { 16ul, ngraph::Shape({}), {0.f}, {15.f}, {0.f}, {1.5f} },
+        { 16ul, ngraph::Shape({}), {0.f}, {15.f}, {0.f}, {1.5f} }
+    },
 };
 
 const std::vector<ngraph::Shape> shapes = {
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp
index f035f132e012b5..4c9d43c124f847 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp
@@ -45,8 +45,7 @@ const std::vector<ConcatWithSplitTransformationParam> testValues = {
     }
 };
 
-// TODO: Split/VariadicSplit operations are not supported in ConcatTransformation
-INSTANTIATE_TEST_CASE_P(DISABLED_smoke_LPT, ConcatWithSplitTransformation,
+INSTANTIATE_TEST_CASE_P(smoke_LPT, ConcatWithSplitTransformation,
     ::testing::Combine(
         ::testing::ValuesIn(netPrecisions),
         ::testing::Values(ngraph::Shape({ 1, 6, 10, 10 })),
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp
index 14d92bbea12633..fdaae7a35f7c88 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp
@@ -72,7 +72,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionQDqTransformationParam> para
             { {-128.f}, ngraph::element::f32, {}, false, 1ul, ngraph::element::i8, true },
             { {0.2f}, ngraph::element::f32, {}, false }
         },
-        "output_original",
+        "Convolution",
         "FP32"
     },
 
@@ -126,7 +126,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionQDqTransformationParam> para
             {},
             { {0.2f}, ngraph::element::f32, {}, false }
         },
-        "output_original",
+        "Convolution",
         "U8"
     },
 
@@ -177,7 +177,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionQDqTransformationParam> para
             { {128.f}, ngraph::element::f32, {}, false, 1ul, ngraph::element::i8, true },
             { {0.2f}, ngraph::element::f32, {}, false }
         },
-        "output_original",
+        "Convolution",
         "FP32"
     },
 
@@ -228,7 +228,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionQDqTransformationParam> para
             {},
             { {0.2f}, ngraph::element::f32, {}, false }
         },
-        "output_original",
+        "Convolution",
         "U8"
     },
 };
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp
index 4bf07c12a315fc..086a2ef6f16a29 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp
@@ -27,7 +27,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
         false,
         {},
         false,
-        "output",
+        "Convolution",
         "FP32"
     },
     {
@@ -35,7 +35,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
         false,
         { 255ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
         false,
-        "output",
+        "Convolution",
         "FP32"
     },
     {
@@ -43,7 +43,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
         false,
         { 255ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
         false,
-        "output_original",
+        "Convolution",
         "U8"
     },
     {
@@ -51,7 +51,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
         false,
         { 16ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
         false,
-        "output",
+        "Convolution",
         "FP32"
     },
     {
@@ -59,7 +59,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
         false,
         { 255ul, ngraph::Shape { 1, 1, 1, 1 }, { -12.7f }, { 12.7f }, { -12.7f }, { 12.7f } },
         false,
-        "output",
+        "Convolution",
         "FP32"
     },
     {
@@ -67,7 +67,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
         false,
         { 16ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
         false,
-        "output",
+        "Convolution",
         "FP32"
     },
     {
@@ -75,7 +75,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
         true,
         { 255ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
         false,
-        "output_original",
+        "Convolution",
         "U8"
     },
     {
@@ -83,7 +83,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
         true,
         { 255ul, ngraph::Shape { 1 }, { 0.f }, { 254.f }, { -18.7f }, { 18.7f } },
         false,
-        "output_original",
+        "Convolution",
         "U8"
     },
 };
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp
index 5779ec69bee152..073e13da0b78d1 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp
@@ -12,8 +12,9 @@ using namespace LayerTestsDefinitions;
 using namespace ngraph::pass::low_precision;
 
 namespace {
-const std::vector<InferenceEngine::Precision> netPrecisions = {
-    InferenceEngine::Precision::FP32
+const std::vector<ngraph::element::Type> netPrecisions = {
+    ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
index ad9678887ccfd9..34e2dbf542d46b 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
@@ -21,7 +21,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::f32, ngraph::Shape{ 2, 4 } },
         { 256ul, {{1}, {1}, {2, 1}, {2, 1}}, {-128.f}, {127.f}, {-128.f, -12.8f}, {127.f, 12.7f} },
         { {}, {}, {} },
-        "matMul/FC",
+        "FullyConnected",
         "U8"
     },
     // 3D with dequantize on weights
@@ -31,7 +31,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::i8, ngraph::Shape{ 2, 4 } },
         {},
         { ngraph::element::f32, {}, {0.1f} },
-        "matMul/FC",
+        "FullyConnected",
         "U8"
     },
     // 3D with different values
@@ -41,7 +41,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::f32, ngraph::Shape{ 2, 4 } },
         { 256ul, {{1}, {1}, {2, 1}, {2, 1}}, {-128.f}, {127.f}, {-128.f, -12.8f}, {127.f, 12.7f} },
         { {}, {}, {} },
-        "matMul/FC",
+        "FullyConnected",
         "U8"
     },
     // 4D with different values
@@ -51,7 +51,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::f32, ngraph::Shape{ 2, 4 } },
         { 256ul, {{1}, {1}, {2, 1}, {2, 1}}, {-128.f}, {127.f}, {-128.f, -12.8f}, {127.f, 12.7f} },
         { {}, {}, {} },
-        "matMul/FC",
+        "FullyConnected",
         "U8"
     },
     // 4D with Dq on weights
@@ -61,7 +61,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::i8, ngraph::Shape{ 2, 4 } },
         {},
         { ngraph::element::f32, {}, {{0.1f, 0.01}, ngraph::element::f32, ngraph::Shape{ 2, 1 }} },
-        "matMul/FC",
+        "FullyConnected",
         "U8"
     },
     // 3D with the same values
@@ -71,7 +71,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 4, 2.f), ngraph::element::f32, ngraph::Shape{ 4, 4 } },
         { 256ul, {{1}, {1}, {1}, {1}}, {-128.f}, {127.f}, {-128.f}, {127.f} },
         { {}, {}, {} },
-        "matMul/FC",
+        "FullyConnected",
         "U8"
     },
     // 2D with subtract on activations
@@ -81,7 +81,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>{1, 2, 3, 4, 5, 6}, ngraph::element::f32, ngraph::Shape{ 2, 3 } },
         { 256ul, {{1}, {1}, {1}, {1}}, {-128.f}, {127.f}, {-12.8f}, {12.7f} },
         { {}, {}, {} },
-        "matMul/1",
+        "FullyConnected",
         "U8"
     },
     // 2D with subtract on activations & Dq on weights
@@ -91,7 +91,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>{1, 2, 3, 4, 5, 6}, ngraph::element::i8, ngraph::Shape{ 2, 3 } },
         {},
         { ngraph::element::f32, {}, {0.1f} },
-        "matMul/1",
+        "FullyConnected",
         "U8"
     }
 };
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp
index 471a9ad4868863..5decd0d9c8824e 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp
@@ -53,7 +53,9 @@ const std::map<ActivationTypes, std::vector<std::vector<float>>> activationTypes
         {HSigmoid,              {}},
         {RoundHalfToEven,       {}},
         {RoundHalfAwayFromZero, {}},
-        {Erf, {}}
+        {Erf,                   {}},
+        {GeluErf,               {}},
+        {GeluTanh,              {}}
 };
 
 const std::map<ActivationTypes, std::vector<std::vector<float>>> activationParamTypes = {
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/binary_convolution.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/binary_convolution.cpp
index fcb8f792d3bbbf..d451e80b292067 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/binary_convolution.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/binary_convolution.cpp
@@ -12,7 +12,8 @@ using namespace LayerTestsDefinitions;
 namespace {
 
 const std::vector<InferenceEngine::Precision> netPrecisions = {
-    InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16};
+    InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16,
+    InferenceEngine::Precision::I32};
 
 /* ============= 2D Binary Convolution ============= */
 const std::vector<std::vector<size_t>> kernels = {{3, 3}, {3, 5}};
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/clamp.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/clamp.cpp
new file mode 100644
index 00000000000000..200350fd316067
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/clamp.cpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/clamp.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+const std::vector<std::vector<size_t>> inShapes = {
+    {50},
+    {10, 10},
+    {1, 20, 20}
+};
+
+const std::vector<std::pair<float, float>> intervals = {
+    {-20.1, -10.5},
+    {-10.0, 10.0},
+    {10.3, 20.4}
+};
+
+const std::vector<std::pair<float, float>> intervals_unsigned = {
+    {0.1, 10.1},
+    {10.0, 100.0},
+    {10.6, 20.6}
+};
+
+const std::vector<InferenceEngine::Precision> netPrc = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16,
+    InferenceEngine::Precision::I64,
+    InferenceEngine::Precision::I32
+};
+
+const auto test_Clamp_signed = ::testing::Combine(
+    ::testing::ValuesIn(inShapes),
+    ::testing::ValuesIn(intervals),
+    ::testing::ValuesIn(netPrc),
+    ::testing::Values(CommonTestUtils::DEVICE_CPU)
+);
+
+const auto test_Clamp_unsigned = ::testing::Combine(
+    ::testing::ValuesIn(inShapes),
+    ::testing::ValuesIn(intervals_unsigned),
+    ::testing::Values(InferenceEngine::Precision::U64),
+    ::testing::Values(CommonTestUtils::DEVICE_CPU)
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_TestsClamp_signed, ClampLayerTest, test_Clamp_signed, ClampLayerTest::getTestCaseName);
+INSTANTIATE_TEST_CASE_P(smoke_TestsClamp_unsigned, ClampLayerTest, test_Clamp_unsigned, ClampLayerTest::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/convolution.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/convolution.cpp
index 020809776338a2..c82e332785f7d1 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/convolution.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/convolution.cpp
@@ -12,7 +12,8 @@ using namespace LayerTestsDefinitions;
 namespace {
 
 const std::vector<InferenceEngine::Precision> netPrecisions = {
-    InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16};
+    InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16,
+    InferenceEngine::Precision::I32};
 
 /* ============= 1D Convolution ============= */
 const std::vector<std::vector<size_t>> kernels1D = {{3}, {5}};
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/group_convolution.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/group_convolution.cpp
index 1a0e4305d8ff59..417ad9d0130096 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/group_convolution.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/group_convolution.cpp
@@ -12,7 +12,8 @@ using namespace LayerTestsDefinitions;
 namespace {
 
 const std::vector<InferenceEngine::Precision> netPrecisions = {
-    InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16};
+    InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16,
+    InferenceEngine::Precision::I32};
 
 /* ============= 1D GroupConvolution ============= */
 const std::vector<std::vector<size_t>> kernels1d = {{3}};
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/group_convolution_backprop_data.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/group_convolution_backprop_data.cpp
index 5c7c79255a1431..70a1151d9bae1d 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/group_convolution_backprop_data.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/group_convolution_backprop_data.cpp
@@ -13,8 +13,8 @@ namespace {
 
 const std::vector<InferenceEngine::Precision> netPrecisions = {
         InferenceEngine::Precision::FP32,
-        InferenceEngine::Precision::FP16
-};
+        InferenceEngine::Precision::FP16,
+        InferenceEngine::Precision::I32};
 
 const std::vector<size_t> numOutChannels = {16, 32};
 const std::vector<size_t> numGroups = {2, 8, 16};
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/split.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/split.cpp
index 814c97988060f3..35d3149dcc361c 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/split.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/split.cpp
@@ -13,12 +13,14 @@ namespace {
 
 const std::vector<InferenceEngine::Precision> netPrecisions = {
         InferenceEngine::Precision::FP32,
-        InferenceEngine::Precision::FP16
+        InferenceEngine::Precision::FP16,
+        InferenceEngine::Precision::I32,
+        InferenceEngine::Precision::U8
 };
 
 INSTANTIATE_TEST_CASE_P(smoke_NumSplitsCheck, SplitLayerTest,
                         ::testing::Combine(
-                                ::testing::Values(1),
+                                ::testing::Values(1, 2, 3, 5),
                                 ::testing::Values(0, 1, 2, 3),
                                 ::testing::ValuesIn(netPrecisions),
                                 ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/strided_slice.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/strided_slice.cpp
index 401e5360eb15f1..01fb2ff0051bd3 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/strided_slice.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/strided_slice.cpp
@@ -12,19 +12,27 @@ using namespace LayerTestsDefinitions;
 namespace {
 
 std::vector<StridedSliceSpecificParams> ss_only_test_cases = {
+        StridedSliceSpecificParams{ { 16 }, { 4 }, { 12 }, { 1 },
+                                    { 0 }, { 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 16 }, { 0 }, { 8 }, { 2 },
+                                    { 1 }, { 0 },  { },  { },  { } },
         StridedSliceSpecificParams{ { 128, 1 }, { 0, 0, 0 }, { 0, 0, 0 }, { 1, 1, 1 },
                             { 0, 1, 1 }, { 0, 1, 1 },  { 1, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
         StridedSliceSpecificParams{ { 128, 1 }, { 0, 0, 0 }, { 0, 0, 0 }, { 1, 1, 1},
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 1, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
+        StridedSliceSpecificParams{ { 2, 3 }, { 1, 0 }, { 2, 3 }, { 1, 1 },
+                            { 0, 0 }, { 0, 0 },  {  },  {  },  {  } },
+        StridedSliceSpecificParams{ { 10, 3 }, { 0, 0 }, { 20, 20 }, { 1, 1 },
+                            { 0, 1 }, { 0, 1 },  {  },  {  },  {  } },
         StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, -1, 0 }, { 0, 0, 0 }, { 1, 1, 1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 1, 0 },  { 0, 0, 0 } },
         StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, 9, 0 }, { 0, 11, 0 }, { 1, 1, 1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
         StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, 1, 0 }, { 0, -1, 0 }, { 1, 1, 1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
-        StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, 9, 0 }, { 0, 7, 0 }, { -1, -1, -1 },
+        StridedSliceSpecificParams{ { 2, 12, 100 }, { 0, 9, 0 }, { 0, 7, 0 }, { -1, -1, -1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
-        StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, 7, 0 }, { 0, 9, 0 }, { -1, 1, -1 },
+        StridedSliceSpecificParams{ { 2, 12, 100 }, { 0, 7, 0 }, { 0, 9, 0 }, { -1, 1, -1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
         StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, 4, 0 }, { 0, 9, 0 }, { -1, 2, -1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
@@ -32,34 +40,72 @@ std::vector<StridedSliceSpecificParams> ss_only_test_cases = {
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
         StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, 9, 0 }, { 0, 4, 0 }, { -1, -2, -1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
-        StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, 10, 0 }, { 0, 4, 0 }, { -1, -2, -1 },
+        StridedSliceSpecificParams{ { 2, 12, 100 }, { 0, 10, 0 }, { 0, 4, 0 }, { -1, -2, -1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
         StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, 11, 0 }, { 0, 0, 0 }, { -1, -2, -1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
         StridedSliceSpecificParams{ { 1, 12, 100 }, { 0, -6, 0 }, { 0, -8, 0 }, { -1, -2, -1 },
                             { 1, 0, 1 }, { 1, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
+        StridedSliceSpecificParams{ { 20, 10, 5 }, { 0, 0, 0 }, { 3, 10, 0 }, { 1, 1, 1 },
+                            { 0, 0, 1 }, { 0, 0, 1 },  { 0, 0, 0 },  { 0, 0, 0 },  { 0, 0, 0 } },
+        StridedSliceSpecificParams{ { 1, 10, 20 }, { 0, 0, 2 }, { 0, 0, 1000 }, { 1, 1, 1 },
+                            { 1, 1, 0 }, { 1, 1, 0 },  {  },  {  },  {  } },
+        StridedSliceSpecificParams{ { 1, 10, 10 }, { 0, 1, 0 }, { 0, 1000, 0 }, { 1, 1, 1 },
+                            { 1, 0, 1 }, { 1, 0, 1 },  {  },  {  },  {  } },
+        StridedSliceSpecificParams{ { 1, 10, 4 }, { 0, 0, 0 }, { 0, 0, 2 }, { 1, 1, 1 },
+                            { 1, 1, 0 }, { 1, 1, 0 },  {  },  {  },  {  } },
+        StridedSliceSpecificParams{ { 1, 10, 4 }, { 0, 0, 2 }, { 0, 0, 1000 }, { 1, 1, 1 },
+                            { 1, 1, 0 }, { 1, 1, 0 },  {  },  {  },  {  } },
+        StridedSliceSpecificParams{ { 1, 10, 2 }, { 0, 0, 0 }, { 0, 0, 1 }, { 1, 1, 1 },
+                            { 1, 1, 0 }, { 1, 1, 0 },  {  },  {  },  {  } },
+        StridedSliceSpecificParams{ { 1, 10, 2 }, { 0, 0, 0 }, { 1000, 0, 0 }, { 1, 1, 1 },
+                            { 0, 1, 1 }, { 0, 1, 1 },  {  },  {  },  {  } },
+        StridedSliceSpecificParams{ { 1, 10, 2 }, { 0, 0, 0 }, { 0, 1000, 0 }, { 1, 1, 1 },
+                            { 1, 0, 1 }, { 1, 0, 1 },  {  },  {  },  {  } },
+        StridedSliceSpecificParams{ { 20, 10, 5 }, { 0, 3 }, { 0, 4 }, { 1, 1 },
+                            { 1, 0 }, { 1, 0 },  {  },  {  },  { 1, 0 } },
+        StridedSliceSpecificParams{ { 20, 10, 5 }, { 0, 0 }, { 0, -1 }, { 1, 1 },
+                            { 1, 0 }, { 1, 0 },  {  },  {  },  { 1, 0 } },
         StridedSliceSpecificParams{ { 1, 12, 100, 1, 1 }, { 0, -1, 0, 0 }, { 0, 0, 0, 0 }, { 1, 1, 1, 1 },
                             { 1, 0, 1, 0 }, { 1, 0, 1, 0 },  { },  { 0, 1, 0, 1 },  {} },
         StridedSliceSpecificParams{ { 2, 2, 2, 2 }, { 0, 0, 0, 0 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 },
-                            {1, 1, 1, 1}, {1, 1, 1, 1},  {},  {},  {} },
+                            { 1, 1, 1, 1}, { 1, 1, 1, 1},  {},  {},  {} },
+        StridedSliceSpecificParams{ { 2, 2, 2, 2 }, { 0, 0 }, { 2, 2 }, { 1, 1 },
+                            { 1, 1 }, { 1, 1 },  {},  {},  {} },
+        StridedSliceSpecificParams{ { 2, 2, 3, 3 }, { 0, -2, -2 }, { 2, -1, -1 }, { 1, 1, 1 },
+                            { 1, 0 }, { 1, 0 },  {},  {},  {} },
         StridedSliceSpecificParams{ { 2, 2, 2, 2 }, { 1, 1, 1, 1 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 },
-                            {0, 0, 0, 0}, {1, 1, 1, 1},  {},  {},  {} },
+                            { 0, 0, 0, 0}, { 1, 1, 1, 1},  {},  {},  {} },
         StridedSliceSpecificParams{ { 2, 2, 2, 2 }, { 1, 1, 1, 1 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 },
-                            {0, 0, 0, 0}, {0, 0, 0, 0},  {},  {},  {} },
+                            { 0, 0, 0, 0}, { 0, 0, 0, 0},  {},  {},  {} },
+        StridedSliceSpecificParams{ { 1, 2, 6, 4 }, { 0, 0, 4, 0 }, { 1, 2, 6, 4 }, { 1, 1, 1, 1 },
+                            {}, {}, {}, {}, {} },
+        StridedSliceSpecificParams{ { 1, 2, 6, 4 }, { 0, 0, -3, 0 }, { 1, 2, 6, 4 }, { 1, 1, 1, 1 },
+                            {}, {}, {}, {}, {} },
+        StridedSliceSpecificParams{ { 1, 2, 6, 4 }, { 0, 0, 4, 0 }, { 1, 2, 6, 4 }, { 1, 1, 1, 1 },
+                            { 1, 1, 0, 1}, { 1, 1, 1, 1},  {},  {},  {} },
+        StridedSliceSpecificParams{ { 10, 2, 2, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 2, 1, 1, 1 },
+                            { 1, 1, 1, 1}, { 1, 1, 1, 1},  {},  {},  {} },
         StridedSliceSpecificParams{ { 2, 2, 4, 3 }, { 0, 0, 0, 0 }, { 2, 2, 4, 3 }, { 1, 1, 2, 1 },
-                            {1, 1, 1, 1}, {1, 1, 1, 1},  {},  {},  {} },
+                            { 1, 1, 1, 1}, { 1, 1, 1, 1},  {},  {},  {} },
         StridedSliceSpecificParams{ { 2, 2, 4, 2 }, { 1, 0, 0, 1 }, { 2, 2, 4, 2 }, { 1, 1, 2, 1 },
-                            {0, 1, 1, 0}, {1, 1, 0, 0},  {},  {},  {} },
+                            { 0, 1, 1, 0}, { 1, 1, 0, 0},  {},  {},  {} },
         StridedSliceSpecificParams{ { 1, 2, 4, 2 }, { 1, 0, 0, 0 }, { 1, 2, 4, 2 }, { 1, 1, -2, -1 },
-                            {1, 1, 1, 1}, {1, 1, 1, 1},  {},  {},  {} },
+                            { 1, 1, 1, 1}, { 1, 1, 1, 1},  {},  {},  {} },
         StridedSliceSpecificParams{ { 2, 2, 4, 2 }, { 1, 0, 0, 0 }, { 1, 2, 4, 2 }, { 1, 1, -2, -1 },
-                            {0, 1, 1, 1}, {1, 1, 1, 1},  {},  {},  {} },
+                            { 0, 1, 1, 1}, { 1, 1, 1, 1},  {},  {},  {} },
         StridedSliceSpecificParams{ { 2, 3, 4, 5, 6 }, { 0, 1, 0, 0, 0 }, { 2, 3, 4, 5, 6 }, { 1, 1, 1, 1, 1 },
-                            {1, 0, 1, 1, 1}, {1, 0, 1, 1, 1},  {},  {0, 1, 0, 0, 0},  {} },
+                            { 1, 0, 1, 1, 1}, { 1, 0, 1, 1, 1},  {},  { 0, 1, 0, 0, 0},  {} },
         StridedSliceSpecificParams{ { 10, 12 }, { -1, 1 }, { -9999, 0 }, { -1, 1 },
-                                { 0, 1 }, { 0, 1 },  { 0, 0 },  { 0, 0 },  { 0, 0 } },
+                            { 0, 1 }, { 0, 1 },  { 0, 0 },  { 0, 0 },  { 0, 0 } },
         StridedSliceSpecificParams{ { 5, 5, 5, 5 }, { -1, 0, -1, 0 }, { -50, 0, -60, 0 }, { -1, 1, -1, 1 },
-                                { 0, 0, 0, 0 }, { 0, 1, 0, 1 },  { 0, 0, 0, 0 },  { 0, 0, 0, 0 },  { 0, 0, 0, 0 } },
+                            { 0, 0, 0, 0 }, { 0, 1, 0, 1 },  { 0, 0, 0, 0 },  { 0, 0, 0, 0 },  { 0, 0, 0, 0 } },
+        StridedSliceSpecificParams{ { 1, 2, 4 }, { 0, 2000, 3, 5 }, { 0, 0, 0, 2 }, { 1, 1, 1, 1 },
+                            { 1, 0, 1, 1 }, { 1, 0, 1, 0 },  { 0, 1, 0, 0 },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 2, 4, 4 }, { 0, 0, 0, 1, 0 }, { 0, 0, 0, 2, 0 }, { 1, 1, 1, 1, 1 },
+                            { 1, 1, 1, 0, 1 }, { 1, 1, 1, 0, 1 },  { 0, 1, 0, 0, 0 },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 2, 2, 4, 4 }, { 0, 0, 0, 1, 0 }, { 0, 0, 0, 2, 0 }, { 1, 1, 1, 1, 1 },
+                            { 1, 1, 1, 0, 1 }, { 1, 1, 1, 0, 1 },  { },  { 0, 1, 0, 0, 0 },  { } },
 };
 
 INSTANTIATE_TEST_CASE_P(
@@ -75,4 +121,4 @@ INSTANTIATE_TEST_CASE_P(
             ::testing::Values(std::map<std::string, std::string>())),
         StridedSliceLayerTest::getTestCaseName);
 
-}  // namespace
\ No newline at end of file
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
index ae1c768f27353d..ca02fd00845c81 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
@@ -27,10 +27,6 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*RangeLayerTest.*)",
         R"(.*(RangeAddSubgraphTest).*Start=1.2.*Stop=(5.2|-5.2).*Step=(0.1|-0.1).*netPRC=FP16.*)",
         R"(.*(RangeNumpyAddSubgraphTest).*netPRC=FP16.*)",
-        // TODO: Issue: 34083
-#if (defined(_WIN32) || defined(_WIN64))
-        R"(.*(CoreThreadingTestsWithIterations).*(smoke_LoadNetworkAccuracy).*)",
-#endif
         // TODO: Issue: 43793
         R"(.*(PreprocessTest).*(SetScalePreProcessSetBlob).*)",
         R"(.*(PreprocessTest).*(SetScalePreProcessGetBlob).*)",
@@ -58,7 +54,10 @@ std::vector<std::string> disabledTestPatterns() {
         // Skip platforms that do not support BF16 (i.e. sse, avx, avx2)
         R"(.*BF16.*(jit_avx(?!5)|jit_sse).*)",
         // TODO: Incorrect blob sizes for node BinaryConvolution_X
-        R"(.*BinaryConvolutionLayerTest.*)"
+        R"(.*BinaryConvolutionLayerTest.*)",
+        // TODO: 51676. Incorrect conversion of min and max limits from double to integral
+        R"(.*ClampLayerTest.*netPrc=(I64|I32).*)",
+        R"(.*ClampLayerTest.*netPrc=U64.*)"
     };
 
     if (!InferenceEngine::with_cpu_x86_avx512_core()) {
diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp
index 2ecfe958f5ffc6..3b8d372ef827b4 100644
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp
@@ -76,14 +76,15 @@ const std::map<ActivationTypes, std::vector<std::vector<float>>> activationTypes
         {Sigmoid,     {{}}},
         {Tanh,        {{}}},
         {Relu,        {{}}},
-        {Gelu,        {{}}},
         {Exp,         {{}}},
         {Clamp,       {{-2.0f, 2.0f}}},
         {Elu,         {{0.1f}}},
         {Swish,       {{0.1f}}},
         {HSwish,      {{}}},
         {Mish,        {{}}},
-        {PReLu, {{-0.01f}}}
+        {PReLu, {{-0.01f}}},
+        {GeluErf,     {{}}},
+        {GeluTanh,    {{}}}
 };
 
 std::vector<CPUSpecificParams> cpuParams_4D = {
diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/crop.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/crop.cpp
deleted file mode 100644
index 5bf5d795c8060a..00000000000000
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/crop.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include <shared_test_classes/single_layer/strided_slice.hpp>
-#include "ngraph_functions/builders.hpp"
-#include "test_utils/cpu_test_utils.hpp"
-
-// Given that the ngraph opset does not contain crop operation, we use the StridedSlice operation instead, since it is mapped to the Crop node if certain
-// conditions are met.
-
-using namespace InferenceEngine;
-using namespace CPUTestUtils;
-using namespace LayerTestsDefinitions;
-
-namespace CPULayerTestsDefinitions {
-
-typedef std::tuple<
-        StridedSliceSpecificParams,
-        InferenceEngine::Precision,        // Net precision
-        std::string,                       // Device name
-        std::map<std::string, std::string>, // Additional network configuration
-        CPUSpecificParams> CropLayerCPUTestParamSet;
-
-class CropLayerCPUTest : public testing::WithParamInterface<CropLayerCPUTestParamSet>,
-                         virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
-public:
-    static std::string getTestCaseName(testing::TestParamInfo<CropLayerCPUTestParamSet> obj) {
-        StridedSliceSpecificParams params;
-        InferenceEngine::Precision netPrc;
-        std::string targetName;
-        std::map<std::string, std::string> additionalConfig;
-        CPUSpecificParams cpuParams;
-        std::tie(params, netPrc, targetName, additionalConfig, cpuParams) = obj.param;
-
-        std::ostringstream result;
-        result << "inShape=" << CommonTestUtils::vec2str(params.inputShape) << "_";
-        result << "netPRC=" << netPrc.name() << "_";
-        result << "begin=" << CommonTestUtils::vec2str(params.begin) << "_";
-        result << "end=" << CommonTestUtils::vec2str(params.end) << "_";
-        result << "stride=" << CommonTestUtils::vec2str(params.strides) << "_";
-        result << "begin_m=" << CommonTestUtils::vec2str(params.beginMask) << "_";
-        result << "end_m=" << CommonTestUtils::vec2str(params.endMask) << "_";
-        if (!params.newAxisMask.empty()) {
-            result << "new_axis_m=" << (params.newAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.newAxisMask)) << "_";
-        }
-        if (!params.shrinkAxisMask.empty()) {
-            result << "shrink_m=" << (params.shrinkAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.shrinkAxisMask)) << "_";
-        }
-        if (!params.ellipsisAxisMask.empty()) {
-            result << "ellipsis_m=" << (params.ellipsisAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.ellipsisAxisMask)) << "_";
-        }
-        result << "trgDev=" << targetName;
-        result << CPUTestsBase::getTestCaseName(cpuParams);
-
-        return result.str();
-    }
-protected:
-    void SetUp() override {
-        StridedSliceSpecificParams ssParams;
-        InferenceEngine::Precision netPrecision;
-        std::map<std::string, std::string> additionalConfig;
-        CPUSpecificParams cpuParams;
-        std::tie(ssParams, netPrecision, targetDevice, additionalConfig, cpuParams) = this->GetParam();
-        inPrc = outPrc = netPrecision; // because crop does not convert Precisions, but only moves the data
-        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
-        configuration.insert(additionalConfig.begin(), additionalConfig.end());
-
-        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
-        auto params = ngraph::builder::makeParams(ngPrc, {ssParams.inputShape});
-        auto paramOuts = ngraph::helpers::convert2OutputVector(
-                ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
-        auto ss = ngraph::builder::makeStridedSlice(paramOuts[0], ssParams.begin, ssParams.end, ssParams.strides, ngPrc, ssParams.beginMask,
-                                                    ssParams.endMask, ssParams.newAxisMask, ssParams.shrinkAxisMask, ssParams.ellipsisAxisMask);
-
-        selectedType = std::string("unknown_") + inPrc.name();
-
-        ss->get_rt_info() = getCPUInfo();
-
-        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(ss)};
-        function = std::make_shared<ngraph::Function>(results, params, "StridedSlice");
-    }
-};
-
-TEST_P(CropLayerCPUTest, CompareWithRefs) {
-    SKIP_IF_CURRENT_TEST_IS_DISABLED()
-
-    Run();
-    CheckPluginRelatedResults(executableNetwork, "Crop");
-}
-
-namespace {
-const std::map<std::string, std::string> additional_config;
-
-const std::vector<Precision> netPrc = {Precision::BF16, Precision::FP32};
-
-const std::vector<StridedSliceSpecificParams> testCasesPlain2D = {StridedSliceSpecificParams{ { 32, 32 }, { 0, 20 }, { 32, 30 }, { 1, 1 },
-                                                                                              { 0, 0 }, { 0, 0 },  { },  { },  { } },
-                                                                  StridedSliceSpecificParams{ { 32, 20 }, { 2, 10 }, { 32, 20 }, { 1, 1 },
-                                                                                              { 0, 0 }, { 0, 0 },  { },  { },  { } } };
-
-const auto CropParamsPlain2D = ::testing::Combine(
-        ::testing::ValuesIn(testCasesPlain2D),
-        ::testing::ValuesIn(netPrc),
-        ::testing::Values(CommonTestUtils::DEVICE_CPU),
-        ::testing::Values(additional_config),
-        ::testing::Values(emptyCPUSpec));
-
-INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Plain_2D, CropLayerCPUTest, CropParamsPlain2D, CropLayerCPUTest::getTestCaseName);
-
-const std::vector<StridedSliceSpecificParams> testCasesPlain4D = {
-        StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 2, 5, 4 }, { 1, 4, 28, 27 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
-        StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 0, 20, 20 }, { 1, 5, 25, 25 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
-        StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 0, 0, 20 }, { 1, 5, 32, 30 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
-        StridedSliceSpecificParams{ { 1, 5, 32, 20 }, { 0, 0, 2, 10 }, { 1, 5, 32, 20 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } }
-};
-
-std::vector<CPUSpecificParams> cpuParams_4D = {
-        CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
-        CPUSpecificParams({nchw}, {nchw}, {}, {})
-};
-
-const auto CropParamsPlain4D = ::testing::Combine(
-        ::testing::ValuesIn(testCasesPlain4D),
-        ::testing::ValuesIn(netPrc),
-        ::testing::Values(CommonTestUtils::DEVICE_CPU),
-        ::testing::Values(additional_config),
-        ::testing::Values(cpuParams_4D.at(1)));
-
-INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Plain_4D, CropLayerCPUTest, CropParamsPlain4D, CropLayerCPUTest::getTestCaseName);
-
-const std::vector<StridedSliceSpecificParams> testCasesBlocked4D = {
-        StridedSliceSpecificParams{ { 1, 16, 32, 32 }, { 0, 0, 20, 20 }, { 1, 16, 25, 25 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
-        StridedSliceSpecificParams{ { 1, 32, 32, 32 }, { 0, 0, 0, 20 }, { 1, 16, 32, 30 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
-};
-
-const auto CropParamsBlocked4D = ::testing::Combine(
-        ::testing::ValuesIn(testCasesBlocked4D),
-        ::testing::ValuesIn(netPrc),
-        ::testing::Values(CommonTestUtils::DEVICE_CPU),
-        ::testing::Values(additional_config),
-        ::testing::Values(filterCPUSpecificParams(cpuParams_4D).front()));
-
-INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Blocked_4D, CropLayerCPUTest, CropParamsBlocked4D, CropLayerCPUTest::getTestCaseName);
-
-const std::vector<StridedSliceSpecificParams> testCasesPlain4DynBatch = {
-        StridedSliceSpecificParams{ { 10, 5, 32, 32 }, { 0, 2, 5, 4 }, { 1, 4, 28, 27 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
-        StridedSliceSpecificParams{ { 10, 5, 32, 32 }, { 0, 0, 20, 20 }, { 1, 5, 25, 25 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
-        StridedSliceSpecificParams{ { 10, 5, 32, 32 }, { 0, 0, 0, 20 }, { 1, 5, 32, 30 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
-        StridedSliceSpecificParams{ { 10, 5, 32, 20 }, { 0, 0, 2, 10 }, { 1, 5, 32, 20 }, { 1, 1, 1, 1 },
-                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } }
-};
-
-std::map<std::string, std::string> additional_config_dyn_batch = {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO},
-                                                                  {PluginConfigParams::KEY_DYN_BATCH_ENABLED, PluginConfigParams::YES}};
-
-const auto CropParamsPlain4DynBatch = ::testing::Combine(
-        ::testing::ValuesIn(testCasesPlain4DynBatch),
-        ::testing::ValuesIn(netPrc),
-        ::testing::Values(CommonTestUtils::DEVICE_CPU),
-        ::testing::Values(additional_config_dyn_batch),
-        ::testing::Values(cpuParams_4D.at(1)));
-
-INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Blocked_4DynBatch, CropLayerCPUTest, CropParamsPlain4DynBatch, CropLayerCPUTest::getTestCaseName);
-} // namespace
-} // namespace CPULayerTestsDefinitions
-
diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/strided_slice.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/strided_slice.cpp
new file mode 100644
index 00000000000000..c4746897552ef2
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/strided_slice.cpp
@@ -0,0 +1,304 @@
+// Copyright (C) 2020-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <shared_test_classes/single_layer/strided_slice.hpp>
+#include "ngraph_functions/builders.hpp"
+#include "test_utils/cpu_test_utils.hpp"
+
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+using namespace LayerTestsDefinitions;
+
+namespace CPULayerTestsDefinitions {
+
+typedef std::tuple<
+        StridedSliceSpecificParams,
+        InferenceEngine::Precision,         // Net precision
+        std::string,                        // Device name
+        std::map<std::string, std::string>, // Additional network configuration
+        CPUSpecificParams> StridedSliceLayerCPUTestParamSet;
+
+class StridedSliceLayerCPUTest : public testing::WithParamInterface<StridedSliceLayerCPUTestParamSet>,
+                                 virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<StridedSliceLayerCPUTestParamSet> obj) {
+        StridedSliceSpecificParams params;
+        InferenceEngine::Precision netPrc;
+        std::string targetName;
+        std::map<std::string, std::string> additionalConfig;
+        CPUSpecificParams cpuParams;
+        std::tie(params, netPrc, targetName, additionalConfig, cpuParams) = obj.param;
+
+        std::ostringstream result;
+        result << "inShape=" << CommonTestUtils::vec2str(params.inputShape) << "_";
+        result << "netPRC=" << netPrc.name() << "_";
+        result << "begin=" << CommonTestUtils::vec2str(params.begin) << "_";
+        result << "end=" << CommonTestUtils::vec2str(params.end) << "_";
+        result << "stride=" << CommonTestUtils::vec2str(params.strides) << "_";
+        result << "begin_m=" << CommonTestUtils::vec2str(params.beginMask) << "_";
+        result << "end_m=" << CommonTestUtils::vec2str(params.endMask) << "_";
+        result << "new_axis_m=" << (params.newAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.newAxisMask)) << "_";
+        result << "shrink_m=" << (params.shrinkAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.shrinkAxisMask)) << "_";
+        result << "ellipsis_m=" << (params.ellipsisAxisMask.empty() ? "def" : CommonTestUtils::vec2str(params.ellipsisAxisMask)) << "_";
+        result << "trgDev=" << targetName;
+        result << CPUTestsBase::getTestCaseName(cpuParams);
+
+        return result.str();
+    }
+protected:
+    void SetUp() override {
+        StridedSliceSpecificParams ssParams;
+        InferenceEngine::Precision netPrecision;
+        std::map<std::string, std::string> additionalConfig;
+        CPUSpecificParams cpuParams;
+        std::tie(ssParams, netPrecision, targetDevice, additionalConfig, cpuParams) = this->GetParam();
+        inPrc = outPrc = netPrecision;
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+        configuration.insert(additionalConfig.begin(), additionalConfig.end());
+
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+        auto params = ngraph::builder::makeParams(ngPrc, {ssParams.inputShape});
+        auto paramOuts = ngraph::helpers::convert2OutputVector(
+                ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+        auto ss = ngraph::builder::makeStridedSlice(paramOuts[0], ssParams.begin, ssParams.end, ssParams.strides, ngPrc, ssParams.beginMask,
+                                                    ssParams.endMask, ssParams.newAxisMask, ssParams.shrinkAxisMask, ssParams.ellipsisAxisMask);
+
+        selectedType = std::string("ref_") + inPrc.name();
+
+        ss->get_rt_info() = getCPUInfo();
+
+        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(ss)};
+        function = std::make_shared<ngraph::Function>(results, params, "StridedSlice");
+    }
+};
+
+TEST_P(StridedSliceLayerCPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Run();
+    CheckPluginRelatedResults(executableNetwork, "StridedSlice");
+}
+
+namespace {
+
+const auto cpuParams_nChw16c = CPUSpecificParams {{nChw16c}, {nChw16c}, {}, {}};
+const auto cpuParams_nCdhw16c = CPUSpecificParams {{nCdhw16c}, {nCdhw16c}, {}, {}};
+
+const auto cpuParams_nChw8c = CPUSpecificParams {{nChw8c}, {nChw8c}, {}, {}};
+const auto cpuParams_nCdhw8c = CPUSpecificParams {{nCdhw8c}, {nCdhw8c}, {}, {}};
+
+const auto cpuParams_nhwc = CPUSpecificParams {{nhwc}, {nhwc}, {}, {}};
+const auto cpuParams_ndhwc = CPUSpecificParams {{ndhwc}, {ndhwc}, {}, {}};
+
+const auto cpuParams_nchw = CPUSpecificParams {{nchw}, {nchw}, {}, {}};
+const auto cpuParams_ncdhw = CPUSpecificParams {{ncdhw}, {ncdhw}, {}, {}};
+
+const std::map<std::string, std::string> additional_config;
+
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::BF16,
+        InferenceEngine::Precision::I8
+};
+
+const std::vector<StridedSliceSpecificParams> testCasesPlain2D = {
+        StridedSliceSpecificParams{ { 32, 32 }, { 0, 20 }, { 32, 30 }, { 1, 1 },
+                                    { 0, 0 }, { 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 32, 20 }, { 2, 10 }, { 32, 20 }, { 1, 1 },
+                                    { 0, 0 }, { 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 32, 20 }, { 2, 10 }, { 32, 20 }, { 1, 2 },
+                                    { 0, 1 }, { 1, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 32, 20 }, { 2, 10 }, { 32, 20 }, { 2, 1 },
+                                    { 0, 0 }, { 1, 0 },  { },  { },  { } },
+};
+
+const auto StridedSliceParamsPlain2D = ::testing::Combine(
+        ::testing::ValuesIn(testCasesPlain2D),
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config),
+        ::testing::Values(emptyCPUSpec));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Plain_2D, StridedSliceLayerCPUTest, StridedSliceParamsPlain2D, StridedSliceLayerCPUTest::getTestCaseName);
+
+const std::vector<StridedSliceSpecificParams> testCasesCommon4D = {
+        StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 2, 5, 4 }, { 1, 4, 28, 27 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 5, 32, 20 }, { 0, 1, 0, 0 }, { 1, 3, 32, 20 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 5, 32, 20 }, { 0, 0, 10, 0 }, { 1, 3, 20, 20 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 1, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 5, 32, 32 }, { 0, 0, 20, 20 }, { 1, 5, 25, 26 }, { 1, 1, 1, 2 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 5, 32, 32 }, { 0, 0, 0, 20 }, { 1, 2, 30, 30 }, { 1, 1, 2, 1 },
+                                    { 0, 0, 0, 1 }, { 0, 1, 0, 1 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 5, 32, 20 }, { 0, 0, 2, 10 }, { 1, 3, 32, 20 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 1, 1 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 5, 32, 32 }, { 0, 1, 0, 10 }, { 1, 5, 32, 30 }, { 1, 1, 1, 1 },
+                                    { 0, 1, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 5, 32, 20 }, { 0, 1, 2, 10 }, { 1, 5, 32, 18 }, { 1, 1, 1, 2 },
+                                    { 0, 0, 1, 0 }, { 0, 0, 0, 1 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 8, 32, 20 }, { 0, 0, 2, 10 }, { 1, 8, 32, 18 }, { 1, 2, 1, 2 },
+                                    { 0, 0, 1, 0 }, { 0, 0, 0, 1 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 8, 32, 20 }, { 0, 0, 10 }, { 0, 32, 18 }, { 1, 1, 1 },
+                                    { 1, 1, 0 }, { 1, 1, 0 },  { },  { },  { 1, 0, 0 } },
+        StridedSliceSpecificParams{ { 2, 8, 32, 20 }, { 0, 0, 10 }, { 1, 0, 20 }, { 1, 1, 1 },
+                                    { 1, 1, 0 }, { 0, 1, 1 },  { },  { },  { 0, 1, 0 } },
+        StridedSliceSpecificParams{ { 2, 8, 32, 20 }, { 0, 4, 10 }, { 2, 8, 0 }, { 1, 1, 1 },
+                                    { 1, 0, 1 }, { 1, 1, 1 },  { },  { },  { 0, 0, 1 } }
+};
+
+const std::vector<CPUSpecificParams> CPUParamsCommon4D = {
+        cpuParams_nchw,
+        cpuParams_nhwc,
+};
+
+const auto StridedSliceParamsCommon4D = ::testing::Combine(
+        ::testing::ValuesIn(testCasesCommon4D),
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config),
+        ::testing::ValuesIn(CPUParamsCommon4D));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Common_4D, StridedSliceLayerCPUTest, StridedSliceParamsCommon4D, StridedSliceLayerCPUTest::getTestCaseName);
+
+const std::vector<StridedSliceSpecificParams> testCasesBlocked4D = {
+        StridedSliceSpecificParams{ { 1, 16, 32, 32 }, { 0, 0, 5, 4 }, { 1, 16, 28, 27 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 32, 10, 10 }, { 0, 16, 0, 0 }, { 1, 32, 10, 10 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 16, 32, 20 }, { 0, 0, 10, 0 }, { 1, 16, 20, 10 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 1 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 32, 32, 32 }, { 0, 0, 20, 20 }, { 1, 32, 25, 25 }, { 1, 1, 1, 1 },
+                                    { 0, 1, 0, 0 }, { 0, 1, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 48, 32, 32 }, { 0, 16, 0, 20 }, { 1, 32, 32, 30 }, { 1, 1, 1, 2 },
+                                    { 1, 0, 1, 0 }, { 1, 0, 1, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 32, 32, 20 }, { 0, 16, 2, 10 }, { 1, 32, 32, 20 }, { 1, 1, 2, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 32, 20 }, { 0, 16, 0, 0 }, { 2, 64, 32, 20 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 32, 20 }, { 0, 32, 0, 0 }, { 2, 50, 32, 20 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 32, 20 }, { 0, 0, 0, 0 }, { 2, 12, 32, 20 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 64, 32, 20 }, { 0, -16, 0, 10 }, { 2, 100, 32, 20 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 32, 32, 20 }, { 0, -16, 0, 0 }, { 2, -4, 32, 20 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 32, 32, 20 }, { 0, -32, 0, 0 }, { 2, -12, 32, 20 }, { 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0 }, { 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 32, 32, 20 }, { 0, 10 }, { 0, 20 }, { 1, 1 },
+                                    { 1, 0 }, { 1, 0 },  { },  { },  { 1, 0 } },
+        StridedSliceSpecificParams{ { 2, 32, 32, 20 }, { 0, 16, 0 }, { 2, 32, 0 }, { 1, 1, 1 },
+                                    { 1, 0, 1 }, { 1, 1, 1 },  { },  { },  { 0, 0, 1 } },
+};
+
+const std::vector<CPUSpecificParams> CPUParamsBlocked4D = {
+        cpuParams_nChw16c,
+        cpuParams_nChw8c,
+};
+
+const auto StridedSliceParamsBlocked4D = ::testing::Combine(
+        ::testing::ValuesIn(testCasesBlocked4D),
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config),
+        ::testing::ValuesIn(CPUParamsBlocked4D));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Blocked_4D, StridedSliceLayerCPUTest, StridedSliceParamsBlocked4D, StridedSliceLayerCPUTest::getTestCaseName);
+
+const std::vector<StridedSliceSpecificParams> testCasesCommon5D = {
+        StridedSliceSpecificParams{ { 1, 5, 20, 32, 32 }, { 0, 2, 0, 5, 4 }, { 1, 4, 5, 28, 27 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 5, 20, 32, 20 }, { 0, 0, 10, 0, 0 }, { 1, 5, 20, 32, 20 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 5, 20, 32, 20 }, { 0, 1, 10, 0, 0 }, { 1, 3, 20, 32, 20 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 1, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 5, 20, 32, 32 }, { 0, 0, 0, 20, 20 }, { 1, 5, 20, 30, 26 }, { 1, 1, 1, 2, 2 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 5, 20, 32, 32 }, { 0, 0, 10, 0, 20 }, { 1, 2, 20, 30, 30 }, { 1, 1, 2, 1, 1 },
+                                    { 0, 0, 0, 0, 1 }, { 0, 1, 0, 1, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 5, 20, 32, 32 }, { 0, 0, 2, 10, 0 }, { 1, 5, 10, 32, 20 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 1, 1, 0 }, { 0, 0, 0, 0, 1 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 5, 20, 32, 32 }, { 0, 1, 0, 10, 0 }, { 1, 5, 20, 32, 32 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 1, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 5, 20, 32, 32 }, { 0, 0, 0, 0, 0 }, { 1, 5, 10, 16, 16 }, { 1, 1, 2, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 8, 20, 32, 32 }, { 0, 2, 0, 0, 0 }, { 1, 8, 10, 16, 16 }, { 1, 2, 1, 1, 2 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 8, 20, 32, 32 }, { 0, 2, 0, 0, 16 }, { 2, 8, 20, 32, 32 }, { 1, 2, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 8, 10, 10, 10 }, { 0, 5 }, { 0, 10 }, { 1, 1 },
+                                    { 1, 0 }, { 1, 0 },  { },  { },  { 1, 0 } },
+        StridedSliceSpecificParams{ { 2, 8, 10, 10, 10 }, { 0, 0, 5 }, { 0, 0, 10 }, { 1, 1, 1 },
+                                    { 1, 1, 0 }, { 1, 1, 0 },  { },  { },  { 0, 1, 0 } },
+        StridedSliceSpecificParams{ { 2, 8, 10, 10, 10 }, { 0, 2, 0 }, { 2, 8, 0 }, { 1, 1, 1 },
+                                    { 1, 0, 1 }, { 1, 1, 1 },  { },  { },  { 0, 0, 1 } }
+};
+
+const std::vector<CPUSpecificParams> CPUParamsCommon5D = {
+        cpuParams_ncdhw,
+        cpuParams_ndhwc,
+};
+
+const auto StridedSliceParamsCommon5D = ::testing::Combine(
+        ::testing::ValuesIn(testCasesCommon5D),
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config),
+        ::testing::ValuesIn(CPUParamsCommon5D));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Common_5D, StridedSliceLayerCPUTest, StridedSliceParamsCommon5D, StridedSliceLayerCPUTest::getTestCaseName);
+
+const std::vector<StridedSliceSpecificParams> testCasesBlocked5D = {
+        StridedSliceSpecificParams{ { 1, 16, 20, 32, 32 }, { 0, 0, 0, 5, 4 }, { 1, 16, 5, 28, 27 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 32, 20, 32, 20 }, { 0, 0, 10, 0, 0 }, { 1, 16, 20, 32, 20 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 16, 20, 32, 20 }, { 0, 0, 10, 0, 0 }, { 1, 16, 20, 32, 20 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 1, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 16, 20, 32, 32 }, { 0, 0, 0, 20, 20 }, { 1, 16, 20, 30, 26 }, { 1, 1, 1, 2, 2 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 16, 20, 32, 32 }, { 0, 0, 10, 0, 20 }, { 1, 16, 20, 30, 30 }, { 1, 1, 2, 1, 1 },
+                                    { 0, 0, 0, 0, 1 }, { 0, 1, 0, 1, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 16, 20, 32, 32 }, { 0, 0, 2, 10, 0 }, { 1, 16, 10, 32, 20 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 1, 1, 0 }, { 0, 0, 0, 0, 1 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 1, 16, 20, 32, 32 }, { 0, 0, 0, 10, 0 }, { 1, 8, 20, 32, 32 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 1, 0, 0, 0 }, { 0, 1, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 16, 20, 32, 32 }, { 0, 0, 0, 0, 0 }, { 1, 16, 10, 16, 16 }, { 1, 1, 2, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 20, 10, 10 }, { 0, 0, 0, 0, 0 }, { 1, 25, 20, 10, 10 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 20, 10, 10 }, { 0, 16, 0, 0, 0 }, { 1, 25, 20, 10, 10 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 20, 10, 10 }, { 0, 16, 0, 0, 0 }, { 1, 64, 20, 10, 10 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 20, 10, 10 }, { 0, 0, 0, 0, 0 }, { 2, 25, 20, 10, 10 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 20, 10, 10 }, { 0, 0, 0, 0, 0 }, { 2, 60, 20, 10, 10 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 20, 10, 10 }, { 0, 32, 0, 0, 0 }, { 2, 40, 20, 10, 10 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } },
+        StridedSliceSpecificParams{ { 2, 64, 20, 10, 10 }, { 0, 16, 0, 0, 0 }, { 2, 64, 20, 10, 10 }, { 1, 1, 1, 1, 1 },
+                                    { 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0 },  { },  { },  { } }
+};
+
+const std::vector<CPUSpecificParams> CPUParamsBlocked5D = {
+        cpuParams_nCdhw16c,
+        cpuParams_nCdhw8c,
+};
+
+const auto StridedSliceParamsBlocked5D = ::testing::Combine(
+        ::testing::ValuesIn(testCasesBlocked5D),
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU),
+        ::testing::Values(additional_config),
+        ::testing::ValuesIn(CPUParamsBlocked5D));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_Blocked_5D, StridedSliceLayerCPUTest, StridedSliceParamsBlocked5D, StridedSliceLayerCPUTest::getTestCaseName);
+
+} // namespace
+} // namespace CPULayerTestsDefinitions
+
diff --git a/inference-engine/tests/functional/plugin/gna/Import_export_tests/import_export_act_conv_act.cpp b/inference-engine/tests/functional/plugin/gna/Import_export_tests/import_export_act_conv_act.cpp
new file mode 100644
index 00000000000000..283d232e0cebdc
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gna/Import_export_tests/import_export_act_conv_act.cpp
@@ -0,0 +1,178 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <fstream>
+
+#include <ie_core.hpp>
+#include <ie_layouts.h>
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+
+typedef std::tuple<
+        std::vector<size_t>,                // Input shape
+        InferenceEngine::Precision,         // Network Precision
+        std::string,                        // Target Device
+        std::map<std::string, std::string>, // Export Configuration
+        std::map<std::string, std::string>  // Import Configuration
+> exportImportNetworkParams;
+
+namespace LayerTestsDefinitions {
+
+class ImportActConvActTest : public testing::WithParamInterface<exportImportNetworkParams>,
+                                 public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<exportImportNetworkParams> obj) {
+        std::vector<size_t> inputShape;
+        InferenceEngine::Precision netPrecision;
+        std::string targetDevice;
+        std::map<std::string, std::string> exportConfiguration;
+        std::map<std::string, std::string> importConfiguration;
+        std::tie(inputShape, netPrecision, targetDevice, exportConfiguration, importConfiguration) = obj.param;
+
+        std::ostringstream result;
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice << "_";
+        for (auto const &configItem : exportConfiguration) {
+            result << "_exportConfigItem=" << configItem.first << "_" << configItem.second;
+        }
+        for (auto const &configItem : importConfiguration) {
+            result << "_importConfigItem=" << configItem.first << "_" << configItem.second;
+        }
+        result << CommonTestUtils::vec2str(inputShape);
+        return result.str();
+    }
+
+    void Run() override {
+        SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+        configuration.insert(exportConfiguration.begin(), exportConfiguration.end());
+        LoadNetwork();
+        GenerateInputs();
+        Infer();
+
+        executableNetwork.Export("exported_model.blob");
+        for (auto const &configItem : importConfiguration) {
+            configuration[configItem.first] = configItem.second;
+        }
+        std::fstream inputStream("exported_model.blob", std::ios_base::in | std::ios_base::binary);
+        if (inputStream.fail()) {
+            FAIL() << "Cannot open file to import model: exported_model.blob";
+        }
+
+        auto importedNetwork = core->ImportNetwork(inputStream, targetDevice, configuration);
+
+        // Generate inputs
+        std::vector<InferenceEngine::Blob::Ptr> inputs;
+        auto inputsInfo = importedNetwork.GetInputsInfo();
+        auto functionParams = function->get_parameters();
+        for (int i = 0; i < functionParams.size(); ++i) {
+            const auto& param = functionParams[i];
+            const auto infoIt = inputsInfo.find(param->get_friendly_name());
+            GTEST_ASSERT_NE(infoIt, inputsInfo.cend());
+
+            const auto& info = infoIt->second;
+            auto blob = GenerateInput(*info);
+            inputs.push_back(blob);
+        }
+
+        // Infer imported network
+        InferenceEngine::InferRequest importInfer = importedNetwork.CreateInferRequest();
+        inputsInfo = importedNetwork.GetInputsInfo();
+        functionParams = function->get_parameters();
+        for (int i = 0; i < functionParams.size(); ++i) {
+            const auto& param = functionParams[i];
+            const auto infoIt = inputsInfo.find(param->get_friendly_name());
+            GTEST_ASSERT_NE(infoIt, inputsInfo.cend());
+
+            const auto& info = infoIt->second;
+            auto blob = inputs[i];
+            importInfer.SetBlob(info->name(), blob);
+        }
+        importInfer.Infer();
+
+        // Validate
+        auto expectedOutputs = CalculateRefs();
+        auto actualOutputs = std::vector<InferenceEngine::Blob::Ptr>{};
+        for (const auto &output : importedNetwork.GetOutputsInfo()) {
+            const auto &name = output.first;
+            actualOutputs.push_back(importInfer.GetBlob(name));
+        }
+        IE_ASSERT(actualOutputs.size() == expectedOutputs.size())
+        << "nGraph interpreter has " << expectedOutputs.size() << " outputs, while IE " << actualOutputs.size();
+        Compare(expectedOutputs, actualOutputs);
+    }
+
+protected:
+    void SetUp() override {
+        std::vector<size_t> inputShape;
+        InferenceEngine::Precision netPrecision;
+        std::tie(inputShape, netPrecision, targetDevice, exportConfiguration, importConfiguration) = this->GetParam();
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+        auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+        auto relu1 = std::make_shared<ngraph::opset1::Relu>(params[0]);
+
+        size_t num_out_channels = 8;
+        size_t kernel_size = 8;
+        std::vector<float> filter_weights = CommonTestUtils::generate_float_numbers(num_out_channels * inputShape[1] * kernel_size,
+                                                                                    -0.2f, 0.2f);
+        auto conv = ngraph::builder::makeConvolution(relu1, ngPrc, { 1, kernel_size }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
+                                                     ngraph::op::PadType::VALID, num_out_channels, true, filter_weights);
+
+        auto relu2 = std::make_shared<ngraph::opset1::Relu>(conv);
+        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(relu2)};
+        function = std::make_shared<ngraph::Function>(results, params, "ExportImportNetwork");
+    }
+
+private:
+    std::map<std::string, std::string> exportConfiguration;
+    std::map<std::string, std::string> importConfiguration;
+};
+
+TEST_P(ImportActConvActTest, CompareWithRefImpl) {
+    Run();
+};
+
+const std::vector<std::vector<size_t>> inputShape = {
+    {1, 1, 1, 240},
+    {1, 1, 1, 160},
+    {1, 2, 1, 80}
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> exportConfigs = {
+        {
+                {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+        }
+};
+
+const std::vector<std::map<std::string, std::string>> importConfigs = {
+        {
+                {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+        }
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_ImportActConvAct, ImportActConvActTest,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(inputShape),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                                ::testing::ValuesIn(exportConfigs),
+                                ::testing::ValuesIn(importConfigs)),
+                        ImportActConvActTest::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
+
diff --git a/inference-engine/tests/functional/plugin/gna/pass_tests/remove_permutations_NHWC_to_NCHW_pass.cpp b/inference-engine/tests/functional/plugin/gna/pass_tests/remove_permutations_NHWC_to_NCHW_pass.cpp
index 84d63fd0882e00..947d27a9b336ae 100644
--- a/inference-engine/tests/functional/plugin/gna/pass_tests/remove_permutations_NHWC_to_NCHW_pass.cpp
+++ b/inference-engine/tests/functional/plugin/gna/pass_tests/remove_permutations_NHWC_to_NCHW_pass.cpp
@@ -74,15 +74,17 @@ class RemovePermutationsNHWCToNCHWPassTest : public testing::WithParamInterface<
                 ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 3, 1, 2 }));
 
             size_t num_out_channels = 12;
-            size_t kernal_size = 8;
-            auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, { 1, kernal_size }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
-                ngraph::op::PadType::VALID, num_out_channels);
+            size_t kernel_size = 8;
+            std::vector<size_t> kernal_shape = (inputShape[1] == 1 ? std::vector<size_t>{1, kernel_size} : std::vector<size_t>{kernel_size, 1});
+            auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, kernal_shape, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
+                                                          ngraph::op::PadType::VALID, num_out_channels);
 
             auto permute2 = std::make_shared<ngraph::opset1::Transpose>(conv1,
                 ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 2, 3, 1 }));
 
-            size_t out_width = (inputShape[2] - kernal_size) + 1;
-            std::vector<size_t> outFormShapes = { 1, out_width * num_out_channels };
+            size_t out_width = (inputShape[2] - kernal_shape[1]) + 1;
+            size_t out_height = (inputShape[1] - kernal_shape[0]) + 1;
+            std::vector<size_t> outFormShapes = { 1, out_width * out_height * num_out_channels };
             auto pattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
             auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(permute2, pattern2, false);
 
@@ -122,7 +124,9 @@ class RemovePermutationsNHWCToNCHWPass4DOutputTest : public testing::WithParamIn
         auto permute1 = std::make_shared<ngraph::opset1::Transpose>(params[0],
                              ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 3, 1, 2 }));
 
-        auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, { 1, 8 }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, 12);
+        size_t kernal_size = 8;
+        std::vector<size_t> kernal_shape = (inputShape[1] == 1 ? std::vector<size_t>{1, kernal_size} : std::vector<size_t>{kernal_size, 1});
+        auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, kernal_shape, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, 12);
 
         auto permute2 = std::make_shared<ngraph::opset1::Transpose>(conv1,
                              ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 2, 3, 1 }));
@@ -200,20 +204,23 @@ class RemovePermutationsWithPoolAndActTest : public testing::WithParamInterface<
 
             size_t num_out_channels = 12;
             size_t kernal_size = 8;
+            auto kernal_shape = (inputShape[1] == 1 ? std::vector<size_t>{1, kernal_size} : std::vector<size_t>{kernal_size, 1});
             std::vector<float> filter_weights = CommonTestUtils::generate_float_numbers(num_out_channels * inputShape[3] * kernal_size,
                                                                                         -0.2f, 0.2f);
-            auto conv1 = ngraph::builder::makeConvolution(relu1, ngPrc, { 1, kernal_size }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
+            auto conv1 = ngraph::builder::makeConvolution(relu1, ngPrc, kernal_shape, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
                 ngraph::op::PadType::VALID, num_out_channels, false, filter_weights);
-            auto pool = ngraph::builder::makePooling(conv1, {1, 2}, {0, 0}, {0, 0}, {1, 2}, ngraph::op::RoundingType::FLOOR,
+            auto pool_kernal_shape = (inputShape[1] == 1 ? std::vector<size_t>{1, 2} : std::vector<size_t>{2, 1});
+            auto pool = ngraph::builder::makePooling(conv1, pool_kernal_shape, {0, 0}, {0, 0}, pool_kernal_shape, ngraph::op::RoundingType::FLOOR,
                                                      ngraph::op::PadType::VALID, false, ngraph::helpers::PoolingTypes::MAX);
 
-            size_t out_width = ((inputShape[2] - kernal_size) + 1) / 2;
+            size_t out_width = ((inputShape[2] - kernal_shape[1]) + 1) / pool_kernal_shape[1];
+            size_t out_height = ((inputShape[1] - kernal_shape[0]) + 1) / pool_kernal_shape[0];
             auto relu2 = std::make_shared<ngraph::opset3::Relu>(pool);
 
             auto permute2 = std::make_shared<ngraph::opset1::Transpose>(relu2,
                 ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 2, 3, 1 }));
 
-            std::vector<size_t> outFormShapes = { 1, out_width * num_out_channels };
+            std::vector<size_t> outFormShapes = { 1, out_width * out_height * num_out_channels };
             auto pattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
             auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(permute2, pattern2, false);
 
@@ -283,22 +290,25 @@ class RemovePermutationsWithTwoConvTest : public testing::WithParamInterface<rem
 
             size_t num_out_channels = 12;
             size_t kernal_size = 8;
+            std::vector<size_t> kernal_shape = (inputShape[1] == 1 ? std::vector<size_t>{1, kernal_size} : std::vector<size_t>{kernal_size, 1});
             std::vector<float> filter_weights_1 = CommonTestUtils::generate_float_numbers(num_out_channels * inputShape[3] * kernal_size,
                                                                                           0.0f, 0.5f);
-            auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, { 1, kernal_size }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
+            auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, kernal_shape, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
                 ngraph::op::PadType::VALID, num_out_channels, false, filter_weights_1);
-            size_t out_width = ((inputShape[2] - kernal_size) + 1);
+            size_t out_width = ((inputShape[2] - kernal_shape[1]) + 1);
+            size_t out_height = ((inputShape[1] - kernal_shape[0]) + 1);
 
             std::vector<float> filter_weights_2 = CommonTestUtils::generate_float_numbers(num_out_channels * num_out_channels * kernal_size,
                                                                                           -0.2f, 0.2f);
-            auto conv2 = ngraph::builder::makeConvolution(conv1, ngPrc, { 1, kernal_size }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
+            auto conv2 = ngraph::builder::makeConvolution(conv1, ngPrc, kernal_shape, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
                 ngraph::op::PadType::VALID, num_out_channels, false, filter_weights_2);
-            out_width = ((out_width - kernal_size) + 1);
+            out_width = ((out_width - kernal_shape[1]) + 1);
+            out_height = ((out_height - kernal_shape[0]) + 1);
 
             auto permute2 = std::make_shared<ngraph::opset1::Transpose>(conv2,
                 ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 2, 3, 1 }));
 
-            std::vector<size_t> outFormShapes = { 1, out_width * num_out_channels };
+            std::vector<size_t> outFormShapes = { 1, out_width * out_height * num_out_channels };
             auto pattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
             auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(permute2, pattern2, false);
 
@@ -363,6 +373,7 @@ class RemovePermutationsWithEltwiseTest : public testing::WithParamInterface<rem
             auto params = ngraph::builder::makeParams(ngPrc, { {1, 2 * in_total_dims_size} });
             auto split = ngraph::builder::makeSplit(params[0], ngPrc, 2, 1);
             auto in_width = inputShape[2];
+            auto in_height = inputShape[1];
             auto in_channels = inputShape[3];
 
             auto pattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, inputShape);
@@ -372,9 +383,10 @@ class RemovePermutationsWithEltwiseTest : public testing::WithParamInterface<rem
 
             size_t num_out_channels = 12;
             size_t kernal_size = 8;
+            std::vector<size_t> kernal_shape = (inputShape[1] == 1 ? std::vector<size_t>{1, kernal_size} : std::vector<size_t>{kernal_size, 1});
             std::vector<float> filter_weights_1 = CommonTestUtils::generate_float_numbers(num_out_channels * in_channels * kernal_size,
                                                                                           -0.2f, 0.2f);
-            auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, { 1, kernal_size }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
+            auto conv1 = ngraph::builder::makeConvolution(permute1, ngPrc, kernal_shape, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
                 ngraph::op::PadType::VALID, num_out_channels, false, filter_weights_1);
 
             auto pattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, inputShape);
@@ -384,7 +396,7 @@ class RemovePermutationsWithEltwiseTest : public testing::WithParamInterface<rem
 
             std::vector<float> filter_weights_2 = CommonTestUtils::generate_float_numbers(num_out_channels * in_channels * kernal_size,
                                                                                           -0.2f, 0.2f);
-            auto conv2 = ngraph::builder::makeConvolution(permute2, ngPrc, { 1, kernal_size }, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
+            auto conv2 = ngraph::builder::makeConvolution(permute2, ngPrc, kernal_shape, { 1, 1 }, { 0, 0 }, { 0, 0 }, { 1, 1 },
                 ngraph::op::PadType::VALID, num_out_channels, false, filter_weights_2);
 
             auto add = std::make_shared<ngraph::opset1::Add>(conv1, conv2);
@@ -392,8 +404,9 @@ class RemovePermutationsWithEltwiseTest : public testing::WithParamInterface<rem
             auto permute3 = std::make_shared<ngraph::opset1::Transpose>(add,
                 ngraph::opset1::Constant::create(ngraph::element::i64, ngraph::Shape{ 4 }, { 0, 2, 3, 1 }));
 
-            size_t out_width = ((in_width - kernal_size) + 1);
-            std::vector<size_t> outFormShapes = { 1, out_width * num_out_channels };
+            size_t out_width = ((in_width - kernal_shape[1]) + 1);
+            size_t out_height = ((in_height - kernal_shape[0]) + 1);
+            std::vector<size_t> outFormShapes = { 1, out_width * out_height * num_out_channels };
             auto pattern3 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
             auto reshape3 = std::make_shared<ngraph::opset1::Reshape>(permute3, pattern3, false);
 
@@ -440,7 +453,13 @@ class RemovePermutationsWithEltwiseTest : public testing::WithParamInterface<rem
         {1, 1, 168, 8},
         {1, 1, 32, 1},
         {1, 1, 32, 2},
-        {1, 1, 32, 8}
+        {1, 1, 32, 8},
+        {1, 168, 1, 1},
+        {1, 168, 1, 2},
+        {1, 168, 1, 8},
+        {1, 32, 1, 1},
+        {1, 32, 1, 2},
+        {1, 32, 1, 8}
     };
 
     INSTANTIATE_TEST_CASE_P(smoke_PermutationPass, RemovePermutationsNHWCToNCHWPassTest,
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp
index d114747234a081..1eb8e7a275a166 100644
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp
@@ -39,11 +39,7 @@ std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> basic = {
         {{1, 50}, {{}}},
         {{1, 128}, {{}}},
         {{1, 10 * 1024}, {{}}},
-        {{64, 1}, {{}}},
         {{8, 128}, {{}}},
-        {{16, 128}, {{}}},
-        {{18, 128}, {{}}},
-        {{32, 512}, {{}}},
         {{1, 4, 2, 256}, {{}}},
         {{4, 4, 4, 4}, {{}}},
         {{1, 16, 1, 128}, {{}}},
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp
index 8f99a0d6778f34..307b6a1271b2d8 100644
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp
@@ -22,7 +22,7 @@ class GnaConvolutionLayerTest : public ConvolutionLayerTest, GnaLayerTestCheck {
         }
     }
 
-    void SetUp() {
+    void SetUp() override {
         ConvolutionLayerTest::SetUp();
     }
 };
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
index 47852fb59c858d..989ee090d873fa 100644
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
@@ -57,5 +57,13 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*ConvolutionLayerTest.CompareWithRefs.*D=\(3.1\).*)",
         R"(.*ConstantResultSubgraphTest.*IS=\(2\.3\.4\.5\).*)",
         R"(.*ConstantResultSubgraphTest.*inPrc=(U8|I8|I32|U64|I64|BOOL).*)",
+        // TODO: Issue 51528
+        R"(.*CachingSupport.*_(u8|i16)_.*)",
+        // TODO: Issue 51527
+        R"(.*CachingSupport.*_batch2_.*)",
+        // TODO: Issue 51526
+        R"(.*CachingSupport.*ConvPoolRelu.*)",
+        // TODO: Issue 51525
+        R"(.*CachingSupport.*KSOFunction.*)",
     };
 }
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/const_conv_concat.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/const_conv_concat.cpp
new file mode 100644
index 00000000000000..48224037b2ae82
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/const_conv_concat.cpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "common_test_utils/test_constants.hpp"
+#include "subgraph_tests/const_conv_concat.hpp"
+
+using namespace SubgraphTestsDefinitions;
+
+namespace {
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_FP32"}
+    },
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+    }
+};
+
+std::vector<convParams> params = {
+    std::make_tuple(
+         std::vector<size_t>{1, 64},    //InputShape
+         std::vector<size_t>{1, 3},     //KernelShape
+         1),                            //Stride
+    std::make_tuple(std::vector<size_t>{1, 128}, std::vector<size_t>{1, 5}, 1),
+    std::make_tuple(std::vector<size_t>{1, 168}, std::vector<size_t>{1, 3}, 2),
+    std::make_tuple(std::vector<size_t>{1, 320}, std::vector<size_t>{1, 8}, 4)
+};
+
+std::vector<size_t> inputChannels = {
+    1,
+    4,
+    8
+};
+
+std::vector<size_t> outputChannels = {
+    64
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_ConstConvConcatTest, ConstConvConcatTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs),
+                            ::testing::ValuesIn(params),
+                            ::testing::ValuesIn(inputChannels),
+                            ::testing::ValuesIn(outputChannels)),
+                        ConstConvConcatTest::getTestCaseName);
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp
index e8b6761694408a..5c0b1afd882207 100644
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp
@@ -24,7 +24,7 @@ class GnaConvolutionReluSequenceTest : public ConvolutionReluSequenceTest, GnaLa
         }
     }
 
-    void SetUp() {
+    void SetUp() override {
         ConvolutionReluSequenceTest::SetUp();
     }
 };
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/eltwise_reshape_activation.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/eltwise_reshape_activation.cpp
new file mode 100644
index 00000000000000..332da49fb38b41
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/eltwise_reshape_activation.cpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "subgraph_tests/eltwise_reshape_activation.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace SubgraphTestsDefinitions;
+namespace {
+const std::vector<std::vector<std::vector<size_t>>> shapes = {
+    {{1, 64}, {64, 1}},
+    {{8, 256}, {16, 128}},
+    {{6, 384}, {18, 128}},
+    {{8, 2048}, {32, 512}}
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16
+};
+
+std::vector<std::map<std::string, std::string>>  additional_config = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_FP32"}
+    },
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+    }
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_EltwiseReshapeActivationTest, EltwiseReshapeActivation,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(shapes),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                                ::testing::ValuesIn(additional_config)),
+                        EltwiseReshapeActivation::getTestCaseName);
+
+} // namespace
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/add_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/add_transformation.cpp
index 56964d5628838d..8a33dacde43fea 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/add_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/add_transformation.cpp
@@ -13,7 +13,7 @@ using namespace InferenceEngine::details;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    //ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<LayerTestsDefinitions::AddTestValues> params = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/clamp_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/clamp_transformation.cpp
index bad59746178beb..078146f1819c10 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/clamp_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/clamp_transformation.cpp
@@ -14,7 +14,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp
index 5ecb4f4d773cd1..57cca3618123ae 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/concat_transformation.cpp
@@ -31,7 +31,12 @@ const std::vector<ConcatTransformationTestValues> testValues = {
     {
         { 256ul, ngraph::Shape({}), {0.f}, {2.55f}, {0.f}, {2.55f} },
         { 256ul, ngraph::Shape({}), {-1.28f}, {1.27f}, {-1.28f}, {1.27f} }
-    }
+    },
+    // FQ with unexpected quantizationLevels
+    {
+        { 16ul, ngraph::Shape({}), {0.f}, {15.f}, {0.f}, {1.5f} },
+        { 16ul, ngraph::Shape({}), {0.f}, {15.f}, {0.f}, {1.5f} }
+    },
 };
 
 INSTANTIATE_TEST_CASE_P(smoke_LPT, ConcatTransformation,
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp
index c2e16a810cd68d..cf789286f7f97f 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/concat_with_split_transformation.cpp
@@ -45,8 +45,7 @@ const std::vector<ConcatWithSplitTransformationParam> testValues = {
     }
 };
 
-// TODO: Split/VariadicSplit operations are not supported in ConcatTransformation
-INSTANTIATE_TEST_CASE_P(DISABLED_smoke_LPT, ConcatWithSplitTransformation,
+INSTANTIATE_TEST_CASE_P(smoke_LPT, ConcatWithSplitTransformation,
     ::testing::Combine(
         ::testing::ValuesIn(netPrecisions),
         ::testing::Values(ngraph::Shape({ 1, 6, 10, 10 })),
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp
index 937aea184aaa89..43928a7ec1caee 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/convolution_qdq_transformation.cpp
@@ -13,7 +13,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
@@ -72,7 +72,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionQDqTransformationParam> para
             { {-128.f}, ngraph::element::f32, {}, false, 1ul, ngraph::element::i8, true },
             { {0.2f}, ngraph::element::f32, {}, false }
         },
-        "output_original",
+        "Convolution",
         "U8"
     },
 
@@ -126,7 +126,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionQDqTransformationParam> para
             {},
             { {0.2f}, ngraph::element::f32, {}, false }
         },
-        "output_original",
+        "Convolution",
         "U8"
     },
 
@@ -177,7 +177,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionQDqTransformationParam> para
             { {128.f}, ngraph::element::f32, {}, false, 1ul, ngraph::element::i8, true },
             { {0.2f}, ngraph::element::f32, {}, false }
         },
-        "output_original",
+        "Convolution",
         "U8"
     },
 
@@ -228,7 +228,7 @@ const std::vector<LayerTestsDefinitions::ConvolutionQDqTransformationParam> para
             {},
             { {0.2f}, ngraph::element::f32, {}, false }
         },
-        "output_original",
+        "Convolution",
         "U8"
     },
 };
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp
index 422641666fc0ad..ce0db6e1e6681d 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/convolution_transformation.cpp
@@ -13,7 +13,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
@@ -27,30 +27,32 @@ const std::vector<LayerTestsDefinitions::ConvolutionTransformationParam> params
         false,
         {},
         false,
-        "output",
-        ""
+        "Convolution",
+        "FP32"
     },
     {
         {},
         false,
         { 255ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
         false,
-        "output",
-        ""
+        "Convolution",
+        "FP32"
     },
     {
         { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 255.f }, { 0.f }, { 25.5f } },
         false,
         { 255ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
         false,
-        "output_original",
+        "Convolution",
         "U8"
     },
     {
         { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 255.f }, { -12.75f }, { 6.375f } },
         true,
         { 255ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 254.f }, { -12.7f }, { 12.7f } },
-        false
+        false,
+        "Convolution",
+        "U8"
     }
 };
 
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/depth_to_space_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/depth_to_space_transformation.cpp
index 4d51965d270c6a..2075bfe18d76d2 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/depth_to_space_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/depth_to_space_transformation.cpp
@@ -14,7 +14,7 @@ using namespace ngraph::opset1;
 namespace {
 const std::vector<ngraph::element::Type> precisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<DepthToSpace::DepthToSpaceMode> modes = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_avg_pool_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_avg_pool_transformation.cpp
index 052f99a6bfb6af..45ca8e01a7d7c9 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_avg_pool_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_avg_pool_transformation.cpp
@@ -13,7 +13,8 @@ using namespace InferenceEngine::details;
 
 namespace {
 const std::vector<ngraph::element::Type> precisions = {
-    ngraph::element::f32
+    ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_max_pool_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_max_pool_transformation.cpp
index c270e701845ca8..56e2ecd29a9ac3 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_max_pool_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_max_pool_transformation.cpp
@@ -13,7 +13,8 @@ using namespace InferenceEngine::details;
 
 namespace {
 const std::vector<ngraph::element::Type> precisions = {
-    ngraph::element::f32
+    ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_two_output_branches_with_convolution.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_two_output_branches_with_convolution.cpp
index 7091a66e0d3143..dc48bcb9941561 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_two_output_branches_with_convolution.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_and_two_output_branches_with_convolution.cpp
@@ -13,7 +13,7 @@ using namespace ngraph::pass::low_precision;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_precision_selection_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_precision_selection_transformation.cpp
index bc8960524670eb..f70b1cdde5fdad 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_precision_selection_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_precision_selection_transformation.cpp
@@ -14,7 +14,7 @@ using namespace ngraph::pass::low_precision;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_transformation.cpp
index 434e150d35e839..1a00abb7f037dd 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_transformation.cpp
@@ -14,7 +14,7 @@ using namespace ngraph::pass::low_precision;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp
index 90facce307b23e..963b6e6f28a7a3 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp
@@ -12,8 +12,9 @@ using namespace LayerTestsDefinitions;
 using namespace ngraph::pass::low_precision;
 
 namespace {
-const std::vector<InferenceEngine::Precision> netPrecisions = {
-    InferenceEngine::Precision::FP32
+const std::vector<ngraph::element::Type> netPrecisions = {
+    ngraph::element::f32,
+    ngraph::element::f16,
 };
 
 const std::vector<LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
index 5ba8057fc3c2d1..db3665f6aaf317 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fully_connected_transformation.cpp
@@ -13,7 +13,7 @@ using namespace InferenceEngine::details;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<MatMulShapes> shapes = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fuse_convert_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fuse_convert_transformation.cpp
index 67bdef0c6a8611..01013dd414758b 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fuse_convert_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fuse_convert_transformation.cpp
@@ -9,7 +9,8 @@ using namespace InferenceEngine::details;
 
 namespace {
 const std::vector<element::Type> precisions = {
-        element::f32
+        element::f32,
+        // element::f16 // TODO: temporarily commented due to failing in GPU Plugin on constant folding stage
 };
 
 const std::vector< ngraph::Shape > inputAndQuantizationShapes = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fuse_fake_quantize_and_scale_shift_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fuse_fake_quantize_and_scale_shift_transformation.cpp
index 042ff2692fbdda..175141ef885d45 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fuse_fake_quantize_and_scale_shift_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/fuse_fake_quantize_and_scale_shift_transformation.cpp
@@ -14,6 +14,7 @@ using namespace ngraph::pass::low_precision;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/gemm_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/gemm_transformation.cpp
index 50bdfdd913db11..ca0f60b35f4e91 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/gemm_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/gemm_transformation.cpp
@@ -13,6 +13,7 @@ using namespace ngraph::pass::low_precision;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::Shape> dimensions = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/group_convolution_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/group_convolution_transformation.cpp
index a26dd86b404c6e..8c346ba667e75d 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/group_convolution_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/group_convolution_transformation.cpp
@@ -12,7 +12,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/interpolate_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/interpolate_transformation.cpp
index 4c56b43458c437..c00f6bf695e56a 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/interpolate_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/interpolate_transformation.cpp
@@ -9,7 +9,8 @@ using namespace InferenceEngine::details;
 
 namespace {
 const std::vector<ngraph::element::Type> precisions = {
-    ngraph::element::f32
+    ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<std::pair<ngraph::Shape, ngraph::Shape>> shapes = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_transformation.cpp
index f6aeca2e58d731..5afaefb92be6ec 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_transformation.cpp
@@ -12,7 +12,7 @@ using namespace InferenceEngine::details;
 namespace {
 const std::vector<ngraph::element::Type> precisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 std::vector<MatMulTransformationTestValues> testValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
index 10d547c83d68ef..bff6d09467bd9a 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_constant_transformation.cpp
@@ -10,7 +10,10 @@ using namespace LayerTestsDefinitions;
 using namespace InferenceEngine::details;
 
 namespace {
-const std::vector<ngraph::element::Type> precisions = { ngraph::element::f32 };
+const std::vector<ngraph::element::Type> precisions = {
+    ngraph::element::f32,
+    ngraph::element::f16
+};
 
 //transpose_a = false, transpose_b = true
 std::vector<MatMulWithConstantTransformationTestValues> testValues = {
@@ -20,7 +23,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::f32, ngraph::Shape{ 2, 4 } },
         { 256ul, {{1}, {1}, {2, 1}, {2, 1}}, {-128.f}, {127.f}, {-128.f, -12.8f}, {127.f, 12.7f} },
         { {}, {}, {} },
-        "result_result",
+        "FullyConnected",
         "FP32"
     },
     {
@@ -29,7 +32,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::i8, ngraph::Shape{ 2, 4 } },
         {},
         { ngraph::element::f32, {}, {0.1f} },
-        "result_result",
+        "FullyConnected",
         "FP32"
     },
     {
@@ -38,7 +41,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::f32, ngraph::Shape{ 2, 4 } },
         { 256ul, {{1}, {1}, {2, 1}, {2, 1}}, {-128.f}, {127.f}, {-128.f, -12.8f}, {127.f, 12.7f} },
         { {}, {}, {} },
-        "result_result",
+        "FullyConnected",
         "FP32"
     },
     {
@@ -47,7 +50,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::f32, ngraph::Shape{ 2, 4 } },
         { 256ul, {{1}, {1}, {2, 1}, {2, 1}}, {-128.f}, {127.f}, {-128.f, -12.8f}, {127.f, 12.7f} },
         { {}, {}, {} },
-        "matMul",
+        "FullyConnected",
         "U8"
     },
     {
@@ -56,7 +59,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 2, 2.f), ngraph::element::i8, ngraph::Shape{ 2, 4 } },
         {},
         { ngraph::element::f32, {}, {{0.1f, 0.01}, ngraph::element::f32, ngraph::Shape{ 2, 1 }} },
-        "matMul",
+        "FullyConnected",
         "U8"
     },
     {
@@ -65,7 +68,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>(4 * 4, 2.f), ngraph::element::f32, ngraph::Shape{ 4, 4 } },
         { 256ul, {{1}, {1}, {1}, {1}}, {-128.f}, {127.f}, {-128.f}, {127.f} },
         { {}, {}, {} },
-        "result_result",
+        "FullyConnected",
         "FP32"
     },
     {
@@ -74,7 +77,7 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>{1, 2, 3, 4, 5, 6}, ngraph::element::f32, ngraph::Shape{ 2, 3 } },
         { 256ul, {{1}, {1}, {1}, {1}}, {-128.f}, {127.f}, {-12.8f}, {12.7f} },
         { {}, {}, {} },
-        "matMul",
+        "FullyConnected",
         "U8"
     },
     {
@@ -83,6 +86,8 @@ std::vector<MatMulWithConstantTransformationTestValues> testValues = {
         { std::vector<float>{1, 2, 3, 4, 5, 6}, ngraph::element::i8, ngraph::Shape{ 2, 3 } },
         {},
         { ngraph::element::f32, {}, {0.1f} },
+        "FullyConnected",
+        "U8"
     }
 };
 
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
index 0425979aa65eea..a3d2c4eac12975 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mat_mul_with_optimized_constant_fake_quantize_transformation.cpp
@@ -12,6 +12,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<LayerTestsDefinitions::MatMulWithOptimizedConstantFakeQuantizeTransformationTestValues> params = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution_transformation.cpp
index 5ac2b3e779be89..2bb1abfb9ed980 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution_transformation.cpp
@@ -9,7 +9,8 @@ using namespace InferenceEngine::details;
 
 namespace {
 const std::vector<element::Type> precisions = {
-    element::f32
+    element::f32,
+    element::f16
 };
 
 const std::vector< ngraph::Shape > inputShapes = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_transformation.cpp
index 74df7d38321987..b107cff023fd83 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_transformation.cpp
@@ -12,7 +12,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    //ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<LayerTestsDefinitions::MultiplyTestValues> params = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp
index 870b9c52130c8e..dba8fb64ad7848 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_with_one_parent_transformation.cpp
@@ -12,7 +12,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<MultiplyWithOneParentTransformationValues> values = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mvn_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mvn_transformation.cpp
index 6355c957403ca9..7599e34484d56b 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mvn_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/mvn_transformation.cpp
@@ -9,7 +9,8 @@ using namespace InferenceEngine::details;
 
 namespace {
     const std::vector<element::Type> precisions = {
-        element::f32
+        element::f32,
+        element::f16
     };
 
     const std::vector<ngraph::Shape> inputAndQuantizationShapes = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/normalize_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/normalize_transformation.cpp
index d446ab186628bd..0fa6be4ada6a15 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/normalize_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/normalize_transformation.cpp
@@ -13,7 +13,7 @@ using namespace InferenceEngine::details;
 namespace {
 const std::vector<ngraph::element::Type> precisions = {
     ngraph::element::f32,
-    //ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<std::pair<ngraph::Shape, ngraph::Shape> > inputAndQuantizationShapes = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat.cpp
index 487a99a379d74d..1901cc4a219bb8 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat.cpp
@@ -4,7 +4,7 @@
 
 #include <vector>
 
-#include "low_precision_transformations/output_layers_handling_in_transformations.hpp"
+#include "low_precision_transformations/output_layers_handling_in_transformations_for_concat.hpp"
 #include "common_test_utils/test_constants.hpp"
 
 using namespace LayerTestsDefinitions;
@@ -19,11 +19,11 @@ const std::vector<LayerTransformation::Params> trasformationParamValues = {
     LayerTestsUtils::LayerTransformationParamsFactory::createParams()
 };
 
-INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformations,
+INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformationsForConcat,
     ::testing::Combine(
         ::testing::ValuesIn(netPrecisions),
         ::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })),
         ::testing::Values(CommonTestUtils::DEVICE_GPU),
         ::testing::ValuesIn(trasformationParamValues)),
-    OutputLayersHandlingInTransformations::getTestCaseName);
+    OutputLayersHandlingInTransformationsForConcat::getTestCaseName);
 }  // namespace
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.cpp
index 487a99a379d74d..cbf5ccb423c59d 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.cpp
@@ -4,7 +4,7 @@
 
 #include <vector>
 
-#include "low_precision_transformations/output_layers_handling_in_transformations.hpp"
+#include "low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.hpp"
 #include "common_test_utils/test_constants.hpp"
 
 using namespace LayerTestsDefinitions;
@@ -19,11 +19,11 @@ const std::vector<LayerTransformation::Params> trasformationParamValues = {
     LayerTestsUtils::LayerTransformationParamsFactory::createParams()
 };
 
-INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformations,
+INSTANTIATE_TEST_CASE_P(DISABLED_smoke_LPT, OutputLayersHandlingInTransformationsForConcatMultiChannel,
     ::testing::Combine(
         ::testing::ValuesIn(netPrecisions),
         ::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })),
         ::testing::Values(CommonTestUtils::DEVICE_GPU),
         ::testing::ValuesIn(trasformationParamValues)),
-    OutputLayersHandlingInTransformations::getTestCaseName);
+    OutputLayersHandlingInTransformationsForConcatMultiChannel::getTestCaseName);
 }  // namespace
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/prelu_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/prelu_transformation.cpp
index 06f2939bb31ccf..e65229a9f81d55 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/prelu_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/prelu_transformation.cpp
@@ -12,7 +12,8 @@ using namespace InferenceEngine::details;
 
 namespace {
 const std::vector<ngraph::element::Type> precisions = {
-    ngraph::element::f32
+    ngraph::element::f32,
+    ngraph::element::f16
 };
 
 std::vector<PReluTestValues> testValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/pull_reshape_through_dequantization_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/pull_reshape_through_dequantization_transformation.cpp
index 96e5773babcc1a..aecad435daafe5 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/pull_reshape_through_dequantization_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/pull_reshape_through_dequantization_transformation.cpp
@@ -12,7 +12,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    ngraph::element::f16
+    // ngraph::element::f16 // TODO: enable f16 test inference (change ngraph function + fp32 to fp16 replacements)
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/relu_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/relu_transformation.cpp
index 9ea97a8bafa718..219a64f9c005cf 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/relu_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/relu_transformation.cpp
@@ -13,7 +13,7 @@ using namespace InferenceEngine::details;
 namespace {
 const std::vector<ngraph::element::Type> precisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 std::vector<ReluTestValues> testValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/reshape_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/reshape_transformation.cpp
index 222e24bd553726..46ca94e9cb7e97 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/reshape_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/reshape_transformation.cpp
@@ -11,8 +11,8 @@ using namespace LayerTestsDefinitions;
 
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
-    ngraph::element::f32
-    // ngraph::element::f16
+    ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/split_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/split_transformation.cpp
index 393e40c15da5ef..9f9ce9f544b8cc 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/split_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/split_transformation.cpp
@@ -15,7 +15,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/squeeze_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/squeeze_transformation.cpp
index a966d6ba20c821..36512fd4e58258 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/squeeze_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/squeeze_transformation.cpp
@@ -13,6 +13,7 @@ using namespace ngraph::pass::low_precision;
 namespace {
     const std::vector<ngraph::element::Type> netPrecisions = {
         ngraph::element::f32,
+        ngraph::element::f16
     };
 
 
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/strided_slice_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/strided_slice_transformation.cpp
index 0263ac39c79398..bc974c50570f9f 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/strided_slice_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/strided_slice_transformation.cpp
@@ -14,7 +14,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/subtract_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/subtract_transformation.cpp
index 387657ab28080c..4bdb067f4b9116 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/subtract_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/subtract_transformation.cpp
@@ -13,6 +13,7 @@ using namespace ngraph::pass::low_precision;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/transpose_after_matmul_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/transpose_after_matmul_transformation.cpp
index 7bf86ffe6cbf57..e3cdb1f93ff173 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/transpose_after_matmul_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/transpose_after_matmul_transformation.cpp
@@ -13,6 +13,7 @@ using namespace ngraph::pass::low_precision;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
+    ngraph::element::f16
 };
 
 const std::vector<LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/transpose_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/transpose_transformation.cpp
index e1735a84f8cad4..e35f3cde45ab43 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/transpose_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/transpose_transformation.cpp
@@ -12,7 +12,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> precisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<TransposeTransformationTestValues> testValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/unsqueeze_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/unsqueeze_transformation.cpp
index a996c8a655cf5e..9397d7482ebf74 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/unsqueeze_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/unsqueeze_transformation.cpp
@@ -13,6 +13,7 @@ using namespace ngraph::pass::low_precision;
 namespace {
     const std::vector<ngraph::element::Type> netPrecisions = {
         ngraph::element::f32,
+        ngraph::element::f16
     };
 
 
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/variadic_split_transformation.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/variadic_split_transformation.cpp
index 5e7e3919c59347..6b83a4a0bf426d 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/variadic_split_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/variadic_split_transformation.cpp
@@ -15,7 +15,7 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<ngraph::element::Type> netPrecisions = {
     ngraph::element::f32,
-    // ngraph::element::f16
+    ngraph::element::f16
 };
 
 const std::vector<ngraph::pass::low_precision::LayerTransformation::Params> trasformationParamValues = {
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather_nd.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather_nd.cpp
new file mode 100644
index 00000000000000..be599acaef41ca
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather_nd.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <ngraph/opsets/opset5.hpp>
+
+#include "single_layer_tests/gather_nd.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+using namespace ngraph::opset5;
+
+namespace {
+
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16,
+        InferenceEngine::Precision::I32,
+};
+
+const std::vector<InferenceEngine::Precision> idxPrecisions = {
+        InferenceEngine::Precision::I32,
+        InferenceEngine::Precision::I64,
+};
+
+// set1
+const auto gatherNDArgsSubset1 = ::testing::Combine(
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>(
+        { {2, 2}, {2, 3, 4} })),                                // Data shape
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>(
+        { {2, 1}, {2, 1, 1} })),                                // Indices shape
+    ::testing::ValuesIn(std::vector<int>({ 0, 1 }))             // Batch dims
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_GatherND_set1, GatherNDLayerTest,
+    ::testing::Combine(
+        gatherNDArgsSubset1,
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::ValuesIn(idxPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GPU),
+        ::testing::Values<Config>({})),
+    GatherNDLayerTest::getTestCaseName);
+
+// set2
+const auto gatherNDArgsSubset2 = ::testing::Combine(
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>(
+        { {15, 12, 20, 15, 2}, {15, 12, 18, 7, 17} })),         // Data shape
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>(
+        { {15, 12, 2}, {15, 12, 5, 9, 1, 3} })),                // Indices shape
+    ::testing::ValuesIn(std::vector<int>({ 1, 2 }))             // Batch dims
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_GatherND_set2, GatherNDLayerTest,
+    ::testing::Combine(
+        gatherNDArgsSubset2,
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::ValuesIn(idxPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GPU),
+        ::testing::Values<Config>({})),
+    GatherNDLayerTest::getTestCaseName);
+
+// set3
+const auto gatherNDArgsSubset3 = ::testing::Combine(
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>(
+        { {4, 3, 2, 5, 5, 2}, {4, 3, 2, 5, 7, 2} })),           // Data shape
+    ::testing::ValuesIn(std::vector<std::vector<size_t>>(
+        { {4, 3, 2, 5, 1}, {4, 3, 2, 5, 6, 2} })),              // Indices shape
+    ::testing::ValuesIn(std::vector<int>({ 3, 4 }))             // Batch dims
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_GatherND_set3, GatherNDLayerTest,
+    ::testing::Combine(
+        gatherNDArgsSubset3,
+        ::testing::ValuesIn(inputPrecisions),
+        ::testing::ValuesIn(idxPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GPU),
+        ::testing::Values<Config>({})),
+    GatherNDLayerTest::getTestCaseName);
+
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/behavior/core_integration.cpp b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/behavior/core_integration.cpp
index 200e62cf37fa23..37cfad33c096e9 100644
--- a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/behavior/core_integration.cpp
+++ b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/behavior/core_integration.cpp
@@ -50,7 +50,7 @@ TEST_P(IEClassNetworkTestP_VPU, smoke_ImportNetworkNoThrowIfNoDeviceName) {
     if (!strm.str().empty() && deviceName.find(CommonTestUtils::DEVICE_FPGA) != std::string::npos) {
         SKIP_IF_NOT_IMPLEMENTED(executableNetwork = ie.ImportNetwork(strm));
     }
-    if (nullptr != static_cast<IExecutableNetwork::Ptr &>(executableNetwork)) {
+    if (executableNetwork) {
         ASSERT_NO_THROW(executableNetwork.CreateInferRequest());
     }
 }
diff --git a/inference-engine/tests/functional/plugin/shared/include/behavior/core_integration.hpp b/inference-engine/tests/functional/plugin/shared/include/behavior/core_integration.hpp
index d9386bdbf39a46..76dc7c4cf8647b 100644
--- a/inference-engine/tests/functional/plugin/shared/include/behavior/core_integration.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/behavior/core_integration.hpp
@@ -485,7 +485,7 @@ TEST_P(IEClassImportExportTestP, smoke_ImportNetworkNoThrowIfNoDeviceName) {
     if (!strm.str().empty()) {
         SKIP_IF_NOT_IMPLEMENTED(executableNetwork = ie.ImportNetwork(strm));
     }
-    if (nullptr != static_cast<IExecutableNetwork::Ptr &>(executableNetwork)) {
+    if (executableNetwork) {
         ASSERT_NO_THROW(executableNetwork.CreateInferRequest());
     }
 }
@@ -498,7 +498,7 @@ TEST_P(IEClassImportExportTestP, smoke_ImportNetworkNoThrowWithDeviceName) {
     ASSERT_NO_THROW(executableNetwork = ie.LoadNetwork(actualNetwork, deviceName));
     SKIP_IF_NOT_IMPLEMENTED(executableNetwork.Export(strm));
     SKIP_IF_NOT_IMPLEMENTED(executableNetwork = ie.ImportNetwork(strm, deviceName));
-    if (nullptr != static_cast<IExecutableNetwork::Ptr &>(executableNetwork)) {
+    if (executableNetwork) {
         ASSERT_NO_THROW(executableNetwork.CreateInferRequest());
     }
 }
@@ -519,7 +519,7 @@ TEST_P(IEClassImportExportTestP, smoke_ExportUsingFileNameImportFromStreamNoThro
         }
         ASSERT_EQ(0, remove(fileName.c_str()));
     }
-    if (nullptr != static_cast<IExecutableNetwork::Ptr &>(executableNetwork)) {
+    if (executableNetwork) {
         ASSERT_NO_THROW(executableNetwork.CreateInferRequest());
     }
 }
diff --git a/inference-engine/tests/functional/plugin/shared/include/behavior/infer_request.hpp b/inference-engine/tests/functional/plugin/shared/include/behavior/infer_request.hpp
index 40106ef228b714..20fceb40d7dd08 100644
--- a/inference-engine/tests/functional/plugin/shared/include/behavior/infer_request.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/behavior/infer_request.hpp
@@ -455,20 +455,15 @@ TEST_P(InferRequestTests, canRun3SyncRequestsConsistentlyFromThreads) {
     auto req1 = execNet.CreateInferRequest();
     auto req2 = execNet.CreateInferRequest();
     auto req3 = execNet.CreateInferRequest();
-    InferenceEngine::ResponseDesc response1, response2, response3;
-    InferenceEngine::StatusCode sts1, sts2, sts3;
 
-    std::thread t1([&] { req1.Infer(); sts1 = req1.Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); });
-    std::thread t2([&] { req2.Infer(); sts2 = req2.Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); });
-    std::thread t3([&] { req3.Infer(); sts3 = req3.Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY); });
 
-    t1.join();
-    t2.join();
-    t3.join();
+    auto f1 = std::async(std::launch::async, [&] { req1.Infer();});
+    auto f2 = std::async(std::launch::async, [&] { req2.Infer();});
+    auto f3 = std::async(std::launch::async, [&] { req3.Infer();});
 
-    ASSERT_EQ(static_cast<int>(InferenceEngine::StatusCode::OK), sts1) << response1.msg;
-    ASSERT_EQ(static_cast<int>(InferenceEngine::StatusCode::OK), sts2) << response2.msg;
-    ASSERT_EQ(static_cast<int>(InferenceEngine::StatusCode::OK), sts3) << response3.msg;
+    ASSERT_NO_THROW(f1.get());
+    ASSERT_NO_THROW(f2.get());
+    ASSERT_NO_THROW(f3.get());
 }
 
 TEST_P(InferRequestTests, canRun3AsyncRequestsConsistentlyWithWait) {
diff --git a/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.hpp b/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.hpp
index 25d56620a52c73..7978fc26da6765 100644
--- a/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.hpp
@@ -49,8 +49,8 @@ inline std::ostream& operator<<(std::ostream& out, const FakeQuantizeWithNotOpti
 
 // ngraph::builder::subgraph::FakeQuantizeOnData
 typedef std::tuple<
-    InferenceEngine::Precision,
-    InferenceEngine::SizeVector,
+    ngraph::element::Type,
+    ngraph::Shape,
     std::string,
     ngraph::pass::low_precision::LayerTransformation::Params,
     FakeQuantizeWithNotOptimalTransformationTestValues> FakeQuantizeTransformationParams;
diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/clamp.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/clamp.hpp
new file mode 100644
index 00000000000000..dcbdb43eea097d
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/clamp.hpp
@@ -0,0 +1,14 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/single_layer/clamp.hpp"
+
+namespace LayerTestsDefinitions {
+
+TEST_P(ClampLayerTest, CompareWithRefs) {
+    Run();
+}
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/const_conv_concat.hpp b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/const_conv_concat.hpp
new file mode 100644
index 00000000000000..676db779faf511
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/const_conv_concat.hpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/subgraph/const_conv_concat.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+TEST_P(ConstConvConcatTest, CompareWithRefImpl) {
+    LoadNetwork();
+    GenerateInputs();
+    Infer();
+    // Create another copy of function for validation since some data will be changed by GNA plugin
+    SetUp();
+    Validate();
+};
+}  // namespace SubgraphTestsDefinitions
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/eltwise_reshape_activation.hpp b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/eltwise_reshape_activation.hpp
new file mode 100644
index 00000000000000..17232b3efad217
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/eltwise_reshape_activation.hpp
@@ -0,0 +1,15 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/subgraph/eltwise_reshape_activation.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+TEST_P(EltwiseReshapeActivation, CompareWithRefs) {
+    Run();
+}
+
+}  // namespace SubgraphTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/concat_transformation.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/concat_transformation.cpp
index f6dc2452a22218..0b4d3bfb1d9cb3 100644
--- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/concat_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/concat_transformation.cpp
@@ -72,9 +72,15 @@ void ConcatTransformation::validate() {
     const auto transformed = transformNGraph(params, getLowPrecisionTransformationsNGraph(params));
 
     const auto output = transformed->get_output_op(0);
-    const auto scaleShift = output->get_input_node_shared_ptr(0);
-    const std::string typeName = scaleShift->get_type_name();
-    ASSERT_EQ("ScaleShiftIE", typeName);
+    const auto previousLayer = output->get_input_node_shared_ptr(0);
+    const std::string typeName = previousLayer->get_type_name();
+
+    if (testValues.fqOnData1.quantizationLevel != 256ul ||
+        testValues.fqOnData2.quantizationLevel != 256ul) {
+        ASSERT_EQ("Concat", typeName);
+    } else {
+        ASSERT_EQ("ScaleShiftIE", typeName);
+    }
 }
 
 TEST_P(ConcatTransformation, CompareWithRefImpl) {
diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/convolution_qdq_transformation.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/convolution_qdq_transformation.cpp
index 3f7d9e0ea31577..a55cc05d2846c4 100644
--- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/convolution_qdq_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/convolution_qdq_transformation.cpp
@@ -59,7 +59,7 @@ void ConvolutionQDqTransformation::Run() {
     LayerTestsCommon::Run();
 
     const auto params = std::get<4>(GetParam());
-    const auto actualType = getRuntimePrecision(params.layerName);
+    const auto actualType = getRuntimePrecisionByType(params.layerName);
     EXPECT_EQ(actualType, params.expectedKernelType);
 }
 
diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/convolution_transformation.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/convolution_transformation.cpp
index b2dcd7205c5963..f6e0a544fde271 100755
--- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/convolution_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/convolution_transformation.cpp
@@ -58,8 +58,12 @@ void ConvolutionTransformation::Run() {
     LayerTestsCommon::Run();
 
     const auto params = std::get<4>(GetParam());
-    const auto actualType = getRuntimePrecision(params.layerName);
-    EXPECT_EQ(actualType, params.expectedKernelType);
+    const auto actualPrecision = getRuntimePrecisionByType(params.layerName);
+    auto expectedPrecision = params.expectedKernelType;
+    if (expectedPrecision == "FP32" && std::get<0>(GetParam()) == ngraph::element::f16) {
+        expectedPrecision = "FP16";
+    }
+    EXPECT_EQ(actualPrecision, expectedPrecision);
 }
 
 void ConvolutionTransformation::validate() {
diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp
index 49823d5335ce0c..9667650a641ca6 100644
--- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/fake_quantize_with_dq_not_optimal_transformation.cpp
@@ -16,8 +16,8 @@
 namespace LayerTestsDefinitions {
 
 std::string FakeQuantizeWithNotOptimalTransformation::getTestCaseName(testing::TestParamInfo<FakeQuantizeTransformationParams> obj) {
-    InferenceEngine::Precision netPrecision;
-    InferenceEngine::SizeVector inputShapes;
+    ngraph::element::Type netPrecision;
+    ngraph::Shape inputShapes;
     std::string targetDevice;
     ngraph::pass::low_precision::LayerTransformation::Params params;
     FakeQuantizeWithNotOptimalTransformationTestValues testValues;
@@ -29,14 +29,14 @@ std::string FakeQuantizeWithNotOptimalTransformation::getTestCaseName(testing::T
 }
 
 void FakeQuantizeWithNotOptimalTransformation::SetUp() {
-    InferenceEngine::SizeVector inputShape;
-    InferenceEngine::Precision netPrecision;
+    ngraph::Shape inputShape;
+    ngraph::element::Type netPrecision;
     ngraph::pass::low_precision::LayerTransformation::Params params;
     FakeQuantizeWithNotOptimalTransformationTestValues testValues;
     std::tie(netPrecision, inputShape, targetDevice, params, testValues) = this->GetParam();
 
     function = ngraph::builder::subgraph::FakeQuantizeAndConvolutionFunction::get(
-        FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision),
+        netPrecision,
         inputShape,
         testValues.fqOnData,
         testValues.convertOnData,
@@ -52,7 +52,7 @@ void FakeQuantizeWithNotOptimalTransformation::Run() {
     LayerTestsCommon::Run();
 
     const auto params = std::get<4>(GetParam());
-    const auto actualType = getRuntimePrecision("output_original");
+    const auto actualType = getRuntimePrecisionByType("Convolution");
     EXPECT_EQ(actualType, params.expectedPrecision);
 }
 
diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_constant_transformation.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_constant_transformation.cpp
index ce3382a64dd84b..50f7c4b324130c 100644
--- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_constant_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/mat_mul_with_constant_transformation.cpp
@@ -96,9 +96,12 @@ void MatMulWithConstantTransformation::Run() {
     LayerTestsCommon::Run();
 
     const auto params = std::get<2>(GetParam());
-    const auto actualType = getRuntimePrecision(params.layerName);
-
-    EXPECT_EQ(actualType, params.expectedKernelType);
+    const auto actualPrecision = getRuntimePrecisionByType(params.layerName);
+    auto expectedPrecision = params.expectedKernelType;
+    if (expectedPrecision == "FP32" && std::get<0>(GetParam()) == ngraph::element::f16) {
+        expectedPrecision = "FP16";
+    }
+    EXPECT_EQ(actualPrecision, expectedPrecision);
 }
 
 TEST_P(MatMulWithConstantTransformation, CompareWithRefImpl) {
diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/multiply_transformation.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/multiply_transformation.cpp
index 98528441a61d80..b2e9e9bdf597d8 100644
--- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/multiply_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/multiply_transformation.cpp
@@ -83,10 +83,10 @@ void MultiplyTransformation::validate() {
         const auto mul = output->get_input_node_shared_ptr(0);
         const std::string typeName = mul->get_type_name();
         ASSERT_EQ("Eltwise", typeName);
-
+        const bool notTransformed = param.expectedPrecisions[0] == param.expectedPrecisions[1];
         for (size_t i = 0; i < param.expectedPrecisions.size(); ++i) {
             const auto curPrecision = mul->get_input_element_type(i);
-            const auto expectedPrecision = param.expectedPrecisions[i];
+            const auto expectedPrecision = notTransformed ? precision : param.expectedPrecisions[i];
             ASSERT_EQ(curPrecision, expectedPrecision);
         }
     }
diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/normalize_transformation.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/normalize_transformation.cpp
index 8a894684a85265..0dfc98a8a82048 100644
--- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/normalize_transformation.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/normalize_transformation.cpp
@@ -82,7 +82,7 @@ void NormalizeL2Transformation::validate() {
     ASSERT_EQ("NormalizeIE", typeName);
 
     const auto inputPrecision = normalize->get_input_element_type(0);
-    const auto expectedPrecision = shift ? ngraph::element::f32 : ngraph::element::u8;
+    const auto expectedPrecision = shift ? precision : ngraph::element::u8;
     ASSERT_EQ(inputPrecision, expectedPrecision);
 }
 
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
index a6e20f7e1fe5af..c6d050de9047b0 100644
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
@@ -79,6 +79,7 @@ class LayerTestsCommon : public CommonTestUtils::TestsCommon {
     std::map<std::string, std::string>& GetConfiguration();
 
     std::string getRuntimePrecision(const std::string& layerName);
+    std::string getRuntimePrecisionByType(const std::string& layerType);
 
     template<class T>
     static void Compare(const T *expected, const T *actual, std::size_t size, T threshold) {
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/activation.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/activation.hpp
index 3447a91664970f..bdf996cd141065 100644
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/activation.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/activation.hpp
@@ -36,7 +36,6 @@ static std::map<ngraph::helpers::ActivationTypes, std::string> activationNames =
         {ngraph::helpers::ActivationTypes::Log,                   "Log"},
         {ngraph::helpers::ActivationTypes::Sign,                  "Sign"},
         {ngraph::helpers::ActivationTypes::Abs,                   "Abs"},
-        {ngraph::helpers::ActivationTypes::Gelu,                  "Gelu"},
         {ngraph::helpers::ActivationTypes::Clamp,                 "Clamp"},
         {ngraph::helpers::ActivationTypes::Negative,              "Negative"},
         {ngraph::helpers::ActivationTypes::Acos,                  "Acos"},
@@ -70,7 +69,9 @@ static std::map<ngraph::helpers::ActivationTypes, std::string> activationNames =
         {ngraph::helpers::ActivationTypes::Swish,                 "Swish"},
         {ngraph::helpers::ActivationTypes::HSigmoid,              "HSigmoid"},
         {ngraph::helpers::ActivationTypes::RoundHalfToEven,       "RoundHalfToEven"},
-        {ngraph::helpers::ActivationTypes::RoundHalfAwayFromZero, "RoundHalfAwayFromZero"}
+        {ngraph::helpers::ActivationTypes::RoundHalfAwayFromZero, "RoundHalfAwayFromZero"},
+        {ngraph::helpers::ActivationTypes::GeluErf,               "GeluErf"},
+        {ngraph::helpers::ActivationTypes::GeluTanh,              "GeluTanh"},
 };
 
 typedef std::tuple<
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/clamp.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/clamp.hpp
new file mode 100644
index 00000000000000..bcf34636513df6
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/clamp.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <tuple>
+#include <string>
+
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+
+namespace LayerTestsDefinitions {
+
+using clampParamsTuple = std::tuple<
+    InferenceEngine::SizeVector,    // Input shape
+    std::pair<float, float>,        // Interval [min, max]
+    InferenceEngine::Precision,     // Net precision
+    std::string>;                   // Device name
+
+class ClampLayerTest : public testing::WithParamInterface<clampParamsTuple>,
+                       virtual public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<clampParamsTuple> obj);
+protected:
+    void SetUp() override;
+};
+
+} // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/reduce_ops.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/reduce_ops.hpp
index 050a6528a7277c..ad8b12deb8599e 100644
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/reduce_ops.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/reduce_ops.hpp
@@ -32,6 +32,7 @@ class ReduceOpsLayerTest : public testing::WithParamInterface<reduceMeanParams>,
                            virtual public LayerTestsUtils::LayerTestsCommon {
 public:
     static std::string getTestCaseName(testing::TestParamInfo<reduceMeanParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override;
 
 protected:
     void SetUp() override;
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/const_conv_concat.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/const_conv_concat.hpp
new file mode 100644
index 00000000000000..6c5984f2cba6f5
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/const_conv_concat.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+typedef std::tuple<
+        std::vector<size_t>,                 // Input Shapes
+        std::vector<size_t>,                 // Kernel Shape
+        size_t                               // Stride
+> convParams;
+
+typedef std::tuple<
+        InferenceEngine::Precision,          // Network Precision
+        std::string,                         // Target Device
+        std::map<std::string, std::string>,  // Configuration
+        convParams,                          // Convolution Params
+        size_t,                              // Input Channels
+        size_t                               // Output Channels
+> ConstConvConcatParams;
+
+class ConstConvConcatTest : public testing::WithParamInterface<ConstConvConcatParams>,
+                            public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ConstConvConcatParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override;
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace SubgraphTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/eltwise_reshape_activation.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/eltwise_reshape_activation.hpp
new file mode 100644
index 00000000000000..5a20bfe40f0fd0
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/eltwise_reshape_activation.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+using EltwiseReshapeActivationParams = typename std::tuple<
+    std::vector<std::vector<size_t>>,   // input shape and shape after reshape
+    InferenceEngine::Precision,         // precision
+    std::string,                        // device name
+    std::map<std::string, std::string>  // configuration
+>;
+
+class EltwiseReshapeActivation : public testing::WithParamInterface<EltwiseReshapeActivationParams>,
+                                 public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ParamType> obj);
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace SubgraphTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp b/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp
index 3924efc1624b8b..f9d5ff50fe0891 100644
--- a/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp
@@ -358,6 +358,30 @@ std::string LayerTestsCommon::getRuntimePrecision(const std::string& layerName)
     return "";
 }
 
+std::string LayerTestsCommon::getRuntimePrecisionByType(const std::string& layerType) {
+    const auto execGraph = executableNetwork.GetExecGraphInfo();
+    const auto function = execGraph.getFunction();
+
+    for (const auto& op : function->get_ops()) {
+        const auto& rtInfo = op->get_rt_info();
+        const auto& typeIt = rtInfo.find("layerType");
+
+        IE_ASSERT(typeIt != rtInfo.end()) << "Layer is not found for type: " << layerType;
+
+        const auto type = ngraph::as_type_ptr<ngraph::VariantWrapper<std::string>>(typeIt->second)->get();
+        if (type == layerType) {
+            const auto& it = rtInfo.find("runtimePrecision");
+
+            IE_ASSERT(it != rtInfo.end()) << "Runtime precision is not found for node: " << type;
+
+            const auto rtPrecisionPtr = ngraph::as_type_ptr<ngraph::VariantWrapper<std::string>>(it->second);
+            return rtPrecisionPtr->get();
+        }
+    }
+
+    return "";
+}
+
 void LayerTestsCommon::SetRefMode(RefMode mode) {
     refMode = mode;
 }
diff --git a/inference-engine/tests/functional/shared_test_classes/src/read_ir/generate_inputs.cpp b/inference-engine/tests/functional/shared_test_classes/src/read_ir/generate_inputs.cpp
index 28cc0ff696ec5f..c4fc81dd7b7b12 100644
--- a/inference-engine/tests/functional/shared_test_classes/src/read_ir/generate_inputs.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/read_ir/generate_inputs.cpp
@@ -404,68 +404,6 @@ InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v1::Power>
            FuncTestUtils::createAndFillBlob(info.getTensorDesc(), 4, 2);
 }
 
-namespace ReduceOps {
-    InferenceEngine::Blob::Ptr generate(const ngraph::AxisSet& axis_vec,
-                                        const InferenceEngine::InputInfo& info) {
-        IE_ASSERT(axis_vec.size() == 1);
-
-        auto axis = *axis_vec.begin();
-        auto td = info.getTensorDesc();
-        auto dims = td.getDims();
-
-        // Slice of tensor through axis is {1, 0, 0, ....}, the mean value is 1/slice_size
-        auto raw_values = std::vector<float>(dims[axis], 0);
-        raw_values[0] = 1;
-
-        auto blob = make_blob_with_precision(td);
-        blob->allocate();
-        CommonTestUtils::fill_data_with_broadcast(blob, axis, raw_values);
-        return blob;
-    }
-} // namespace ReduceOps
-
-InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v1::ReduceLogicalAnd> node,
-                                    const InferenceEngine::InputInfo& info,
-                                    size_t port) {
-    return ReduceOps::generate(node->get_reduction_axes(), info);
-}
-
-InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v1::ReduceLogicalOr> node,
-                                    const InferenceEngine::InputInfo& info,
-                                    size_t port) {
-    return ReduceOps::generate(node->get_reduction_axes(), info);
-}
-
-InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v1::ReduceMax> node,
-                                    const InferenceEngine::InputInfo& info,
-                                    size_t port) {
-    return ReduceOps::generate(node->get_reduction_axes(), info);
-}
-
-InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v1::ReduceMean> node,
-                                    const InferenceEngine::InputInfo& info,
-                                    size_t port) {
-    return ReduceOps::generate(node->get_reduction_axes(), info);
-}
-
-InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v1::ReduceMin> node,
-                                    const InferenceEngine::InputInfo& info,
-                                    size_t port) {
-    return ReduceOps::generate(node->get_reduction_axes(), info);
-}
-
-InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v1::ReduceProd> node,
-                                    const InferenceEngine::InputInfo& info,
-                                    size_t port) {
-    return ReduceOps::generate(node->get_reduction_axes(), info);
-}
-
-InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v1::ReduceSum> node,
-                                    const InferenceEngine::InputInfo& info,
-                                    size_t port) {
-    return ReduceOps::generate(node->get_reduction_axes(), info);
-}
-
 InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v3::Bucketize> node,
                                                    const InferenceEngine::InputInfo& info,
                                                    size_t port) {
@@ -531,18 +469,6 @@ InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v4::Propos
     return FuncTestUtils::createAndFillBlobFloatNormalDistribution(info.getTensorDesc(), 0.0f, 0.2f, 7235346);
 }
 
-InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v4::ReduceL1> node,
-                                    const InferenceEngine::InputInfo& info,
-                                    size_t port) {
-    return ReduceOps::generate(node->get_reduction_axes(), info);
-}
-
-InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v4::ReduceL2> node,
-                                    const InferenceEngine::InputInfo& info,
-                                    size_t port) {
-    return ReduceOps::generate(node->get_reduction_axes(), info);
-}
-
 InferenceEngine::Blob::Ptr generate(const std::shared_ptr<ngraph::op::v4::SoftPlus> node,
                                     const InferenceEngine::InputInfo& info,
                                     size_t port) {
diff --git a/inference-engine/tests/functional/shared_test_classes/src/read_ir/read_ir.cpp b/inference-engine/tests/functional/shared_test_classes/src/read_ir/read_ir.cpp
index 6110c6b762904c..3f966b858f1839 100644
--- a/inference-engine/tests/functional/shared_test_classes/src/read_ir/read_ir.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/read_ir/read_ir.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "common_test_utils/file_utils.hpp"
 #include "functional_test_utils/core_config.hpp"
 
 #include "shared_test_classes/read_ir/read_ir.hpp"
@@ -14,8 +15,12 @@ std::string ReadIRTest::getTestCaseName(const testing::TestParamInfo<std::tuple<
     std::tie(pathToModel, deviceName) = obj.param;
 
     std::ostringstream result;
-    result << "ModelPath=" << pathToModel << "_";
-    result << "TargetDevice=" << deviceName << "_";
+    auto splittedFilename = CommonTestUtils::splitStringByDelimiter(pathToModel, CommonTestUtils::FileSeparator);
+    if (splittedFilename.size() > 1) {
+        result << "PRC=" << *std::next(splittedFilename.rbegin()) << "_";
+    }
+    result << "IR_name=" << splittedFilename.back() << "_";
+    result << "TargetDevice=" << deviceName;
     return result.str();
 }
 
diff --git a/inference-engine/tests/functional/shared_test_classes/src/single_layer/clamp.cpp b/inference-engine/tests/functional/shared_test_classes/src/single_layer/clamp.cpp
new file mode 100644
index 00000000000000..0fae07ccc745e1
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/clamp.cpp
@@ -0,0 +1,40 @@
+// Copyright (C) 2021 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/single_layer/clamp.hpp"
+
+namespace LayerTestsDefinitions {
+
+std::string ClampLayerTest::getTestCaseName(testing::TestParamInfo<clampParamsTuple> obj) {
+    InferenceEngine::SizeVector inShape;
+    std::pair<float, float> interval;
+    InferenceEngine::Precision netPrc;
+    std::string targetDevice;
+
+    std::tie(inShape, interval, netPrc, targetDevice) = obj.param;
+
+    std::ostringstream result;
+    result << "inShape=" << CommonTestUtils::vec2str(inShape) << "_";
+    result << "min=" << interval.first << "_";
+    result << "max=" << interval.second << "_";
+    result << "netPrc=" << netPrc.name() << "_";
+    result << "trgDev=" << targetDevice;
+    return result.str();
+}
+
+void ClampLayerTest::SetUp() {
+    InferenceEngine::SizeVector inShape;
+    std::pair<float, float> interval;
+    InferenceEngine::Precision netPrc;
+
+    std::tie(inShape, interval, netPrc, targetDevice) = this->GetParam();
+
+    auto ngNetPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrc);
+    auto input = std::make_shared<ngraph::op::Parameter>(ngNetPrc, ngraph::Shape(inShape));
+    auto clamp = std::make_shared<ngraph::op::Clamp>(input, interval.first, interval.second);
+    function = std::make_shared<ngraph::Function>(std::make_shared<ngraph::opset1::Result>(clamp), ngraph::ParameterVector{input});
+}
+
+} // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/src/single_layer/reduce_ops.cpp b/inference-engine/tests/functional/shared_test_classes/src/single_layer/reduce_ops.cpp
index 35b04950dbcb17..c410fb0b39d5f4 100644
--- a/inference-engine/tests/functional/shared_test_classes/src/single_layer/reduce_ops.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/reduce_ops.cpp
@@ -32,9 +32,6 @@ std::string ReduceOpsLayerTest::getTestCaseName(testing::TestParamInfo<reduceMea
 }
 
 void ReduceOpsLayerTest::SetUp() {
-    // TODO: Issue 33151
-    // Failed to create function on SetUp stage with some parameters
-    SKIP_IF_CURRENT_TEST_IS_DISABLED()
     InferenceEngine::Precision netPrecision;
     bool keepDims;
     ngraph::helpers::ReductionType reductionType;
@@ -69,6 +66,21 @@ void ReduceOpsLayerTest::SetUp() {
     const ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(reduce)};
     function = std::make_shared<ngraph::Function>(results, params, "Reduce");
 }
+InferenceEngine::Blob::Ptr ReduceOpsLayerTest::GenerateInput(const InferenceEngine::InputInfo &info) const {
+    ngraph::helpers::ReductionType reductionType = std::get<3>(GetParam());
+    InferenceEngine::Precision netPrecision = std::get<4>(GetParam());
+    if (reductionType == ngraph::helpers::ReductionType::LogicalOr ||
+        reductionType == ngraph::helpers::ReductionType::LogicalAnd) {
+        return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), 2, 0);
+    } else if (!netPrecision.is_float()) {
+        return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), 5, 0);
+    }
+    auto td = info.getTensorDesc();
+    auto blob = make_blob_with_precision(td);
+    blob->allocate();
+    CommonTestUtils::fill_data_random_float<InferenceEngine::Precision::FP32>(blob, 5, 0, 1000);
+    return blob;
+}
 
 InferenceEngine::Blob::Ptr ReduceOpsLayerWithSpecificInputTest::GenerateInput(const InferenceEngine::InputInfo &info) const {
     auto axis_vec = std::get<0>(GetParam());
diff --git a/inference-engine/tests/functional/shared_test_classes/src/subgraph/const_conv_concat.cpp b/inference-engine/tests/functional/shared_test_classes/src/subgraph/const_conv_concat.cpp
new file mode 100644
index 00000000000000..06014dfb17a796
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/src/subgraph/const_conv_concat.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/subgraph/const_conv_concat.hpp"
+#include "ngraph_functions/builders.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+std::string ConstConvConcatTest::getTestCaseName(testing::TestParamInfo<ConstConvConcatParams> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    size_t inputChannels;
+    size_t outputChannels;
+    convParams convolutionParams;
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(netPrecision, targetDevice, configuration, convolutionParams, inputChannels, outputChannels) = obj.param;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "KS=" << CommonTestUtils::vec2str(kernelShape) << "_";
+    result << "S=" << stride << "_";
+    result << "IC=" << inputChannels << "_";
+    result << "OC=" << outputChannels << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+InferenceEngine::Blob::Ptr ConstConvConcatTest::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+    blob->allocate();
+
+    auto* rawBlobDataPtr = blob->buffer().as<float*>();
+    std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -0.2f, 0.2f);
+    for (size_t i = 0; i < blob->size(); i++) {
+        rawBlobDataPtr[i] = values[i];
+    }
+    return blob;
+}
+
+void ConstConvConcatTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    convParams convolutionParams;
+    size_t inputChannels;
+    size_t outputChannels;
+    std::tie(netPrecision, targetDevice, tempConfig, convolutionParams, inputChannels, outputChannels) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+
+    std::vector<size_t> convInputShape = {inputShape[0], inputChannels, 1,  inputShape[1] / inputChannels};
+    auto reshapePattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(params[0], reshapePattern1, false);
+
+    auto filterWeights = CommonTestUtils::generate_float_numbers(outputChannels * convInputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                 0.0f, 0.1f);
+    auto conv = ngraph::builder::makeConvolution(reshape1, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filterWeights);
+
+    auto widthAfterConv = (convInputShape[3] - kernelShape[1]) / stride + 1;
+    std::vector<size_t> outFormShapes =  {1,  outputChannels * widthAfterConv };
+
+    auto const_values = CommonTestUtils::generate_float_numbers(outputChannels * widthAfterConv, -0.2f, 0.2f);
+    auto constant = ngraph::builder::makeConstant(ngPrc, {1, outputChannels, 1, widthAfterConv}, const_values);
+    auto concat = ngraph::builder::makeConcat({constant, conv}, 3);
+
+    auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 },
+                                                                      std::vector<size_t>{1,  2 * outputChannels * widthAfterConv });
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(concat, reshapePattern2, false);
+
+    function = std::make_shared<ngraph::Function>(reshape2, params, "ConstConvConcatTest");
+}
+}  // namespace SubgraphTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/src/subgraph/eltwise_reshape_activation.cpp b/inference-engine/tests/functional/shared_test_classes/src/subgraph/eltwise_reshape_activation.cpp
new file mode 100644
index 00000000000000..74b8e76bf16977
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/src/subgraph/eltwise_reshape_activation.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ngraph_functions/builders.hpp"
+#include "shared_test_classes/subgraph/eltwise_reshape_activation.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+using namespace CommonTestUtils;
+using namespace InferenceEngine;
+
+std::string EltwiseReshapeActivation::getTestCaseName(testing::TestParamInfo<ParamType> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::vector<std::vector<size_t>> shapes;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    std::tie(shapes, netPrecision, targetDevice, configuration) = obj.param;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(shapes[0]) << "_";
+    result << "AS=" << CommonTestUtils::vec2str(shapes[1]) << "_";
+    result << "PRC=" << netPrecision.name() << "_";
+    result << "dev=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+void EltwiseReshapeActivation::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::vector<std::vector<size_t>> shapes;
+    std::map<std::string, std::string> config;
+    std::tie(shapes, netPrecision, targetDevice, config) = this->GetParam();
+    configuration.insert(config.begin(), config.end());
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+    auto input = ngraph::builder::makeParams(ngPrc, { shapes[0], shapes[0] });
+    auto eltw = ngraph::builder::makeEltwise(input[0], input[1], ngraph::helpers::EltwiseTypes::ADD);
+
+    auto reshape_pattern1 = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{shapes[1].size()}, shapes[1]);
+    auto reshape1 = std::make_shared<ngraph::op::v1::Reshape>(eltw, reshape_pattern1, false);
+
+    auto relu = ngraph::builder::makeActivation(reshape1, ngPrc, ngraph::helpers::ActivationTypes::Relu);
+
+    auto reshape_pattern2 = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{shapes[0].size()}, shapes[0]);
+    auto reshape2 = std::make_shared<ngraph::op::v1::Reshape>(relu, reshape_pattern2, false);
+
+    function = std::make_shared<ngraph::Function>(reshape2, input, "EltwiseReshapeActivation");
+}
+}  // namespace SubgraphTestsDefinitions
diff --git a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/merge_xmls.py b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/merge_xmls.py
index 414709fdfd1b08..6c93a383477966 100644
--- a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/merge_xmls.py
+++ b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/merge_xmls.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import xml.etree.ElementTree as ET
-from jinja2 import Environment, FileSystemLoader
 import argparse
 import os
 from datetime import datetime
@@ -93,6 +92,8 @@ def merge_xml(input_folder_paths: list, output_folder_paths: str):
         summary.set("timestamp", timestamp)
         logger.info(f" Processing is finished")
 
+        if not os.path.exists(output_folder_paths):
+            os.mkdir(output_folder_paths)
         out_file_path = os.path.join(output_folder_paths, "report.xml")
         with open(out_file_path, "w") as xml_file:
             xml_file.write(ET.tostring(summary).decode('utf8'))
diff --git a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/summarize.py b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/summarize.py
index 6d8298fbe7dd13..dd7f093f2a1ba3 100644
--- a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/summarize.py
+++ b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/summarize.py
@@ -91,6 +91,7 @@
     'Sin-0',
     'SoftPlus-4',
     'Softmax-1',
+    'Split-1',
     'StridedSlice-1',
     'Substract-1',
     'Swish-4',
diff --git a/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp b/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp
index a24be876758994..43395428f7251e 100644
--- a/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp
+++ b/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp
@@ -20,7 +20,7 @@ class MockInferencePluginInternal2 : public InferenceEngine::InferencePluginInte
 public:
     MOCK_METHOD2(LoadExeNetworkImpl, std::shared_ptr<InferenceEngine::ExecutableNetworkInternal>(
             const InferenceEngine::CNNNetwork &, const std::map<std::string, std::string> &));
-    MOCK_METHOD2(LoadNetwork, ExecutableNetwork(
+    MOCK_METHOD2(LoadNetwork, std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>(
             const InferenceEngine::CNNNetwork &,
             const std::map<std::string, std::string> &));
     MOCK_METHOD1(AddExtension, void(InferenceEngine::IExtensionPtr ext_ptr));
@@ -36,7 +36,7 @@ class MockInferencePluginInternal : public InferenceEngine::InferencePluginInter
 
     using InferenceEngine::InferencePluginInternal::ImportNetwork;
 
-    ExecutableNetwork ImportNetworkImpl(std::istream& stream, const std::map <std::string, std::string>&) {
+    ExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& stream, const std::map <std::string, std::string>&) {
         std::getline(stream, importedString);
         return {};
     }
diff --git a/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/interface/mock_iinference_plugin.hpp b/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/interface/mock_iinference_plugin.hpp
index 53a4c1f79c1bdc..73208737707264 100644
--- a/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/interface/mock_iinference_plugin.hpp
+++ b/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/cpp_interfaces/interface/mock_iinference_plugin.hpp
@@ -13,9 +13,9 @@
 class MockIInferencePlugin : public InferenceEngine::IInferencePlugin {
 public:
     MOCK_METHOD1(AddExtension, void(InferenceEngine::IExtensionPtr));
-    MOCK_METHOD2(LoadNetwork, InferenceEngine::ExecutableNetwork(
+    MOCK_METHOD2(LoadNetwork, std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>(
                 const CNNNetwork&, const std::map<std::string, std::string>&));
-    MOCK_METHOD2(ImportNetwork, InferenceEngine::ExecutableNetwork(
+    MOCK_METHOD2(ImportNetwork, std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>(
                 const std::string&, const std::map<std::string, std::string>&));
     MOCK_METHOD1(SetConfig, void(const std::map<std::string, std::string> &));
 
@@ -30,12 +30,12 @@ class MockIInferencePlugin : public InferenceEngine::IInferencePlugin {
     MOCK_METHOD1(CreateContext,
                 InferenceEngine::RemoteContext::Ptr(const InferenceEngine::ParamMap&));
     MOCK_METHOD1(GetDefaultContext, InferenceEngine::RemoteContext::Ptr(const InferenceEngine::ParamMap&));
-    MOCK_METHOD3(LoadNetwork, InferenceEngine::ExecutableNetwork(
+    MOCK_METHOD3(LoadNetwork, std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>(
                 const InferenceEngine::CNNNetwork&, const std::map<std::string, std::string>&,
                 InferenceEngine::RemoteContext::Ptr));
-    MOCK_METHOD2(ImportNetwork, InferenceEngine::ExecutableNetwork(
+    MOCK_METHOD2(ImportNetwork, std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>(
                 std::istream&, const std::map<std::string, std::string>&));
-    MOCK_METHOD3(ImportNetwork, InferenceEngine::ExecutableNetwork(
+    MOCK_METHOD3(ImportNetwork, std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>(
                 std::istream&, const InferenceEngine::RemoteContext::Ptr&,
                 const std::map<std::string, std::string>&));
 };
diff --git a/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_engine/mock_plugin.cpp b/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_engine/mock_plugin.cpp
index ebba2977370232..d1e903e18b763f 100644
--- a/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_engine/mock_plugin.cpp
+++ b/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_engine/mock_plugin.cpp
@@ -30,7 +30,7 @@ Parameter MockPlugin::GetMetric(const std::string& name, const std::map<std::str
     }
 }
 
-ExecutableNetwork
+std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>
 MockPlugin::LoadNetwork(const CNNNetwork &network,
                         const std::map<std::string, std::string> &config) {
     if (_target) {
@@ -40,7 +40,7 @@ MockPlugin::LoadNetwork(const CNNNetwork &network,
     }
 }
 
-ExecutableNetwork
+std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>
 MockPlugin::LoadNetwork(const CNNNetwork& network, const std::map<std::string, std::string>& config,
                         RemoteContext::Ptr context) {
     if (_target) {
@@ -56,22 +56,22 @@ MockPlugin::LoadExeNetworkImpl(const CNNNetwork& network,
     return {};
 }
 
-InferenceEngine::ExecutableNetwork
+InferenceEngine::ExecutableNetworkInternal::Ptr
 MockPlugin::ImportNetworkImpl(std::istream& networkModel,
                               const std::map<std::string, std::string>& config) {
     if (_target) {
-        return _target->ImportNetwork(networkModel, config);
+        return std::static_pointer_cast<ExecutableNetworkInternal>(_target->ImportNetwork(networkModel, config));
     } else {
         IE_THROW(NotImplemented);
     }
 }
 
-InferenceEngine::ExecutableNetwork
+InferenceEngine::ExecutableNetworkInternal::Ptr
 MockPlugin::ImportNetworkImpl(std::istream& networkModel,
                               const InferenceEngine::RemoteContext::Ptr& context,
                               const std::map<std::string, std::string>& config) {
     if (_target) {
-        return _target->ImportNetwork(networkModel, context, config);
+        return std::static_pointer_cast<ExecutableNetworkInternal>(_target->ImportNetwork(networkModel, context, config));
     } else {
         IE_THROW(NotImplemented);
     }
diff --git a/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_engine/mock_plugin.hpp b/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_engine/mock_plugin.hpp
index 73254a4e51d204..8b8a0beba5a362 100644
--- a/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_engine/mock_plugin.hpp
+++ b/inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_engine/mock_plugin.hpp
@@ -18,23 +18,25 @@ class MockPlugin : public InferenceEngine::InferencePluginInternal {
 
     void SetConfig(const std::map<std::string, std::string>& config) override;
 
-    InferenceEngine::ExecutableNetwork
+    std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>
     LoadNetwork(const InferenceEngine::CNNNetwork &network,
                 const std::map<std::string, std::string> &config) override;
 
-    InferenceEngine::ExecutableNetwork
+    std::shared_ptr<InferenceEngine::IExecutableNetworkInternal>
     LoadNetwork(const InferenceEngine::CNNNetwork& network,
                 const std::map<std::string, std::string>& config,
                 InferenceEngine::RemoteContext::Ptr context) override;
 
-    InferenceEngine::ExecutableNetworkInternal::Ptr
+    std::shared_ptr<InferenceEngine::ExecutableNetworkInternal>
     LoadExeNetworkImpl(const InferenceEngine::CNNNetwork& network,
                        const std::map<std::string, std::string>& config) override;
 
-    InferenceEngine::ExecutableNetwork ImportNetworkImpl(std::istream& networkModel,
+    std::shared_ptr<InferenceEngine::ExecutableNetworkInternal>
+    ImportNetworkImpl(std::istream& networkModel,
         const std::map<std::string, std::string>& config) override;
 
-    InferenceEngine::ExecutableNetwork ImportNetworkImpl(std::istream& networkModel,
+    std::shared_ptr<InferenceEngine::ExecutableNetworkInternal>
+    ImportNetworkImpl(std::istream& networkModel,
         const InferenceEngine::RemoteContext::Ptr& context,
         const std::map<std::string, std::string>& config) override;
 
diff --git a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/common/dequantization_operations.hpp b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/common/dequantization_operations.hpp
index 4c9f1a66f17942..db9702ab631022 100644
--- a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/common/dequantization_operations.hpp
+++ b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/common/dequantization_operations.hpp
@@ -109,6 +109,7 @@ class DequantizationOperations {
     bool operator==(const DequantizationOperations& value) const noexcept {
         return equal(value);
     }
+    void setPrecision(const ngraph::element::Type& type) noexcept;
 
     Convert convert;
     Subtract subtract;
diff --git a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/concat_function.hpp b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/concat_function.hpp
index e69fd113be56b5..95da3db91efbad 100644
--- a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/concat_function.hpp
+++ b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/concat_function.hpp
@@ -114,7 +114,8 @@ class ConcatFunction {
         const DequantizationOperations::Convert& convert2,
         const DequantizationOperations& dequantization2,
         const ngraph::element::Type precisionAfterOperation,
-        const DequantizationOperations& dequantizationAfter);
+        const DequantizationOperations& dequantizationAfter,
+        const std::int64_t& axis);
 
     static std::shared_ptr<ngraph::Function> getReferenceWithNeighbors(
         const ngraph::element::Type precision,
diff --git a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/common/dequantization_operations.cpp b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/common/dequantization_operations.cpp
index eccd81c08aa7f1..e5da339d912281 100644
--- a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/common/dequantization_operations.cpp
+++ b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/common/dequantization_operations.cpp
@@ -185,6 +185,14 @@ DequantizationOperations::DequantizationOperations(
     multiply(multiply)
 {}
 
+void DequantizationOperations::setPrecision(const ngraph::element::Type& type) noexcept {
+    convert.outPrecision = type;
+    subtract.constantPrecision = type;
+    subtract.outPrecision = type;
+    multiply.constantPrecision = type;
+    multiply.outPrecision = type;
+}
+
 bool DequantizationOperations::empty() const noexcept {
     return convert.empty() && subtract.empty() && multiply.empty();
 }
diff --git a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/concat_function.cpp b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/concat_function.cpp
index 2d9bb24453a948..8b251a4d9be959 100644
--- a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/concat_function.cpp
+++ b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/concat_function.cpp
@@ -752,7 +752,8 @@ std::shared_ptr<ngraph::Function> ConcatFunction::get(
     const DequantizationOperations::Convert& convert2,
     const DequantizationOperations& dequantization2,
     const ngraph::element::Type precisionAfterOperation,
-    const DequantizationOperations& dequantizationAfter) {
+    const DequantizationOperations& dequantizationAfter,
+    const std::int64_t& axis) {
     const auto input1 = std::make_shared<ngraph::opset1::Parameter>(inputPrecision, inputShape);
     input1->set_friendly_name("input1");
 
@@ -775,7 +776,7 @@ std::shared_ptr<ngraph::Function> ConcatFunction::get(
         parent2 = makeDequantization(parent2, dequantization2);
     }
 
-    const std::shared_ptr<ngraph::opset1::Concat> concat = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{ parent1, parent2 }, 1);
+    const std::shared_ptr<ngraph::opset1::Concat> concat = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{ parent1, parent2 }, axis);
 
     auto& rtInfo = concat->get_rt_info();
     rtInfo["Variant::std::string"] = std::make_shared<VariantWrapper<std::string>>("concat");
@@ -989,6 +990,13 @@ std::shared_ptr<ngraph::Function> ConcatFunction::getReferenceWithSplitedInterme
     input2->set_friendly_name("input2");
 
     const auto fakeQuantize2 = makeFakeQuantizeTypeRelaxed(input2, precision, fqOnData2);
+    replace_node(
+        fakeQuantize2->get_input_node_shared_ptr(3),
+        ngraph::pass::low_precision::NetworkHelper::toScalarIfPossible(fakeQuantize2->get_input_node_shared_ptr(3)));
+    replace_node(
+        fakeQuantize2->get_input_node_shared_ptr(4),
+        ngraph::pass::low_precision::NetworkHelper::toScalarIfPossible(fakeQuantize2->get_input_node_shared_ptr(4)));
+
     fakeQuantize2->set_friendly_name("fakeQuantize2");
     low_precision::NetworkHelper::setOutDataPrecisionForTypeRelaxed(fakeQuantize2, precisionAfterOperation);
     const auto deqBefore2 = makeDequantization(fakeQuantize2, dequantizationBefore1);
diff --git a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/convolution_function.cpp b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/convolution_function.cpp
index 5159d3faa04cf8..36091dd532302f 100644
--- a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/convolution_function.cpp
+++ b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/convolution_function.cpp
@@ -147,7 +147,7 @@ std::shared_ptr<ngraph::Function> ConvolutionFunction::getOriginalWithIncorrectW
             fakeQuantizeOnWeights.outputLowValues, fakeQuantizeOnWeights.outputHighValues);
 
     const auto subtract = isCorrect ? nullptr : std::make_shared<DequantizationSubtract>(fqOnWeights,
-        std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, Shape{ 1, 1, 1, 1 }, 3.0f));
+        std::make_shared<ngraph::opset1::Constant>(precision, Shape{ 1, 1, 1, 1 }, 3.0f));
 
     const auto convolution = std::make_shared<ngraph::opset1::Convolution>(
         fakeQuantizeOnData.empty() ? input : fqOnData,
diff --git a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/fuse_convert_function.cpp b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/fuse_convert_function.cpp
index 8d8e1b2ab6756a..a191bfdd09b01d 100644
--- a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/fuse_convert_function.cpp
+++ b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/fuse_convert_function.cpp
@@ -54,8 +54,9 @@ std::shared_ptr<ngraph::Function> FuseConvertFunction::getWithFQ(
                 ngraph::Shape(inputShape));
         parent = input1;
     }
-
-    const std::shared_ptr<Node> dequantizationOp = makeDequantization(parent, dequantization);
+    auto deqStructure = dequantization;
+    deqStructure.multiply.outPrecision = inputPrecision;
+    const std::shared_ptr<Node> dequantizationOp = makeDequantization(parent, deqStructure);
 
     std::shared_ptr<op::Parameter> input2 = std::make_shared<ngraph::opset1::Parameter>(
             inputPrecision,
@@ -68,7 +69,7 @@ std::shared_ptr<ngraph::Function> FuseConvertFunction::getWithFQ(
     // just some non-transparent layer
     const auto power = std::make_shared<opset1::Power>(
         fakeQuantizeOnActivations,
-        std::make_shared<opset1::Constant>(element::f32, Shape{}, std::vector<float>{2.f}));
+        std::make_shared<opset1::Constant>(inputPrecision, Shape{}, std::vector<float>{2.f}));
 
     const auto add = std::make_shared<opset1::Add>(
         dequantizationOp,
diff --git a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/group_convolution_function.cpp b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/group_convolution_function.cpp
index 2d7297baa193ed..4d3b6153f08c56 100644
--- a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/group_convolution_function.cpp
+++ b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/group_convolution_function.cpp
@@ -70,7 +70,7 @@ std::shared_ptr<Node> createWeightsOriginal(
             ngraph::opset1::Constant::create(
                 element::i64,
                 Shape{ 5 },
-                std::vector<size_t>({ groupCount, outputChannelsCount / groupCount, inputChannelsPerGroup, 7, 7 })),
+                std::vector<size_t>({ groupCount, outputChannelsCount / groupCount, inputChannelsPerGroup, kernelSize, kernelSize })),
             true);
     }
 
@@ -146,7 +146,7 @@ std::shared_ptr<ngraph::Function> GroupConvolutionFunction::getOriginal(
     // TODO: pass as argument
     //const size_t groupCount = 3ul;
     const size_t outputChannelsCount = outputShape[1];
-    const size_t kernelSize = 7ul;
+    const size_t kernelSize = 5ul;
     const size_t inputChannelsCount = inputShape[1];
 
     std::vector<float> weightsValues = { 1.f };
diff --git a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/mat_mul_function.cpp b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/mat_mul_function.cpp
index 82f54f048a8892..457a9a868c6b52 100644
--- a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/mat_mul_function.cpp
+++ b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/mat_mul_function.cpp
@@ -313,12 +313,15 @@ std::shared_ptr<ngraph::Function> MatMulFunction::getOriginal(
     const auto dequantizationOnData = makeFakeQuantize(input, precision, fqOnData);
 
     const std::shared_ptr<ngraph::Node> weightsConst = std::make_shared<ngraph::opset1::Constant>(
-        weights.outPrecision,
+        weights.outPrecision.is_real() ? precision : weights.outPrecision,
         weights.shape,
         weights.values);
 
     const std::shared_ptr<ngraph::Node> fakeQuantize = fqOnWeights.empty() ? nullptr : makeFakeQuantize(weightsConst, precision, fqOnWeights);
-    const auto dequantizationOnWeights = makeDequantization(fakeQuantize == nullptr ? weightsConst : fakeQuantize, deqOnWeights);
+
+    auto deqStructure = deqOnWeights;
+    deqStructure.setPrecision(precision);
+    const auto dequantizationOnWeights = makeDequantization(fakeQuantize == nullptr ? weightsConst : fakeQuantize, deqStructure);
 
     const std::shared_ptr<ngraph::opset1::MatMul> matMul = std::make_shared<ngraph::opset1::MatMul>(
         dequantizationOnData,
diff --git a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/transformations_after_split_function.cpp b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/transformations_after_split_function.cpp
index e7ccd1f65989aa..69dafb9422e892 100644
--- a/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/transformations_after_split_function.cpp
+++ b/inference-engine/tests/ngraph_helpers/lpt_ngraph_functions/src/transformations_after_split_function.cpp
@@ -41,7 +41,12 @@ std::shared_ptr<Function> TransformationsAfterSplitFunction::get(const std::stri
 std::shared_ptr<Node> TransformationsAfterSplitFunction::getLayerByTransformationName(
     const std::string transformationName,
     const Output<Node> parent) {
-    if (transformationName == "AddTransformation") {
+    if (transformationName == "AddTransformationWithoutConcat") {
+        const auto dequantization = makeDequantization(parent, { {}, {}, { 3.f } });
+        const auto addConstant = opset1::Constant::create(element::u8, Shape{}, { 128.f });
+        return std::make_shared<opset1::Add>(dequantization, addConstant);
+    }
+    if (transformationName == "AddTransformationWithConcat") {
         const auto dequantization = makeDequantization(parent, { {element::f32}, {}, { 0.1f } });
         const auto addConstant = opset1::Constant::create(element::f32, Shape{}, { 128.f });
         return std::make_shared<opset1::Add>(dequantization, addConstant);
diff --git a/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/pass/convert_prc.hpp b/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/pass/convert_prc.hpp
index fe630b104e5a8e..26516a6617e7eb 100644
--- a/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/pass/convert_prc.hpp
+++ b/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/pass/convert_prc.hpp
@@ -9,6 +9,7 @@
 
 #include <ngraph_functions/utils/ngraph_helpers.hpp>
 #include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
 
 namespace ngraph {
 namespace pass {
@@ -61,12 +62,36 @@ class ConvertParametersPrecision : public MatcherPass {
     }
 };
 
+template<ngraph::element::Type_t from, ngraph::element::Type_t to>
+class ConvertConvertLayerOutputPrecision : public MatcherPass {
+public:
+    ConvertConvertLayerOutputPrecision() {
+        auto convert = ngraph::pattern::wrap_type<opset1::Convert>();
+        ngraph::matcher_pass_callback callback = [](pattern::Matcher &m) {
+            auto convert = std::dynamic_pointer_cast<ngraph::op::Convert>(m.get_match_root());
+            if (!convert) {
+                return false;
+            }
+
+            if (convert->get_convert_element_type() == ngraph::element::Type(from)) {
+                convert->set_convert_element_type(to);
+                return true;
+            }
+            return false;
+        };
+
+        auto m = std::make_shared<ngraph::pattern::Matcher>(convert, "ConvertConvertLayerPrecision");
+        register_matcher(m, callback);
+    }
+};
+
 template<ngraph::element::Type_t from, ngraph::element::Type_t to>
 class ConvertPrecision : public ngraph::pass::GraphRewrite {
 public:
     ConvertPrecision() {
         add_matcher<ConvertConstantsPrecision<from, to>>();
         add_matcher<ConvertParametersPrecision<from, to>>();
+        add_matcher<ConvertConvertLayerOutputPrecision<from, to>>();
     }
 };
 }  // namespace pass
diff --git a/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/utils/ngraph_helpers.hpp b/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/utils/ngraph_helpers.hpp
index f5c1dc79653d72..32aa7ab3e97d0f 100644
--- a/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/utils/ngraph_helpers.hpp
+++ b/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/utils/ngraph_helpers.hpp
@@ -120,7 +120,9 @@ enum ActivationTypes {
     Swish,
     HSigmoid,
     RoundHalfToEven,
-    RoundHalfAwayFromZero
+    RoundHalfAwayFromZero,
+    GeluErf,
+    GeluTanh
 };
 
 enum EltwiseTypes {
diff --git a/inference-engine/tests/ngraph_helpers/ngraph_functions/src/activation.cpp b/inference-engine/tests/ngraph_helpers/ngraph_functions/src/activation.cpp
index 7737b299b48cec..97dea94963aad8 100644
--- a/inference-engine/tests/ngraph_helpers/ngraph_functions/src/activation.cpp
+++ b/inference-engine/tests/ngraph_helpers/ngraph_functions/src/activation.cpp
@@ -39,7 +39,7 @@ std::shared_ptr<ngraph::Node> makeActivation(const ngraph::Output<Node> &in,
         case ngraph::helpers::ActivationTypes::Abs:
             return std::make_shared<ngraph::op::Abs>(in);
         case ngraph::helpers::ActivationTypes::Gelu:
-            return std::make_shared<ngraph::op::Gelu>(in);
+            return std::make_shared<ngraph::op::v0::Gelu>(in);
         case ngraph::helpers::ActivationTypes::Clamp:
             return std::make_shared<ngraph::op::Clamp>(in, constantsValue[0], constantsValue[1]);
         case ngraph::helpers::ActivationTypes::Negative:
@@ -107,6 +107,10 @@ std::shared_ptr<ngraph::Node> makeActivation(const ngraph::Output<Node> &in,
             return std::make_shared<ngraph::op::v5::Round>(in, ngraph::op::v5::Round::RoundMode::HALF_TO_EVEN);
         case ngraph::helpers::ActivationTypes::RoundHalfAwayFromZero:
             return std::make_shared<ngraph::op::v5::Round>(in, ngraph::op::v5::Round::RoundMode::HALF_AWAY_FROM_ZERO);
+        case ngraph::helpers::ActivationTypes::GeluErf:
+            return std::make_shared<ngraph::op::v7::Gelu>(in, ngraph::op::GeluApproximationMode::ERF);
+        case ngraph::helpers::ActivationTypes::GeluTanh:
+            return std::make_shared<ngraph::op::v7::Gelu>(in, ngraph::op::GeluApproximationMode::TANH);
         default:
             throw std::runtime_error("Can't create layer for this activation type");
     }
diff --git a/inference-engine/tests/unit/inference_engine/cpp_interfaces/ie_memory_state_internal_test.cpp b/inference-engine/tests/unit/inference_engine/cpp_interfaces/ie_memory_state_internal_test.cpp
index 217d624c79e516..a998f9b758f0f3 100644
--- a/inference-engine/tests/unit/inference_engine/cpp_interfaces/ie_memory_state_internal_test.cpp
+++ b/inference-engine/tests/unit/inference_engine/cpp_interfaces/ie_memory_state_internal_test.cpp
@@ -12,6 +12,8 @@
 #include "unit_test_utils/mocks/cpp_interfaces/interface/mock_ivariable_state_internal.hpp"
 #include "unit_test_utils/mocks/cpp_interfaces/interface/mock_iexecutable_network_internal.hpp"
 #include "unit_test_utils/mocks/cpp_interfaces/interface/mock_iasync_infer_request_internal.hpp"
+#include "unit_test_utils/mocks/cpp_interfaces/interface/mock_iinference_plugin.hpp"
+#include "ie_plugin_cpp.hpp"
 
 using namespace ::testing;
 using namespace std;
@@ -31,6 +33,20 @@ class VariableStateTests : public ::testing::Test {
     shared_ptr<MockIAsyncInferRequestInternal> mockInferRequestInternal;
     shared_ptr<MockIVariableStateInternal> mockVariableStateInternal;
 
+    struct TestPluginInternal : public MockIInferencePlugin {
+        TestPluginInternal(const std::shared_ptr<MockIExecutableNetworkInternal>& mockIExeNet_) : mockIExeNet{mockIExeNet_} {}
+        std::shared_ptr<IExecutableNetworkInternal> LoadNetwork(const CNNNetwork&, const std::map<std::string, std::string>&) override {
+            return mockIExeNet;
+        }
+        QueryNetworkResult QueryNetwork(const CNNNetwork&, const std::map<std::string, std::string>&) const override {return {};}
+        std::shared_ptr<MockIExecutableNetworkInternal> mockIExeNet;
+    };
+    struct TestPlugin : public InferenceEngine::InferencePlugin {
+        TestPlugin(std::shared_ptr<MockIExecutableNetworkInternal> mockIExeNet) :
+            InferenceEngine::InferencePlugin(InferenceEngine::details::SOPointer<TestPluginInternal>{
+                new TestPluginInternal{mockIExeNet}}) {}
+    };
+
     virtual void SetUp() {
         mockExeNetworkInternal = make_shared<MockIExecutableNetworkInternal>();
         mockInferRequestInternal = make_shared<MockIAsyncInferRequestInternal>();
@@ -40,11 +56,11 @@ class VariableStateTests : public ::testing::Test {
 
 TEST_F(VariableStateTests, ExecutableNetworkCanConvertOneVariableStateFromCppToAPI) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn(1);
     toReturn[0] = mockVariableStateInternal;
 
-    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(2).WillRepeatedly(Return(toReturn));
+    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(1).WillRepeatedly(Return(toReturn));
 
     auto state = net.QueryState();
     ASSERT_EQ(state.size(), 1);
@@ -53,7 +69,7 @@ TEST_F(VariableStateTests, ExecutableNetworkCanConvertOneVariableStateFromCppToA
 
 TEST_F(VariableStateTests, ExecutableNetworkCanConvertZeroVariableStateFromCppToAPI) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
 
     EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).WillOnce(Return(toReturn));
@@ -65,12 +81,12 @@ TEST_F(VariableStateTests, ExecutableNetworkCanConvertZeroVariableStateFromCppTo
 
 TEST_F(VariableStateTests, ExecutableNetworkCanConvert2VariableStatesFromCPPtoAPI) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
     toReturn.push_back(mockVariableStateInternal);
     toReturn.push_back(mockVariableStateInternal);
 
-    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(3).WillRepeatedly(Return(toReturn));
+    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(1).WillRepeatedly(Return(toReturn));
 
     auto state = net.QueryState();
     ASSERT_EQ(state.size(), 2);
@@ -79,11 +95,11 @@ TEST_F(VariableStateTests, ExecutableNetworkCanConvert2VariableStatesFromCPPtoAP
 
 TEST_F(VariableStateTests, VariableStatePropagatesReset) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
     toReturn.push_back(mockVariableStateInternal);
 
-    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(2).WillRepeatedly(Return(toReturn));
+    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(1).WillRepeatedly(Return(toReturn));
     EXPECT_CALL(*mockVariableStateInternal.get(), Reset()).Times(1);
 
     auto state = net.QueryState();
@@ -93,11 +109,11 @@ TEST_F(VariableStateTests, VariableStatePropagatesReset) {
 
 TEST_F(VariableStateTests, VariableStatePropagatesExceptionsFromReset) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
     toReturn.push_back(mockVariableStateInternal);
 
-    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(2).WillRepeatedly(Return(toReturn));
+    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(1).WillRepeatedly(Return(toReturn));
     EXPECT_CALL(*mockVariableStateInternal.get(), Reset()).WillOnce(Throw(std::logic_error("some error")));
 
     auto state = net.QueryState();
@@ -107,11 +123,11 @@ TEST_F(VariableStateTests, VariableStatePropagatesExceptionsFromReset) {
 
 TEST_F(VariableStateTests, VariableStatePropagatesGetName) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
     toReturn.push_back(mockVariableStateInternal);
 
-    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(2).WillRepeatedly(Return(toReturn));
+    EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(1).WillRepeatedly(Return(toReturn));
     EXPECT_CALL(*mockVariableStateInternal.get(), GetName()).WillOnce(Return("someName"));
 
     auto state = net.QueryState();
@@ -121,61 +137,54 @@ TEST_F(VariableStateTests, VariableStatePropagatesGetName) {
 
 TEST_F(VariableStateTests, VariableStatePropagatesGetNameWithZeroLen) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
     toReturn.push_back(mockVariableStateInternal);
 
     EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(1).WillRepeatedly(Return(toReturn));
     EXPECT_CALL(*mockVariableStateInternal.get(), GetName()).WillOnce(Return("someName"));
 
-    IVariableState::Ptr pState;
-
-    static_cast<IExecutableNetwork::Ptr>(net)->QueryState(pState, 0, nullptr);
-    char *name = reinterpret_cast<char *>(1);
-    EXPECT_NO_THROW(pState->GetName(name, 0, nullptr));
+    auto pState = net.QueryState().front();
+    EXPECT_NO_THROW(pState.GetName());
     IE_SUPPRESS_DEPRECATED_END
 }
 
 
 TEST_F(VariableStateTests, VariableStatePropagatesGetNameWithLenOfOne) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
     toReturn.push_back(mockVariableStateInternal);
 
     EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(1).WillRepeatedly(Return(toReturn));
     EXPECT_CALL(*mockVariableStateInternal.get(), GetName()).WillOnce(Return("someName"));
 
-    IVariableState::Ptr pState;
-
-    static_cast<IExecutableNetwork::Ptr>(net)->QueryState(pState, 0, nullptr);
-    char name[1];
-    EXPECT_NO_THROW(pState->GetName(name, 1, nullptr));
-    EXPECT_STREQ(name, "");
+    auto pState = net.QueryState().front();
+    std::string name;
+    EXPECT_NO_THROW(name = pState.GetName());
+    EXPECT_EQ(name, "someName");
     IE_SUPPRESS_DEPRECATED_END
 }
 
 TEST_F(VariableStateTests, VariableStatePropagatesGetNameWithLenOfTwo) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
     toReturn.push_back(mockVariableStateInternal);
 
     EXPECT_CALL(*mockExeNetworkInternal.get(), QueryState()).Times(1).WillRepeatedly(Return(toReturn));
     EXPECT_CALL(*mockVariableStateInternal.get(), GetName()).WillOnce(Return("someName"));
 
-    IVariableState::Ptr pState;
-
-    static_cast<IExecutableNetwork::Ptr>(net)->QueryState(pState, 0, nullptr);
-    char name[2];
-    EXPECT_NO_THROW(pState->GetName(name, 2, nullptr));
-    EXPECT_STREQ(name, "s");
+    auto pState = net.QueryState().front();
+    std::string name;
+    EXPECT_NO_THROW(name = pState.GetName());
+    EXPECT_EQ(name, "someName");
     IE_SUPPRESS_DEPRECATED_END
 }
 
 TEST_F(VariableStateTests, VariableStateCanPropagateSetState) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
     Blob::Ptr saver;
     toReturn.push_back(mockVariableStateInternal);
@@ -195,7 +204,7 @@ TEST_F(VariableStateTests, VariableStateCanPropagateSetState) {
 
 TEST_F(VariableStateTests, VariableStateCanPropagateGetLastState) {
     IE_SUPPRESS_DEPRECATED_START
-    auto net = make_executable_network(mockExeNetworkInternal);
+    auto net = TestPlugin{mockExeNetworkInternal}.LoadNetwork({}, {});
     std::vector<IVariableStateInternal::Ptr> toReturn;
 
     float data[] = {123, 124, 125};
diff --git a/inference-engine/tests/unit/inference_engine/cpp_interfaces/ie_plugin_test.cpp b/inference-engine/tests/unit/inference_engine/cpp_interfaces/ie_plugin_test.cpp
index a4ad1a82a51342..f76806b2193dd6 100644
--- a/inference-engine/tests/unit/inference_engine/cpp_interfaces/ie_plugin_test.cpp
+++ b/inference-engine/tests/unit/inference_engine/cpp_interfaces/ie_plugin_test.cpp
@@ -8,6 +8,9 @@
 #include <ie_version.hpp>
 #include <ie_plugin_cpp.hpp>
 
+#include <cpp_interfaces/base/ie_infer_async_request_base.hpp>
+#include <cpp_interfaces/interface/ie_iexecutable_network_internal.hpp>
+
 #include "unit_test_utils/mocks/mock_not_empty_icnn_network.hpp"
 #include "unit_test_utils/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp"
 #include "unit_test_utils/mocks/cpp_interfaces/impl/mock_executable_thread_safe_default.hpp"
@@ -48,7 +51,7 @@ class InferenceEnginePluginInternalTest : public ::testing::Test {
     }
 
     void getInferRequestWithMockImplInside(IInferRequest::Ptr &request) {
-        ExecutableNetwork exeNetwork;
+        IExecutableNetworkInternal::Ptr exeNetwork;
         InputsDataMap inputsInfo;
         mockNotEmptyNet->getInputsInfo(inputsInfo);
         OutputsDataMap outputsInfo;
@@ -58,7 +61,7 @@ class InferenceEnginePluginInternalTest : public ::testing::Test {
         EXPECT_CALL(*mock_plugin_impl.get(), LoadExeNetworkImpl(_, _)).WillOnce(Return(mockExeNetworkTS));
         EXPECT_CALL(*mockExeNetworkTS.get(), CreateInferRequestImpl(_, _)).WillOnce(Return(mockInferRequestInternal));
         ASSERT_NO_THROW(exeNetwork = plugin->LoadNetwork(InferenceEngine::CNNNetwork(mockNotEmptyNet), {}));
-        ASSERT_NO_THROW(request = exeNetwork.CreateInferRequest());
+        ASSERT_NO_THROW(request = exeNetwork->CreateInferRequest());
     }
 };
 
diff --git a/inference-engine/tests/unit/inference_engine/ie_compilation_context_test.cpp b/inference-engine/tests/unit/inference_engine/ie_compilation_context_test.cpp
index c3d428dbf8b240..a52ce386a7aee5 100644
--- a/inference-engine/tests/unit/inference_engine/ie_compilation_context_test.cpp
+++ b/inference-engine/tests/unit/inference_engine/ie_compilation_context_test.cpp
@@ -349,6 +349,33 @@ TEST(NetworkContext_CNNNetwork, HashWithDifferentMeanValues) {
               NetworkCompilationContext::computeHash(net3, {}));
 }
 
+// Verify all internal hash calculations are thread-safe (like ngraph::function serialization)
+TEST(NetworkContext_CNNNetwork, HashOfSameMultiThreading) {
+    auto net1 = createNetwork();
+    auto net2 = createNetwork();
+    std::atomic_bool fail{false};
+    const auto TEST_DURATION_MS = 1000;
+    auto start = high_resolution_clock::now();
+    int t1Count = 0, t2Count = 0;
+    auto threadFun = [&](int& count) {
+        do {
+            count++;
+            auto hash1 = NetworkCompilationContext::computeHash(net1, {});
+            auto hash2 = NetworkCompilationContext::computeHash(net2, {});
+            if (hash1 != hash2) {
+                fail = true;
+                break;
+            }
+        } while (!fail && duration_cast<milliseconds>(high_resolution_clock::now() - start).count() < TEST_DURATION_MS);
+    };
+    std::thread t1(threadFun, std::ref(t1Count));
+    std::thread t2(threadFun, std::ref(t2Count));
+    t1.join();
+    t2.join();
+    std::cout << "Hash threading test finished. Total runs = " << t1Count + t2Count << std::endl;
+    ASSERT_FALSE(fail);
+}
+
 ////////////////////////////////////////////
 
 TEST(NetworkContext_ModelName, HashOfSame) {
diff --git a/inference-engine/tests/unit/inference_engine/ie_executable_network_test.cpp b/inference-engine/tests/unit/inference_engine/ie_executable_network_test.cpp
index d2f39bc4a4d2f8..d8bd5abce575e1 100644
--- a/inference-engine/tests/unit/inference_engine/ie_executable_network_test.cpp
+++ b/inference-engine/tests/unit/inference_engine/ie_executable_network_test.cpp
@@ -8,12 +8,16 @@
 #include <vector>
 
 #include "cpp/ie_executable_network.hpp"
+#include "ie_iexecutable_network.hpp"
+#include "ie_plugin_cpp.hpp"
 
 #include "unit_test_utils/mocks/mock_iexecutable_network.hpp"
 #include "unit_test_utils/mocks/mock_iinfer_request.hpp"
 #include "unit_test_utils/mocks/mock_ie_ivariable_state.hpp"
 #include "unit_test_utils/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp"
 #include "unit_test_utils/mocks/cpp_interfaces/interface/mock_iexecutable_network_internal.hpp"
+#include "unit_test_utils/mocks/cpp_interfaces/interface/mock_ivariable_state_internal.hpp"
+#include "unit_test_utils/mocks/cpp_interfaces/interface/mock_iinference_plugin.hpp"
 
 using testing::_;
 using testing::Throw;
@@ -31,128 +35,91 @@ using testing::SetArgReferee;
 //  8. RemoteContext::Ptr GetContext()
 
 
-TEST(ExecutableNetworkConstructorTests, ThrowsIfConstructFromNullptr) {
-    // TODO issue: 26390; ExecutableNetwork's constructor shouldn't be available
-    EXPECT_NO_THROW(InferenceEngine::ExecutableNetwork exeNet{});
-
-    EXPECT_THROW(InferenceEngine::ExecutableNetwork exeNet{nullptr}, InferenceEngine::Exception);
-}
-
-TEST(ExecutableNetworkConstructorTests, CanConstruct) {
-    std::shared_ptr<MockIExecutableNetwork> mockIExeNet_p = std::make_shared<MockIExecutableNetwork>();
-    InferenceEngine::ExecutableNetwork exeNet{mockIExeNet_p};
-}
-
-TEST(ExecutableNetworkDestructorTests, Destruct) {
-    std::shared_ptr<MockIExecutableNetwork> mockIExeNet_p = std::make_shared<MockIExecutableNetwork>();
-    {
-        InferenceEngine::ExecutableNetwork exeNet{mockIExeNet_p};
-    }
-    // Call of destructor should decrease counter of shared_ptr
-    ASSERT_EQ(mockIExeNet_p.use_count(), 1);
-}
-
 class ExecutableNetworkTests : public ::testing::Test {
 protected:
-    std::shared_ptr<MockIExecutableNetwork> mockIExeNet_p;
-    std::unique_ptr<InferenceEngine::ExecutableNetwork> exeNetwork;
+    std::shared_ptr<MockIExecutableNetworkInternal> mockIExeNet;
+    InferenceEngine::ExecutableNetwork exeNetwork;
+
+    struct TestPluginInternal : public MockIInferencePlugin {
+        TestPluginInternal(const std::shared_ptr<MockIExecutableNetworkInternal>& mockIExeNet_) : mockIExeNet{mockIExeNet_} {}
+        std::shared_ptr<IExecutableNetworkInternal> LoadNetwork(const CNNNetwork&, const std::map<std::string, std::string>&) override {
+            return mockIExeNet;
+        }
+        QueryNetworkResult QueryNetwork(const CNNNetwork&, const std::map<std::string, std::string>&) const override {
+            IE_THROW(NotImplemented);
+        }
+        std::shared_ptr<MockIExecutableNetworkInternal> mockIExeNet;
+    };
+    struct TestPlugin : public InferenceEngine::InferencePlugin {
+        TestPlugin(std::shared_ptr<MockIExecutableNetworkInternal> mockIExeNet) :
+            InferenceEngine::InferencePlugin{InferenceEngine::details::SOPointer<TestPluginInternal>{
+                new TestPluginInternal{mockIExeNet}}} {}
+    };
 
     virtual void TearDown() {
-        mockIExeNet_p.reset();
-        exeNetwork.reset();
+        mockIExeNet.reset();
+        exeNetwork = {};
     }
 
     virtual void SetUp() {
-        mockIExeNet_p = std::make_shared<MockIExecutableNetwork>();
-        ASSERT_EQ(exeNetwork, nullptr);
-        exeNetwork = std::unique_ptr<InferenceEngine::ExecutableNetwork>(
-                new InferenceEngine::ExecutableNetwork(mockIExeNet_p));
-        ASSERT_NE(exeNetwork, nullptr);
+        mockIExeNet = std::make_shared<MockIExecutableNetworkInternal>();
+        exeNetwork = TestPlugin{mockIExeNet}.LoadNetwork({}, {});
     }
 };
 
 TEST_F(ExecutableNetworkTests, GetOutputsInfoThrowsIfReturnErr) {
-    EXPECT_CALL(*mockIExeNet_p.get(), GetOutputsInfo(_, _))
+    EXPECT_CALL(*mockIExeNet.get(), GetOutputsInfo())
             .Times(1)
-            .WillOnce(Return(InferenceEngine::GENERAL_ERROR));
+            .WillOnce(Throw(InferenceEngine::GeneralError{""}));
 
-    ASSERT_THROW(exeNetwork->GetOutputsInfo(), InferenceEngine::Exception);
+    ASSERT_THROW(exeNetwork.GetOutputsInfo(), InferenceEngine::Exception);
 }
 
 TEST_F(ExecutableNetworkTests, GetOutputsInfo) {
-    EXPECT_CALL(*mockIExeNet_p.get(), GetOutputsInfo(_, _))
-            .Times(1)
-            .WillOnce(Return(InferenceEngine::OK));
-
     InferenceEngine::ConstOutputsDataMap data;
-    ASSERT_NO_THROW(data = exeNetwork->GetOutputsInfo());
+    EXPECT_CALL(*mockIExeNet.get(), GetOutputsInfo()).Times(1).WillRepeatedly(Return(InferenceEngine::ConstOutputsDataMap{}));
+
+    ASSERT_NO_THROW(data = exeNetwork.GetOutputsInfo());
     ASSERT_EQ(data, InferenceEngine::ConstOutputsDataMap{});
 }
 
 TEST_F(ExecutableNetworkTests, GetInputsInfoThrowsIfReturnErr) {
-    EXPECT_CALL(*mockIExeNet_p.get(), GetInputsInfo(_, _))
+    EXPECT_CALL(*mockIExeNet.get(), GetInputsInfo())
             .Times(1)
-            .WillOnce(Return(InferenceEngine::GENERAL_ERROR));
+            .WillOnce(Throw(InferenceEngine::GeneralError{""}));
 
-    ASSERT_THROW(exeNetwork->GetInputsInfo(), InferenceEngine::Exception);
+    ASSERT_THROW(exeNetwork.GetInputsInfo(), InferenceEngine::Exception);
 }
 
 TEST_F(ExecutableNetworkTests, GetInputsInfo) {
-    EXPECT_CALL(*mockIExeNet_p.get(), GetInputsInfo(_, _))
-            .Times(1)
-            .WillOnce(Return(InferenceEngine::OK));
+    EXPECT_CALL(*mockIExeNet.get(), GetInputsInfo()).Times(1).WillRepeatedly(Return(InferenceEngine::ConstInputsDataMap{}));
 
     InferenceEngine::ConstInputsDataMap info;
-    ASSERT_NO_THROW(info = exeNetwork->GetInputsInfo());
+    ASSERT_NO_THROW(info = exeNetwork.GetInputsInfo());
     ASSERT_EQ(info, InferenceEngine::ConstInputsDataMap{});
 }
 
 
 TEST_F(ExecutableNetworkTests, resetThrowsIfResetToNullptr) {
-    InferenceEngine::IExecutableNetwork::Ptr mockIExeNet_p_2{};
-    ASSERT_THROW(exeNetwork->reset(mockIExeNet_p_2), InferenceEngine::Exception);
-}
-
-TEST_F(ExecutableNetworkTests, reset) {
-    InferenceEngine::IExecutableNetwork::Ptr mockIExeNet_p_2 = std::make_shared<MockIExecutableNetwork>();
-
-    exeNetwork->reset(mockIExeNet_p_2);
-
-    InferenceEngine::IExecutableNetwork::Ptr exeNet_p = *exeNetwork;   // use of IExecutableNetwork::Ptr&
-    EXPECT_NE(exeNet_p, mockIExeNet_p);
-    EXPECT_EQ(exeNet_p, mockIExeNet_p_2);
-}
-
-TEST_F(ExecutableNetworkTests, OperatorAmpersand) {
-    InferenceEngine::IExecutableNetwork::Ptr exeNet_p = *exeNetwork;   // use of IExecutableNetwork::Ptr&
-    ASSERT_EQ(exeNet_p, mockIExeNet_p);
+    InferenceEngine::IExecutableNetwork::Ptr mockIExeNet_2{};
+    ASSERT_THROW(exeNetwork.reset(mockIExeNet_2), InferenceEngine::Exception);
 }
 
 IE_SUPPRESS_DEPRECATED_START
 TEST_F(ExecutableNetworkTests, QueryStateThrowsIfReturnErr) {
-    EXPECT_CALL(*mockIExeNet_p.get(), QueryState(_, _, _))
+    EXPECT_CALL(*mockIExeNet.get(), QueryState())
             .Times(1)
-            .WillOnce(Return(InferenceEngine::GENERAL_ERROR));
-    EXPECT_THROW(exeNetwork->QueryState(), InferenceEngine::Exception);
-}
-
-TEST_F(ExecutableNetworkTests, QueryStateIfReturnOutOfBounds) {
-    EXPECT_CALL(*mockIExeNet_p.get(), QueryState(_, _, _))
-            .Times(1)
-            .WillOnce(Return(InferenceEngine::OUT_OF_BOUNDS));
-    std::vector<InferenceEngine::VariableState> MemState_;
-    EXPECT_NO_THROW(MemState_ = exeNetwork->QueryState());
-    EXPECT_EQ(MemState_.size(), 0);
+            .WillOnce(Throw(InferenceEngine::GeneralError{""}));
+    EXPECT_THROW(exeNetwork.QueryState(), InferenceEngine::Exception);
 }
 
 TEST_F(ExecutableNetworkTests, QueryState) {
-    std::shared_ptr<MockIVariableState> mockIMemState_p = std::make_shared<MockIVariableState>();
-    EXPECT_CALL(*mockIExeNet_p.get(), QueryState(_, _, _))
-            .Times(2)
-            .WillOnce(DoAll(SetArgReferee<0>(mockIMemState_p), Return(InferenceEngine::OK)))
-            .WillOnce(Return(InferenceEngine::OUT_OF_BOUNDS));
+    auto mockIMemState_p = std::make_shared<MockIVariableStateInternal>();
+    EXPECT_CALL(*mockIExeNet.get(), QueryState())
+            .Times(1)
+            .WillOnce(Return(std::vector<std::shared_ptr<InferenceEngine::IVariableStateInternal>>(1, mockIMemState_p)));
     std::vector<InferenceEngine::VariableState> MemState_v;
-    EXPECT_NO_THROW(MemState_v = exeNetwork->QueryState());
+    EXPECT_NO_THROW(MemState_v = exeNetwork.QueryState());
     EXPECT_EQ(MemState_v.size(), 1);
 }
 IE_SUPPRESS_DEPRECATED_END
@@ -173,42 +140,39 @@ class ExecutableNetworkWithIInferReqTests : public ExecutableNetworkTests {
 };
 
 TEST_F(ExecutableNetworkWithIInferReqTests, CanCreateInferRequest) {
-    EXPECT_CALL(*mockIExeNet_p.get(), CreateInferRequest(_, _))
-            .WillOnce(DoAll(SetArgReferee<0>(mockIInferReq_p), Return(InferenceEngine::OK)));
+    EXPECT_CALL(*mockIExeNet.get(), CreateInferRequest()).WillOnce(Return(mockIInferReq_p));
     InferRequest actualInferReq;
-    ASSERT_NO_THROW(actualInferReq = exeNetwork->CreateInferRequest());
+    ASSERT_NO_THROW(actualInferReq = exeNetwork.CreateInferRequest());
     ASSERT_EQ(mockIInferReq_p, static_cast<IInferRequest::Ptr &>(actualInferReq));
 }
 
 TEST_F(ExecutableNetworkWithIInferReqTests, CreateInferRequestThrowsIfReturnNotOK) {
-    EXPECT_CALL(*mockIExeNet_p.get(), CreateInferRequest(_, _)).WillOnce(Return(InferenceEngine::GENERAL_ERROR));
-    ASSERT_THROW(exeNetwork->CreateInferRequest(), InferenceEngine::Exception);
+    EXPECT_CALL(*mockIExeNet.get(), CreateInferRequest()).WillOnce(Throw(InferenceEngine::GeneralError{""}));
+    ASSERT_THROW(exeNetwork.CreateInferRequest(), InferenceEngine::Exception);
 }
 
 TEST_F(ExecutableNetworkWithIInferReqTests, CreateInferRequestThrowsIfSetRequestToNullptr) {
-    EXPECT_CALL(*mockIExeNet_p.get(), CreateInferRequest(_, _))
-            .WillOnce(DoAll(SetArgReferee<0>(nullptr), Return(InferenceEngine::OK)));
-    ASSERT_THROW(exeNetwork->CreateInferRequest(), InferenceEngine::Exception);
+    EXPECT_CALL(*mockIExeNet.get(), CreateInferRequest())
+            .WillOnce(Return(std::shared_ptr<MockIInferRequest>{}));
+    ASSERT_THROW(exeNetwork.CreateInferRequest(), InferenceEngine::Exception);
 }
 
 // CreateInferRequestPtr
 TEST_F(ExecutableNetworkWithIInferReqTests, CanCreateInferRequestPtr) {
-    EXPECT_CALL(*mockIExeNet_p.get(), CreateInferRequest(_, _))
-            .WillOnce(DoAll(SetArgReferee<0>(mockIInferReq_p), Return(InferenceEngine::OK)));
+    EXPECT_CALL(*mockIExeNet.get(), CreateInferRequest()).WillOnce(Return(mockIInferReq_p));
     InferRequest::Ptr actualInferReq;
-    ASSERT_NO_THROW(actualInferReq = exeNetwork->CreateInferRequestPtr());
+    ASSERT_NO_THROW(actualInferReq = exeNetwork.CreateInferRequestPtr());
     ASSERT_EQ(mockIInferReq_p, static_cast<IInferRequest::Ptr &>(*actualInferReq.get()));
 }
 
 TEST_F(ExecutableNetworkWithIInferReqTests, CreateInferRequestPtrThrowsIfReturnNotOK) {
-    EXPECT_CALL(*mockIExeNet_p.get(), CreateInferRequest(_, _)).WillOnce(Return(InferenceEngine::GENERAL_ERROR));
-    ASSERT_THROW(exeNetwork->CreateInferRequestPtr(), InferenceEngine::Exception);
+    EXPECT_CALL(*mockIExeNet.get(), CreateInferRequest()).WillOnce(Throw(InferenceEngine::GeneralError{""}));
+    ASSERT_THROW(exeNetwork.CreateInferRequestPtr(), InferenceEngine::Exception);
 }
 
 TEST_F(ExecutableNetworkWithIInferReqTests, CreateInferRequestPtrThrowsIfSetRequestToNullptr) {
-    EXPECT_CALL(*mockIExeNet_p.get(), CreateInferRequest(_, _))
-            .WillOnce(DoAll(SetArgReferee<0>(nullptr), Return(InferenceEngine::OK)));
-    ASSERT_THROW(exeNetwork->CreateInferRequestPtr(), InferenceEngine::Exception);
+    EXPECT_CALL(*mockIExeNet.get(), CreateInferRequest()).WillOnce(Return(std::shared_ptr<MockIInferRequest>{}));
+    ASSERT_THROW(exeNetwork.CreateInferRequestPtr(), InferenceEngine::Exception);
 }
 
 IE_SUPPRESS_DEPRECATED_START
diff --git a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_load_network_case.hpp b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_load_network_case.hpp
index 3ca276f6105a7d..abf911537fd076 100644
--- a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_load_network_case.hpp
+++ b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/helpers/myriad_load_network_case.hpp
@@ -11,7 +11,7 @@
 #include <mvnc.h>
 
 using namespace InferenceEngine;
-using ExeNetworkPtr = InferenceEngine::IExecutableNetwork::Ptr;
+using ExeNetwork = InferenceEngine::ExecutableNetwork;
 
 //------------------------------------------------------------------------------
 // class MyriadLoadNetworkTestCase
diff --git a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_load_network_tests.cpp b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_load_network_tests.cpp
index cedf09cce9b908..f59644d3ac283e 100644
--- a/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_load_network_tests.cpp
+++ b/inference-engine/tests_deprecated/behavior/vpu/myriad_tests/vpu_load_network_tests.cpp
@@ -22,7 +22,7 @@ TEST_F(MyriadLoadNetworkTestCase, smoke_SimpleLoading) {
         {KEY_DEVICE_ID, device_to_load},
     };
 
-    ASSERT_NO_THROW(ExeNetworkPtr exe_network =
+    ASSERT_NO_THROW(ExeNetwork exe_network =
                         ie->LoadNetwork(cnnNetwork, "MYRIAD", config));
 
     ASSERT_TRUE(!IsDeviceAvailable(device_to_load));
@@ -37,12 +37,12 @@ TEST_F(MyriadLoadNetworkTestCase, smoke_LoadingAtTheSameDevice) {
         {KEY_DEVICE_ID, device_to_load},
     };
 
-    ASSERT_NO_THROW(ExeNetworkPtr exe_network =
+    ASSERT_NO_THROW(ExeNetwork exe_network =
                         ie->LoadNetwork(cnnNetwork, "MYRIAD", config));
 
     ASSERT_TRUE(!IsDeviceAvailable(device_to_load));
 
-    ASSERT_NO_THROW(ExeNetworkPtr exe_network =
+    ASSERT_NO_THROW(ExeNetwork exe_network =
                         ie->LoadNetwork(cnnNetwork, "MYRIAD", config));
 }
 
@@ -52,7 +52,7 @@ TEST_F(MyriadLoadNetworkTestCase, smoke_ThrowsExeptionWhenNameIsInvalid) {
         {KEY_DEVICE_ID, device_to_load},
     };
 
-    ASSERT_ANY_THROW(ExeNetworkPtr exe_network =
+    ASSERT_ANY_THROW(ExeNetwork exe_network =
         ie->LoadNetwork(cnnNetwork, "MYRIAD", config));
 }
 
@@ -76,6 +76,6 @@ TEST_F(MyriadLoadNetworkTestCase, smoke_ThrowsExeptionWhenPlatformConflictWithPr
         {KEY_VPU_MYRIAD_PLATFORM, wrong_platform},
     };
 
-    ASSERT_ANY_THROW(ExeNetworkPtr exe_network =
+    ASSERT_ANY_THROW(ExeNetwork exe_network =
         ie->LoadNetwork(cnnNetwork, "MYRIAD", config));
 }
diff --git a/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp b/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp
index 5993baae55354a..bc25cfa969ae3d 100644
--- a/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp
+++ b/inference-engine/tests_deprecated/fluid_preproc/common/fluid_tests.cpp
@@ -193,7 +193,7 @@ InferenceEngine::Blob::Ptr img2Blob(cv::Mat &img, InferenceEngine::Layout layout
     const size_t height = img.size().height;
     const size_t width = img.size().width;
 
-    CV_Assert(cv::DataType<data_t>::depth == img.depth());
+    CV_Assert(cv::DataType<data_t>::depth == img.depth() || (PRC == Precision::FP16 && img.depth() == CV_16F));
 
     SizeVector dims = {1, channels, height, width};
     Blob::Ptr resultBlob = make_shared_blob<data_t>(TensorDesc(PRC, dims, layout));;
@@ -237,7 +237,8 @@ void Blob2Img(const InferenceEngine::Blob::Ptr& blobP, cv::Mat& img, InferenceEn
     const size_t height = img.size().height;
     const size_t width = img.size().width;
 
-    CV_Assert(cv::DataType<data_t>::depth == img.depth());
+    //IE and OpenCV use different data types for FP16 representation, so need to check for it explicitly
+    CV_Assert(cv::DataType<data_t>::depth == img.depth() || ((img.depth() == CV_16F) && (PRC == Precision::FP16)));
 
     data_t* blobData = blobP->buffer().as<data_t*>();
 
@@ -438,11 +439,20 @@ TEST_P(SplitTestGAPI, AccuracyTest)
     cv::Size sz = std::get<2>(params);
     double tolerance = std::get<3>(params);
 
-    int srcType = CV_MAKE_TYPE(depth, planes);
+    auto make_src_type = [planes](int d){
+            return CV_MAKE_TYPE(d, planes);
+    };
+    int srcType = make_src_type(depth);
     int dstType = CV_MAKE_TYPE(depth, 1);
 
     cv::Mat in_mat(sz, srcType);
-    cv::randn(in_mat, cv::Scalar::all(127), cv::Scalar::all(40.f));
+    bool const is_fp16 = (depth == CV_16F);
+    cv::Mat rnd_mat =  is_fp16 ? cv::Mat(sz, make_src_type(CV_32F)) : in_mat;
+    cv::randn(rnd_mat, cv::Scalar::all(127), cv::Scalar::all(40.f));
+
+    if (is_fp16) {
+        rnd_mat.convertTo(in_mat, depth);
+    }
 
     std::vector<cv::Mat> out_mats_gapi(planes, cv::Mat::zeros(sz, dstType));
     std::vector<cv::Mat> out_mats_ocv (planes, cv::Mat::zeros(sz, dstType));
@@ -520,12 +530,21 @@ TEST_P(MergeTestGAPI, AccuracyTest)
     cv::Size sz = std::get<2>(params);
     double tolerance = std::get<3>(params);
 
-    int srcType = CV_MAKE_TYPE(depth, 1);
+    auto make_src_type = [](int d){
+            return CV_MAKE_TYPE(d, 1);
+    };
+    int srcType = make_src_type(depth);
     int dstType = CV_MAKE_TYPE(depth, planes);
 
     std::vector<cv::Mat> in_mats(planes, cv::Mat(sz, srcType));
     for (int p = 0; p < planes; p++) {
-        cv::randn(in_mats[p], cv::Scalar::all(127), cv::Scalar::all(40.f));
+        bool const is_fp16 = (depth == CV_16F);
+        cv::Mat rnd_mat =  is_fp16 ? cv::Mat(sz, make_src_type(CV_32F)) : in_mats[p];
+        cv::randn(rnd_mat, cv::Scalar::all(127), cv::Scalar::all(40.f));
+
+        if (is_fp16) {
+            rnd_mat.convertTo(in_mats[p], depth);
+        }
     }
 
     cv::Mat out_mat_ocv  = cv::Mat::zeros(sz, dstType);
@@ -754,7 +773,8 @@ TEST_P(ColorConvertTestIE, AccuracyTest)
     cv::Scalar mean = cv::Scalar::all(127);
     cv::Scalar stddev = cv::Scalar::all(40.f);
 
-    cv::randn(in_mat1, mean, stddev);
+    if (depth != CV_16F)
+        cv::randn(in_mat1, mean, stddev);
 
     cv::Mat out_mat(size, out_type);
     cv::Mat out_mat_ocv(size, out_type);
@@ -771,7 +791,7 @@ TEST_P(ColorConvertTestIE, AccuracyTest)
     size_t out_channels = out_mat.channels();
     CV_Assert(3 == out_channels || 4 == out_channels);
 
-    CV_Assert(CV_8U == depth || CV_32F == depth);
+    CV_Assert(CV_8U == depth || CV_32F == depth || depth == CV_16S || depth == CV_16F);
 
     ASSERT_TRUE(in_mat1.isContinuous() && out_mat.isContinuous());
 
@@ -780,8 +800,21 @@ TEST_P(ColorConvertTestIE, AccuracyTest)
     InferenceEngine::SizeVector  in_sv = { 1, in_channels,  in_height,  in_width };
     InferenceEngine::SizeVector out_sv = { 1, out_channels, out_height, out_width };
 
+    auto depth_to_precision = [](int depth) -> Precision::ePrecision {
+        switch (depth)
+        {
+            case CV_8U:  return Precision::U8;
+            case CV_16S: return Precision::I16;
+            case CV_16F: return Precision::FP16;
+            case CV_32F: return Precision::FP32;
+            default:
+                throw std::logic_error("Unsupported configuration");
+        }
+        return Precision::UNSPECIFIED;
+    };
+
     // HWC blob: channels are interleaved
-    Precision precision = CV_8U == depth ? Precision::U8 : Precision::FP32;
+    Precision precision = depth_to_precision(depth);
 
     Blob::Ptr in_blob, out_blob;
     switch (precision)
@@ -796,6 +829,18 @@ TEST_P(ColorConvertTestIE, AccuracyTest)
         out_blob = img2Blob<Precision::FP32>(out_mat, out_layout);
         break;
 
+    case Precision::I16:
+        in_blob = img2Blob<Precision::I16>(in_mat1, in_layout);
+        out_blob = img2Blob<Precision::I16>(out_mat, out_layout);
+        break;
+
+    case Precision::FP16:
+        in_blob =  img2Blob<Precision::FP16>(in_mat1, in_layout);
+        out_blob = img2Blob<Precision::FP16>(out_mat, out_layout);
+
+        break;
+
+
     default:
         FAIL() << "Unsupported configuration";
     }
@@ -813,6 +858,8 @@ TEST_P(ColorConvertTestIE, AccuracyTest)
     {
     case Precision::U8:   Blob2Img<Precision::U8>  (out_blob, out_mat, out_layout); break;
     case Precision::FP32: Blob2Img<Precision::FP32>(out_blob, out_mat, out_layout); break;
+    case Precision::I16:  Blob2Img<Precision::I16> (out_blob, out_mat, out_layout); break;
+    case Precision::FP16: Blob2Img<Precision::FP16> (out_blob, out_mat, out_layout); break;
     default: FAIL() << "Unsupported configuration";
     }
 
diff --git a/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp b/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp
index babaa43f1a8cbc..d71cb8b3c1e50e 100644
--- a/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp
+++ b/inference-engine/tests_deprecated/fluid_preproc/cpu/fluid_tests_cpu.cpp
@@ -132,7 +132,7 @@ INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestGAPI,
 
 INSTANTIATE_TEST_CASE_P(SplitTestFluid, SplitTestGAPI,
                         Combine(Values(2, 3, 4),
-                                Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F, CV_32S),
+                                Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_16F, CV_32F, CV_32S),
                                 Values(TEST_SIZES),
                                 Values(0)));
 
@@ -144,7 +144,7 @@ INSTANTIATE_TEST_CASE_P(ChanToPlaneTestFluid, ChanToPlaneTestGAPI,
 
 INSTANTIATE_TEST_CASE_P(MergeTestFluid, MergeTestGAPI,
                         Combine(Values(2, 3, 4),
-                                Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F, CV_32S),
+                                Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_16F, CV_32F, CV_32S),
                                 Values(TEST_SIZES),
                                 Values(0)));
 
@@ -269,7 +269,7 @@ INSTANTIATE_TEST_CASE_P(ColorConvertYUV420Fluid, ColorConvertYUV420TestIE,
                                 Values(0)));
 
 INSTANTIATE_TEST_CASE_P(Reorder_HWC2CHW, ColorConvertTestIE,
-                        Combine(Values(CV_8U, CV_32F),
+                        Combine(Values(CV_8U, CV_32F, CV_16S, CV_16F),
                                 Values(InferenceEngine::ColorFormat::BGR),
                                 Values(InferenceEngine::NHWC),
                                 Values(InferenceEngine::NCHW),
@@ -277,7 +277,7 @@ INSTANTIATE_TEST_CASE_P(Reorder_HWC2CHW, ColorConvertTestIE,
                                 Values(0)));
 
 INSTANTIATE_TEST_CASE_P(Reorder_CHW2HWC, ColorConvertTestIE,
-                        Combine(Values(CV_8U, CV_32F),
+                        Combine(Values(CV_8U, CV_32F, CV_16S, CV_16F),
                                 Values(InferenceEngine::ColorFormat::BGR),
                                 Values(InferenceEngine::NCHW),
                                 Values(InferenceEngine::NHWC),
diff --git a/inference-engine/tests_deprecated/functional/ie_tests/src/custom_matcher.cpp b/inference-engine/tests_deprecated/functional/ie_tests/src/custom_matcher.cpp
index fce8d1c153a49e..b6b4a169302600 100644
--- a/inference-engine/tests_deprecated/functional/ie_tests/src/custom_matcher.cpp
+++ b/inference-engine/tests_deprecated/functional/ie_tests/src/custom_matcher.cpp
@@ -6,6 +6,7 @@
 #include <gtest/gtest.h>
 #include <ie_plugin_config.hpp>
 #include "custom_matcher.hpp"
+#include "ie_iexecutable_network.hpp"
 
 using namespace InferenceEngine;
 
@@ -103,7 +104,7 @@ void Regression::Matchers::CustomMatcher::matchCustom() {
             ASSERT_NO_FATAL_FAILURE(executableApi = createExecutableNetworkFromIR());
         }
 
-        if (executableApi.operator IExecutableNetwork::Ptr &() != nullptr) {
+        if (executableApi) {
             for (int i=0; i != config._nrequests; i++ ) {
                 inferRequests.push_back(executableApi.CreateInferRequest());
             }
@@ -116,7 +117,7 @@ void Regression::Matchers::CustomMatcher::matchCustom() {
         }
 
         auto make_unified_endpoints = [&] () {
-            if (executableApi.operator IExecutableNetwork::Ptr &() != nullptr) {
+            if (executableApi) {
                 return std::make_pair(executableApi.GetInputsInfo(), executableApi.GetOutputsInfo());
             }
             auto inputs2 = network.getInputsInfo();
diff --git a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_crop_test.cpp b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_crop_test.cpp
deleted file mode 100644
index 6d3769e1932342..00000000000000
--- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_crop_test.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "test_graph.hpp"
-
-#include "single_layer_common.hpp"
-#include "tests_common.hpp"
-
-#include <ie_core.hpp>
-#include <ie_plugin_config.hpp>
-
-using namespace ::testing;
-using namespace std;
-using namespace mkldnn;
-
-struct crop_test_params {
-    InferenceEngine::SizeVector in;
-    std::vector<int> axis;
-    std::vector<int> offsets;
-    std::vector<int> dims;
-
-    size_t num_prim_desc;
-
-    MKLDNNPlugin::impl_desc_type selectedType;
-
-    std::vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
-};
-
-
-
-template <typename data_t>
-void ref_crop(InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst, crop_test_params prm) {
-    data_t *dst_ptr = dst.data();
-
-    std::vector<int> offsets(4, 0);
-    for (size_t i = 0; i < prm.offsets.size(); i++) {
-        offsets[prm.axis[i]] = prm.offsets[i];
-    }
-    int OFFSET_N = offsets.at(0);
-    int OFFSET_C = offsets.at(1);
-    int OFFSET_H = offsets.at(2);
-    int OFFSET_W = offsets.at(3);
-
-    auto dst_dims = dst.getTensorDesc().getDims();
-    int dst_ndims = static_cast<int>(dst_dims.size());
-    const int ON = (dst_ndims  > 0) ? dst_dims[0] : 1;
-    const int OC = (dst_ndims  > 1) ? dst_dims[1] : 1;
-    const int OH = (dst_ndims  > 2) ? dst_dims[2] : 1;
-    const int OW = (dst_ndims  > 3) ? dst_dims[3] : 1;
-
-    auto src_dims = src.getTensorDesc().getDims();
-    int src_ndims = static_cast<int>(src_dims.size());
-    const int _IN = (src_ndims  > 0) ? src_dims[0] : 1;
-    const int IC = (src_ndims  > 1) ? src_dims[1] : 1;
-    const int IH = (src_ndims  > 2) ? src_dims[2] : 1;
-    const int IW = (src_ndims  > 3) ? src_dims[3] : 1;
-
-    auto dst_off = [=](int n, int c, int h, int w) {
-        return (n * OW * OH * OC + c * OW * OH + h * OW + w);
-    };
-    auto src_off = [=](int n, int c, int h, int w) {
-        return (n * IW * IH * IC + c * IW * IH + h * IW + w);
-    };
-
-    ASSERT_GE(_IN - OFFSET_N, ON);
-    ASSERT_GE(IC - OFFSET_C, OC);
-    ASSERT_GE(IH - OFFSET_H, OH);
-    ASSERT_GE(IW - OFFSET_W, OW);
-
-    data_t* src_ptr = src.data();
-    for (int n = 0; n < ON; ++n) {
-        for (int c = 0; c < OC; ++c) {
-            for (int h = 0; h < OH; ++h) {
-                for (int w = 0; w < OW; ++w) {
-                    dst_ptr[dst_off(n, c, h, w)] = src_ptr[src_off(n + OFFSET_N, c + OFFSET_C,
-                                                                   h + OFFSET_H, w + OFFSET_W)];
-                }
-            }
-        }
-    }
-}
-
-class MKLDNNGraphCropTests: public TestsCommon,
-                                     public WithParamInterface<crop_test_params> {
-    std::string model_t = R"V0G0N(
-<Net Name="Crop_Only" version="2" precision="FP32" batch="1">
-    <layers>
-        <layer name="in1" type="Input" precision="FP32" id="0">
-            <output>
-                <port id="0">
-                    _IN_
-                </port>
-            </output>
-        </layer>
-        <layer name="crop" id="1" type="Crop" precision="FP32">
-            <data axis="_AXC_" offset="_OFC_" dim="_DIMC_" />
-            <input>
-                <port id="1">
-                    _IN_
-                </port>
-            </input>
-            <output>
-                <port id="2">
-                    _OUT_
-                </port>
-            </output>
-        </layer>
-    </layers>
-    <edges>
-        <edge from-layer="0" from-port="0" to-layer="1" to-port="1"/>
-    </edges>
-</Net>
-)V0G0N";
-
-protected:
-    std::string getModel(crop_test_params p) {
-        std::string model = model_t;
-        std::string in_shape, out_shape;
-
-        std::string axis, offset, dim;
-        InferenceEngine::SizeVector outDims = p.in;
-        for (size_t i = 0; i < p.offsets.size(); i++) {
-            if (!axis.empty())
-                axis += ",";
-            axis += std::to_string(p.axis[i]);
-            if (!offset.empty())
-                offset += ",";
-            offset += std::to_string(p.offsets[i]);
-            if (!dim.empty())
-                dim += ",";
-            dim += std::to_string(p.dims[i]);
-            outDims[p.axis[i]] = p.dims[i];
-        }
-
-        for (size_t i = 0; i < p.in.size(); i++) {
-            in_shape += "<dim>";
-            in_shape += std::to_string(p.in[i]) + "</dim>\n";
-        }
-        REPLACE_WITH_STR(model, "_IN_", in_shape);
-
-        for (size_t i = 0; i < outDims.size(); i++) {
-            out_shape += "<dim>";
-            out_shape += std::to_string(outDims[i]) + "</dim>\n";
-        }
-        REPLACE_WITH_STR(model, "_OUT_", out_shape);
-
-        REPLACE_WITH_STR(model, "_AXC_", axis);
-        REPLACE_WITH_STR(model, "_OFC_", offset);
-        REPLACE_WITH_STR(model, "_DIMC_", dim);
-        return model;
-    }
-
-    virtual void TearDown() {
-    }
-
-    virtual void SetUp() {
-        try {
-            TestsCommon::SetUp();
-            crop_test_params p = ::testing::WithParamInterface<crop_test_params>::GetParam();
-            std::string model = getModel(p);
-
-            InferenceEngine::Core core;
-            InferenceEngine::CNNNetwork network;
-            ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-
-            MKLDNNGraphTestClass graph;
-            graph.CreateGraph(network);
-
-            auto& nodes = graph.getNodes();
-            for (int i = 0; i < nodes.size(); i++) {
-                if (nodes[i]->getType() == MKLDNNPlugin::Crop) {
-                    ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size());
-                    for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
-                        p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j));
-                    }
-                    ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor());
-                    ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType());
-                }
-            }
-
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32,  p.in, InferenceEngine::TensorDesc::getLayoutByDims(p.in) });
-            src->allocate();
-            fill_data(src->buffer(), src->size());
-
-            InferenceEngine::TBlob<float>* srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
-
-            if (srcPtr == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-
-            InferenceEngine::BlobMap srcs;
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src));
-
-            InferenceEngine::OutputsDataMap out;
-            out = network.getOutputsInfo();
-            InferenceEngine::BlobMap outputBlobs;
-
-            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
-
-            InferenceEngine::TBlob<float>::Ptr output;
-            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
-            output->allocate();
-            outputBlobs[item.first] = output;
-
-            graph.Infer(srcs, outputBlobs);
-
-            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
-            dst_ref.allocate();
-
-            ref_crop(*srcPtr, dst_ref, p);
-
-            compare(*output, dst_ref);
-        } catch (const InferenceEngine::Exception &e) {
-            FAIL() << e.what();
-        }
-    }
-};
-
-TEST_P(MKLDNNGraphCropTests, TestCrop) {}
-
-
-INSTANTIATE_TEST_CASE_P(
-        TestCrop, MKLDNNGraphCropTests,
-        ::testing::Values(
-                crop_test_params{{1, 5, 32, 32}, {1, 2, 3}, {2, 5, 4}, {2, 23, 23}, 1, MKLDNNPlugin::impl_desc_type::unknown, {
-                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
-                            ASSERT_EQ(1, impl.getConfig().inConfs.size());
-                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
-                        }}},
-                crop_test_params{{3, 8, 32, 32}, {0, 1, 2, 3}, {1, 0, 20, 20}, {2, 8, 5, 5}, 2, MKLDNNPlugin::impl_desc_type::unknown, {
-                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
-                            ASSERT_EQ(1, impl.getConfig().inConfs.size());
-                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
-                        }} },
-                crop_test_params{{1, 5, 32, 32}, {3}, {10}, {20}, 1, MKLDNNPlugin::impl_desc_type::unknown },
-                crop_test_params{{1, 5, 32, 20}, {2, 3}, {30, 10}, {2, 10}, 1, MKLDNNPlugin::impl_desc_type::unknown },
-                crop_test_params{ { 32, 32 },{ 1 },{ 10 },{ 20 }, 1, MKLDNNPlugin::impl_desc_type::unknown },
-                crop_test_params{ { 32, 20 },{ 0, 1 },{ 30, 10 },{ 2, 10 }, 1, MKLDNNPlugin::impl_desc_type::unknown }));
-
-class MKLDNNGraphDynBatchCropTests: public MKLDNNGraphCropTests {
-protected:
-
-    virtual void SetUp() {
-        try {
-            TestsCommon::SetUp();
-            crop_test_params p = ::testing::WithParamInterface<crop_test_params>::GetParam();
-            std::string model = getModel(p);
-            size_t MB = p.in[0];
-            if (MB < 2)
-                MB = 2;
-
-            InferenceEngine::Core core;
-            InferenceEngine::CNNNetwork network;
-            ASSERT_NO_THROW(network = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()));
-            network.setBatchSize(MB);
-
-            MKLDNNGraphTestClass graph;
-            graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
-            graph.CreateGraph(network);
-
-            InferenceEngine::SizeVector dims_src = p.in;
-            dims_src[0] = MB;
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float>({InferenceEngine::Precision::FP32, dims_src, InferenceEngine::TensorDesc::getLayoutByDims(dims_src) });
-            InferenceEngine::TBlob<float>* srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
-            if (srcPtr == nullptr)
-                FAIL() << "Cannot cast blob to TBlob<float>.";
-
-            src->allocate();
-            fill_data(src->buffer(), src->size());
-
-            InferenceEngine::BlobMap srcs;
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src));
-
-            InferenceEngine::OutputsDataMap out;
-            out = network.getOutputsInfo();
-            InferenceEngine::BlobMap outputBlobs;
-
-            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
-
-            InferenceEngine::TBlob<float>::Ptr output;
-            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
-            output->allocate();
-            outputBlobs[item.first] = output;
-
-            auto checkCrop = [](const MKLDNNPlugin::MKLDNNNodePtr& node) {
-                return node->getType() == MKLDNNPlugin::Crop;
-            };
-
-            graph.checkDynBatch(srcs, outputBlobs, MB, MB, checkCrop);
-            graph.checkDynBatch(srcs, outputBlobs, 1, MB, checkCrop);
-        } catch (const InferenceEngine::Exception &e) {
-            FAIL() << e.what();
-        }
-    }
-};
-
-TEST_P(MKLDNNGraphDynBatchCropTests, TestsDynBatchCrop) {}
-
-INSTANTIATE_TEST_CASE_P(
-        TestsDynBatchCrop, MKLDNNGraphDynBatchCropTests,
-        ::testing::Values(
-                crop_test_params{{1, 5, 32, 32}, {1, 2, 3}, {2, 5, 4}, {2, 23, 23}, 1, MKLDNNPlugin::impl_desc_type::unknown, {
-                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
-                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
-                            ASSERT_EQ(1, impl.getConfig().inConfs.size());
-                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
-                        }}},
-                crop_test_params{{1, 5, 32, 32}, {3}, {10}, {20}, 1, MKLDNNPlugin::impl_desc_type::unknown },
-                crop_test_params{{1, 5, 32, 20}, {2, 3}, {30, 10}, {2, 10}, 1, MKLDNNPlugin::impl_desc_type::unknown }));
diff --git a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp
index e75902ff09ffb7..a6a3b737cde53b 100644
--- a/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp
+++ b/inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp
@@ -24,10 +24,8 @@ struct TestExecutableNetworkBase : public InferenceEngine::ExecutableNetworkBase
     using InferenceEngine::ExecutableNetworkBase::_impl;
 };
 
-static MKLDNNPlugin::MKLDNNGraph& getGraph(InferenceEngine::IExecutableNetwork::Ptr execNetwork) {
-    return reinterpret_cast<MKLDNNTestExecNetwork*>(
-        reinterpret_cast<TestExecutableNetworkBase*>(
-            execNetwork.get())->_impl.get())->getGraph();
+static MKLDNNPlugin::MKLDNNGraph& getGraph(InferenceEngine::IExecutableNetworkInternal::Ptr execNetwork) {
+    return static_cast<MKLDNNTestExecNetwork*>(execNetwork.get())->getGraph();
 }
 
 class MKLDNNGraphLeaksTests: public ::testing::Test {
@@ -253,7 +251,7 @@ TEST_F(MKLDNNGraphLeaksTests, MKLDNN_not_release_outputs_fp32) {
         ASSERT_NE(1, network.getOutputsInfo().size());
 
         std::shared_ptr<MKLDNNPlugin::Engine> score_engine(new MKLDNNPlugin::Engine());
-        InferenceEngine::ExecutableNetwork exeNetwork1;
+        InferenceEngine::IExecutableNetworkInternal::Ptr exeNetwork1;
         ASSERT_NO_THROW(exeNetwork1 = score_engine->LoadNetwork(network, {}));
 
         size_t modified_outputs_size = getGraph(exeNetwork1).GetOutputNodes().size();
@@ -262,7 +260,7 @@ TEST_F(MKLDNNGraphLeaksTests, MKLDNN_not_release_outputs_fp32) {
         ASSERT_NO_THROW(network2 = core.ReadNetwork(model, weights_ptr));
         ASSERT_EQ(1, network2.getOutputsInfo().size());
 
-        InferenceEngine::ExecutableNetwork exeNetwork2;
+        InferenceEngine::IExecutableNetworkInternal::Ptr exeNetwork2;
         ASSERT_NO_THROW(exeNetwork2 = score_engine->LoadNetwork(network2, {}));
 
         size_t original_outputs_size = getGraph(exeNetwork2).GetOutputNodes().size();
diff --git a/inference-engine/thirdparty/CMakeLists.txt b/inference-engine/thirdparty/CMakeLists.txt
index 76965d11e37dd1..f028ebdd8ab5d2 100644
--- a/inference-engine/thirdparty/CMakeLists.txt
+++ b/inference-engine/thirdparty/CMakeLists.txt
@@ -39,7 +39,7 @@ if (ENABLE_CLDNN)
     else()
         set(CLDNN__ARCHITECTURE_TARGET "Linux64" CACHE STRING "" FORCE)
     endif()
-
+    set(CLDNN_THREADING "${THREADING}" CACHE STRING "" FORCE)
     add_subdirectory(clDNN)
 
     # disable CLDNN docs build
diff --git a/inference-engine/thirdparty/clDNN/CMakeLists.txt b/inference-engine/thirdparty/clDNN/CMakeLists.txt
index 1378acb4b99e71..c9cc74e38f7b82 100644
--- a/inference-engine/thirdparty/clDNN/CMakeLists.txt
+++ b/inference-engine/thirdparty/clDNN/CMakeLists.txt
@@ -59,6 +59,14 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 # ======================================================================================================
 # ====================================== HELPER CONSTANT VARIABLES =====================================
 # ======================================================================================================
+# ======================================================================================================
+if("${CLDNN_THREADING}" MATCHES "SEQ")
+    add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_SEQ)
+elseif("${CLDNN_THREADING}" MATCHES "TBB")
+    add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_TBB)
+else()
+    add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_THREADPOOL)
+endif()
 
 # Path which points to main directory of project.
 set(CLDNN__MAIN_DIR      "${CMAKE_CURRENT_SOURCE_DIR}")
diff --git a/inference-engine/thirdparty/clDNN/api/cldnn.hpp b/inference-engine/thirdparty/clDNN/api/cldnn.hpp
index 96460b7f4386c8..4b531ab050bb45 100644
--- a/inference-engine/thirdparty/clDNN/api/cldnn.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn.hpp
@@ -123,6 +123,7 @@
 #include <memory>
 #include <string>
 #include <type_traits>
+#include <thread>
 
 namespace cldnn {
 
diff --git a/inference-engine/thirdparty/clDNN/api/engine.hpp b/inference-engine/thirdparty/clDNN/api/engine.hpp
index 0da27f2d0835fd..c65d89df60196d 100644
--- a/inference-engine/thirdparty/clDNN/api/engine.hpp
+++ b/inference-engine/thirdparty/clDNN/api/engine.hpp
@@ -10,6 +10,7 @@
 #include <stdexcept>
 #include <vector>
 #include <map>
+#include <algorithm>
 
 namespace cldnn {
 
@@ -61,6 +62,7 @@ struct engine_configuration {
                                               ///< (switched off for older drivers then NEO).
     uint16_t n_streams;                       ///< Number of queues executed in parallel
     const std::string kernels_cache_path;     ///< Path to compiled kernels cache
+    uint16_t n_threads;                       ///< Number of threads
     const std::string tuning_cache_path;      ///< Path to tuning kernel cache
 
     /// @brief Constructs engine configuration with specified options.
@@ -83,6 +85,7 @@ struct engine_configuration {
         bool memory_pool = true,
         uint16_t n_streams = 1,
         const std::string& kernels_cache_path = "",
+        uint16_t n_threads = std::max(static_cast<uint16_t>(std::thread::hardware_concurrency()), static_cast<uint16_t>(1)),
         const std::string& tuning_cache_path = "cache.json")
         : enable_profiling(profiling)
         , meaningful_kernels_names(decorate_kernel_names)
@@ -97,6 +100,7 @@ struct engine_configuration {
         , enable_memory_pool(memory_pool)
         , n_streams(n_streams)
         , kernels_cache_path(kernels_cache_path)
+        , n_threads(n_threads)
         , tuning_cache_path(tuning_cache_path) {
         if (n_streams == 0) {
             throw std::invalid_argument("Invalid streams count set in engine config");
diff --git a/inference-engine/thirdparty/clDNN/api/gather_nd.hpp b/inference-engine/thirdparty/clDNN/api/gather_nd.hpp
new file mode 100644
index 00000000000000..ebf3953cf7fd65
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/api/gather_nd.hpp
@@ -0,0 +1,57 @@
+/*
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+#include "primitive.hpp"
+
+namespace cldnn {
+/// @addtogroup cpp_api C++ API
+/// @{
+/// @addtogroup cpp_topology Network Topology
+/// @{
+/// @addtogroup cpp_primitives Primitives
+/// @{
+
+/// @brief
+/// @details
+struct gather_nd : public primitive_base<gather_nd> {
+    CLDNN_DECLARE_PRIMITIVE(gather_nd)
+
+    /// @brief Constructs gather_nd primitive.
+    /// @param id This primitive id.
+    /// @param data Input data primitive id.
+    /// @param indices Input indexes primitive id.
+    /// @param indices_rank Rank of indices.
+    /// @param batch_dims batch_dims as an attribute of GatherND. Optional.
+    gather_nd(const primitive_id& id,
+                   const primitive_id& data,
+                   const primitive_id& indices,
+                   const uint8_t indices_rank,
+                   const uint8_t batch_dims = 0,
+                   const padding& output_padding = padding())
+        : primitive_base(id, {data, indices}, output_padding), indices_rank(indices_rank), batch_dims(batch_dims) {}
+
+    /// @brief GatherND indices_rank
+    uint8_t indices_rank;
+
+    /// @brief GatherND batch_dims
+    uint8_t batch_dims;
+};
+/// @}
+/// @}
+/// @}
+}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
index 0c07f352f5120d..abea5a1117df07 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
@@ -47,6 +47,7 @@ enum class KernelType {
     CONTRACT,
     ONE_HOT,
     GATHER,
+    GATHER_ND,
     SCATTER_UPDATE,
     SCATTER_ND_UPDATE,
     SCATTER_ELEMENTS_UPDATE,
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_to_space/batch_to_space_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_to_space/batch_to_space_kernel_base.cpp
index 99fcee558c8646..cef13342302795 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_to_space/batch_to_space_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_to_space/batch_to_space_kernel_base.cpp
@@ -81,7 +81,7 @@ KernelsData BatchToSpaceKernelBase::GetCommonKernelsData(const Params& params, c
     auto dispatchData = SetDefault(newParams, options);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/deformable_convolution_kernel_bfyx_interp.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/deformable_convolution_kernel_bfyx_interp.cpp
index 776b9a7f8e5e21..e337b9b68e7d22 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/deformable_convolution_kernel_bfyx_interp.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/deformable_convolution_kernel_bfyx_interp.cpp
@@ -83,7 +83,7 @@ KernelsData DeformableConvolutionKernel_bfyx_interp::GetKernelsData(const Params
     CommonDispatchData dispatchData = SetDefault(newParams);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_base.cpp
index 7461c206a890dd..0fc8b0cbb1da5f 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_base.cpp
@@ -92,7 +92,7 @@ KernelsData CumSumKernelBase::GetCommonKernelsData(const Params& params,
     auto dispatchData = SetDefault(newParams);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams, dispatchData);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_partial_sum.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_partial_sum.cpp
index 319c9bb0224f65..085ce54e7a0944 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_partial_sum.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_partial_sum.cpp
@@ -53,7 +53,7 @@ KernelsData CumSumKernelPartialSum::GetMultiStageKernelsData(const Params& param
         // Final
         auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
         auto cldnn_jit = GetJitConstants(newParams, dispatchData.stage_final);
-        std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+        auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
         auto& kernel = kd.kernels[1];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp
index 5b7d2f308feb06..75c686d4604db4 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp
@@ -63,7 +63,7 @@ KernelsData DepthToSpaceKernelBase::GetCommonKernelsData(const Params& params, c
     auto dispatchData = SetDefault(newParams);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
index e3150b22f77dd9..6c5f8f3ec8f428 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
@@ -32,6 +32,13 @@ ParamsKey EltwiseKernel_b_fs_yx_fsv16::GetSupportedKey() const {
 }
 
 static inline size_t GetBlockSize(const eltwise_params& params) {
+    // Set blocksize 1 when broadcasting X dim
+    for (size_t i = 0; i < params.inputs.size(); i++) {
+        if (params.inputs[i].X().v == 1 && params.inputs[i].LogicalSize() != 1) {
+            return 1;
+        }
+    }
+
     size_t optimal_bs_values[] = {8, 4, 2, 1};
 
     for (auto bs : optimal_bs_values) {
@@ -43,6 +50,23 @@ static inline size_t GetBlockSize(const eltwise_params& params) {
     return 1;
 }
 
+static inline bool OpHasFeatureBroadcast(const eltwise_params& params, const size_t op_num) {
+    const auto &ew = params.operations[op_num];
+
+    for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
+        const auto &input = ew.inputs[input_idx];
+        if (input.mode == EltwiseInputMode::INPUT_BUFFER) {
+            if (params.inputs[input_idx].LogicalSize() != 1
+                && params.inputs[input_idx].Feature().v == 1
+                && params.output.Feature().v != 1) {
+                    return true;
+                }
+        }
+    }
+
+    return false;
+}
+
 JitConstants EltwiseKernel_b_fs_yx_fsv16::MakeLoadJitConstants(const eltwise_params& params, bool /*useVload8*/) const {
     JitConstants jit = {};
     std::string vload_decls;
@@ -52,13 +76,13 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::MakeLoadJitConstants(const eltwise_par
         for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
             const auto &input = ew.inputs[input_idx];
             const std::string name = "INPUT_" + op_num_str + "_" + std::to_string(input_idx);
-            std::string idx_order = "INPUT" + std::to_string(input.index) + "_IDX_ORDER";
 
             switch (input.mode) {
                 case EltwiseInputMode::SCALAR:
                     jit.AddConstant(MakeJitConstant(name, input.scalar));
                     break;
                 case EltwiseInputMode::INPUT_BUFFER:
+                {
                     if (params.inputs[input.index].LogicalSize() == params.output.Feature().v &&
                         params.inputs[input.index].LogicalSize() == params.inputs[input.index].Feature().v) {
                         jit.AddConstant(MakeJitConstant(name,
@@ -69,14 +93,37 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::MakeLoadJitConstants(const eltwise_par
                                                         "input" + std::to_string(input.index) +
                                                         "[0]"));
                     } else {
-                        std::string block_read_str = "BLOCK_READN(INPUT" + std::to_string(input.index) + "_TYPE, " +
-                                                                 "BLOCK_SIZE, " +
-                                                                 "input" + std::to_string(input.index) + ", " +
-                                                                 "INPUT" + std::to_string(input.index) + "_GET_INDEX(b, f_block*16, y, x))";
-                        jit.AddConstant(MakeJitConstant(name,
-                                                        "TO_TYPE(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, BLOCK_SIZE), " + block_read_str + ")"));
+                        const std::string idx_order = "INPUT" + std::to_string(input.index) + "_IDX_ORDER";
+                        jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, y, x"));
+
+                        bool feature_broadcasting = (params.inputs[input_idx].Feature().v == 1 && params.output.Feature().v != 1);
+
+                        const std::string block_read_str = "TO_TYPE(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, BLOCK_SIZE), BLOCK_READN(INPUT" +
+                                                                std::to_string(input.index) + "_TYPE, BLOCK_SIZE, " +
+                                                                "input" + std::to_string(input.index) + ", " +
+                                                                "GET_INDEX(INPUT, " + std::to_string(input.index) + ", " + idx_order + ")))";
+                        if (feature_broadcasting) {
+                            const std::string broadcast_name = "DO_FEATURE_BROADCAST" + std::to_string(op_num);
+                            std::string sub_group_broadcast;
+                            if (GetBlockSize(params) == 1) {
+                                sub_group_broadcast = "\\\n\ttmp_b" + std::to_string(op_num) +
+                                                    " = sub_group_broadcast(tmp_b" + std::to_string(op_num) + ", 0);";
+                            } else {
+                                sub_group_broadcast = "\\\n\tunroll_for (uint i = 0; i < BLOCK_SIZE; ++i) tmp_b" + std::to_string(op_num) +
+                                                    "[i] = sub_group_broadcast(tmp_b" + std::to_string(op_num) + "[i], 0);";
+                            }
+
+                            std::string broadcast_value = "\\\n\tMAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, BLOCK_SIZE) tmp_b" + std::to_string(op_num) +
+                                                        " = " + block_read_str + ";" + sub_group_broadcast;
+
+                            jit.AddConstant(MakeJitConstant(broadcast_name, broadcast_value));
+                            jit.AddConstant(MakeJitConstant(name, "tmp_b" + std::to_string(op_num)));
+                        } else {
+                            jit.AddConstant(MakeJitConstant(name, block_read_str));
+                        }
                     }
                     break;
+                }
                 case EltwiseInputMode::OUTPUT_BUFFER:
                     jit.AddConstant(MakeJitConstant(name, "output[off]"));
                     break;
@@ -107,13 +154,15 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::GetJitConstants(const eltwise_params&
     jit.AddConstant(MakeJitConstant("BLOCKS_COUNT", CeilDiv(params.output.X().v, blockSize)));
 
     jit.Merge(MakeInputDeclsJitConstants(params, useVload8));
-    jit.Merge(MakeIndexJitConstants(params, useVload8));
     jit.Merge(MakeLoadJitConstants(params, useVload8));
     jit.Merge(GetOperationsJitConstants(params, useVload8, blockSize));
 
     std::string do_eltwise;
     auto& operations = params.operations;
     for (size_t op_num = 0; op_num < operations.size(); op_num++) {
+        if (OpHasFeatureBroadcast(params, op_num)) {
+            do_eltwise += "\\\n\tDO_FEATURE_BROADCAST" + std::to_string(op_num) + ";";
+        }
         do_eltwise += "\\\n\tOPERATION" + std::to_string(op_num) + ";";
     }
 
@@ -144,6 +193,8 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::GetJitConstants(const eltwise_params&
         jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
     }
 
+    jit.AddConstant(MakeJitConstant("ELTWISE_BROADCAST", params.broadcast));
+
     return jit;
 }
 
@@ -155,17 +206,11 @@ bool EltwiseKernel_b_fs_yx_fsv16::Validate(const Params& params, const optional_
     const auto& ewParams = static_cast<const eltwise_params&>(params);
 
     const auto& output = ewParams.output;
-    const auto count = output.PhysicalSize();
-
-    if (count % 8 != 0)
-        return false;
 
     for (size_t i = 0; i < ewParams.inputs.size(); i++) {
-        // Allow the same input sizes OR per-channel operation
-        if ((ewParams.inputs[i].LogicalSize() != output.LogicalSize()) &&
-            (ewParams.inputs[i].LogicalSize() != output.Feature().v || ewParams.inputs[i].Feature().v != output.Feature().v) &&
-            (ewParams.inputs[i].LogicalSize() != 1))
+        if (ewParams.inputs[i].GetLayout() != DataLayout::b_fs_yx_fsv16 && GetBlockSize(ewParams) != 1) {
             return false;
+        }
     }
 
     auto input0 = ewParams.inputs[0];
@@ -239,7 +284,7 @@ KernelsData EltwiseKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, co
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     DispatchData dispatchData = SetDefault(newParams);
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
index ed333d47fc49b3..abf22356be801d 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
@@ -587,7 +587,7 @@ KernelsData EltwiseKernelBase::GetCommonKernelsData(const Params& params, const
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     DispatchData dispatchData = SetDefault(newParams);
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_b_yx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_b_yx_fsv32.cpp
index c38010987cbee3..4e8657a06a1c03 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_b_yx_fsv32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_b_yx_fsv32.cpp
@@ -73,7 +73,7 @@ KernelsData EltwiseKernel_fs_b_yx_fsv32::GetKernelsData(const Params& params, co
     KernelData kd = KernelData::Default<eltwise_params>(params);
     eltwise_params& newParams = *static_cast<eltwise_params*>(kd.params.get());
 
-    std::string jit;
+    std::pair<std::string, std::string> jit;
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_mixed_byxf_and_fs_b_yx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_mixed_byxf_and_fs_b_yx_fsv32.cpp
index 6536cd3061399e..288453139ccd8b 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_mixed_byxf_and_fs_b_yx_fsv32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_mixed_byxf_and_fs_b_yx_fsv32.cpp
@@ -70,7 +70,7 @@ KernelsData EltwiseKernel_mixed_byxf_and_fs_b_yx_fsv32::GetKernelsData(const Par
     KernelData kd = KernelData::Default<eltwise_params>(params);
     eltwise_params& newParams = *static_cast<eltwise_params*>(kd.params.get());
 
-    std::string jit;
+    std::pair<std::string, std::string> jit;
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp
index 295ac25fd83f94..8b9925bbf190a0 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_vload8.cpp
@@ -88,7 +88,7 @@ KernelsData EltwiseKernel_vload8::GetKernelsData(const Params& params, const opt
     KernelData kd = KernelData::Default<eltwise_params>(params);
     eltwise_params& newParams = *static_cast<eltwise_params*>(kd.params.get());
 
-    std::string jit;
+    std::pair<std::string, std::string> jit;
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp
index 4328d436d21982..1d80c4af4d1db0 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp
@@ -52,7 +52,7 @@ KernelsData EmbeddingBagKernelRef::GetKernelsData(const Params& params, const op
     auto dispatchData = SetDefault(newParams);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp
index bb7138c701a91e..2a902dd0f8773a 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp
@@ -87,7 +87,7 @@ KernelsData FullyConnectedKernelBase::GetCommonKernelsData(const Params &params,
 
     const DispatchData dispatchData = SetDefault(newParams, autoTuneIndex);
     auto cldnn_jit = GetJitConstants(newParams, dispatchData);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     uint32_t fused_deps_total = 0;
     for (auto& fused_dep : newParams.fused_ops) {
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp
index da46595290925d..f5d113a75f1b27 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp
@@ -198,7 +198,7 @@ KernelsData GatherKernelRef::GetKernelsData(const Params& params, const optional
     auto dispatchData = SetDefault(newParams, options);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.cpp
new file mode 100644
index 00000000000000..3da6f44337bd3c
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.cpp
@@ -0,0 +1,210 @@
+/*
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "gather_nd_kernel_ref.h"
+#include "kernel_selector_utils.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+
+ParamsKey GatherNDKernelRef::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::bfzyx);
+    k.EnableOutputLayout(DataLayout::bfzyx);
+    k.EnableInputLayout(DataLayout::bfwzyx);
+    k.EnableOutputLayout(DataLayout::bfwzyx);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDifferentTypes();
+    return k;
+}
+
+static inline std::string GetOrderString(std::vector<std::string>& order) {
+    std::string order_str = order[0];
+    for (size_t i = 1; i < order.size(); i++)
+        order_str += ", " + order[i];
+
+    return order_str;
+}
+
+static inline std::vector<std::string> GetDefaultOrder(size_t size) {
+    std::vector<std::string> default_order;
+    if (size <= 4) {
+        default_order = { "b", "f", "y", "x" };
+    } else if (size == 5) {
+        default_order = { "b", "f", "z", "y", "x" };
+    } else if (size == 6) {
+        default_order = { "b", "f", "w", "z", "y", "x" };
+    }
+
+    return default_order;
+}
+
+CommonDispatchData GatherNDKernelRef::SetDefault(const gather_nd_params& params, const optional_params&) const {
+    CommonDispatchData dispatchData;
+
+    auto indices_dims = params.inputs[1].LogicalDims();
+
+    if (indices_dims.size() > 1) {
+        std::reverse(indices_dims.begin(), indices_dims.end());
+    }
+
+    indices_dims[params.indices_rank - 1] = 1; // set last dim of indices to 1
+
+    switch (params.inputs[1].GetLayout()) {
+    case DataLayout::bfyx:
+        dispatchData.gws = { indices_dims[3], indices_dims[2], indices_dims[1] * indices_dims[0] };
+        break;
+
+    case DataLayout::bfzyx:
+        dispatchData.gws = { indices_dims[4] * indices_dims[3], indices_dims[2], indices_dims[1] * indices_dims[0] };
+        break;
+
+    case DataLayout::bfwzyx:
+        dispatchData.gws = { indices_dims[5] * indices_dims[4], indices_dims[3] * indices_dims[2], indices_dims[1] * indices_dims[0] };
+        break;
+
+    default:
+        throw std::invalid_argument("Unsupported data layout for scatter elements update primitive");
+        break;
+    }
+
+    dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
+
+    return dispatchData;
+}
+
+static size_t GetIndicesLastDim(const gather_nd_params& params) {
+    // get indices dims
+    auto indices_dims = params.inputs[1].LogicalDims();
+
+    if (indices_dims.size() > 1) {
+        std::reverse(indices_dims.begin(), indices_dims.end());
+    }
+
+    auto indices_last_dim = indices_dims[params.indices_rank - 1];
+
+    return indices_last_dim;
+}
+
+static size_t GetSliceSize(const gather_nd_params& params) {
+    // get input dims
+    auto input_dims = params.inputs[0].LogicalDims();
+
+    if (input_dims.size() > 1) {
+        std::reverse(input_dims.begin(), input_dims.end());
+    }
+
+    // get last dim of indices
+    auto indices_last_dim = GetIndicesLastDim(params);
+
+    // calculate slize size which is used in kernel to copy
+    size_t wi_slice_size = 1;
+    for (size_t i = params.batch_dims + indices_last_dim; i < input_dims.size(); i++) {
+        wi_slice_size *= input_dims[i];
+    }
+
+    return wi_slice_size;
+}
+
+JitConstants GatherNDKernelRef::GetJitConstants(const gather_nd_params& params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+
+    jit.AddConstant(MakeJitConstant("INDICES_RANK", params.indices_rank));
+    jit.AddConstant(MakeJitConstant("BATCH_DIMS", params.batch_dims));
+    jit.AddConstant(MakeJitConstant("WI_SLICE_SIZE", GetSliceSize(params)));
+    jit.AddConstant(MakeJitConstant("INDICES_LAST_DIM", GetIndicesLastDim(params)));
+
+    if (!params.fused_ops.empty()) {
+        FusedOpsConfiguration conf = { "", GetDefaultOrder(params.output.GetDims().size()), "val", params.inputs[0].GetDType() };
+        jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
+    }
+
+    return jit;
+}
+
+bool GatherNDKernelRef::Validate(const Params& p, const optional_params& o) const {
+    if (p.GetType() != KernelType:: GATHER_ND || o.GetType() != KernelType::GATHER_ND) {
+        return false;
+    }
+
+    const gather_nd_params& params = static_cast<const gather_nd_params&>(p);
+    auto input_dims = params.inputs[0].LogicalDims();
+    auto indices_dims = params.inputs[1].LogicalDims();
+    auto indices_rank = params.indices_rank;
+    auto batch_dims = params.batch_dims;
+
+    std::reverse(input_dims.begin(), input_dims.end());
+    std::reverse(indices_dims.begin(), indices_dims.end());
+
+    if (indices_rank < 1) {
+        return false;
+    }
+
+    if (batch_dims + indices_dims[indices_rank - 1] > input_dims.size()) {
+        return false;
+    }
+
+    if (batch_dims >= std::min(input_dims.size(), static_cast<size_t>(indices_rank))) {
+        return false;
+    }
+
+    for (uint8_t i = 0; i < batch_dims; i++) {
+        if (input_dims[i] != indices_dims[i]) {
+            return false;
+        }
+    }
+
+    for (auto& fused_op : params.fused_ops) {
+        if (!IsFusedPrimitiveSupported(fused_op))
+            return false;
+    }
+
+    return true;
+}
+
+KernelsData GatherNDKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
+    if (!Validate(params, options)) {
+        return {};
+    }
+
+    KernelData kd = KernelData::Default<gather_nd_params>(params);
+    gather_nd_params& newParams = *static_cast<gather_nd_params*>(kd.params.get());
+
+    auto dispatchData = SetDefault(newParams, options);
+    auto cldnn_jit = GetJitConstants(newParams);
+
+    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto& kernel = kd.kernels[0];
+    FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2, GetFusedPrimitiveInputsCount(params));
+
+    return { kd };
+}
+
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.h
new file mode 100644
index 00000000000000..82fa0913413f34
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_ref.h
@@ -0,0 +1,60 @@
+/*
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "kernel_base_opencl.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// gather_nd_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct gather_nd_params : public base_params {
+    gather_nd_params() : base_params(KernelType::GATHER_ND), indices_rank(0), batch_dims(0) {}
+
+    uint8_t indices_rank;
+
+    uint8_t batch_dims;
+
+    virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// gather_nd_optional_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct gather_nd_optional_params : optional_params {
+    gather_nd_optional_params() : optional_params(KernelType::GATHER_ND) {}
+};
+
+class GatherNDKernelRef : public KernelBaseOpenCL {
+public:
+    GatherNDKernelRef() : KernelBaseOpenCL("gather_nd_ref") {}
+    virtual ~GatherNDKernelRef() {}
+    virtual JitConstants GetJitConstants(const gather_nd_params& params) const;
+    virtual CommonDispatchData SetDefault(const gather_nd_params& params, const optional_params&) const;
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const;
+    ParamsKey GetSupportedKey() const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION,
+                 FusedOpType::ELTWISE };
+    }
+
+protected:
+    bool Validate(const Params& p, const optional_params& o) const override;
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_selector.cpp
new file mode 100644
index 00000000000000..82c938d08c9871
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_selector.cpp
@@ -0,0 +1,27 @@
+/*
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "gather_nd_kernel_selector.h"
+#include "gather_nd_kernel_ref.h"
+
+namespace kernel_selector {
+
+gather_nd_kernel_selector::gather_nd_kernel_selector() { Attach<GatherNDKernelRef>(); }
+
+KernelsData gather_nd_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
+    return GetNaiveBestKernel(params, options, KernelType::GATHER_ND);
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_selector.h
new file mode 100644
index 00000000000000..e4d2ca1edb4149
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_nd_kernel_selector.h
@@ -0,0 +1,35 @@
+/*
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "kernel_selector.h"
+
+namespace kernel_selector {
+class gather_nd_kernel_selector : public kernel_selector_base {
+public:
+    static gather_nd_kernel_selector& Instance() {
+        static gather_nd_kernel_selector instance_;
+        return instance_;
+    }
+
+    gather_nd_kernel_selector();
+
+    virtual ~gather_nd_kernel_selector() {}
+
+    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_base.cpp
index 7c2ad325ded808..bda85cfc41554a 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_base.cpp
@@ -37,7 +37,7 @@ KernelsData PermuteKernelBase::GetKernelsData(const Params& params, const option
     auto cldnn_jit = GetJitConstants(newParams, dispatchData);
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    std::pair<std::string, std::string> jit = CreateJit(kernelName, cldnn_jit, entry_point);
     auto& kernel = kd.kernels[0];
     FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, "", false, false, 1, GetFusedPrimitiveInputsCount(params));
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp
index 77b90e587142a0..457e90ba97e0fa 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp
@@ -64,7 +64,7 @@ KernelsData QuantizeKernelBase::GetKernelsData(const Params& params, const optio
     auto dispatchData = SetDefault(newParams, options);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams, dispatchData);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp
index 0255ceb190c27f..cf227d83504e4d 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp
@@ -200,7 +200,7 @@ KernelsData ReorderKernelBase::GetCommonKernelsData(const reorder_weights_params
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
@@ -224,7 +224,7 @@ KernelsData ReorderKernelBase::GetCommonKernelsData(const reorder_params& params
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_base.cpp
index 75c876e11bd2a2..97fee3f405c1c8 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_base.cpp
@@ -175,6 +175,22 @@ JitConstants ResampleKernelBase::GetJitConstants(const resample_params& params)
         MakeJitConstant("CUBE_COEFF", params.cube_coeff),
     });
 
+    if (params.resampleType == ResampleType::CAFFE_BILINEAR_INTERP) {
+        if (axesUsed[0] == 1) jit.AddConstant(MakeJitConstant("AXES_USED_B", 1));
+        if (axesUsed[1] == 1) jit.AddConstant(MakeJitConstant("AXES_USED_F", 1));
+        if (axesUsed[2] == 1) jit.AddConstant(MakeJitConstant("AXES_USED_Z", 1));
+        if (axesUsed[3] == 1) jit.AddConstant(MakeJitConstant("AXES_USED_Y", 1));
+        if (axesUsed[4] == 1) jit.AddConstant(MakeJitConstant("AXES_USED_X", 1));
+
+        jit.AddConstants({
+            MakeJitConstant("PADDED_B", b_size_padded),
+            MakeJitConstant("PADDED_F", f_size_padded),
+            MakeJitConstant("PADDED_X", x_size_padded),
+            MakeJitConstant("PADDED_Y", y_size_padded),
+            MakeJitConstant("PADDED_Z", z_size_padded),
+        });
+    }
+
     size_t feature_block_size = GetFeatureBlockSize(params);
 
     if (params.resampleType == ResampleType::CAFFE_BILINEAR_INTERP) {
@@ -207,7 +223,7 @@ KernelsData ResampleKernelBase::GetCommonKernelsData(const Params& params, const
     auto dispatchData = SetDefault(newParams);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
     FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp
index 9efc0fcc2b3db8..7773696b3141a7 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp
@@ -37,6 +37,7 @@ ParamsKey ResampleKernelOpt::GetSupportedKey() const {
     k.EnableReampleType(ResampleType::BILINEAR_INTERP);
     k.EnableReampleType(ResampleType::NEAREST_NEIGHBOR);
     k.EnableReampleType(ResampleType::LINEAR_ONNX);
+    k.EnableReampleType(ResampleType::CAFFE_BILINEAR_INTERP);
     k.EnableSubGroup();
     k.EnableSubGroupShort();
     return k;
@@ -46,13 +47,21 @@ ResampleKernelBase::DispatchData ResampleKernelOpt::SetDefault(const kernel_sele
     DispatchData dispatchData;
     const auto& out = arg.output;
 
-    dispatchData.gws[0] = CeilDiv(out.X().v, GetOptimalBlockSize(arg)) * out.Y().v;
-    dispatchData.gws[1] = Align(out.Feature().v, sub_group_size);
-    dispatchData.gws[2] = arg.output.Batch().v;
+    if (arg.resampleType == ResampleType::CAFFE_BILINEAR_INTERP) {
+        dispatchData.gws[0] = out.X().v * out.Y().v;
+        dispatchData.gws[1] = CeilDiv(out.Feature().v, GetFeatureBlockSize(arg));
+        dispatchData.gws[2] = arg.output.Batch().v;
 
-    dispatchData.lws[0] = 1;
-    dispatchData.lws[1] = sub_group_size;
-    dispatchData.lws[2] = 1;
+        dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
+    } else {
+        dispatchData.gws[0] = CeilDiv(out.X().v, GetOptimalBlockSize(arg)) * out.Y().v;
+        dispatchData.gws[1] = Align(out.Feature().v, sub_group_size);
+        dispatchData.gws[2] = arg.output.Batch().v;
+
+        dispatchData.lws[0] = 1;
+        dispatchData.lws[1] = sub_group_size;
+        dispatchData.lws[2] = 1;
+    }
 
     return dispatchData;
 }
@@ -98,10 +107,26 @@ JitConstants ResampleKernelOpt::GetJitConstants(const resample_params &params) c
     jit.AddConstant(MakeJitConstant("VEC_SIZE", vec_size));
 
     if (!params.fused_ops.empty()) {
-        std::vector<std::string> idx_order = {"b", "feature_block", "y", "(x + out_x)"};
-        FusedOpsConfiguration conf = {"", idx_order, "res", GetAccumulatorType(params), vec_size, LoadType::LT_ALIGNED_READ};
-        conf.SetVectorAxis(Tensor::DataChannelName::FEATURE);
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+        if (params.resampleType != ResampleType::CAFFE_BILINEAR_INTERP) {
+            std::vector<std::string> idx_order = {"b", "feature_block", "y", "(x + out_x)"};
+            FusedOpsConfiguration conf = {"", idx_order, "res", GetAccumulatorType(params), vec_size, LoadType::LT_ALIGNED_READ};
+            conf.SetVectorAxis(Tensor::DataChannelName::FEATURE);
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+        } else {
+            std::vector<std::string> idx_order;
+            idx_order = {"batch", "OF_ID", "oy", "ox"};
+
+            FusedOpsConfiguration conf = {"", idx_order, "res", GetAccumulatorType(params), 1};
+            jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+        }
+    }
+
+    if (params.resampleType == ResampleType::CAFFE_BILINEAR_INTERP) {
+        if (GetFeatureBlockSize(params) == 8) {
+            jit.AddConstant(MakeJitConstant("VEC_BLOCK_SIZE", 8));
+        } else {
+            jit.AddConstant(MakeJitConstant("VEC_BLOCK_SIZE", 16));
+        }
     }
 
     return jit;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.cpp
index 011c581b55ebbd..0a76fed62c7a47 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reshape/reshape_kernel_ref.cpp
@@ -39,7 +39,7 @@ KernelsData ReshapeKernelRef::GetKernelsData(const Params& params, const optiona
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = MakeBaseParamsJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     const auto& in = newParams.inputs[0];
     auto& kernel = kd.kernels[0];
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp
index 9090a36911c06f..da20e77c031c01 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp
@@ -60,7 +60,7 @@ KernelsData ReverseSequenceKernelRef::GetKernelsData(const Params& params, const
     auto dispatchData = SetDefault(newParams, options);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_elements_update_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_elements_update_kernel_ref.cpp
index 3bec06a2e34702..d3c4b93a837950 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_elements_update_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_elements_update_kernel_ref.cpp
@@ -150,7 +150,7 @@ KernelsData ScatterElementsUpdateKernelRef::GetKernelsData(const Params& params,
         if (i == 1) {
             cldnn_jit.AddConstant(MakeJitConstant("IS_SECOND_ITER", "true"));
         }
-        std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+        auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
         clKernelData& kernel = kd.kernels[i];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_nd_update_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_nd_update_kernel_ref.cpp
index f63662909e4016..6c13dd74a0cc11 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_nd_update_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_nd_update_kernel_ref.cpp
@@ -59,6 +59,7 @@ ScatterNDUpdateKernelRef::SetDefault(const scatter_nd_update_params& params, con
 
     if (!is_second) {
         const auto& scope = params.output;
+        dispatchData.indicesLastDim = 1;
         dispatchData.gws = { scope.X().v * scope.Y().v, scope.Z().v * scope.W().v, scope.Feature().v * scope.Batch().v };
     } else {
         auto indices_rank = params.indices_rank;
@@ -168,7 +169,7 @@ KernelsData ScatterNDUpdateKernelRef::GetKernelsData(const Params& params, const
             cldnn_jit.AddConstant(MakeJitConstant("INDICES_LAST_DIM", dispatchData.indicesLastDim));
             cldnn_jit.AddConstant(MakeJitConstant("INPUT_BLOCK_ND", GetInputBlockND(newParams)));
         }
-        std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+        std::pair<std::string, std::string> jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
         clKernelData& kernel = kd.kernels[i];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_update_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_update_kernel_ref.cpp
index 90e40d274fbe90..4ccb8ca55a875b 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_update_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_update_kernel_ref.cpp
@@ -274,7 +274,7 @@ KernelsData ScatterUpdateKernelRef::GetKernelsData(const Params& params, const o
         if (i == 1) {
             cldnn_jit.AddConstant(MakeJitConstant("IS_SECOND_ITER", "true"));
         }
-        std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+        auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
         clKernelData& kernel = kd.kernels[i - start_with_iteration];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_base.cpp
index e63fb5e04e919d..b116ae10fac9b9 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/select/select_kernel_base.cpp
@@ -119,7 +119,7 @@ KernelsData SelectKernelBase::GetCommonKernelsData(const Params& params, const o
 
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     DispatchData dispatchData = SetDefault(newParams);
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp
index dea0167155e318..3f234fa89ed2d3 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp
@@ -91,7 +91,7 @@ KernelsData ShuffleChannelsKernelRef::GetKernelsData(const Params& params, const
     auto dispatchData = SetDefault(newParams, options);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp
index 039210668aa597..830b3bfd9cdc4d 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp
@@ -81,7 +81,7 @@ KernelsData SpaceToBatchKernelBase::GetCommonKernelsData(const Params& params, c
     auto dispatchData = SetDefault(newParams, options);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_depth/space_to_depth_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_depth/space_to_depth_kernel_ref.cpp
index 2c3a95df3c6ac7..14316395776138 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_depth/space_to_depth_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_depth/space_to_depth_kernel_ref.cpp
@@ -93,7 +93,7 @@ KernelsData SpaceToDepthKernelRef::GetKernelsData(const Params& params, const op
     auto dispatchData = SetDefault(newParams, options);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp
index d20aab92c4c0e3..495292b4f45f3f 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp
@@ -157,7 +157,7 @@ KernelsData StridedSliceKernelRef::GetKernelsData(const Params& params, const op
     auto dispatchData = SetDefault(newParams, options);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.cpp
index 32a07751603c2e..d82835b5bc193c 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.cpp
@@ -57,7 +57,7 @@ KernelsData TileKernelRef::GetKernelsData(const Params& params, const optional_p
     auto dispatchData = SetDefault(newParams, options);
     auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
     auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 
     auto& kernel = kd.kernels[0];
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv16.cl
index 71e9622db3ef22..95aee13d41650f 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv16.cl
@@ -7,6 +7,7 @@
 #include "include/data_types.cl"
 
 #define FEATURE_SLICE_SIZE 16
+#define unroll_for  __attribute__((opencl_unroll_hint())) for
 
 #define OUTPUT_TYPE_BLOCK               MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE)
 #define TO_TYPE(type, val)              CAT(convert_, type)(val)
@@ -19,6 +20,12 @@
     #define WRITE_FUNC(ptr, offset, val) DT_OUTPUT_BLOCK_WRITE(ptr, offset, val)
 #endif
 
+#if ELTWISE_BROADCAST
+    #define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX_SAFE)(idx_order)
+#else
+    #define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX)(idx_order)
+#endif
+
 __attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
 KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
                               __global OUTPUT_TYPE* output
@@ -84,3 +91,11 @@ KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
     }
 
 }
+
+#undef FEATURE_SLICE_SIZE
+#undef unroll_for
+#undef OUTPUT_TYPE_BLOCK
+#undef TO_TYPE
+#undef READ_FUNC
+#undef WRITE_FUNC
+#undef GET_INDEX
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_nd_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_nd_ref.cl
new file mode 100644
index 00000000000000..91cb7d9be773e9
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_nd_ref.cl
@@ -0,0 +1,231 @@
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/fetch.cl"
+
+#define GET_UPDATES_INDEX(prefix, idx_order) CAT(prefix, _GET_INDEX)(idx_order)
+#define GET_OUTPUT_INDEX(out_order) OUTPUT_GET_INDEX(out_order)
+
+#if INPUT0_DIMS == 4
+    #define IN_ORDER in_b,in_f,in_y,in_x
+#elif INPUT0_DIMS == 5
+    #define IN_ORDER in_b,in_f,in_z,in_y,in_x
+#else
+    #define IN_ORDER in_b,in_f,in_w,in_z,in_y,in_x
+#endif
+
+#if INPUT1_DIMS == 4
+    #define IDX_ORDER idx_b,idx_f,idx_y,idx_x
+#elif INPUT1_DIMS == 5
+    #define IDX_ORDER idx_b,idx_f,idx_z,idx_y,idx_x
+#else
+    #define IDX_ORDER idx_b,idx_f,idx_w,idx_z,idx_y,idx_x
+#endif
+
+#if OUTPUT_DIMS == 4
+    #define OUT_ORDER out_b,out_f,out_y,out_x
+#elif OUTPUT_DIMS == 5
+    #define OUT_ORDER out_b,out_f,out_z,out_y,out_x
+#else
+    #define OUT_ORDER out_b,out_f,out_w,out_z,out_y,out_x
+#endif
+
+#define INDICES_MAX_DIM 6
+
+KERNEL(gather_nd_ref)(const __global INPUT0_TYPE* data,
+                   const __global INPUT1_TYPE* indices,
+                   __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+                   , FUSED_OPS_DECLS
+#endif
+)
+{
+
+    const uint dim0 = get_global_id(0);
+    const uint dim1 = get_global_id(1);
+    const uint dim2 = get_global_id(2);
+
+    // Calculate indice index
+    const uint F_NUM = (INDICES_RANK == 2) ? 1 : INPUT1_FEATURE_NUM;
+    const uint idx_f = dim2 % F_NUM;
+    const uint idx_b = dim2 / F_NUM;
+
+    #if INPUT1_DIMS == 4
+        const uint idx_x = dim0;
+        const uint idx_y = dim1;
+        const uint idx_z = 0;
+        const uint idx_w = 0;
+
+        const uint idx_arr[INPUT1_DIMS*2] = {idx_b, idx_f, idx_y, idx_x, 0, 0, 0, 0};
+        const uint idx_dim[INPUT1_DIMS] = {INPUT1_BATCH_NUM, INPUT1_FEATURE_NUM, INPUT1_SIZE_Y, INPUT1_SIZE_X};
+    #elif INPUT1_DIMS == 5
+        const uint X_NUM = (INDICES_RANK == 5) ? 1 : INPUT1_SIZE_X;
+
+        const uint idx_x = dim0 % X_NUM;
+        const uint idx_y = dim0 / X_NUM;
+        const uint idx_z = dim1;
+        const uint idx_w = 0;
+
+        const uint idx_arr[INPUT1_DIMS*2] = {idx_b, idx_f, idx_z, idx_y, idx_x, 0, 0, 0, 0, 0};
+        const uint idx_dim[INPUT1_DIMS] = {INPUT1_BATCH_NUM, INPUT1_FEATURE_NUM, INPUT1_SIZE_Z, INPUT1_SIZE_Y, INPUT1_SIZE_X};
+    #else
+        const uint X_NUM = (INDICES_RANK == 6) ? 1 : INPUT1_SIZE_X;
+        const uint Z_NUM = (INDICES_RANK == 4) ? 1 : INPUT1_SIZE_Z;
+
+        const uint idx_x = dim0 % X_NUM;
+        const uint idx_y = dim0 / X_NUM;
+        const uint idx_z = dim1 % Z_NUM;
+        const uint idx_w = dim1 / Z_NUM;
+
+        const uint idx_arr[INPUT1_DIMS*2] = {idx_b, idx_f, idx_w, idx_z, idx_y, idx_x, 0, 0, 0, 0, 0, 0};
+        const uint idx_dim[INPUT1_DIMS] = {INPUT1_BATCH_NUM, INPUT1_FEATURE_NUM, INPUT1_SIZE_W, INPUT1_SIZE_Z, INPUT1_SIZE_Y, INPUT1_SIZE_X};
+    #endif
+
+    const int idx = GET_UPDATES_INDEX(INPUT1, IDX_ORDER);
+
+    // Calculate data index
+    uint indices_val[INDICES_MAX_DIM + BATCH_DIMS];
+    for (int i = 0; i < INDICES_MAX_DIM + BATCH_DIMS; i++) {
+        indices_val[i] = 0;
+    }
+
+    for (int i = 0; i < BATCH_DIMS; i++) {
+        indices_val[i] = idx_arr[i];
+    }
+
+    for (int i = 0; i < INDICES_LAST_DIM; i++) {
+        indices_val[i + BATCH_DIMS] = indices[idx+i];
+    }
+
+    #if INPUT0_DIMS == 4
+        const uint in_x = indices_val[3];
+        const uint in_y = indices_val[2];
+    #elif INPUT0_DIMS == 5
+        const uint in_x = indices_val[4];
+        const uint in_y = indices_val[3];
+        const uint in_z = indices_val[2];
+    #else
+        const uint in_x = indices_val[5];
+        const uint in_y = indices_val[4];
+        const uint in_z = indices_val[3];
+        const uint in_w = indices_val[2];
+    #endif
+    const uint in_f = indices_val[1];
+    const uint in_b = indices_val[0];
+
+    const uint data_idx = GET_UPDATES_INDEX(INPUT0, IN_ORDER);
+
+    // Calculate output index
+    #if BATCH_DIMS <= 1
+        const uint out_x = idx_x;
+        const uint out_y = idx_y;
+        const uint out_z = idx_z;
+        const uint out_w = idx_w;
+        const uint out_f = idx_f;
+        const uint out_b = idx_b;
+    #else
+        uint pitch_acc = 1;
+        uint output_batch_size = 0;
+        for (int i = BATCH_DIMS - 1; i >= 0; i--) {
+            output_batch_size += (idx_arr[i] * pitch_acc);
+            pitch_acc *= idx_dim[i];
+        }
+
+        #if OUTPUT_DIMS == 4
+            const uint out_x = idx_arr[BATCH_DIMS+2];
+            const uint out_y = idx_arr[BATCH_DIMS+1];
+        #elif OUTPUT_DIMS == 5
+            const uint out_x = idx_arr[BATCH_DIMS+3];
+            const uint out_y = idx_arr[BATCH_DIMS+2];
+            const uint out_z = idx_arr[BATCH_DIMS+1];
+        #else
+            const uint out_x = idx_arr[BATCH_DIMS+4];
+            const uint out_y = idx_arr[BATCH_DIMS+3];
+            const uint out_z = idx_arr[BATCH_DIMS+2];
+            const uint out_w = idx_arr[BATCH_DIMS+1];
+        #endif
+        const uint out_f = idx_arr[BATCH_DIMS+0];
+        const uint out_b = output_batch_size;
+    #endif
+
+    const uint output_idx = GET_OUTPUT_INDEX(OUT_ORDER);
+
+    // Copy data to output as slice size
+    #if HAS_FUSED_OPS
+        #if OUTPUT_DIMS == 4
+            const uint y_pitch = OUTPUT_SIZE_X;
+            const uint f_pitch = y_pitch * OUTPUT_SIZE_Y;
+        #elif OUTPUT_DIMS == 5
+            const uint y_pitch = OUTPUT_SIZE_X;
+            const uint z_pitch = y_pitch * OUTPUT_SIZE_Y;
+            const uint f_pitch = z_pitch * OUTPUT_SIZE_Z;
+        #else
+            const uint y_pitch = OUTPUT_SIZE_X;
+            const uint z_pitch = y_pitch * OUTPUT_SIZE_Y;
+            const uint w_pitch = z_pitch * OUTPUT_SIZE_Z;
+            const uint f_pitch = w_pitch * OUTPUT_SIZE_W;
+        #endif
+        const uint b_pitch = f_pitch * OUTPUT_FEATURE_NUM;
+    #endif
+
+    for (int i = 0; i < WI_SLICE_SIZE; i++) {
+        uint dst_idx = output_idx + i;
+        INPUT0_TYPE val = data[data_idx + i];
+
+        #if HAS_FUSED_OPS
+            const uint b_remain = dst_idx % b_pitch;
+            const uint f_remain = b_remain % f_pitch;
+            #if OUTPUT_DIMS == 4
+                const uint y_remain = f_remain % y_pitch;
+
+                const uint y = f_remain / y_pitch;
+            #elif OUTPUT_DIMS == 5
+                const uint z_remain = f_remain % z_pitch;
+                const uint y_remain = z_remain % y_pitch;
+
+                const uint z = f_remain / z_pitch;
+                const uint y = z_remain / y_pitch;
+            #else
+                const uint w_remain = f_remain % w_pitch;
+                const uint z_remain = w_remain % z_pitch;
+                const uint y_remain = z_remain % y_pitch;
+
+                const uint w = f_remain / w_pitch;
+                const uint z = w_remain / z_pitch;
+                const uint y = z_remain / y_pitch;
+            #endif
+            const uint b = dst_idx / b_pitch;
+            const uint f = b_remain / f_pitch;
+            const uint x = y_remain;
+
+            #if FUSED_OPS_CAN_USE_PRELOAD
+                FUSED_OPS_PRELOAD;
+                FUSED_OPS_CALC;
+            #else
+                FUSED_OPS;
+            #endif
+
+            output[dst_idx] = FUSED_OPS_RESULT;
+        #else
+            output[dst_idx] = ACTIVATION(val, ACTIVATION_PARAMS);
+        #endif
+    }
+}
+
+#undef INDICES_MAX_DIM
+#undef GET_UPDATES_INDEX
+#undef GET_OUTPUT_INDEX
+#undef OUT_ORDER
+#undef IDX_ORDER
+#undef IN_ORDER
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/resample_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/resample_opt.cl
index 829656d0ef5c54..be431bb6c37668 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/resample_opt.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/resample_opt.cl
@@ -2,12 +2,17 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "include/common.cl"
+#include "include/fetch.cl"
 #include "include/data_types.cl"
-#include "include/include_all.cl"
 
 #define unroll_for __attribute__((opencl_unroll_hint)) for
 
+#if ANTIALIAS == 1
+    #define TRIANGLE_COEFF(a, x) ( (a) * ACCUMULATOR_MAX_FUNC(ACCUMULATOR_VAL_ZERO, ACCUMULATOR_VAL_ONE - ACCUMULATOR_ABS_FUNC((a) * (x))))
+#else
+    #define TRIANGLE_COEFF(a, x) (ACCUMULATOR_MAX_FUNC(ACCUMULATOR_VAL_ZERO, ACCUMULATOR_VAL_ONE - ACCUMULATOR_ABS_FUNC(x)))
+#endif
+
 #define READ_FUNC(ptr, offset)          BLOCK_READN(INPUT0_TYPE, VEC_SIZE, ptr, offset)
 #define WRITE_FUNC(ptr, offset, val)    BLOCK_WRITEN(OUTPUT_TYPE, VEC_SIZE, ptr, offset, val)
 
@@ -18,6 +23,25 @@
 #define OUT_VEC_TYPE                    MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
 #define TO_OUT_VEC_TYPE(x)              CAT(convert_, OUT_VEC_TYPE)(x)
 
+
+inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x)
+{
+#if INPUT0_DIMS < 5
+    return INPUT0_GET_INDEX(b, f, y, x);
+#else
+#error [clDNN resample_ref.cl]: input format - not supported
+#endif
+}
+
+inline uint FUNC(get_output_index)(uint b, uint f, uint y, uint x)
+{
+#if OUTPUT_DIMS < 5
+    return OUTPUT_GET_INDEX(b, f, y, x);
+#else
+#error [clDNN resample_ref.cl]: output format - not supported
+#endif
+}
+
 inline float FUNC(get_original_coordinate)(float num, float scale, int length_resized, int length_original)
 {
 #if defined(COORD_TRANS_MODE_HALF_PIXEL)
@@ -35,6 +59,171 @@ inline float FUNC(get_original_coordinate)(float num, float scale, int length_re
 #endif
 }
 
+#ifdef SAMPLE_TYPE_CAFFE_INTERP
+KERNEL (resample_opt)(__global INPUT0_TYPE* input,
+                      __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+                      , FUSED_OPS_DECLS
+#endif
+)
+{
+    const int in_size[4] = { INPUT0_BATCH_NUM, INPUT0_FEATURE_NUM, INPUT0_SIZE_Y, INPUT0_SIZE_X };
+    const int out_size[4] = { OUTPUT_BATCH_NUM, OUTPUT_FEATURE_NUM, OUTPUT_SIZE_Y, OUTPUT_SIZE_X };
+
+    const int ox = (int)get_global_id(0) % OUTPUT_SIZE_X;
+    const int oy = (int)get_global_id(0) / OUTPUT_SIZE_X;
+    const int feature_block_num = get_global_id(1);
+    const int feature = feature_block_num * FEATURE_BLOCK_SIZE;
+
+#if OUTPUT_DIMS <= 4
+    const int batch = get_global_id(2);
+#else
+#error [clDNN resample_ref.cl]: Unsupported data dimension
+#endif
+
+    ACCUMULATOR_TYPE i_b = AXES_USED[0] ? FUNC_CALL(get_original_coordinate)(batch, SCALES[0], out_size[0], PADDED_B) : batch;
+    ACCUMULATOR_TYPE i_f = AXES_USED[1] ? FUNC_CALL(get_original_coordinate)(feature, SCALES[1], out_size[1], PADDED_F) : feature;
+    ACCUMULATOR_TYPE i_y = AXES_USED[3] ? FUNC_CALL(get_original_coordinate)(oy, SCALES[3], out_size[2], PADDED_Y) : oy;
+    ACCUMULATOR_TYPE i_x = AXES_USED[4] ? FUNC_CALL(get_original_coordinate)(ox, SCALES[4], out_size[3], PADDED_X) : ox;
+
+#if PADDING_USED == 1
+    i_b -= PADS_BEGIN[0];
+    i_f -= PADS_BEGIN[1];
+    i_y -= PADS_BEGIN[3];
+    i_x -= PADS_BEGIN[4];
+#endif
+
+    const int ib_r = (int)i_b;
+    const int if_r = (int)i_f;
+    const int iy_r = (int)i_y;
+    const int ix_r = (int)i_x;
+
+#if ANTIALIAS == 1
+    const ACCUMULATOR_TYPE ab = 1.0f / SCALES[0];
+    const ACCUMULATOR_TYPE af = 1.0f / SCALES[1];
+    const ACCUMULATOR_TYPE ay = 1.0f / SCALES[3];
+    const ACCUMULATOR_TYPE ax = 1.0f / SCALES[4];
+
+    const int rb = (SCALES[0] < 1.0f) ? 2 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / ab);
+    const int rf = (SCALES[1] < 1.0f) ? 2 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / af);
+    const int ry = (SCALES[3] < 1.0f) ? 2 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / ay);
+    const int rx = (SCALES[4] < 1.0f) ? 2 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / ax);
+#else
+    const ACCUMULATOR_TYPE ab = 1.0f;
+    const ACCUMULATOR_TYPE af = 1.0f;
+    const ACCUMULATOR_TYPE ay = 1.0f;
+    const ACCUMULATOR_TYPE ax = 1.0f;
+
+    const int rb = (SCALES[0] < 1.0f) ? 1 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / ab);
+    const int rf = (SCALES[1] < 1.0f) ? 1 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / af);
+    const int ry = (SCALES[3] < 1.0f) ? 1 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / ay);
+    const int rx = (SCALES[4] < 1.0f) ? 1 : (int)ceil(TO_ACCUMULATOR_TYPE(KERNEL_W) / ax);
+#endif
+
+    int const b_init = max(-PADS_BEGIN[0], ib_r - rb);
+    int const f_init = max(-PADS_BEGIN[1], if_r - rf);
+    int const y_init = max(-PADS_BEGIN[3], iy_r - ry);
+    int const x_init = max(-PADS_BEGIN[4], ix_r - rx);
+
+    int const b_max = min(PADS_END[0] + INPUT0_BATCH_NUM, ib_r + rb + 1);
+    int const f_max = min(PADS_END[1] + INPUT0_FEATURE_NUM, if_r + rf + 1);
+    int const y_max = min(PADS_END[3] + INPUT0_SIZE_Y, iy_r + ry + 1);
+    int const x_max = min(PADS_END[4] + INPUT0_SIZE_X, ix_r + rx + 1);
+
+    const int fp_max = FEATURE_BLOCK_SIZE;
+
+    ACCUMULATOR_TYPE wb = ACCUMULATOR_VAL_ZERO;
+    ACCUMULATOR_TYPE wf = ACCUMULATOR_VAL_ZERO;
+    ACCUMULATOR_TYPE wy = ACCUMULATOR_VAL_ZERO;
+    ACCUMULATOR_TYPE wx = ACCUMULATOR_VAL_ZERO;
+    ACCUMULATOR_TYPE w  = ACCUMULATOR_VAL_ZERO;
+
+    for (int fp = 0; fp < fp_max; fp+=VEC_BLOCK_SIZE) {
+        MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, VEC_BLOCK_SIZE) sum = ACCUMULATOR_VAL_ZERO;
+        ACCUMULATOR_TYPE wsum = ACCUMULATOR_VAL_ZERO;
+
+        for (int b = b_init; b < b_max; b++) {
+            wb = TRIANGLE_COEFF(ab, i_b - b);
+
+            for (int f = f_init; f < f_max; f++) {
+                wf = wb * TRIANGLE_COEFF(af, i_f - f);
+
+                if (wf != 0) {
+                    for (int y = y_init; y < y_max; y++) {
+                        wy = wf * TRIANGLE_COEFF(ay, i_y - y);
+
+                        if (wy != 0) {
+                            for (int x = x_init; x < x_max; x++) {
+                                wx = TRIANGLE_COEFF(ax, i_x - x);
+                                w = wx * wy;
+
+#if PADDING_USED == 1
+                                bool isOutOfBounds = b < 0 || f < 0 || y < 0 || x < 0 ||
+                                                    b >= in_size[0] || f >= in_size[1] ||
+                                                    y >= in_size[2] || x >= in_size[3];
+#endif
+                                if (w != 0) {
+                                    wsum += w;
+
+#if PADDING_USED == 1
+                                    if (!isOutOfBounds)
+#endif
+                                    {
+#if VEC_BLOCK_SIZE == 8
+                                        MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_BLOCK_SIZE) input_vec = vload8(0, &input[FUNC_CALL(get_input_index)(b, f+fp, y, x)]);
+                                        sum = fma(convert_float8(input_vec), (float8)w, sum);
+#else
+                                        MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_BLOCK_SIZE) input_vec = vload16(0, &input[FUNC_CALL(get_input_index)(b, f+fp, y, x)]);
+                                        sum = fma(convert_float16(input_vec), (float16)w, sum);
+#endif
+                                    }
+                                }  // w != 0;
+                            }  // for (int x = x_init; x < x_max; x++)
+                        }
+                    }  // for (int y = y_init; y < y_max; y++)
+                }
+            }  // for (int f = f_init; f < f_max; f++)
+        }  // for (int b = b_init; b < b_max; b++)
+
+        MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_BLOCK_SIZE) out;
+        ACCUMULATOR_TYPE res;
+
+        if (wsum == 0) {
+            res = ACCUMULATOR_VAL_ZERO;
+            for  (int f = 0; f < VEC_BLOCK_SIZE; f++) {
+#if HAS_FUSED_OPS
+                #define OF_ID (feature+fp+f)
+                FUSED_OPS;
+                out[f] = FUSED_OPS_RESULT;
+                #undef OF_ID
+#else
+                out[f] = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS);
+#endif
+            }
+        } else {
+            for  (int f = 0; f < VEC_BLOCK_SIZE; f++) {
+                res = sum[f] / wsum;
+#if HAS_FUSED_OPS
+                #define OF_ID (feature+fp+f)
+                FUSED_OPS;
+                out[f] = FUSED_OPS_RESULT;
+                #undef OF_ID
+#else
+                out[f] = ACTIVATION(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS);
+#endif
+            }
+        }
+
+#if VEC_BLOCK_SIZE == 8
+        vstore8(out, 0, &output[FUNC_CALL(get_output_index)(batch, feature+fp, oy, ox)]);
+#else
+        vstore16(out, 0, &output[FUNC_CALL(get_output_index)(batch, feature+fp, oy, ox)]);
+#endif
+    } // fp
+}
+#endif // SAMPLE_TYPE_CAFFE_INTERP
+
+#ifndef SAMPLE_TYPE_CAFFE_INTERP
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
 KERNEL (resample_opt)(__global INPUT0_TYPE* input,
                       __global OUTPUT_TYPE* output
@@ -140,7 +329,9 @@ KERNEL (resample_opt)(__global INPUT0_TYPE* input,
         WRITE_FUNC(output, OUTPUT_GET_INDEX(b, feature_block, y, (x + out_x)), out);
     }
 }
+#endif // !SAMPLE_TYPE_CAFFE_INTERP
 
 #undef unroll_for
+#undef TRIANGLE_COEFF
 #undef READ_FUNC
 #undef WRITE_FUNC
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/resample_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/resample_ref.cl
index 536abd56b7f759..9fa19b5376879d 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/resample_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/resample_ref.cl
@@ -2,9 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "include/common.cl"
+#include "include/fetch.cl"
 #include "include/data_types.cl"
-#include "include/include_all.cl"
 
 inline uint FUNC(get_input_index)(uint b, uint f, uint z, uint y, uint x)
 {
@@ -373,11 +372,6 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
     const int batch = (int)get_global_id(2) % OUTPUT_BATCH_NUM;
     const int oz    = (int)get_global_id(2) / OUTPUT_BATCH_NUM;
 #endif
-    const int PADDED_B = in_size[0] + PADS_BEGIN[0] + PADS_END[0];
-    const int PADDED_F = in_size[1] + PADS_BEGIN[1] + PADS_END[1];
-    const int PADDED_Z = in_size[2] + PADS_BEGIN[2] + PADS_END[2];
-    const int PADDED_Y = in_size[3] + PADS_BEGIN[3] + PADS_END[3];
-    const int PADDED_X = in_size[4] + PADS_BEGIN[4] + PADS_END[4];
 
     ACCUMULATOR_TYPE i_b = AXES_USED[0] ? FUNC_CALL(get_original_coordinate)(batch, SCALES[0], out_size[0], PADDED_B) : batch;
     ACCUMULATOR_TYPE i_f = AXES_USED[1] ? FUNC_CALL(get_original_coordinate)(feature, SCALES[1], out_size[1], PADDED_F) : feature;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.cpp
index 92103f5bffe65f..392d0ed2bc2bb2 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.cpp
@@ -79,10 +79,11 @@ std::string KernelBaseOpenCL::GetEntryPoint(const std::string& templateName,
     return kernelID;
 }
 
-std::string KernelBaseOpenCL::CreateJit(const std::string& template_name,
+std::pair<std::string, std::string> KernelBaseOpenCL::CreateJit(const std::string& template_name,
                                           const JitConstants& constants,
                                           const std::string& kernel_id) const {
     class CodeBuilder code;
+    std::string undefs;
     code.add_line("\n//====================================================")
         .add_line("// Kernel template: " + template_name + " ")
         .add_line("// Kernel name: " + kernel_id)
@@ -90,13 +91,21 @@ std::string KernelBaseOpenCL::CreateJit(const std::string& template_name,
         .decoration_macro("FUNC", "", kernel_id)
         .decoration_macro("FUNC_CALL", "", kernel_id);
 
+    undefs += "#undef KERNEL\n";
+    undefs += "#undef FUNC\n";
+    undefs += "#undef FUNC_CALL\n";
+
     for (auto& definition : constants.GetDefinitions()) {
         code.value_macro(definition.first, definition.second);
+        undefs += "#ifdef " + definition.first.substr(0, definition.first.find('(')) + "\n";
+        undefs += "#undef " + definition.first.substr(0, definition.first.find('(')) + "\n";
+        undefs += "#endif\n";
     }
 
     std::string jit = code.str();
+    std::pair<std::string, std::string> jit_undefs(jit, undefs);
 
-    return jit;
+    return jit_undefs;
 }
 
 Arguments KernelBaseOpenCL::GetArgsDesc(uint32_t num_of_input,
@@ -127,7 +136,7 @@ Arguments KernelBaseOpenCL::GetArgsDesc(uint32_t num_of_input,
 }
 
 std::shared_ptr<KernelString> KernelBaseOpenCL::GetKernelString(const std::string& name,
-                                                                  const std::string& jit,
+                                                                  const std::pair<std::string, std::string>& jit,
                                                                   const std::string& entry_point,
                                                                   const EngineInfo& engine_info,
                                                                   const std::string& exe_mode) const {
@@ -137,7 +146,8 @@ std::shared_ptr<KernelString> KernelBaseOpenCL::GetKernelString(const std::strin
 
     if (codes.size()) {
         kernel_string->str = codes[0];
-        kernel_string->jit = jit;
+        kernel_string->jit = jit.first;
+        kernel_string->undefs = jit.second;
         kernel_string->options = exe_mode + " -cl-mad-enable";
         if (engine_info.bOptHintsSupport)
             kernel_string->options += " -DOPT_HINS_SUPPORTED=1";
@@ -164,7 +174,7 @@ void KernelBaseOpenCL::FillCLKernelData(clKernelData& kernel,
                                         const CommonDispatchData& dispatchData,
                                         const EngineInfo& engine_info,
                                         const std::string& kernelMapName,
-                                        const std::string& jit,
+                                        const std::pair<std::string, std::string>& jit,
                                         const std::string& entryPoint,
                                         const std::string& exeMode,
                                         bool weights,
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.h
index aad1b9e691b8d1..1e5505b2017956 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.h
@@ -19,7 +19,7 @@ class KernelBaseOpenCL : public KernelBase {
 
 protected:
     virtual bool Validate(const Params&, const optional_params&) const { return true; }
-    std::string CreateJit(const std::string& template_name,
+    std::pair<std::string, std::string> CreateJit(const std::string& template_name,
                           const JitConstants& constants,
                           const std::string& kernel_name) const;
     std::string GetEntryPoint(const std::string& templateName,
@@ -30,7 +30,7 @@ class KernelBaseOpenCL : public KernelBase {
                           bool use_bias,
                           uint32_t number_of_inputs_for_fused_prim = 0) const;
     std::shared_ptr<KernelString> GetKernelString(const std::string& kernel_name,
-                                                  const std::string& jit,
+                                                  const std::pair<std::string, std::string>& jit,
                                                   const std::string& entry_point,
                                                   const EngineInfo& engine_info,
                                                   const std::string& exe_mode = DEFAULT) const;
@@ -41,7 +41,7 @@ class KernelBaseOpenCL : public KernelBase {
                           const CommonDispatchData& dispatchData,
                           const EngineInfo& engine_info,
                           const std::string& kernel_map_name,
-                          const std::string& jit,
+                          const std::pair<std::string, std::string>& jit,
                           const std::string& entry_point,
                           const std::string& exe_mode = DEFAULT,
                           bool weights = false,
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/primitive_db_gen.py b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/primitive_db_gen.py
index 0975ed1e2da05b..1be743acfd73f9 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/primitive_db_gen.py
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/primitive_db_gen.py
@@ -40,6 +40,24 @@ def convert(self):
         with open(out_file_name, 'w') as out_file:
             out_file.write(res)
 
+    def append_undefs(self, filename):
+        undefs = ""
+        content = []
+        with open(filename) as f:
+            content += f.readlines()
+        for line in content:
+            if '#define' in line:
+                name = line.strip().split(" ")[1].split("(")[0]
+                undefs += "#ifdef " + name + "\n"
+                undefs += "#undef " + name + "\n"
+                undefs += "#endif\n"
+            if '# define' in line:
+                name = line.strip().split(" ")[2].split("(")[0]
+                undefs += "#ifdef " + name + "\n"
+                undefs += "#undef " + name + "\n"
+                undefs += "#endif\n"
+        return undefs
+
     def append_file_content(self, filename, origin_file):
         res = ""
         content = []
@@ -64,6 +82,7 @@ def cl_file_to_str(self, filename):
         kernel_name = name[:name.find('.cl')]
         res = '{{"{}",\n(std::string) R"__krnl(\n'.format(kernel_name)
         content = self.append_file_content(filename, filename)
+        content += self.append_undefs(filename)
         max_lines = 200
         max_characters = 16350
         characters = 1  # Newline character above
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h
index b75d2dd7e87001..5a0d62336cbd60 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h
@@ -47,13 +47,14 @@ std::string GetStringEnv(const char* varName);
 struct KernelString {
     std::string str;
     std::string jit;
+    std::string undefs;
     std::string options;
     std::string entry_point;
     bool batch_compilation;
 
-    KernelString() : str(""), jit(""), options(""), entry_point(""), batch_compilation(false) {}
+    KernelString() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false) {}
 
-    std::string get_hash() { return str + jit + options + entry_point; }
+    std::string get_hash() { return str + jit + undefs + options + entry_point; }
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/inference-engine/thirdparty/clDNN/src/engine.cpp b/inference-engine/thirdparty/clDNN/src/engine.cpp
index 3cbeaac654d80e..e64c0051827714 100644
--- a/inference-engine/thirdparty/clDNN/src/engine.cpp
+++ b/inference-engine/thirdparty/clDNN/src/engine.cpp
@@ -83,6 +83,7 @@ gpu_toolkit_config convert_configuration(const engine_configuration conf) {
     result.queues_num = conf.n_streams;
     result.kernels_cache_path = conf.kernels_cache_path;
     result.tuning_cache_path = conf.tuning_cache_path;
+    result.n_threads = conf.n_threads;
     return result;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/src/gather_nd.cpp b/inference-engine/thirdparty/clDNN/src/gather_nd.cpp
new file mode 100644
index 00000000000000..f01d82dffa3ea8
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/src/gather_nd.cpp
@@ -0,0 +1,114 @@
+/*
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "gather_nd_inst.h"
+
+#include "primitive_type_base.h"
+#include "error_handler.h"
+#include "json_object.h"
+#include <string>
+
+namespace cldnn {
+primitive_type_id gather_nd::type_id() {
+    static primitive_type_base<gather_nd> instance;
+    return &instance;
+}
+
+layout gather_nd_inst::calc_output_layout(gather_nd_node const& node) {
+    auto op = node.get_primitive();
+
+    auto input_layout_origin = node.input(0).get_output_layout();
+    auto indices_layout_origin = node.input(1).get_output_layout();
+
+    auto input_layout = input_layout_origin.size.sizes(input_layout_origin.format);
+    auto indices_layout = indices_layout_origin.size.sizes(indices_layout_origin.format);
+
+    const size_t input_dims = input_layout.size();
+
+    const auto indices_rank = op->indices_rank;
+    const auto batch_dims = op->batch_dims;
+
+    // calculate initial output shape
+    std::vector<tensor::value_type> output_sizes;
+
+    for (uint8_t x = 0; x < indices_rank - 1; x++) {
+        output_sizes.push_back(indices_layout[x]);
+    }
+
+    const size_t indices_last_dim = indices_layout[indices_rank - 1];
+    for (size_t x = static_cast<size_t>(batch_dims + indices_last_dim); x < input_dims; x++) {
+        output_sizes.push_back(input_layout[x]);
+    }
+
+    // calculate batch_size by batch_dims
+    int batch_size = 1;
+    for (uint8_t x = 0; x < batch_dims; x++) {
+        batch_size *= output_sizes[x];
+    }
+
+    // create final output shape by batch_dims
+    std::vector<tensor::value_type> final_output_sizes;
+
+    if (batch_dims > 0) {
+        final_output_sizes.push_back(batch_size);
+    }
+
+    for (size_t x = static_cast<size_t>(batch_dims); x < output_sizes.size(); x++) {
+        final_output_sizes.push_back(output_sizes[x]);
+    }
+
+    auto output_format = cldnn::format::bfyx;
+    if (final_output_sizes.size() >= 6) {
+        output_format = cldnn::format::bfwzyx;
+    } else if (final_output_sizes.size() == 5) {
+        output_format = cldnn::format::bfzyx;
+    }
+
+    auto output_sizes_tensor = tensor(tensor(final_output_sizes).sizes(output_format));
+    auto padding = op->output_padding;
+
+
+    if (node.has_fused_primitives()) {
+        input_layout_origin.data_type = node.get_fused_output_layout().data_type;
+    }
+
+    return layout(input_layout_origin.data_type, output_format, output_sizes_tensor, padding);
+}
+
+std::string gather_nd_inst::to_string(gather_nd_node const& node) {
+    auto desc = node.get_primitive();
+    auto node_info = node.desc_to_json();
+    auto& input = node.input();
+
+    std::stringstream primitive_description;
+
+    json_composite gather_nd_info;
+    gather_nd_info.add("input id", input.id());
+    gather_nd_info.add("input shape", node.input(0).get_output_layout().size.to_string());
+    gather_nd_info.add("indices shape", node.input(1).get_output_layout().size.to_string());
+    gather_nd_info.add("indices rank", desc->indices_rank);
+    gather_nd_info.add("batch dims", desc->batch_dims);
+    gather_nd_info.add("output shape", calc_output_layout(node).size.to_string());
+
+    node_info->add("gather_nd info", gather_nd_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+gather_nd_inst::typed_primitive_inst(network_impl& network, gather_nd_node const& node) : parent(network, node) {}
+
+}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp b/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp
index fe22b8318282d1..2536bf0b0c1d10 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp
@@ -4,6 +4,7 @@
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #include "configuration.h"
+#include <algorithm>
 
 namespace cldnn {
 namespace gpu {
@@ -22,6 +23,7 @@ configuration::configuration()
       throttle_mode(throttle_mode_types::disabled),
       queues_num(0),
       tuning_cache_path("cache.json"),
-      kernels_cache_path("") {}
+      kernels_cache_path(""),
+      n_threads(std::max(static_cast<uint16_t>(std::thread::hardware_concurrency()), static_cast<uint16_t>(1))) {}
 }  // namespace gpu
 }  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/configuration.h b/inference-engine/thirdparty/clDNN/src/gpu/configuration.h
index 791260f28f5ef4..f0792ad8dac0d2 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/configuration.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/configuration.h
@@ -31,6 +31,7 @@ struct configuration {
     uint16_t queues_num;
     std::string tuning_cache_path;
     std::string kernels_cache_path;
+    uint16_t n_threads;
 };
 }  // namespace gpu
 }  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/gather_nd_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/gather_nd_gpu.cpp
new file mode 100644
index 00000000000000..dc05203f37981e
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/src/gpu/gather_nd_gpu.cpp
@@ -0,0 +1,78 @@
+/*
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "gather_nd_inst.h"
+#include "primitive_gpu_base.h"
+#include "implementation_map.h"
+#include "kernel_selector_helper.h"
+#include "gather/gather_nd_kernel_selector.h"
+#include "gather/gather_nd_kernel_ref.h"
+#include "error_handler.h"
+
+using namespace cldnn;
+
+namespace cldnn {
+namespace gpu {
+
+struct gather_nd_gpu : typed_primitive_gpu_impl<gather_nd> {
+    using parent = typed_primitive_gpu_impl<gather_nd>;
+    using parent::parent;
+
+public:
+    static primitive_impl* create(const gather_nd_node& arg) {
+        auto gather_nd_params = get_default_params<kernel_selector::gather_nd_params>(arg);
+        auto gather_nd_optional_params =
+            get_default_optional_params<kernel_selector::gather_nd_optional_params>(arg.get_program());
+
+        gather_nd_params.indices_rank = arg.get_primitive()->indices_rank;
+        gather_nd_params.batch_dims = arg.get_primitive()->batch_dims;
+
+        gather_nd_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout()));
+
+        auto& kernel_selector = kernel_selector::gather_nd_kernel_selector::Instance();
+        auto best_kernels = kernel_selector.GetBestKernels(gather_nd_params, gather_nd_optional_params);
+
+        CLDNN_ERROR_BOOL(arg.id(),
+                         "Best_kernel.empty()",
+                         best_kernels.empty(),
+                         "Cannot find a proper kernel with this arguments");
+
+        auto gather_nd = new gather_nd_gpu(arg, best_kernels[0]);
+
+        return gather_nd;
+    }
+};
+
+namespace detail {
+
+attach_gather_nd_gpu::attach_gather_nd_gpu() {
+    auto val_fw = gather_nd_gpu::create;
+    implementation_map<gather_nd>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
+    implementation_map<gather_nd>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
+    implementation_map<gather_nd>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw);
+
+    implementation_map<gather_nd>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), val_fw);
+    implementation_map<gather_nd>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), val_fw);
+    implementation_map<gather_nd>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfzyx), val_fw);
+
+    implementation_map<gather_nd>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfwzyx), val_fw);
+    implementation_map<gather_nd>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfwzyx), val_fw);
+    implementation_map<gather_nd>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfwzyx), val_fw);
+}
+
+}  // namespace detail
+}  // namespace gpu
+}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp
index a5a5b4b477f942..7631afc0ccb313 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp
@@ -13,9 +13,17 @@
 #include <string>
 #include <memory>
 #include <utility>
-
 #include "kernel_selector_helper.h"
 #include "cldnn_itt.h"
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range.h>
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+#include <thread>
+#include <future>
+#include <queue>
+#include <condition_variable>
+#endif
 
 #ifndef ENABLE_UNICODE_PATH_SUPPORT
 # ifdef _WIN32
@@ -36,8 +44,10 @@
 #include <Windows.h>
 #endif
 
+#if (CLDNN_THREADING != CLDNN_THREADING_SEQ)
+#define DEFAULT_NUM_THREADS 2
+#endif
 namespace {
-
 std::mutex cacheAccessMutex;
 
 #ifdef ENABLE_UNICODE_PATH_SUPPORT
@@ -84,7 +94,6 @@ static std::vector<unsigned char> loadBinaryFromFile(std::string path) {
 
     return {};
 }
-
 static void saveBinaryToFile(std::string path, const std::vector<unsigned char> buffer) {
     std::lock_guard<std::mutex> lock(cacheAccessMutex);
 #if defined(ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
@@ -99,49 +108,6 @@ static void saveBinaryToFile(std::string path, const std::vector<unsigned char>
     }
 }
 
-std::string get_undef_jit(cldnn::gpu::kernels_cache::source_code org_source_code) {
-    const std::string white_space_with_new_lines = " \t\r\n";
-    const std::string white_space = " \t";
-
-    size_t current_pos = 0;
-
-    const std::string define = "define";
-
-    std::set<std::string> to_undef;
-    for (const auto& source : org_source_code) {
-        do {
-            size_t index_to_hash = source.find_first_not_of(white_space_with_new_lines, current_pos);
-            if (index_to_hash != std::string::npos && source[index_to_hash] == '#') {
-                size_t index_define = source.find_first_not_of(white_space, index_to_hash + 1);
-
-                if (index_define != std::string::npos && !source.compare(index_define, define.size(), define)) {
-                    size_t index_to_name = source.find_first_not_of(white_space, index_define + define.size());
-                    if (index_to_name != std::string::npos) {
-                        size_t index_to_end_name =
-                            source.find_first_of(white_space_with_new_lines + "(", index_to_name);
-                        if (index_to_end_name == std::string::npos) {
-                            index_to_end_name = source.size();
-                        }
-                        std::string name = source.substr(index_to_name, index_to_end_name - index_to_name);
-                        to_undef.insert(name);
-                    }
-                }
-            }
-
-            current_pos = source.find_first_of('\n', current_pos + 1);
-        } while (current_pos != std::string::npos);
-    }
-
-    std::string undefs;
-    for (const auto& name : to_undef) {
-        undefs += "#ifdef " + name + "\n";
-        undefs += "#undef " + name + "\n";
-        undefs += "#endif\n";
-    }
-
-    return undefs;
-}
-
 std::string reorder_options(const std::string& org_options) {
     std::stringstream ss(org_options);
     std::set<std::string> sorted_options;
@@ -190,13 +156,13 @@ size_t kernels_cache::get_max_kernels_per_batch() const {
     return 10;
 }
 
-kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code& kernels_source_code) const {
+
+void kernels_cache::get_program_source(const kernels_code& kernels_source_code, std::vector<kernels_cache::batch_program>* all_batches) const {
     OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll::GetProgramSource");
-    sorted_code scode;
+    std::map<std::string, std::vector<batch_program>> program_buckets;
 
     for (const auto& code : kernels_source_code) {
-        std::string full_code = code.kernel_strings->jit + code.kernel_strings->str;
-        full_code += get_undef_jit({full_code});
+        std::string full_code = code.kernel_strings->jit + code.kernel_strings->str + code.kernel_strings->undefs;
         const source_code org_source_code = { full_code };
         std::string entry_point = code.kernel_strings->entry_point;
         std::string options = code.kernel_strings->options;
@@ -213,7 +179,7 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
         std::string key = options;
 
         if (batch_compilation == false) {
-            key += " __PROGRAM__" + std::to_string(scode.size());
+            key += " __PROGRAM__" + std::to_string(program_buckets.size());
         }
 
         if (dump_custom_program) {
@@ -223,48 +189,63 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
         if (one_time_kernel) {
             key += " __ONE_TIME__";
         }
-
-        auto& current_bucket = scode[key];
-        current_bucket.dump_custom_program = dump_custom_program;
-        current_bucket.one_time = one_time_kernel;
-
-        if (current_bucket.source.empty()) {
-            current_bucket.options = options;
+        auto& current_bucket = program_buckets[key];
+        if (current_bucket.empty()) { // new bucket
+            const auto& bucket_id = program_buckets.size() - 1;
+            current_bucket.push_back(batch_program());
+            current_bucket.back().bucket_id = static_cast<int32_t>(bucket_id);
+            current_bucket.back().batch_id = 0;
+            current_bucket.back().options = options;
         }
 
-        // Create new kernels bucket when the limit is reached
-        if ((current_bucket.kernels_counter % get_max_kernels_per_batch()) == 0) {
-            current_bucket.source.push_back({});
+        // Create new kernels batch when the limit is reached
+        if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()) {
+            const auto& batch_id = current_bucket.size();
+            current_bucket.push_back(batch_program());
+            current_bucket.back().bucket_id = static_cast<int32_t>(program_buckets.size());
+            current_bucket.back().batch_id = static_cast<int32_t>(batch_id);
+            current_bucket.back().options = options;
         }
 
-        current_bucket.entry_point_to_id[entry_point] = code.id;
-        assert(org_source_code.size() == 1);
+        auto& current_batch = current_bucket.back();
+        current_batch.dump_custom_program = dump_custom_program;
+        current_batch.one_time = one_time_kernel;
+        current_batch.entry_point_to_id[entry_point] = code.id;
 
-        current_bucket.source.back().push_back(std::move(org_source_code.front()));
+        assert(org_source_code.size() == 1);
 
-        current_bucket.kernels_counter++;
+        current_batch.source.push_back(std::move(org_source_code.front()));
+        current_batch.kernels_counter++;
     }
 
-    // Compute hash value for each bucket
+    // Compute hash value for each batch
     // Hash calculation might require additional optimizations, but currently execution time of this part is much smaller than loading
     // of the precompiled binaries or get_undef_jit calls
     // Hash is computed for string that contains compilation options + driver version +
-    // full source code (jit + template + undef sections) of all kernels in the bucket
-    for (auto& c : scode) {
-        program_code& code = c.second;
+    // full source code (jit + template + undef sections) of all kernels in the batches
+    for (auto& c : program_buckets) {
         auto options = c.first;
-        for (size_t i = 0; i < code.source.size(); i++) {
+        auto& batches = c.second;
+        for (auto& b : batches) {
             std::string full_code = options + " " + _context.get_device_info().driver_version;
-            for (auto& ss : code.source[i])
+            for (auto& ss : b.source)
                 full_code += ss;
-            code.hash_values.push_back(std::hash<std::string>()(full_code));
+            b.hash_value = std::hash<std::string>()(full_code);
+            all_batches->push_back(b);
         }
     }
-
-    return scode;
 }
 
-kernels_cache::kernels_cache(gpu_toolkit& context, uint32_t prog_id) : _context(context), _prog_id(prog_id) {}
+kernels_cache::kernels_cache(gpu_toolkit& context, uint32_t prog_id) : _context(context), _prog_id(prog_id) {
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+    int n_threads = _context.get_configuration().n_threads;
+    arena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena());
+    arena->initialize(n_threads);
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+    int n_threads = _context.get_configuration().n_threads;
+    pool = std::unique_ptr<thread_pool>(new thread_pool(n_threads));
+#endif
+}
 
 kernels_cache::kernel_id kernels_cache::set_kernel_source(
     const std::shared_ptr<kernel_selector::kernel_string>& kernel_string,
@@ -301,149 +282,160 @@ static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
     return program.getInfo<CL_PROGRAM_BINARIES>().front();
 }
 
-kernels_cache::kernels_map kernels_cache::build_program(const program_code& program_source) const {
+void kernels_cache::build_batch(const batch_program& batch) {
     OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram");
-    static uint32_t current_file_index = 0;
 
-    bool dump_sources = !_context.get_configuration().ocl_sources_dumps_dir.empty() || program_source.dump_custom_program;
+    bool dump_sources = !_context.get_configuration().ocl_sources_dumps_dir.empty() || batch.dump_custom_program;
 
-    std::string dump_file_name = "";
+    std::string err_log;  // accumulated build log from all program's parts (only contains messages from parts which
+
+    std::string current_dump_file_name = "";
     if (dump_sources) {
-        dump_file_name = _context.get_configuration().ocl_sources_dumps_dir;
-        if (!dump_file_name.empty() && dump_file_name.back() != '/')
-            dump_file_name += '/';
+        current_dump_file_name = _context.get_configuration().ocl_sources_dumps_dir;
+        if (!current_dump_file_name.empty() && current_dump_file_name.back() != '/')
+            current_dump_file_name += '/';
 
-        dump_file_name += "clDNN_program_" + std::to_string(current_file_index++) + "_part_";
+        current_dump_file_name += "clDNN_program_" + std::to_string(batch.bucket_id) + "_part_" + std::to_string(batch.batch_id) + ".cl";
     }
 
-    try {
-        kernels_map kmap;
-        std::string err_log;  // accumulated build log from all program's parts (only contains messages from parts which
-                              // failed to compile)
-
-        uint32_t part_idx = 0;
-        for (size_t i = 0; i < program_source.source.size(); i++) {
-            auto sources_bucket_to_compile = program_source.source[i];
-            const auto& hash_value = program_source.hash_values[i];
-            std::string cached_bin_name = get_cache_path() + std::to_string(hash_value) + ".cl_cache";
-            cl::Program::Binaries precompiled_kernels = {};
-            if (is_cache_enabled()) {
-                // Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
-                // If read is successful, then remove kernels from compilation bucket
-                auto bin = loadBinaryFromFile(cached_bin_name);
-                if (!bin.empty()) {
-                    precompiled_kernels.push_back(bin);
-                }
-            }
-            auto current_dump_file_name = dump_file_name + std::to_string(part_idx++) + ".cl";
-            std::ofstream dump_file;
+    std::ofstream dump_file;
+    if (dump_sources) {
+        dump_file.open(current_dump_file_name);
+        if (dump_file.good()) {
+            for (auto& s : batch.source)
+                dump_file << s;
+        }
+    }
 
-            if (dump_sources) {
-                dump_file.open(current_dump_file_name);
+    std::string cached_bin_name = get_cache_path() + std::to_string(batch.hash_value) + ".cl_cache";
+    cl::Program::Binaries precompiled_kernels = {};
 
-                if (dump_file.good()) {
-                    for (auto& s : sources_bucket_to_compile)
-                        dump_file << s;
-                }
+    if (is_cache_enabled()) {
+        // Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
+        // If read is successful, then remove kernels from compilation bucket
+        auto bin = loadBinaryFromFile(cached_bin_name);
+        if (!bin.empty()) {
+            precompiled_kernels.push_back(bin);
+        }
+    }
+    try {
+        cl::vector<cl::Kernel> kernels;
+
+        // Run compilation
+        if (precompiled_kernels.empty()) {
+            cl::Program program(_context.context(), batch.source);
+            {
+                OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram::RunCompilation");
+                program.build(_context.device(), batch.options.c_str());
             }
 
-            try {
-                cl::vector<cl::Kernel> kernels;
-                // Run compilation
-                if (precompiled_kernels.empty()) {
-                    cl::Program program(_context.context(), sources_bucket_to_compile);
-                    {
-                        OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram::RunCompilation");
-                        program.build(_context.device(), program_source.options.c_str());
-                    }
+            if (dump_sources && dump_file.good()) {
+                dump_file << "\n/* Build Log:\n";
+                for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
+                    dump_file << p.second << "\n";
 
-                    if (dump_sources && dump_file.good()) {
-                        dump_file << "\n/* Build Log:\n";
-                        for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
-                            dump_file << p.second << "\n";
+                dump_file << "*/\n";
+            }
 
-                        dump_file << "*/\n";
-                    }
+            program.createKernels(&kernels);
 
-                    program.createKernels(&kernels);
-                    if (is_cache_enabled()) {
-                        // If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
-                        // Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
-                        // Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer
-                        // compile time.
-                        saveBinaryToFile(cached_bin_name, getProgramBinaries(program));
+            if (is_cache_enabled()) {
+                // If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
+                // Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
+                // Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer
+                // compile time.
+                saveBinaryToFile(cached_bin_name, getProgramBinaries(program));
+            }
+        } else {
+            cl::Program program(_context.context(), {_context.device()}, precompiled_kernels);
+            program.build(_context.device(), batch.options.c_str());
+            program.createKernels(&kernels);
+        }
+        {
+            std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
+            for (auto& k : kernels) {
+                const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
+                const auto& k_id = batch.entry_point_to_id.find(entry_point);
+                const auto& k_type = kernel_type(k, _context.get_device_info().supports_usm);
+                if (k_id != batch.entry_point_to_id.end()) {
+                    const auto& kmap = std::make_pair(k_id->second, k_type);
+                    if (batch.one_time) {
+                        _one_time_kernels.insert(kmap);
+                    } else {
+                        _kernels.insert(kmap);
                     }
                 } else {
-                    cl::Program program(_context.context(), {_context.device()}, precompiled_kernels);
-                    program.build(_context.device(), program_source.options.c_str());
-                    program.createKernels(&kernels);
+                    throw std::runtime_error("Could not find entry point");
                 }
-
-                for (auto& k : kernels) {
-                    auto kernel_name = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
-                    kmap.emplace(kernel_name, kernels_cache::kernel_type(k, _context.get_device_info().supports_usm));
-                }
-            } catch (const cl::BuildError& err) {
-                if (dump_sources && dump_file.good())
-                    dump_file << "\n/* Build Log:\n";
-
-                for (auto& p : err.getBuildLog()) {
-                    if (dump_sources && dump_file.good())
-                        dump_file << p.second << "\n";
-
-                    err_log += p.second + '\n';
-                }
-
-                if (dump_sources && dump_file.good())
-                    dump_file << "*/\n";
             }
         }
-
-        if (!err_log.empty()) {
-            throw std::runtime_error("Program build failed. You may enable OCL source dump to see the error log.\n");
+    } catch (const cl::BuildError& err) {
+        if (dump_sources && dump_file.good())
+            dump_file << "\n/* Build Log:\n";
+
+        for (auto& p : err.getBuildLog()) {
+            if (dump_sources && dump_file.good())
+                dump_file << p.second << "\n";
+            err_log += p.second + '\n';
         }
-
-        return kmap;
-    } catch (const cl::Error& err) {
-        throw ocl_error(err);
+        if (dump_sources && dump_file.good())
+            dump_file << "*/\n";
+    }
+    if (!err_log.empty()) {
+        throw std::runtime_error("Program build failed. You may enable OCL source dump to see the error log.\n");
     }
 }
 
-kernels_cache::kernel_type kernels_cache::get_kernel(kernel_id id, bool one_time_kernel) {
-    build_all();
-    if (one_time_kernel) {
-        return _one_time_kernels.at(id);
-    } else {
-        return _kernels.at(id);
-    }
+kernels_cache::kernel_type kernels_cache::get_kernel(kernel_id id, bool one_time_kernel) const {
+    if (_pending_compilation)
+        throw std::runtime_error("Kernel cache is not compiled, call build_all() first!");
+
+    const auto& kernels = one_time_kernel ?  _one_time_kernels : _kernels;
+    auto res = kernels.find(id);
+    if (kernels.end() == res)
+        throw std::runtime_error("Kernel " + id + " not found in the kernel cache!");
+    return res->second;
 }
 
 void kernels_cache::build_all() {
     OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll");
     if (!_pending_compilation)
         return;
+    std::vector<batch_program> batches;
+    {
+        std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
+        get_program_source(_kernels_code, &batches);
+        _one_time_kernels.clear();
+    }
 
-    std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
-
-    auto sorted_program_code = get_program_source(_kernels_code);
-
-    _one_time_kernels.clear();
-    for (auto& program : sorted_program_code) {
-        auto kernels = build_program(program.second);
-
-        for (auto& k : kernels) {
-            const auto& entry_point = k.first;
-            const auto& k_id = program.second.entry_point_to_id[entry_point];
-            if (program.second.one_time) {
-                _one_time_kernels[k_id] = k.second;
-            } else {
-                _kernels[k_id] = k.second;
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+    arena->execute([this, &batches] {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, batches.size()), [this, &batches](const tbb::blocked_range<size_t>& r) {
+            for (auto i = r.begin(); i != r.end(); ++i) {
+                build_batch(batches[i]);
             }
-        }
+        });
+    });
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+    std::vector<std::future<void>> builds;
+    for (size_t i = 0; i < batches.size(); ++i) {
+        builds.push_back(pool->enqueue([this, &batches, i] () {
+            build_batch(batches[i]);
+        }));
+    }
+    std::for_each(builds.begin(), builds.end(), [] (std::future<void>& f) { f.wait(); });
+#else
+    // no parallel build
+    for (const auto& batch : batches) {
+        build_batch(batch);
     }
+#endif
 
-    _kernels_code.clear();
-    _pending_compilation = false;
+    {
+        std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
+        _kernels_code.clear();
+        _pending_compilation = false;
+    }
 }
 
 void kernels_cache::reset() {
@@ -452,6 +444,5 @@ void kernels_cache::reset() {
     _kernels_code.clear();
     _pending_compilation = false;
 }
-
 }  // namespace gpu
 }  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h
index 23ebf7ab22f1ab..7cf31598bbf01a 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h
@@ -13,6 +13,19 @@
 #include <unordered_set>
 #include <kernel_selector_common.h>
 
+#define CLDNN_THREADING_SEQ 0
+#define CLDNN_THREADING_TBB 1
+#define CLDNN_THREADING_THREADPOOL 2
+
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+#include <tbb/task_arena.h>
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+#include <queue>
+#include <future>
+#include <functional>
+#include <condition_variable>
+#endif
+
 namespace cl {
 class Kernel;
 class KernelIntel;
@@ -26,14 +39,76 @@ namespace cldnn {
 namespace gpu {
 
 class gpu_toolkit;
+#if (CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+class thread_pool {
+public:
+    thread_pool(size_t num_threads) : _stop_pool(false) {
+        _workers.reserve(num_threads);
+        for (size_t i = 0; i < num_threads; ++i) {
+            _workers.emplace_back(std::thread(&thread_pool::worker_thread, this));
+        }
+    }
+
+    ~thread_pool() {
+        {
+            std::lock_guard<std::mutex> lock(_q_m);
+            _stop_pool = true;
+        }
+        this->wait_all();
+    }
+
+    template <class F, class... Args>
+    std::future<typename std::result_of<F(Args...)>::type> enqueue(F&& f, Args&&... args) {
+        if (_stop_pool) {
+            throw std::runtime_error("Thread pool is stoped");
+        }
 
+        using return_type = typename std::result_of<F(Args...)>::type;
+        auto task = std::make_shared<std::packaged_task<return_type()>> (std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+        std::future<return_type> result = task->get_future();
+        {
+            std::lock_guard<std::mutex> lock(_q_m);
+            _tasks.push([task]() {(*task)();});
+        }
+        _cv.notify_one();
+        return result;
+    }
+
+    void wait_all() {
+        _cv.notify_all();
+        for (auto& w : _workers) {
+            w.join();
+        }
+    }
+
+private:
+    std::vector<std::thread> _workers;
+    std::queue<std::function<void()>> _tasks;
+    std::condition_variable _cv;
+    std::mutex _q_m;
+    bool _stop_pool;
+
+    void worker_thread() {
+        while (true) {
+            std::unique_lock<std::mutex> lock(this->_q_m);
+            _cv.wait(lock, [this]() { return (!this->_tasks.empty()) || (_stop_pool); });
+            if ((_stop_pool) && (this->_tasks.empty())) return;
+            auto task = std::move(_tasks.front());
+            this->_tasks.pop();
+            lock.unlock();
+            task();
+        }
+    }
+};
+#endif
 class kernels_cache {
 public:
     using source_code = std::vector<std::string>;
-
-    struct program_code {
-        std::vector<source_code> source;
-        std::vector<size_t> hash_values;
+    struct batch_program {
+        int32_t bucket_id = 0;
+        int32_t batch_id = 0;
+        source_code source;
+        size_t hash_value;
         uint32_t kernels_counter = 0;
         std::string options;
         bool dump_custom_program = false;
@@ -69,7 +144,6 @@ class kernels_cache {
 
     typedef std::string kernel_id;
     typedef cl::KernelIntel kernel_type;
-    using sorted_code = std::map<std::string, program_code>;
     using kernels_map = std::map<std::string, kernel_type>;
     using kernels_code = std::unordered_set<kernel_code, hash_kernel_code>;
 
@@ -77,13 +151,19 @@ class kernels_cache {
     gpu_toolkit& _context;
     kernels_code _kernels_code;
     std::atomic<bool> _pending_compilation{false};
-    std::map<std::string, kernel_type> _kernels;
-    std::map<std::string, kernel_type> _one_time_kernels;  // These kernels are intended to be executed only once (can
+    std::map<const std::string, const kernel_type> _kernels;
+    std::map<const std::string, const kernel_type> _one_time_kernels;  // These kernels are intended to be executed only once (can
                                                            // be removed later from the cache).
     uint32_t _prog_id;
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+    std::unique_ptr<tbb::task_arena> arena;
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+    std::unique_ptr<thread_pool> pool;
+#endif
+
 
-    sorted_code get_program_source(const kernels_code& kernels_source_code) const;
-    kernels_map build_program(const program_code& pcode) const;
+    void get_program_source(const kernels_code& kernels_source_code, std::vector<batch_program>*) const;
+    void build_batch(const batch_program& batch);
 
     std::string get_cache_path() const;
     bool is_cache_enabled() const;
@@ -94,7 +174,7 @@ class kernels_cache {
     kernel_id set_kernel_source(const std::shared_ptr<kernel_selector::kernel_string>& kernel_string,
                                 bool dump_custom_program,
                                 bool one_time_kernel);
-    kernel_type get_kernel(kernel_id id, bool one_time_kernel);
+    kernel_type get_kernel(kernel_id id, bool one_time_kernel) const;
     gpu_toolkit& get_context() { return _context; }
     // forces compilation of all pending kernels/programs
     void build_all();
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.cpp
index a906e09970337c..60fcc9b043542e 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_builder.cpp
@@ -25,6 +25,49 @@ namespace cldnn {
 namespace gpu {
 static constexpr auto INTEL_PLATFORM_VENDOR = "Intel(R) Corporation";
 
+static std::vector<cl::Device> getSubDevices(cl::Device& rootDevice) {
+    cl_uint maxSubDevices;
+    size_t maxSubDevicesSize;
+    const auto err = clGetDeviceInfo(rootDevice(),
+                                     CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
+                                     sizeof(maxSubDevices),
+                                     &maxSubDevices, &maxSubDevicesSize);
+
+    if (err != CL_SUCCESS || maxSubDevicesSize != sizeof(maxSubDevices)) {
+        throw cl::Error(err, "clGetDeviceInfo(..., CL_DEVICE_PARTITION_MAX_SUB_DEVICES,...)");
+    }
+
+    if (maxSubDevices == 0) {
+        return {};
+    }
+
+    const auto partitionProperties = rootDevice.getInfo<CL_DEVICE_PARTITION_PROPERTIES>();
+    const auto partitionable = std::any_of(partitionProperties.begin(), partitionProperties.end(),
+                                            [](const decltype(partitionProperties)::value_type& prop) {
+                                                return prop == CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
+                                            });
+
+    if (!partitionable) {
+        return {};
+    }
+
+    const auto partitionAffinityDomain = rootDevice.getInfo<CL_DEVICE_PARTITION_AFFINITY_DOMAIN>();
+    const decltype(partitionAffinityDomain) expectedFlags =
+        CL_DEVICE_AFFINITY_DOMAIN_NUMA | CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE;
+
+    if ((partitionAffinityDomain & expectedFlags) != expectedFlags) {
+        return {};
+    }
+
+    std::vector<cl::Device> subDevices;
+    cl_device_partition_property partitionProperty[] = {CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
+                                                        CL_DEVICE_AFFINITY_DOMAIN_NUMA, 0};
+
+    rootDevice.createSubDevices(partitionProperty, &subDevices);
+
+    return subDevices;
+}
+
 std::map<std::string, device_impl::ptr> ocl_builder::get_available_devices(void* user_context, void* user_device) const {
     bool host_out_of_order = true;  // Change to false, if debug requires in-order queue.
     std::vector<device_impl::ptr> dev_orig, dev_sorted;
@@ -45,7 +88,20 @@ std::map<std::string, device_impl::ptr> ocl_builder::get_available_devices(void*
     }
     uint32_t idx = 0;
     for (auto& dptr : dev_sorted) {
-        ret[std::to_string(idx++)] = dptr;
+        auto map_id = std::to_string(idx++);
+        ret[map_id] = dptr;
+
+        auto rootDevice = dptr->get_device();
+        auto subDevices = getSubDevices(rootDevice);
+        if (!subDevices.empty()) {
+            uint32_t sub_idx = 0;
+            for (auto& subdevice : subDevices) {
+                auto subdPtr = device_impl::ptr(new device_impl(subdevice, cl::Context(subdevice),
+                                                                dptr->get_platform(),
+                                                                device_info_internal(subdevice)), false);
+                ret[map_id+"."+std::to_string(sub_idx++)] = subdPtr;
+            }
+        }
     }
     return ret;
 }
@@ -77,7 +133,7 @@ std::vector<device_impl::ptr> ocl_builder::build_device_list(bool out_out_order)
         for (auto& device : devices) {
             if (!does_device_match_config(out_out_order, device)) continue;
             ret.emplace_back(device_impl::ptr{ new device_impl(device, cl::Context(device),
-                                                            id, device_info_internal(device)), false});
+                                                        id, device_info_internal(device)), false});
         }
     }
     if (ret.empty()) {
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
index f272c6023fba59..0dd40bf89e4743 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
@@ -32,6 +32,7 @@ void register_implementations_gpu() {
     REGISTER_GPU(eltwise);
     REGISTER_GPU(fully_connected);
     REGISTER_GPU(gather);
+    REGISTER_GPU(gather_nd);
     REGISTER_GPU(gemm);
     REGISTER_GPU(input_layout);
     REGISTER_GPU(lrn);
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
index 646dee946533b0..01032f1137a131 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
@@ -24,6 +24,7 @@
 #include "api/eltwise.hpp"
 #include "api/fully_connected.hpp"
 #include "api/gather.hpp"
+#include "api/gather_nd.hpp"
 #include "api/gemm.hpp"
 #include "api/input_layout.hpp"
 #include "api/lrn.hpp"
@@ -100,6 +101,7 @@ REGISTER_GPU(eltwise);
 REGISTER_GPU(embed);
 REGISTER_GPU(fully_connected);
 REGISTER_GPU(gather);
+REGISTER_GPU(gather_nd);
 REGISTER_GPU(gemm);
 REGISTER_GPU(input_layout);
 REGISTER_GPU(lookup_table);
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
index fd8d4233d7d628..0a192f75c147d5 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
@@ -32,6 +32,7 @@
 #include "depth_to_space_inst.h"
 #include "space_to_depth_inst.h"
 #include "gather_inst.h"
+#include "gather_nd_inst.h"
 #include "scatter_update_inst.h"
 #include "scatter_nd_update_inst.h"
 #include "scatter_elements_update_inst.h"
@@ -196,6 +197,7 @@ void prepare_primitive_fusing::fuse_activations(program_impl &p) {
                  !input.is_type<depth_to_space>() && !input.is_type<batch_to_space>() &&
                  !input.is_type<space_to_batch>() && !input.is_type<gather>() && !input.is_type<scatter_update>() && !input.is_type<shuffle_channels>() &&
                  !input.is_type<scatter_nd_update>() &&
+                 !input.is_type<gather_nd>() &&
                  !input.is_type<strided_slice>() && !input.is_type<cum_sum>() && !input.is_type<reverse_sequence>() &&
                  !input.is_type<embedding_bag>() && !input.is_type<extract_image_patches>() &&
                  !input.is_type<fused_conv_eltwise>() && !input.is_type<activation>()))
@@ -528,6 +530,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<gather>();
 
+            should_fuse |= input_data.is_type<gather_nd>();
+
             should_fuse |= input_data.is_type<scatter_update>();
 
             should_fuse |= input_data.is_type<scatter_nd_update>();
@@ -594,6 +598,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<gather>();
 
+            should_fuse |= input_data.is_type<gather_nd>();
+
             should_fuse |= input_data.is_type<scatter_update>();
 
             should_fuse |= input_data.is_type<scatter_nd_update>();
@@ -682,6 +688,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<gather>() && quantize_node.get_scale_shift_opt();
 
+            should_fuse |= input_data.is_type<gather_nd>() && quantize_node.get_scale_shift_opt();
+
             should_fuse |= input_data.is_type<scatter_update>() && quantize_node.get_scale_shift_opt();
 
             should_fuse |= input_data.is_type<scatter_nd_update>() && quantize_node.get_scale_shift_opt();
@@ -741,6 +749,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
                                       (parents[i]->is_type<space_to_batch>()) ||
                                       (parents[i]->is_type<eltwise>() && eltwise_supports_fusings(parents[i]->as<eltwise>())) ||
                                       (parents[i]->is_type<scale>()) ||
+                                      (parents[i]->is_type<gather_nd>()) ||
                                       (parents[i]->is_type<scatter_nd_update>()) ||
                                       (parents[i]->is_type<scatter_elements_update>()) ||
                                       (parents[i]->is_type<pooling>() && pooling_supports_fusings(parents[i]->as<pooling>())) ||
diff --git a/inference-engine/thirdparty/clDNN/src/include/gather_nd_inst.h b/inference-engine/thirdparty/clDNN/src/include/gather_nd_inst.h
new file mode 100644
index 00000000000000..b8732f7171c635
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/src/include/gather_nd_inst.h
@@ -0,0 +1,49 @@
+/*
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+#include "api/gather_nd.hpp"
+#include "primitive_inst.h"
+#include <string>
+
+namespace cldnn {
+template <>
+struct typed_program_node<gather_nd> : public typed_program_node_base<gather_nd> {
+    using parent = typed_program_node_base<gather_nd>;
+
+public:
+    using parent::parent;
+
+    program_node& input(size_t index = 0) const { return get_dependency(index); }
+};
+
+using gather_nd_node = typed_program_node<gather_nd>;
+
+template <>
+class typed_primitive_inst<gather_nd> : public typed_primitive_inst_base<gather_nd> {
+    using parent = typed_primitive_inst_base<gather_nd>;
+
+public:
+    static layout calc_output_layout(gather_nd_node const& node);
+    static std::string to_string(gather_nd_node const& node);
+
+public:
+    typed_primitive_inst(network_impl& network, gather_nd_node const& desc);
+};
+
+using gather_nd_inst = typed_primitive_inst<gather_nd>;
+}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/eltwise_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/eltwise_gpu_test.cpp
index 2feaeb715f58dd..9624213018d970 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/eltwise_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/eltwise_gpu_test.cpp
@@ -3342,6 +3342,8 @@ static std::vector<std::vector<std::vector<int32_t>>> inputs = {
         {{1, 16, 1, 1}, {1, 16, 8, 2}},
         {{1, 32, 1, 1}, {1, 32, 2, 2}},
         {{1, 32, 1, 1}, {8, 32, 4, 5}},
+        {{1, 2, 1, 1}, {1, 1, 3, 1}},
+        {{1, 2, 1, 1}, {4, 1, 3, 5}},
 
         {{1, 16, 8, 2, 4}, {1, 16, 8, 2, 4}},
         {{8, 32, 4, 5, 6}, {1, 32, 1, 1, 1}},
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
index ec4e6ec656288d..6be7d5ae8deba2 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@@ -22,6 +22,7 @@
 #include "api/deconvolution.hpp"
 #include "api/permute.hpp"
 #include "api/gather.hpp"
+#include "api/gather_nd.hpp"
 #include "api/scatter_update.hpp"
 #include "api/scatter_nd_update.hpp"
 #include "api/scatter_elements_update.hpp"
@@ -2956,6 +2957,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_2in_act_scale_eltwise,
 #define CASE_RESAMPLE_FP32_7 {1, 16, 4, 5, 4}, {1, 16, 2, 3, 2}, data_types::f32, format::bfzyx, resample_type::nearest, data_types::f32, format::bfzyx
 #define CASE_RESAMPLE_FP32_8 {1, 16, 4, 5, 4}, {1, 16, 2, 3, 2}, data_types::f32, format::bfzyx, resample_type::caffe_bilinear, data_types::f32, format::bfzyx
 #define CASE_RESAMPLE_FP32_9 {1, 16, 4, 5}, {1, 16, 7, 8}, data_types::f32, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx
+#define CASE_RESAMPLE_FP32_10 {1, 16, 4, 5}, {1, 16, 7, 8}, data_types::f32, format::b_fs_yx_fsv16, resample_type::caffe_bilinear, data_types::f32, format::bfyx
 
 #define CASE_RESAMPLE_FP16_1 {1, 15, 4, 5}, {1, 15, 2, 3}, data_types::f16, format::bfyx, resample_type::nearest, data_types::f16, format::bfyx
 #define CASE_RESAMPLE_FP16_2 {1, 15, 4, 5}, {1, 15, 2, 3}, data_types::f16, format::bfyx, resample_type::bilinear, data_types::f16, format::bfyx
@@ -2967,6 +2969,10 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_2in_act_scale_eltwise,
 #define CASE_RESAMPLE_FP16_8 {1, 16, 4, 5, 4}, {1, 16, 2, 3, 2}, data_types::f16, format::bfzyx, resample_type::caffe_bilinear, data_types::f16, format::bfzyx
 #define CASE_RESAMPLE_FP16_9 {1, 16, 4, 5}, {1, 16, 7, 8}, data_types::f16, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f16, format::bfyx
 #define CASE_RESAMPLE_FP16_10 {2, 32, 4, 5}, {2, 32, 7, 8}, data_types::f16, format::fs_b_yx_fsv32, resample_type::bilinear, data_types::f16, format::bfyx
+#define CASE_RESAMPLE_FP16_11 {1, 16, 4, 5}, {1, 16, 7, 8}, data_types::f16, format::b_fs_yx_fsv16, resample_type::caffe_bilinear, data_types::f16, format::bfyx
+#define CASE_RESAMPLE_FP16_12 {2, 32, 4, 5}, {2, 32, 7, 8}, data_types::f16, format::fs_b_yx_fsv32, resample_type::caffe_bilinear, data_types::f16, format::bfyx
+#define CASE_RESAMPLE_FP16_13 {1, 16, 4, 5}, {1, 16, 7, 8}, data_types::f16, format::b_fs_yx_fsv16, resample_type::caffe_bilinear, data_types::f16, format::bfyx
+#define CASE_RESAMPLE_FP16_14 {1, 32, 4, 5}, {1, 32, 2, 3}, data_types::f16, format::fs_b_yx_fsv32, resample_type::caffe_bilinear, data_types::f16, format::bfyx
 
 #define CASE_RESAMPLE_I8_1 {1, 16, 4, 5}, {1, 16, 2, 3}, data_types::i8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx
 #define CASE_RESAMPLE_I8_2 {2, 32, 4, 5}, {2, 32, 2, 3}, data_types::i8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx
@@ -3006,6 +3012,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_quantize,
                         resample_test_params{ CASE_RESAMPLE_FP32_7, 2, 3 },
                         resample_test_params{ CASE_RESAMPLE_FP32_8, 2, 3 },
                         resample_test_params{ CASE_RESAMPLE_FP32_9, 2, 3 },
+                        resample_test_params{ CASE_RESAMPLE_FP32_10, 2, 3 },
 
                         // FQ can't be fused to FP16 primitive for now
                         // resample_test_params{ CASE_RESAMPLE_FP16_1, 2, 3 },
@@ -3047,6 +3054,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_scale_activation_eltwise,
                         resample_test_params{ CASE_RESAMPLE_FP32_7, 2, 5 },
                         resample_test_params{ CASE_RESAMPLE_FP32_8, 2, 5 },
                         resample_test_params{ CASE_RESAMPLE_FP32_9, 2, 5 },
+                        resample_test_params{ CASE_RESAMPLE_FP32_10, 2, 5 },
 
                         resample_test_params{ CASE_RESAMPLE_FP16_1, 2, 5 },
                         resample_test_params{ CASE_RESAMPLE_FP16_2, 2, 5 },
@@ -3058,6 +3066,10 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_scale_activation_eltwise,
                         resample_test_params{ CASE_RESAMPLE_FP16_8, 2, 5 },
                         resample_test_params{ CASE_RESAMPLE_FP16_9, 2, 5 },
                         resample_test_params{ CASE_RESAMPLE_FP16_10, 2, 5 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_11, 2, 5 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_12, 2, 5 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_13, 2, 5 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_14, 2, 5 },
 
                         resample_test_params{ CASE_RESAMPLE_I8_1, 2, 5 },
                         resample_test_params{ CASE_RESAMPLE_I8_2, 2, 5 },
@@ -3106,6 +3118,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_quantize_concat,
                         resample_test_params{ CASE_RESAMPLE_FP32_7, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP32_8, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP32_9, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP32_10, 3, 6 },
 
                         resample_test_params{ CASE_RESAMPLE_FP16_1, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP16_2, 3, 6 },
@@ -3117,6 +3130,10 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_quantize_concat,
                         resample_test_params{ CASE_RESAMPLE_FP16_8, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP16_9, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP16_10, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_11, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_12, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_13, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_14, 3, 6 },
 
                         resample_test_params{ CASE_RESAMPLE_I8_3, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_I8_4, 3, 6 },
@@ -3157,6 +3174,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_scale_concat,
                         resample_test_params{ CASE_RESAMPLE_FP32_7, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP32_8, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP32_9, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP32_10, 3, 6 },
 
                         resample_test_params{ CASE_RESAMPLE_FP16_1, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP16_2, 3, 6 },
@@ -3168,6 +3186,10 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_scale_concat,
                         resample_test_params{ CASE_RESAMPLE_FP16_8, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP16_9, 3, 6 },
                         resample_test_params{ CASE_RESAMPLE_FP16_10, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_11, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_12, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_13, 3, 6 },
+                        resample_test_params{ CASE_RESAMPLE_FP16_14, 3, 6 },
 
                         resample_test_params{ CASE_RESAMPLE_I8_1, 3, 6},
                         resample_test_params{ CASE_RESAMPLE_I8_2, 3, 6},
@@ -5594,6 +5616,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, gather_scale_activation,
                         gather_test_params{ CASE_GATHER_5D_FP16_5, 2, 4 },
 }), );
 
+
 /* ----------------------------------------------------------------------------------------------------- */
 /* ------------------------------------------ ScatterUpdate cases --------------------------------------------- */
 /* ----------------------------------------------------------------------------------------------------- */
@@ -5855,7 +5878,7 @@ class ScatterElementsUpdatePrimitiveFusingTest : public ::BaseFusingTest<scatter
 public:
     void execute(scatter_elements_update_test_params& p) {
 
-        auto input_prim = get_mem(get_input_layout(p));
+        auto input_prim = get_mem(get_input_layout(p), -5, 5);
         network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
         network network_fused(this->engine, this->topology_fused, bo_fused);
         network_fused.set_input_data("input", input_prim);
@@ -5942,8 +5965,8 @@ TEST_P(scatter_elements_update_scale_activation_eltwise, basic) {
     auto p = GetParam();
     create_topologies(input_layout("input", get_input_layout(p)),
         data("scatter_elements_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast<int>(get_axis_dim(p)) - 1)),
-        data("scatter_elements_update_updates", get_mem(get_updates_layout(p), 0, 100)),
-        data("scale_data", get_mem(get_per_channel_layout(p), -3, 3)),
+        data("scatter_elements_update_updates", get_mem(get_updates_layout(p), 0, 5)),
+        data("scale_data", get_mem(get_per_channel_layout(p), -1, 1)),
         data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.input_shape})),
         scatter_elements_update("scatter_elements_update_prim", "input", "scatter_elements_update_indices", "scatter_elements_update_updates", p.axis),
         activation("activation", "scatter_elements_update_prim", activation_func::abs),
@@ -5951,7 +5974,7 @@ TEST_P(scatter_elements_update_scale_activation_eltwise, basic) {
         eltwise("eltwise", {"scale", "eltwise_data"}, eltwise_mode::sum, p.data_type),
         reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32)
     );
-    tolerance = 1.0f;
+    tolerance = 1e-2f;
     execute(p);
 }
 
@@ -7749,7 +7772,7 @@ TEST_P(scatter_nd_update_scale_activation_eltwise, basic) {
     create_topologies(input_layout("input", get_input_layout(p)),
         data("scatter_nd_update_indices", get_mem(get_indices_layout(p), 0, p.max_number_in_indices)),
         data("scatter_nd_update_updates", get_mem(get_updates_layout(p), 0, 100)),
-        data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)),
+        data("scale_data", get_mem(get_per_channel_layout(p), -1, 1)),
         data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.input_shape })),
         scatter_nd_update("scatter_nd_update_prim", "input", "scatter_nd_update_indices", "scatter_nd_update_updates", p.indices_rank),
         activation("activation", "scatter_nd_update_prim", activation_func::abs),
@@ -7778,8 +7801,8 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, scatter_nd_update_scale_activation_eltwise,
         scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_5, 2, 5 },
         scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_6, 2, 5 },
         scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_7, 2, 5 },
-        scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_8, 2, 5 },
         scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_9, 2, 5 },
+        scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_8, 2, 5 },
 
         scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_1, 2, 5 },
         scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_2, 2, 5 },
@@ -7808,3 +7831,199 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, scatter_nd_update_scale_activation_eltwise,
         scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_5, 2, 5 },
         scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_6, 2, 5 },
 }), );
+
+
+/* ----------------------------------------------------------------------------------------------------- */
+/* ------------------------------------------ GatherND cases ------------------------------------------- */
+/* ----------------------------------------------------------------------------------------------------- */
+struct gather_nd_test_params {
+    data_types data_type;
+
+    format input_format;
+    tensor input_shape;
+
+    format indices_format;
+    tensor indices_shape;
+
+    format output_format;
+    tensor output_shape;
+
+    int max_number_in_indices;
+    int indices_rank;
+    int batch_dims;
+
+    data_types default_type;
+    format default_format;
+
+    size_t expected_fused_primitives;
+    size_t expected_not_fused_primitives;
+};
+
+#define CASE_GATHER_ND_FP16_4D_1 data_types::f16, format::bfyx, {6, 7, 9, 8}, format::bfyx, {3, 1, 1, 1}, format::bfyx, {3, 7, 9, 8}, 6, 2, 0, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_4D_2 data_types::f16, format::bfyx, {6, 7, 9, 8}, format::bfyx, {6, 1, 1, 1}, format::bfyx, {6, 8, 1, 9}, 6, 2, 1, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_4D_3 data_types::f16, format::bfyx, {5, 4, 7, 2}, format::bfyx, {5, 4, 1, 2}, format::bfyx, {40, 1, 1, 1}, 6, 4, 3, data_types::f16, format::bfyx
+
+#define CASE_GATHER_ND_FP16_5D_1 data_types::f16, format::bfzyx, {5, 6, 7, 8, 5}, format::bfyx, {5, 1, 1, 1}, format::bfzyx, {5, 6, 7, 8, 5}, 5, 2, 0, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_5D_2 data_types::f16, format::bfzyx, {5, 6, 7, 8, 5}, format::bfyx, {5, 1, 1, 1}, format::bfyx, {5, 5, 7, 8}, 5, 2, 1, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_5D_3 data_types::f16, format::bfzyx, {5, 4, 7, 8, 5}, format::bfyx, {5, 4, 1, 3}, format::bfyx, {20, 1, 1, 1}, 4, 3, 2, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_5D_4 data_types::f16, format::bfzyx, {5, 4, 7, 8, 3}, format::bfyx, {5, 4, 1, 3}, format::bfyx, {60, 7, 1, 1}, 4, 4, 3, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_5D_5 data_types::f16, format::bfzyx, {5, 4, 7, 2, 3}, format::bfzyx, {5, 4, 1, 2, 3}, format::bfyx, {120, 1, 1, 1}, 4, 5, 4, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_5D_6 data_types::f16, format::bfzyx, {5, 4, 7, 4, 4}, format::bfzyx, {5, 4, 1, 1, 3}, format::bfzyx, {20, 3, 7, 4, 1}, 4, 5, 2, data_types::f16, format::bfyx
+
+#define CASE_GATHER_ND_FP16_6D_1 data_types::f16, format::bfwzyx, {5, 4, 6, 7, 8, 5}, format::bfyx, {5, 4, 2, 2}, format::bfyx, {20, 2, 6, 7}, 5, 4, 2, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_6D_2 data_types::f16, format::bfwzyx, {5, 4, 6, 7, 8, 2}, format::bfyx, {5, 4, 2, 2}, format::bfyx, {40, 6, 1, 1}, 5, 4, 3, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_6D_3 data_types::f16, format::bfwzyx, {5, 4, 6, 7, 2, 2}, format::bfzyx, {5, 4, 1, 2, 2}, format::bfyx, {80, 6, 1, 1}, 5, 5, 4, data_types::f16, format::bfyx
+#define CASE_GATHER_ND_FP16_6D_4 data_types::f16, format::bfwzyx, {5, 4, 6, 3, 2, 2}, format::bfwzyx, {5, 4, 1, 3, 2, 2}, format::bfyx, {240, 1, 1, 1}, 5, 6, 5, data_types::f16, format::bfyx
+
+#define CASE_GATHER_ND_FP32_4D_1 data_types::f32, format::bfyx, {6, 7, 9, 8}, format::bfyx, {3, 1, 1, 1}, format::bfyx, {3, 7, 9, 8}, 6, 2, 0, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_4D_2 data_types::f32, format::bfyx, {6, 7, 9, 8}, format::bfyx, {6, 1, 1, 1}, format::bfyx, {6, 8, 1, 9}, 6, 2, 1, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_4D_3 data_types::f32, format::bfyx, {5, 4, 7, 2}, format::bfyx, {5, 4, 1, 2}, format::bfyx, {40, 1, 1, 1}, 6, 4, 3, data_types::f32, format::bfyx
+
+#define CASE_GATHER_ND_FP32_5D_1 data_types::f32, format::bfzyx, {5, 6, 7, 8, 5}, format::bfyx, {5, 1, 1, 1}, format::bfzyx, {5, 6, 7, 8, 5}, 5, 2, 0, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_5D_2 data_types::f32, format::bfzyx, {5, 6, 7, 8, 5}, format::bfyx, {5, 1, 1, 1}, format::bfyx, {5, 5, 7, 8}, 5, 2, 1, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_5D_3 data_types::f32, format::bfzyx, {5, 4, 7, 8, 5}, format::bfyx, {5, 4, 1, 3}, format::bfyx, {20, 1, 1, 1}, 4, 3, 2, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_5D_4 data_types::f32, format::bfzyx, {5, 4, 7, 8, 3}, format::bfyx, {5, 4, 1, 3}, format::bfyx, {60, 7, 1, 1}, 4, 4, 3, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_5D_5 data_types::f32, format::bfzyx, {5, 4, 7, 2, 3}, format::bfzyx, {5, 4, 1, 2, 3}, format::bfyx, {120, 1, 1, 1}, 4, 5, 4, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_5D_6 data_types::f32, format::bfzyx, {5, 4, 7, 4, 4}, format::bfzyx, {5, 4, 1, 1, 3}, format::bfzyx, {20, 3, 7, 4, 1}, 4, 5, 2, data_types::f32, format::bfyx
+
+#define CASE_GATHER_ND_FP32_6D_1 data_types::f32, format::bfwzyx, {5, 4, 6, 7, 8, 5}, format::bfyx, {5, 4, 2, 2}, format::bfyx, {20, 2, 6, 7}, 5, 4, 2, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_6D_2 data_types::f32, format::bfwzyx, {5, 4, 6, 7, 8, 2}, format::bfyx, {5, 4, 2, 2}, format::bfyx, {40, 6, 1, 1}, 5, 4, 3, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_6D_3 data_types::f32, format::bfwzyx, {5, 4, 6, 7, 2, 2}, format::bfzyx, {5, 4, 1, 2, 2}, format::bfyx, {80, 6, 1, 1}, 5, 5, 4, data_types::f32, format::bfyx
+#define CASE_GATHER_ND_FP32_6D_4 data_types::f32, format::bfwzyx, {5, 4, 6, 3, 2, 2}, format::bfwzyx, {5, 4, 1, 3, 2, 2}, format::bfyx, {240, 1, 1, 1}, 5, 6, 5, data_types::f32, format::bfyx
+
+
+
+class GatherNDPrimitiveFusingTest : public ::BaseFusingTest<gather_nd_test_params> {
+public:
+    void execute(gather_nd_test_params& p) {
+        auto input_prim = get_mem(get_input_layout(p));
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
+        network_fused.set_input_data("input", input_prim);
+        network_not_fused.set_input_data("input", input_prim);
+        compare(network_not_fused, network_fused, p);
+    }
+
+    layout get_input_layout(gather_nd_test_params& p) {
+        return layout{ p.data_type, p.input_format, p.input_shape };
+    }
+
+    layout get_indices_layout(gather_nd_test_params& p) {
+        return layout{ p.data_type, p.indices_format, p.indices_shape };
+    }
+
+    layout get_output_layout(gather_nd_test_params& p) {
+        return layout{ p.data_type, p.output_format, p.output_shape };
+    }
+
+    layout get_per_channel_layout(gather_nd_test_params& p) {
+        return layout{ p.default_type, p.default_format, tensor{1, p.output_shape.feature[0], 1, 1} };
+    }
+};
+
+class gather_nd_quantize : public GatherNDPrimitiveFusingTest {};
+TEST_P(gather_nd_quantize, basic) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+        data("gather_nd_indices", get_mem(get_indices_layout(p), 0, p.max_number_in_indices - 1)),
+        data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+        data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+        data("out_lo", get_mem(get_single_element_layout(p), -127)),
+        data("out_hi", get_mem(get_single_element_layout(p), 127)),
+        gather_nd("gather_nd_prim", "input", "gather_nd_indices", p.indices_rank, p.batch_dims),
+        quantize("quantize", "gather_nd_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
+        reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
+    );
+    tolerance = 1.f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, gather_nd_quantize,
+    ::testing::ValuesIn(std::vector<gather_nd_test_params>{
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_1, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_2, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_3, 2, 3 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_1, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_2, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_3, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_4, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_5, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_6, 2, 3 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_1, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_2, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_3, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_4, 2, 3 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_1, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_2, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_3, 2, 3 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_1, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_2, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_3, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_4, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_5, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_6, 2, 3 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_1, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_2, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_3, 2, 3 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_4, 2, 3 },
+}), );
+
+class gather_nd_activation_scale_eltwise : public GatherNDPrimitiveFusingTest {};
+TEST_P(gather_nd_activation_scale_eltwise, basic) {
+    auto p = GetParam();
+
+    create_topologies(input_layout("input", get_input_layout(p)),
+        data("gather_nd_indices", get_mem(get_indices_layout(p), 0, p.max_number_in_indices - 1)),
+        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / 255)),
+        data("eltwise_data", get_mem(get_output_layout(p))),
+        gather_nd("gather_nd_prim", "input", "gather_nd_indices", p.indices_rank, p.batch_dims),
+        activation("activation", "gather_nd_prim", activation_func::abs),
+        scale("scale", "activation", "scale_data"),
+        eltwise("eltwise", { "scale", "eltwise_data" }, eltwise_mode::sum, p.data_type),
+        reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32)
+    );
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, gather_nd_activation_scale_eltwise,
+    ::testing::ValuesIn(std::vector<gather_nd_test_params>{
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_1, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_2, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_3, 2, 5 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_1, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_2, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_3, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_4, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_5, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_6, 2, 5 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_1, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_2, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_3, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_4, 2, 5 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_1, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_2, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_3, 2, 5 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_1, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_2, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_3, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_4, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_5, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_6, 2, 5 },
+
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_1, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_2, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_3, 2, 5 },
+        gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_4, 2, 5 },
+}), );
+
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/gather_nd_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/gather_nd_gpu_test.cpp
new file mode 100644
index 00000000000000..2999969928772b
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/gather_nd_gpu_test.cpp
@@ -0,0 +1,730 @@
+// Copyright (c) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include <gtest/gtest.h>
+
+#include <api/input_layout.hpp>
+#include <api/memory.hpp>
+#include <api/gather_nd.hpp>
+#include <api/topology.hpp>
+#include <api/network.hpp>
+
+#include <cstddef>
+#include <tests/test_utils/test_utils.h>
+
+using namespace cldnn;
+using namespace ::tests;
+
+inline void DoTest(const engine& engine,
+    const cldnn::memory& input0,
+    const cldnn::memory& input1,
+    const std::vector<float>& expected_results,
+    const int indices_rank,
+    const int batch_dims) {
+    topology topology;
+    topology.add(input_layout("InputData", input0.get_layout()));
+    topology.add(input_layout("InputIndices", input1.get_layout()));
+    topology.add(
+        gather_nd("gather_nd", "InputData", "InputIndices", indices_rank, batch_dims)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("InputData", input0);
+    network.set_input_data("InputIndices", input1);
+    auto outputs = network.execute();
+    auto output = outputs.at("gather_nd").get_memory();
+    auto output_ptr = output.pointer<uint16_t>();
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+    }
+}
+
+TEST(gather_nd_gpu_fp16, d23322_i231312_ir6_batch2) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 6;
+    const int batch_dims = 2;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 2, 3, 2, 2, 3 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfwzyx, { 2, 3, 2, 1, 3, 1 } }); // indices
+    // expected output dim: {6,1,3,1,2}
+
+    set_values(input0, {
+        FLOAT16(11), FLOAT16(12),  FLOAT16(13), FLOAT16(14),    FLOAT16(15), FLOAT16(16),  FLOAT16(11), FLOAT16(12),    FLOAT16(13), FLOAT16(14),  FLOAT16(15), FLOAT16(16),
+        FLOAT16(21), FLOAT16(22),  FLOAT16(23), FLOAT16(24),    FLOAT16(25), FLOAT16(26),  FLOAT16(21), FLOAT16(22),    FLOAT16(23), FLOAT16(24),  FLOAT16(25), FLOAT16(26),
+        FLOAT16(31), FLOAT16(32),  FLOAT16(33), FLOAT16(34),    FLOAT16(35), FLOAT16(36),  FLOAT16(31), FLOAT16(32),    FLOAT16(33), FLOAT16(34),  FLOAT16(35), FLOAT16(36),
+
+        FLOAT16(11), FLOAT16(12),  FLOAT16(13), FLOAT16(14),    FLOAT16(15), FLOAT16(16),  FLOAT16(11), FLOAT16(12),    FLOAT16(13), FLOAT16(14),  FLOAT16(15), FLOAT16(16),
+        FLOAT16(21), FLOAT16(22),  FLOAT16(23), FLOAT16(24),    FLOAT16(25), FLOAT16(26),  FLOAT16(21), FLOAT16(22),    FLOAT16(23), FLOAT16(24),  FLOAT16(25), FLOAT16(26),
+        FLOAT16(31), FLOAT16(32),  FLOAT16(33), FLOAT16(34),    FLOAT16(35), FLOAT16(36),  FLOAT16(31), FLOAT16(32),    FLOAT16(33), FLOAT16(34),  FLOAT16(35), FLOAT16(36),
+        });
+
+    set_values(input1, {
+        FLOAT16(2), FLOAT16(1),    FLOAT16(1), FLOAT16(1),    FLOAT16(1), FLOAT16(1),
+        FLOAT16(1), FLOAT16(0),    FLOAT16(2), FLOAT16(0),    FLOAT16(2), FLOAT16(0),
+        FLOAT16(0), FLOAT16(1),    FLOAT16(0), FLOAT16(1),    FLOAT16(0), FLOAT16(1),
+
+        FLOAT16(2), FLOAT16(0),    FLOAT16(1), FLOAT16(0),    FLOAT16(1), FLOAT16(0),
+        FLOAT16(1), FLOAT16(1),    FLOAT16(2), FLOAT16(1),    FLOAT16(2), FLOAT16(1),
+        FLOAT16(0), FLOAT16(0),    FLOAT16(1), FLOAT16(0),    FLOAT16(2), FLOAT16(0),
+        });
+
+    std::vector<float> expected_results = {
+        FLOAT16(15), FLOAT16(16),   FLOAT16(11), FLOAT16(12),   FLOAT16(11), FLOAT16(12),
+        FLOAT16(25), FLOAT16(26),   FLOAT16(23), FLOAT16(24),   FLOAT16(23), FLOAT16(24),
+        FLOAT16(33), FLOAT16(34),   FLOAT16(33), FLOAT16(34),   FLOAT16(33), FLOAT16(34),
+
+        FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(15), FLOAT16(16),
+        FLOAT16(21), FLOAT16(22),   FLOAT16(25), FLOAT16(26),   FLOAT16(25), FLOAT16(26),
+        FLOAT16(31), FLOAT16(32),   FLOAT16(35), FLOAT16(36),   FLOAT16(33), FLOAT16(34),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d231322_i231321_ir6_batch5) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 6;
+    const int batch_dims = 5;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfwzyx, { 2, 3, 2, 2, 3, 1 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfwzyx, { 2, 3, 1, 2, 3, 1 } }); // indices
+    // expected output dim: {36}
+
+    set_values(input0, {
+        FLOAT16(11), FLOAT16(12),   FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(17), FLOAT16(18),   FLOAT16(19), FLOAT16(10),   FLOAT16(21), FLOAT16(18),
+        FLOAT16(21), FLOAT16(22),   FLOAT16(23), FLOAT16(24),   FLOAT16(25), FLOAT16(26),   FLOAT16(27), FLOAT16(28),   FLOAT16(29), FLOAT16(20),   FLOAT16(27), FLOAT16(28),
+        FLOAT16(31), FLOAT16(32),   FLOAT16(33), FLOAT16(34),   FLOAT16(35), FLOAT16(36),   FLOAT16(37), FLOAT16(38),   FLOAT16(39), FLOAT16(30),   FLOAT16(31), FLOAT16(30),
+
+        FLOAT16(11), FLOAT16(12),   FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(17), FLOAT16(18),   FLOAT16(19), FLOAT16(10),   FLOAT16(17), FLOAT16(18),
+        FLOAT16(21), FLOAT16(22),   FLOAT16(23), FLOAT16(24),   FLOAT16(25), FLOAT16(26),   FLOAT16(27), FLOAT16(28),   FLOAT16(29), FLOAT16(20),   FLOAT16(27), FLOAT16(28),
+        FLOAT16(31), FLOAT16(32),   FLOAT16(33), FLOAT16(34),   FLOAT16(35), FLOAT16(36),   FLOAT16(37), FLOAT16(38),   FLOAT16(39), FLOAT16(30),   FLOAT16(29), FLOAT16(30),
+        });
+
+    set_values(input1, {
+        FLOAT16(1), FLOAT16(1),    FLOAT16(1), FLOAT16(1),    FLOAT16(1), FLOAT16(1),
+        FLOAT16(0), FLOAT16(0),    FLOAT16(0), FLOAT16(0),    FLOAT16(0), FLOAT16(0),
+        FLOAT16(1), FLOAT16(0),    FLOAT16(0), FLOAT16(1),    FLOAT16(1), FLOAT16(0),
+
+        FLOAT16(1), FLOAT16(1),    FLOAT16(1), FLOAT16(1),    FLOAT16(1), FLOAT16(1),
+        FLOAT16(0), FLOAT16(0),    FLOAT16(0), FLOAT16(0),    FLOAT16(0), FLOAT16(0),
+        FLOAT16(1), FLOAT16(0),    FLOAT16(0), FLOAT16(1),    FLOAT16(1), FLOAT16(0),
+        });
+
+    std::vector<float> expected_results = {
+        FLOAT16(12), FLOAT16(14),   FLOAT16(16), FLOAT16(18),   FLOAT16(10), FLOAT16(18),
+        FLOAT16(21), FLOAT16(23),   FLOAT16(25), FLOAT16(27),   FLOAT16(29), FLOAT16(27),
+        FLOAT16(32), FLOAT16(33),   FLOAT16(35), FLOAT16(38),   FLOAT16(30), FLOAT16(31),
+
+        FLOAT16(12), FLOAT16(14),   FLOAT16(16), FLOAT16(18),   FLOAT16(10), FLOAT16(18),
+        FLOAT16(21), FLOAT16(23),   FLOAT16(25), FLOAT16(27),   FLOAT16(29), FLOAT16(27),
+        FLOAT16(32), FLOAT16(33),   FLOAT16(35), FLOAT16(38),   FLOAT16(30), FLOAT16(29),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d23322_i23321_ir5_batch4) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 5;
+    const int batch_dims = 4;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 2, 3, 2, 2, 3 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 2, 3, 1, 2, 3 } }); // indices
+    // expected output dim: {36}
+
+    set_values(input0, {
+        FLOAT16(11), FLOAT16(12),   FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(17), FLOAT16(18),   FLOAT16(19), FLOAT16(10),   FLOAT16(21), FLOAT16(18),
+        FLOAT16(21), FLOAT16(22),   FLOAT16(23), FLOAT16(24),   FLOAT16(25), FLOAT16(26),   FLOAT16(27), FLOAT16(28),   FLOAT16(29), FLOAT16(20),   FLOAT16(27), FLOAT16(28),
+        FLOAT16(31), FLOAT16(32),   FLOAT16(33), FLOAT16(34),   FLOAT16(35), FLOAT16(36),   FLOAT16(37), FLOAT16(38),   FLOAT16(39), FLOAT16(30),   FLOAT16(31), FLOAT16(30),
+
+        FLOAT16(11), FLOAT16(12),   FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(17), FLOAT16(18),   FLOAT16(19), FLOAT16(10),   FLOAT16(17), FLOAT16(18),
+        FLOAT16(21), FLOAT16(22),   FLOAT16(23), FLOAT16(24),   FLOAT16(25), FLOAT16(26),   FLOAT16(27), FLOAT16(28),   FLOAT16(29), FLOAT16(20),   FLOAT16(27), FLOAT16(28),
+        FLOAT16(31), FLOAT16(32),   FLOAT16(33), FLOAT16(34),   FLOAT16(35), FLOAT16(36),   FLOAT16(37), FLOAT16(38),   FLOAT16(39), FLOAT16(30),   FLOAT16(29), FLOAT16(30),
+        });
+
+    set_values(input1, {
+        FLOAT16(1), FLOAT16(1),    FLOAT16(1), FLOAT16(1),    FLOAT16(1), FLOAT16(1),
+        FLOAT16(0), FLOAT16(0),    FLOAT16(0), FLOAT16(0),    FLOAT16(0), FLOAT16(0),
+        FLOAT16(1), FLOAT16(0),    FLOAT16(0), FLOAT16(1),    FLOAT16(1), FLOAT16(0),
+
+        FLOAT16(1), FLOAT16(1),    FLOAT16(1), FLOAT16(1),    FLOAT16(1), FLOAT16(1),
+        FLOAT16(0), FLOAT16(0),    FLOAT16(0), FLOAT16(0),    FLOAT16(0), FLOAT16(0),
+        FLOAT16(1), FLOAT16(0),    FLOAT16(0), FLOAT16(1),    FLOAT16(1), FLOAT16(0),
+        });
+
+    std::vector<float> expected_results = {
+        FLOAT16(12), FLOAT16(14),   FLOAT16(16), FLOAT16(18),   FLOAT16(10), FLOAT16(18),
+        FLOAT16(21), FLOAT16(23),   FLOAT16(25), FLOAT16(27),   FLOAT16(29), FLOAT16(27),
+        FLOAT16(32), FLOAT16(33),   FLOAT16(35), FLOAT16(38),   FLOAT16(30), FLOAT16(31),
+
+        FLOAT16(12), FLOAT16(14),   FLOAT16(16), FLOAT16(18),   FLOAT16(10), FLOAT16(18),
+        FLOAT16(21), FLOAT16(23),   FLOAT16(25), FLOAT16(27),   FLOAT16(29), FLOAT16(27),
+        FLOAT16(32), FLOAT16(33),   FLOAT16(35), FLOAT16(38),   FLOAT16(30), FLOAT16(29),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d23223_i2321_ir4_batch3) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 4;
+    const int batch_dims = 3;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 2, 3, 3, 2, 2 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 3, 1, 2 } }); // indices
+    // expected output dim: {2*3*2,3}
+
+    set_values(input0, {
+        FLOAT16(11), FLOAT16(12), FLOAT16(13),  FLOAT16(14), FLOAT16(15), FLOAT16(16),  FLOAT16(17), FLOAT16(18),FLOAT16(15),  FLOAT16(16), FLOAT16(17), FLOAT16(18),
+        FLOAT16(21), FLOAT16(22), FLOAT16(23),  FLOAT16(24), FLOAT16(25), FLOAT16(26),  FLOAT16(27), FLOAT16(28),FLOAT16(25),  FLOAT16(26), FLOAT16(27), FLOAT16(28),
+        FLOAT16(29), FLOAT16(30), FLOAT16(31),  FLOAT16(32), FLOAT16(33), FLOAT16(34),  FLOAT16(35), FLOAT16(36),FLOAT16(33),  FLOAT16(34), FLOAT16(35), FLOAT16(36),
+
+        FLOAT16(11), FLOAT16(12), FLOAT16(13),  FLOAT16(14), FLOAT16(15), FLOAT16(16),  FLOAT16(17), FLOAT16(18),FLOAT16(15),  FLOAT16(16), FLOAT16(17), FLOAT16(18),
+        FLOAT16(21), FLOAT16(22), FLOAT16(23),  FLOAT16(24), FLOAT16(25), FLOAT16(26),  FLOAT16(27), FLOAT16(28),FLOAT16(25),  FLOAT16(26), FLOAT16(27), FLOAT16(28),
+        FLOAT16(29), FLOAT16(30), FLOAT16(31),  FLOAT16(32), FLOAT16(33), FLOAT16(34),  FLOAT16(35), FLOAT16(36),FLOAT16(33),  FLOAT16(34), FLOAT16(35), FLOAT16(36),
+        });
+
+    set_values(input1, {
+        FLOAT16(1), FLOAT16(1),
+        FLOAT16(1), FLOAT16(0),
+        FLOAT16(1), FLOAT16(1),
+
+        FLOAT16(0), FLOAT16(0),
+        FLOAT16(0), FLOAT16(1),
+        FLOAT16(0), FLOAT16(0),
+        });
+
+    std::vector<float> expected_results = {
+        FLOAT16(14), FLOAT16(15), FLOAT16(16),  FLOAT16(16), FLOAT16(17), FLOAT16(18),
+        FLOAT16(24), FLOAT16(25), FLOAT16(26),  FLOAT16(27), FLOAT16(28), FLOAT16(25),
+        FLOAT16(32), FLOAT16(33), FLOAT16(34),  FLOAT16(34), FLOAT16(35), FLOAT16(36),
+
+        FLOAT16(11), FLOAT16(12), FLOAT16(13),  FLOAT16(17), FLOAT16(18), FLOAT16(15),
+        FLOAT16(21), FLOAT16(22), FLOAT16(23),  FLOAT16(26), FLOAT16(27), FLOAT16(28),
+        FLOAT16(29), FLOAT16(30), FLOAT16(31),  FLOAT16(35), FLOAT16(36), FLOAT16(33),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d2342_i2312_ir4_batch2) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 4;
+    const int batch_dims = 2;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 3, 2, 4 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 3, 2, 1 } }); // indices
+    // expected output dim: {6,1}
+
+    set_values(input0, {
+        FLOAT16(11), FLOAT16(12),   FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(17), FLOAT16(18),
+        FLOAT16(21), FLOAT16(22),   FLOAT16(23), FLOAT16(24),   FLOAT16(25), FLOAT16(26),   FLOAT16(27), FLOAT16(28),
+        FLOAT16(29), FLOAT16(30),   FLOAT16(31), FLOAT16(32),   FLOAT16(33), FLOAT16(34),   FLOAT16(35), FLOAT16(36),
+
+        FLOAT16(11), FLOAT16(12),   FLOAT16(13), FLOAT16(14),   FLOAT16(15), FLOAT16(16),   FLOAT16(17), FLOAT16(18),
+        FLOAT16(21), FLOAT16(22),   FLOAT16(23), FLOAT16(24),   FLOAT16(25), FLOAT16(26),   FLOAT16(27), FLOAT16(28),
+        FLOAT16(29), FLOAT16(30),   FLOAT16(31), FLOAT16(32),   FLOAT16(33), FLOAT16(34),   FLOAT16(35), FLOAT16(36),
+    });
+
+    set_values(input1, {
+        FLOAT16(1), FLOAT16(1),
+        FLOAT16(0), FLOAT16(0),
+        FLOAT16(2), FLOAT16(1),
+
+        FLOAT16(0), FLOAT16(0),
+        FLOAT16(2), FLOAT16(1),
+        FLOAT16(2), FLOAT16(0),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(14),
+        FLOAT16(21),
+        FLOAT16(34),
+
+        FLOAT16(11),
+        FLOAT16(26),
+        FLOAT16(33),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d234_i2311_ir4_batch2) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 4;
+    const int batch_dims = 2;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 3, 1, 4 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 3, 1, 1 } }); // indices
+    // expected output dim: {6,1,1}
+
+    set_values(input0, {
+        FLOAT16(1), FLOAT16(2), FLOAT16(3), FLOAT16(4),
+        FLOAT16(5), FLOAT16(6), FLOAT16(7), FLOAT16(8),
+        FLOAT16(9), FLOAT16(10), FLOAT16(11), FLOAT16(12),
+
+        FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
+        FLOAT16(17), FLOAT16(18), FLOAT16(19), FLOAT16(20),
+        FLOAT16(21), FLOAT16(22), FLOAT16(23), FLOAT16(24),
+
+        });
+
+    set_values(input1, {
+        FLOAT16(1),
+        FLOAT16(0),
+        FLOAT16(2),
+
+        FLOAT16(0),
+        FLOAT16(2),
+        FLOAT16(2),
+        });
+
+    std::vector<float> expected_results = {
+        FLOAT16(2),
+        FLOAT16(5),
+        FLOAT16(11),
+
+        FLOAT16(13),
+        FLOAT16(19),
+        FLOAT16(23),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d234_i21_ir2_batch1) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 2;
+    const int batch_dims = 1;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 3, 1, 4 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 1, 1, 1 } }); // indices
+    // expected output dim: {2,4}
+
+    set_values(input0, {
+        FLOAT16(1), FLOAT16(2), FLOAT16(3), FLOAT16(4),
+        FLOAT16(5), FLOAT16(6), FLOAT16(7), FLOAT16(8),
+        FLOAT16(9), FLOAT16(10), FLOAT16(11), FLOAT16(12),
+
+        FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
+        FLOAT16(17), FLOAT16(18), FLOAT16(19), FLOAT16(20),
+        FLOAT16(21), FLOAT16(22), FLOAT16(23), FLOAT16(24),
+
+    });
+
+    set_values(input1, {
+        FLOAT16(1),
+        FLOAT16(0),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(5), FLOAT16(6), FLOAT16(7), FLOAT16(8),
+        FLOAT16(13), FLOAT16(14), FLOAT16(15), FLOAT16(16),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch1) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 2;
+    const int batch_dims = 1;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 1, 1, 1 } }); // indices
+    // expected output dim: 2
+
+    set_values(input0, {
+        FLOAT16(1), FLOAT16(2),
+        FLOAT16(3), FLOAT16(4),
+    });
+
+    set_values(input1, {
+        FLOAT16(1),
+        FLOAT16(0),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(2),
+        FLOAT16(3),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d3223_i321113_ir6_batch0) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 6;
+    const int batch_dims = 0;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 3, 2 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfwzyx, { 3, 2, 3, 1, 1, 1 } }); // indices
+    // expected output dim: 321113
+
+    set_values(input0, {
+        FLOAT16(11), FLOAT16(12), FLOAT16(13),   FLOAT16(14), FLOAT16(15), FLOAT16(16),
+        FLOAT16(21), FLOAT16(22), FLOAT16(23),   FLOAT16(24), FLOAT16(25), FLOAT16(26),
+
+        FLOAT16(31), FLOAT16(32), FLOAT16(33),   FLOAT16(34), FLOAT16(35), FLOAT16(36),
+        FLOAT16(41), FLOAT16(42), FLOAT16(43),   FLOAT16(44), FLOAT16(45), FLOAT16(46),
+
+        FLOAT16(51), FLOAT16(52), FLOAT16(53),   FLOAT16(54), FLOAT16(55), FLOAT16(56),
+        FLOAT16(61), FLOAT16(62), FLOAT16(63),   FLOAT16(64), FLOAT16(65), FLOAT16(66),
+    });
+
+    set_values(input1, {
+        FLOAT16(2), FLOAT16(1), FLOAT16(1),
+        FLOAT16(1), FLOAT16(0), FLOAT16(0),
+
+        FLOAT16(0), FLOAT16(1), FLOAT16(0),
+        FLOAT16(2), FLOAT16(0), FLOAT16(1),
+
+        FLOAT16(1), FLOAT16(1), FLOAT16(0),
+        FLOAT16(0), FLOAT16(0), FLOAT16(0),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(64), FLOAT16(65), FLOAT16(66),
+        FLOAT16(31), FLOAT16(32), FLOAT16(33),
+
+        FLOAT16(21), FLOAT16(22), FLOAT16(23),
+        FLOAT16(54), FLOAT16(55), FLOAT16(56),
+
+        FLOAT16(41), FLOAT16(42), FLOAT16(43),
+        FLOAT16(11), FLOAT16(12), FLOAT16(13),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d3221_i32312_ir3_batch0) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 3;
+    const int batch_dims = 0;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 3, 2, 2, 1, 3 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 2 } }); // indices
+    // expected output dim: 32312
+
+    set_values(input0, {
+        FLOAT16(11), FLOAT16(12),     FLOAT16(13), FLOAT16(14),     FLOAT16(15), FLOAT16(16),
+        FLOAT16(21), FLOAT16(22),     FLOAT16(23), FLOAT16(24),     FLOAT16(25), FLOAT16(26),
+
+        FLOAT16(31), FLOAT16(32),     FLOAT16(33), FLOAT16(34),     FLOAT16(35), FLOAT16(36),
+        FLOAT16(41), FLOAT16(42),     FLOAT16(43), FLOAT16(44),     FLOAT16(45), FLOAT16(46),
+
+        FLOAT16(51), FLOAT16(52),     FLOAT16(53), FLOAT16(54),     FLOAT16(55), FLOAT16(56),
+        FLOAT16(61), FLOAT16(62),     FLOAT16(63), FLOAT16(64),     FLOAT16(65), FLOAT16(66),
+    });
+
+    set_values(input1, {
+        FLOAT16(2), FLOAT16(1),
+        FLOAT16(1), FLOAT16(0),
+
+        FLOAT16(0), FLOAT16(1),
+        FLOAT16(2), FLOAT16(0),
+
+        FLOAT16(1), FLOAT16(1),
+        FLOAT16(0), FLOAT16(0),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(61), FLOAT16(62),     FLOAT16(63), FLOAT16(64),     FLOAT16(65), FLOAT16(66),
+        FLOAT16(31), FLOAT16(32),     FLOAT16(33), FLOAT16(34),     FLOAT16(35), FLOAT16(36),
+
+        FLOAT16(21), FLOAT16(22),     FLOAT16(23), FLOAT16(24),     FLOAT16(25), FLOAT16(26),
+        FLOAT16(51), FLOAT16(52),     FLOAT16(53), FLOAT16(54),     FLOAT16(55), FLOAT16(56),
+
+        FLOAT16(41), FLOAT16(42),     FLOAT16(43), FLOAT16(44),     FLOAT16(45), FLOAT16(46),
+        FLOAT16(11), FLOAT16(12),     FLOAT16(13), FLOAT16(14),     FLOAT16(15), FLOAT16(16),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d3231_i32312_ir3_batch0) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 3;
+    const int batch_dims = 0;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 3, 2, 2, 1, 3 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 3 } }); // indices
+    // expected output dim: {3,2,1,2}
+
+    set_values(input0, {
+        FLOAT16(11), FLOAT16(12),     FLOAT16(13), FLOAT16(14),     FLOAT16(15), FLOAT16(16),
+        FLOAT16(21), FLOAT16(22),     FLOAT16(23), FLOAT16(24),     FLOAT16(25), FLOAT16(26),
+
+        FLOAT16(31), FLOAT16(32),     FLOAT16(33), FLOAT16(34),     FLOAT16(35), FLOAT16(36),
+        FLOAT16(41), FLOAT16(42),     FLOAT16(43), FLOAT16(44),     FLOAT16(45), FLOAT16(46),
+
+        FLOAT16(51), FLOAT16(52),     FLOAT16(53), FLOAT16(54),     FLOAT16(55), FLOAT16(56),
+        FLOAT16(61), FLOAT16(62),     FLOAT16(63), FLOAT16(64),     FLOAT16(65), FLOAT16(66),
+    });
+
+    set_values(input1, {
+        FLOAT16(2), FLOAT16(1), FLOAT16(1),
+        FLOAT16(1), FLOAT16(0), FLOAT16(2),
+
+        FLOAT16(0), FLOAT16(1), FLOAT16(0),
+        FLOAT16(2), FLOAT16(0), FLOAT16(1),
+
+        FLOAT16(1), FLOAT16(1), FLOAT16(2),
+        FLOAT16(0), FLOAT16(0), FLOAT16(0),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(63), FLOAT16(64),
+        FLOAT16(35), FLOAT16(36),
+
+        FLOAT16(21), FLOAT16(22),
+        FLOAT16(53), FLOAT16(54),
+
+        FLOAT16(45), FLOAT16(46),
+        FLOAT16(11), FLOAT16(12),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d3112_i3221_ir4_batch0) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 4;
+    const int batch_dims = 0;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 1, 2, 1 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 2 } }); // indices
+    // expected output dim: {3,2,2,1,1,2}
+
+    set_values(input0, {
+        FLOAT16(1), FLOAT16(2),
+        FLOAT16(7), FLOAT16(8),
+        FLOAT16(13), FLOAT16(14),
+    });
+
+    set_values(input1, {
+        FLOAT16(2), FLOAT16(1),
+        FLOAT16(0), FLOAT16(1),
+
+        FLOAT16(2), FLOAT16(1),
+        FLOAT16(0), FLOAT16(1),
+
+        FLOAT16(2), FLOAT16(1),
+        FLOAT16(0), FLOAT16(1),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(13), FLOAT16(14),       FLOAT16(7), FLOAT16(8),
+        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
+
+        FLOAT16(13), FLOAT16(14),       FLOAT16(7), FLOAT16(8),
+        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
+
+        FLOAT16(13), FLOAT16(14),       FLOAT16(7), FLOAT16(8),
+        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d311211_i322111_ir4_batch0) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 4;
+    const int batch_dims = 0;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfwzyx, { 3, 1, 1, 1, 2, 1 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfwzyx, { 3, 2, 1, 1, 1, 2 } }); // indices
+    // expected output dim: {3,2,2,1,1,2,1,1}
+
+    set_values(input0, {
+        FLOAT16(1), FLOAT16(2),
+        FLOAT16(7), FLOAT16(8),
+        FLOAT16(13), FLOAT16(14),
+    });
+
+    set_values(input1, {
+        FLOAT16(2), FLOAT16(1),
+        FLOAT16(0), FLOAT16(1),
+
+        FLOAT16(2), FLOAT16(1),
+        FLOAT16(0), FLOAT16(1),
+
+        FLOAT16(2), FLOAT16(1),
+        FLOAT16(0), FLOAT16(1),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(13), FLOAT16(14),       FLOAT16(7), FLOAT16(8),
+        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
+
+        FLOAT16(13), FLOAT16(14),       FLOAT16(7), FLOAT16(8),
+        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
+
+        FLOAT16(13), FLOAT16(14),       FLOAT16(7), FLOAT16(8),
+        FLOAT16(1), FLOAT16(2),         FLOAT16(7), FLOAT16(8),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d3332_i3223_ir4_batch0) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 4;
+    const int batch_dims = 0;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 3, 3, 2 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 3, 2 } }); // indices
+
+    set_values(input0, {
+        FLOAT16(1), FLOAT16(2), FLOAT16(3),     FLOAT16(4), FLOAT16(5), FLOAT16(6),
+        FLOAT16(7), FLOAT16(8), FLOAT16(9),     FLOAT16(10), FLOAT16(11), FLOAT16(12),
+        FLOAT16(13), FLOAT16(14), FLOAT16(15),  FLOAT16(16), FLOAT16(17), FLOAT16(18),
+
+        FLOAT16(19), FLOAT16(20), FLOAT16(21),     FLOAT16(22), FLOAT16(23), FLOAT16(24),
+        FLOAT16(25), FLOAT16(26), FLOAT16(27),     FLOAT16(28), FLOAT16(29), FLOAT16(30),
+        FLOAT16(31), FLOAT16(32), FLOAT16(33),     FLOAT16(34), FLOAT16(35), FLOAT16(36),
+
+        FLOAT16(41), FLOAT16(42), FLOAT16(43),     FLOAT16(44), FLOAT16(45), FLOAT16(46),
+        FLOAT16(51), FLOAT16(52), FLOAT16(53),     FLOAT16(54), FLOAT16(55), FLOAT16(56),
+        FLOAT16(61), FLOAT16(62), FLOAT16(63),     FLOAT16(64), FLOAT16(65), FLOAT16(66),
+    });
+
+    set_values(input1, {
+        FLOAT16(2), FLOAT16(0), FLOAT16(0),        FLOAT16(2), FLOAT16(2), FLOAT16(0),
+        FLOAT16(1), FLOAT16(0), FLOAT16(0),        FLOAT16(1), FLOAT16(1), FLOAT16(0),
+
+        FLOAT16(1), FLOAT16(0), FLOAT16(1),        FLOAT16(1), FLOAT16(1), FLOAT16(1),
+        FLOAT16(2), FLOAT16(0), FLOAT16(0),        FLOAT16(2), FLOAT16(1), FLOAT16(0),
+
+        FLOAT16(1), FLOAT16(1), FLOAT16(1),        FLOAT16(0), FLOAT16(1), FLOAT16(1),
+        FLOAT16(1), FLOAT16(2), FLOAT16(1),        FLOAT16(0), FLOAT16(2), FLOAT16(1),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(41), FLOAT16(42), FLOAT16(43),      FLOAT16(61), FLOAT16(62), FLOAT16(63),
+        FLOAT16(19), FLOAT16(20), FLOAT16(21),      FLOAT16(25), FLOAT16(26), FLOAT16(27),
+
+        FLOAT16(22), FLOAT16(23), FLOAT16(24),      FLOAT16(28), FLOAT16(29), FLOAT16(30),
+        FLOAT16(41), FLOAT16(42), FLOAT16(43),      FLOAT16(51), FLOAT16(52), FLOAT16(53),
+
+        FLOAT16(28), FLOAT16(29), FLOAT16(30),      FLOAT16(10), FLOAT16(11), FLOAT16(12),
+        FLOAT16(34), FLOAT16(35), FLOAT16(36),      FLOAT16(16), FLOAT16(17), FLOAT16(18),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d3323_i322_ir3_batch0) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 3;
+    const int batch_dims = 0;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 3, 3, 2 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 2 } }); // indices
+
+    set_values(input0, {
+        FLOAT16(1), FLOAT16(2), FLOAT16(3),     FLOAT16(4), FLOAT16(5), FLOAT16(6),
+        FLOAT16(7), FLOAT16(8), FLOAT16(9),     FLOAT16(10), FLOAT16(11), FLOAT16(12),
+        FLOAT16(13), FLOAT16(14), FLOAT16(15),  FLOAT16(16), FLOAT16(17), FLOAT16(18),
+
+        FLOAT16(19), FLOAT16(20), FLOAT16(21),     FLOAT16(22), FLOAT16(23), FLOAT16(24),
+        FLOAT16(25), FLOAT16(26), FLOAT16(27),     FLOAT16(28), FLOAT16(29), FLOAT16(30),
+        FLOAT16(31), FLOAT16(32), FLOAT16(33),     FLOAT16(34), FLOAT16(35), FLOAT16(36),
+
+        FLOAT16(41), FLOAT16(42), FLOAT16(43),     FLOAT16(44), FLOAT16(45), FLOAT16(46),
+        FLOAT16(51), FLOAT16(52), FLOAT16(53),     FLOAT16(54), FLOAT16(55), FLOAT16(56),
+        FLOAT16(61), FLOAT16(62), FLOAT16(63),     FLOAT16(64), FLOAT16(65), FLOAT16(66),
+    });
+
+    set_values(input1, {
+        FLOAT16(2), FLOAT16(0),
+        FLOAT16(2), FLOAT16(1),
+
+        FLOAT16(1), FLOAT16(2),
+        FLOAT16(1), FLOAT16(0),
+
+        FLOAT16(0), FLOAT16(1),
+        FLOAT16(0), FLOAT16(2),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(41), FLOAT16(42), FLOAT16(43),     FLOAT16(44), FLOAT16(45), FLOAT16(46),
+        FLOAT16(51), FLOAT16(52), FLOAT16(53),     FLOAT16(54), FLOAT16(55), FLOAT16(56),
+
+        FLOAT16(31), FLOAT16(32), FLOAT16(33),     FLOAT16(34), FLOAT16(35), FLOAT16(36),
+        FLOAT16(19), FLOAT16(20), FLOAT16(21),     FLOAT16(22), FLOAT16(23), FLOAT16(24),
+
+        FLOAT16(7), FLOAT16(8), FLOAT16(9),        FLOAT16(10), FLOAT16(11), FLOAT16(12),
+        FLOAT16(13), FLOAT16(14), FLOAT16(15),     FLOAT16(16), FLOAT16(17), FLOAT16(18),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d22_i21_ir2_batch0) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 2;
+    const int batch_dims = 0;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 1, 1, 1 } }); // indices
+
+    set_values(input0, {
+        FLOAT16(1), FLOAT16(2),
+        FLOAT16(3), FLOAT16(4)
+    });
+
+    set_values(input1, {
+        FLOAT16(1), FLOAT16(0),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(3), FLOAT16(4),
+        FLOAT16(1), FLOAT16(2),
+    };
+
+    DoTest(engine, input0, input1, expected_results, indices_rank, batch_dims);
+}
+
+TEST(gather_nd_gpu_fp16, d22_i32_ir2_batch0) {
+    const auto& engine = get_test_engine();
+
+    const int indices_rank = 2;
+    const int batch_dims = 0;
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 1, 1 } }); // data
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 3, 2, 1, 1 } }); // indices
+
+    set_values(input0, {
+        FLOAT16(1), FLOAT16(2),
+        FLOAT16(3), FLOAT16(4)
+    });
+
+    set_values(input1, {
+        FLOAT16(0), FLOAT16(0),
+        FLOAT16(1), FLOAT16(0),
+        FLOAT16(1), FLOAT16(1),
+    });
+
+    std::vector<float> expected_results = {
+        FLOAT16(1),
+        FLOAT16(3),
+        FLOAT16(4),
+    };
+
+    DoTest(engine,input0, input1, expected_results, indices_rank, batch_dims);
+}
+
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp
index 151fcd8e8ba164..1a02c57d79c0e7 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp
@@ -718,7 +718,6 @@ struct resample_random_test : testing::TestWithParam<resample_random_test_params
             if (info.original_id == "resample")
                 kernel = info.kernel_id;
         }
-        SCOPED_TRACE("kernel: " + kernel);
 
         compare(in_mem, output, params.operation_type, params.align_corners);
     }
@@ -750,7 +749,7 @@ struct resample_random_test_param_generator : std::vector<resample_random_test_p
 
 };
 
-INSTANTIATE_TEST_CASE_P(smoke,
+INSTANTIATE_TEST_CASE_P(smoke_resample,
                         resample_random_test,
                         testing::ValuesIn(
                             resample_random_test_param_generator()
@@ -765,6 +764,206 @@ INSTANTIATE_TEST_CASE_P(smoke,
                             .smoke_params(data_types::u8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
                         ), );
 
+
+/////////////////////////////////////////////////////////////////////////
+
+struct caffe_resample_random_test_params {
+    data_types input_type;
+    tensor input_size;
+    tensor output_size;
+    uint32_t num_filter;
+    resample_type operation_type;
+    uint32_t align_corners;
+    format::type in_format;
+    format::type out_format;
+    std::vector<int32_t> pads_begin;
+    std::vector<int32_t> pads_end;
+};
+
+struct caffe_resample_random_test : testing::TestWithParam<caffe_resample_random_test_params>
+{
+    template <typename T>
+    void fill_random_typed(memory& mem, int min, int max, int k) {
+        auto size = mem.get_layout().size;
+        size_t b = size.batch[0];
+        size_t f = size.feature[0];
+        size_t x = size.spatial[0];
+        size_t y = size.spatial[1];
+
+        auto data = generate_random_4d<T>(b, f, y, x, min, max, k);
+        auto ptr = mem.pointer<T>();
+        for (size_t bi = 0; bi < b; ++bi) {
+            for (size_t fi = 0; fi < f; ++fi) {
+                for (size_t yi = 0; yi < y; ++yi) {
+                    for (size_t xi = 0; xi < x; ++xi) {
+                        auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+                        auto offset = mem.get_layout().get_linear_offset(coords);
+                        ptr[offset] = data[bi][fi][yi][xi];
+                    }
+                }
+            }
+        }
+    }
+
+    void fill_random(memory& mem) {
+        auto dt = mem.get_layout().data_type;
+        switch (dt) {
+        case data_types::f32:
+            fill_random_typed<float>(mem, -127, 127, 2);
+            break;
+        case data_types::f16:
+            fill_random_typed<FLOAT16>(mem, -127, 127, 2);
+            break;
+        case data_types::i8:
+            fill_random_typed<int8_t>(mem, -127, 127, 1);
+            break;
+        case data_types::u8:
+            fill_random_typed<uint8_t>(mem, 0, 255, 1);
+            break;
+        default:
+            break;
+        }
+    }
+
+    template <typename T>
+    bool compare_outputs(const memory& out_ref, const memory& out_opt) {
+        auto output_lay = out_ref.get_layout();
+        auto opt_output_lay = out_opt.get_layout();
+
+        size_t b = output_lay.size.batch[0];
+        size_t f = output_lay.size.feature[0];
+        size_t x = output_lay.size.spatial[0];
+        size_t y = output_lay.size.spatial[1];
+        auto ref_ptr = out_ref.pointer<T>();
+        auto opt_ptr = out_opt.pointer<T>();
+        for (size_t bi = 0; bi < b; ++bi) {
+            for (size_t fi = 0; fi < f; ++fi) {
+                for (size_t yi = 0; yi < y; ++yi) {
+                    for (size_t xi = 0; xi < x; ++xi) {
+                        auto ref_out_coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+                        auto ref_out_offset = output_lay.get_linear_offset(ref_out_coords);
+                        auto ref_out_val = ref_ptr[ref_out_offset];
+
+                        auto opt_out_offset = opt_output_lay.get_linear_offset(ref_out_coords);
+                        auto opt_out_val = opt_ptr[opt_out_offset];
+
+                        EXPECT_EQ(ref_out_offset, opt_out_offset);
+                        EXPECT_EQ(opt_out_val, ref_out_val);
+                        // EXPECT_NEAR(static_cast<float>(opt_out_val), static_cast<float>(ref_out_val), 1.e-1f);
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    void execute_compare(const caffe_resample_random_test_params& params, bool check_result) {
+        auto eng = cldnn::engine();
+
+        auto in_layout = layout(params.input_type, params.in_format, params.input_size);
+        auto in_mem = memory::allocate(eng, in_layout);
+        fill_random(in_mem);
+
+        cldnn::topology topo;
+        topo.add(input_layout("in", in_layout));
+        auto prim = resample("resample", "in", params.output_size, params.num_filter, params.operation_type);
+        prim.align_corners = params.align_corners;
+        prim.pads_begin = params.pads_begin;
+        prim.pads_end = params.pads_end;
+        topo.add(prim);
+
+        auto build_opts = build_options();
+        build_opts.set_option(build_option::outputs({"resample"}));
+        build_opts.set_option(build_option::force_implementations({ {"resample", {params.in_format, "resample_ref"}} }));
+
+        auto net = network(eng, topo, build_opts);
+        net.set_input_data("in", in_mem);
+
+        auto result = net.execute();
+        auto output = result.at("resample").get_memory();
+
+        // Execute resample_opt
+        auto eng_opt = cldnn::engine();
+
+        cldnn::topology topo_opt;
+        topo_opt.add(input_layout("in", in_layout));
+        auto prim_opt = resample("resample_opt", "in", params.output_size, params.num_filter, params.operation_type);
+        prim_opt.align_corners = params.align_corners;
+        prim_opt.pads_begin = params.pads_begin;
+        prim_opt.pads_end = params.pads_end;
+        topo_opt.add(prim_opt);
+
+        auto build_opts_opt = build_options();
+        build_opts_opt.set_option(build_option::outputs({"resample_opt"}));
+        build_opts.set_option(build_option::force_implementations({ {"resample_opt", {params.in_format, "resample_opt"}} }));
+
+        auto net_opt = network(eng_opt, topo_opt, build_opts_opt);
+
+        // Use in_mem from ref network
+        net_opt.set_input_data("in", in_mem);
+
+        auto result_opt = net_opt.execute();
+        auto output_opt = result_opt.at("resample_opt").get_memory();
+
+        if (check_result == true) {
+            // Check data_types
+            if (params.input_type == data_types::f32) {
+                compare_outputs<float>(output, output_opt);
+            } else if (params.input_type == data_types::f16) {
+                compare_outputs<FLOAT16>(output, output_opt);
+            } else if (params.input_type == data_types::i8) {
+                compare_outputs<int8_t>(output, output_opt);
+            } else if (params.input_type == data_types::u8) {
+                compare_outputs<uint8_t>(output, output_opt);
+            } else {
+                FAIL() << "Not supported data type: " << static_cast<size_t>(params.input_type);
+            }
+        }
+    }
+};
+
+struct caffe_resample_random_test_param_generator : std::vector<caffe_resample_random_test_params> {
+    caffe_resample_random_test_param_generator& add(caffe_resample_random_test_params params) {
+        push_back(params);
+        return *this;
+    }
+
+    caffe_resample_random_test_param_generator& smoke_params(data_types type, format::type input_format, format::type output_format) {
+        push_back(caffe_resample_random_test_params{ type, {1, 512, 16, 16}, {1, 512, 32, 32}, 1, resample_type::caffe_bilinear, 1, input_format, output_format, {}, {}});
+        push_back(caffe_resample_random_test_params{ type, {1, 512, 32, 32}, {1, 512, 16, 16}, 1, resample_type::caffe_bilinear, 1, input_format, output_format, {}, {}});
+        push_back(caffe_resample_random_test_params{ type, {1, 24, 32, 32}, {1, 24, 64, 64}, 1,   resample_type::caffe_bilinear, 1, input_format, output_format, {}, {}});
+        push_back(caffe_resample_random_test_params{ type, {1, 24, 96, 96}, {1, 24, 32, 32}, 1,   resample_type::caffe_bilinear, 1, input_format, output_format, {}, {}});
+        push_back(caffe_resample_random_test_params{ type, {1, 8, 64, 64},  {1, 8, 32, 32},  1,   resample_type::caffe_bilinear, 1, input_format, output_format, {}, {}});
+        push_back(caffe_resample_random_test_params{ type, {1, 20, 10, 10}, {1, 20, 20, 20}, 1,   resample_type::caffe_bilinear, 1, input_format, output_format, {}, {}});
+        push_back(caffe_resample_random_test_params{ type, {1, 20, 20, 20}, {1, 20, 10, 10}, 1,   resample_type::caffe_bilinear, 1, input_format, output_format, {}, {}});
+        // Padding applied
+        push_back(caffe_resample_random_test_params{ type, {1, 96, 16, 16}, {1, 96, 32, 32}, 1, resample_type::caffe_bilinear, 1, input_format, output_format, {0, 0, 1, 1}, {0, 0, 1, 1}});
+        push_back(caffe_resample_random_test_params{ type, {1, 96, 32, 32}, {1, 96, 16, 16}, 1, resample_type::caffe_bilinear, 1, input_format, output_format, {0, 0, 1, 1}, {0, 0, 1, 1}});
+        return *this;
+    }
+};
+
+TEST_P(caffe_resample_random_test, random) {
+    auto param = GetParam();
+    execute_compare(param, true);
+}
+
+INSTANTIATE_TEST_CASE_P(caffe_smoke_caffe_fsv16,
+                        caffe_resample_random_test,
+                        testing::ValuesIn(
+                            caffe_resample_random_test_param_generator()
+                            .smoke_params(data_types::f32, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
+                            .smoke_params(data_types::f16, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
+                        ), );
+
+INSTANTIATE_TEST_CASE_P(caffe_smoke_caffe_fsv32,
+                        caffe_resample_random_test,
+                        testing::ValuesIn(
+                            caffe_resample_random_test_param_generator()
+                            .smoke_params(data_types::f16, format::fs_b_yx_fsv32, format::fs_b_yx_fsv32)
+                        ), );
+
 TEST(resample_gpu, interpolate_in2x2x3x2_nearest1) {
     //  Input  : 2x2x3x2
     //  Output : 2x2x6x4
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp
index 73656ed61cdce4..b9ab89cf767a02 100644
--- a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp
+++ b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp
@@ -32,7 +32,8 @@ typedef unsigned short ushort;
 #define CV_32S  4
 #define CV_32F  5
 #define CV_64F  6
-#define CV_USRTYPE1 7
+#define CV_16F  7
+#define CV_USRTYPE1 8
 
 #define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
 #define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
@@ -70,6 +71,13 @@ typedef unsigned short ushort;
 #define CV_32SC4 CV_MAKETYPE(CV_32S,4)
 #define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))
 
+
+#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
+#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
+#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
+#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
+#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
+
 #define CV_32FC1 CV_MAKETYPE(CV_32F,1)
 #define CV_32FC2 CV_MAKETYPE(CV_32F,2)
 #define CV_32FC3 CV_MAKETYPE(CV_32F,3)
diff --git a/inference-engine/thirdparty/mkl-dnn b/inference-engine/thirdparty/mkl-dnn
index b5497010eb48fe..fdf537051e8d30 160000
--- a/inference-engine/thirdparty/mkl-dnn
+++ b/inference-engine/thirdparty/mkl-dnn
@@ -1 +1 @@
-Subproject commit b5497010eb48fed033d91d4499c50d797452be74
+Subproject commit fdf537051e8d30adcf56f0a56afa3cc3abddc7a4
diff --git a/inference-engine/thirdparty/movidius/mvnc/src/mvnc_api.c b/inference-engine/thirdparty/movidius/mvnc/src/mvnc_api.c
index 58f58b212d56d9..961e45426b4829 100644
--- a/inference-engine/thirdparty/movidius/mvnc/src/mvnc_api.c
+++ b/inference-engine/thirdparty/movidius/mvnc/src/mvnc_api.c
@@ -82,8 +82,23 @@ static OVERLAPPED global_lock_overlap = { 0 };
 #define GLOBAL_UNLOCK() UnlockFileEx(global_lock_fd, 0, MAXDWORD, MAXDWORD, &global_lock_overlap)
 #else
 static int global_lock_fd = -1;
-#define GLOBAL_LOCK() flock(global_lock_fd, LOCK_EX)
-#define GLOBAL_UNLOCK() flock(global_lock_fd, LOCK_UN)
+#define GLOBAL_LOCK()                                                                               \
+    do {                                                                                            \
+        CHECK_MUTEX_SUCCESS_RC(flock(global_lock_fd, LOCK_EX), NC_ERROR);                           \
+        if (pthread_mutex_lock(&deviceOpenMutex) != 0) {                                            \
+            CHECK_MUTEX_SUCCESS(flock(global_lock_fd, LOCK_UN));                                    \
+            return NC_ERROR;                                                                        \
+        }                                                                                           \
+    } while (0)
+
+#define GLOBAL_UNLOCK()                                                                             \
+    do {                                                                                            \
+        if (flock(global_lock_fd, LOCK_UN) != 0) {                                                  \
+            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));                            \
+            return NC_ERROR;                                                                        \
+        }                                                                                           \
+        CHECK_MUTEX_SUCCESS_RC(pthread_mutex_unlock(&deviceOpenMutex), NC_ERROR);                   \
+    } while (0)
 #endif
 
 #define STRINGIFY(_text) #_text
@@ -749,17 +764,10 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
     }
 
     GLOBAL_LOCK();
-    int error = pthread_mutex_lock(&deviceOpenMutex);
-    if (error) {
-        GLOBAL_UNLOCK();
-        mvLog(MVLOG_ERROR, "pthread_mutex_lock(&deviceOpenMutex) failed with error: %d", error);
-        return NC_ERROR;
-    }
 
     if (!initialized) {
         ncStatus_t sc;
         if ((sc = initializeXLink()) != 0) {
-            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
             GLOBAL_UNLOCK();
             return sc;
         }
@@ -775,7 +783,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
     }
 
     if (rc != X_LINK_SUCCESS) {
-        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
         GLOBAL_UNLOCK();
         return parseXLinkError(NC_ERROR);
     }
@@ -795,7 +802,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
         d->wd_interval = watchdogInterval;
         *deviceHandlePtr = dH;
     } else {
-        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
         GLOBAL_UNLOCK();
         mvLog(MVLOG_ERROR, "Memory allocation failed");
         free(d);
@@ -805,7 +811,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
 
     if (d->dev_addr == NULL) {
         destroyDeviceHandle(deviceHandlePtr);
-        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
         GLOBAL_UNLOCK();
         return NC_OUT_OF_MEMORY;
     }
@@ -817,7 +822,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
     if (!handler) {
         mvLog(MVLOG_ERROR, "Memory allocation failed");
         destroyDeviceHandle(deviceHandlePtr);
-        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
         GLOBAL_UNLOCK();
         return NC_OUT_OF_MEMORY;
     }
@@ -847,7 +851,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
             mvLog(MVLOG_ERROR, "Can't get firmware, error: %s", ncStatusToStr(sc));
             free(handler);
             destroyDeviceHandle(deviceHandlePtr);
-            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
             GLOBAL_UNLOCK();
             return NC_MVCMD_NOT_FOUND;
         }
@@ -858,7 +861,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
                   __func__, XLinkErrorToStr(rc), d->dev_addr);
             free(handler);
             destroyDeviceHandle(deviceHandlePtr);
-            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
             GLOBAL_UNLOCK();
             return NC_ERROR;
         } else {
@@ -917,7 +919,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
             mvLog(MVLOG_ERROR, "Can't get firmware, error: %s", ncStatusToStr(sc));
             free(handler);
             destroyDeviceHandle(deviceHandlePtr);
-            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
             GLOBAL_UNLOCK();
             return NC_MVCMD_NOT_FOUND;
         }
@@ -1017,7 +1018,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
             }
             free(handler);
             destroyDeviceHandle(deviceHandlePtr);
-            CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
             GLOBAL_UNLOCK();
             return NC_ERROR;
         }
@@ -1028,7 +1028,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
         mvLog(MVLOG_ERROR, "Failed connection to device (%s) with error %d", d->dev_addr, rc);
         free(handler);
         destroyDeviceHandle(deviceHandlePtr);
-        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
         GLOBAL_UNLOCK();
         return parseXLinkError(rc);
     }
@@ -1041,7 +1040,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
     if (d->dev_addr == NULL || d->dev_addr_booted == NULL || d->xlink == NULL) {
         mvLog(MVLOG_ERROR, "device is invalid");
         destroyDeviceHandle(deviceHandlePtr);
-        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
         GLOBAL_UNLOCK();
         return NC_INVALID_HANDLE;
     }
@@ -1049,11 +1047,10 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
     devices = d;
 
     mvLog(MVLOG_INFO, "XLinkConnect done - link Id %d\n", handler->linkId);
-
+    int error = 0;
     if ((error = pthread_mutex_init(&d->dev_data_m, NULL)) != 0) {
         mvLog(MVLOG_ERROR, "pthread_mutex_init (dev_data_m) failed with error: %d", error);
         destroyDeviceHandle(deviceHandlePtr);
-        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
         GLOBAL_UNLOCK();
         return NC_ERROR;
     }
@@ -1062,7 +1059,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
         mvLog(MVLOG_ERROR, "pthread_mutex_init (dev_stream_m) failed with error: %d", error);
         CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(&d->dev_data_m));
         destroyDeviceHandle(deviceHandlePtr);
-        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
         GLOBAL_UNLOCK();
         return NC_ERROR;
     }
@@ -1071,7 +1067,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
         CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(&d->dev_data_m));
         CHECK_MUTEX_SUCCESS(pthread_mutex_destroy(&d->dev_stream_m));
         destroyDeviceHandle(deviceHandlePtr);
-        CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
         GLOBAL_UNLOCK();
         return NC_ERROR;
     }
@@ -1087,7 +1082,6 @@ ncStatus_t ncDeviceOpen(struct ncDeviceHandle_t **deviceHandlePtr,
 
     sleepForSeconds(1);
 
-    CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&deviceOpenMutex));
     GLOBAL_UNLOCK();
 
     streamId_t deviceMonitorStreamId = XLinkOpenStream(d->xlink->linkId, "deviceMonitor", CONFIG_STREAM_SIZE);
@@ -1941,22 +1935,24 @@ ncStatus_t checkGraphMonitorResponse(streamId_t graphMonStream) {
     return NC_OK;
 }
 
-static void lockAllInferences() {
+static ncStatus_t lockAllInferences() {
+    GLOBAL_LOCK();
     struct _devicePrivate_t *d = devices;
     while (d) {
         CHECK_MUTEX_SUCCESS(pthread_mutex_lock(&d->graph_stream_m));
         d = d->next;
     }
-    return;
+    return NC_OK;
 }
 
-static void unlockAllInferences() {
+static ncStatus_t unlockAllInferences() {
     struct _devicePrivate_t *d = devices;
     while (d) {
         CHECK_MUTEX_SUCCESS(pthread_mutex_unlock(&d->graph_stream_m));
         d = d->next;
     }
-    return;
+    GLOBAL_UNLOCK();
+    return NC_OK;
 }
 
 ncStatus_t ncGraphAllocate(struct ncDeviceHandle_t * deviceHandle,
@@ -2006,7 +2002,12 @@ ncStatus_t ncGraphAllocate(struct ncDeviceHandle_t * deviceHandle,
         return NC_OUT_OF_MEMORY;
     }
 
-    lockAllInferences();
+    rc = lockAllInferences();
+    if (rc != 0) {
+        mvLog(MVLOG_ERROR, "can't lock all inferences");
+        unlockAllInferences();
+        return rc;
+    }
     g->id = graphIdCount++;
     streamId_t streamId;
 
@@ -2171,7 +2172,11 @@ ncStatus_t ncGraphAllocate(struct ncDeviceHandle_t * deviceHandle,
 
     g->debug_buffer = g->aux_buffer;
     g->time_taken = (float *) (g->aux_buffer + 120);
-    unlockAllInferences();
+    rc = unlockAllInferences();
+    if (rc != 0) {
+        mvLog(MVLOG_ERROR, "Can't unlock all inferences");
+        return rc;
+    }
 
     GLOBAL_LOCK();
     g->dev = d;
diff --git a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
index 80175b96a95428..9c0056265720a6 100644
--- a/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
+++ b/inference-engine/thirdparty/ocv/opencv_hal_neon.hpp
@@ -2426,6 +2426,42 @@ CV_ALWAYS_INLINE v_uint8x16 v_gather_lines(const uchar src[], const short* mapsx
     return v_uint8x16(vreinterpretq_u8_s32(result));
 }
 
+CV_ALWAYS_INLINE void v_gather_pairs(const float src[], const int mapsx[], const int x,
+                                     v_float32x4& low, v_float32x4& high)
+{
+#if defined(__aarch64__)
+    float64x2_t l = {};
+    l = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x]]), l, 0);
+    l = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x + 1]]), l, 1);
+    low.val = vreinterpretq_f32_f64(l);
+
+    float64x2_t h = {};
+    h = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x + 2]]), h, 0);
+    h = vsetq_lane_f64(*reinterpret_cast<const double*>(&src[mapsx[x + 3]]), h, 1);
+    high.val = vreinterpretq_f32_f64(h);
+#else
+    float32x4_t l = {};
+    l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x]]), l, 0);
+    l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x] + 1]), l, 1);
+    l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 1]]), l, 2);
+    l = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 1] + 1]), l, 3);
+    low.val = l;
+
+    float32x4_t h = {};
+    h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 2]]), h, 0);
+    h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 2] + 1]), h, 1);
+    h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 3]]), h, 2);
+    h = vsetq_lane_f32(*reinterpret_cast<const float*>(&src[mapsx[x + 3] + 1]), h, 3);
+    high.val = h;
+#endif
+
+    return;
+}
+
+CV_ALWAYS_INLINE v_float32x4 v_fma(const v_float32x4& a, float b, const v_float32x4& c) {
+    return v_fma(a, v_setall_f32(b), c);
+}
+
 template<int imm>
 CV_ALWAYS_INLINE v_uint8x16 v_blend(const v_uint8x16& a, const v_uint8x16& b)
 {
@@ -2473,6 +2509,18 @@ CV_ALWAYS_INLINE v_uint8x16 v_shuffle(const v_uint8x16& a, const v_uint8x16& mas
 #endif
 }
 
+CV_ALWAYS_INLINE void v_deinterleave(const v_float32x4& low, const v_float32x4& high,
+                                     v_float32x4& even, v_float32x4& odd) {
+    float32x4x2_t p1 = vzipq_f32(low.val, high.val);
+    float32x4_t tmp0 = p1.val[0];
+    float32x4_t tmp1 = p1.val[1];
+
+    float32x4x2_t p2 = vzipq_f32(tmp0, tmp1);
+    even.val = p2.val[0];
+    odd.val = p2.val[1];
+    return;
+}
+
 CV_ALWAYS_INLINE void v_deinterleave(const v_uint8x16& i0, const v_uint8x16& i1,
                                      const v_uint8x16& i2, const v_uint8x16& i3,
                                      v_uint8x16& res0, v_uint8x16& res1,
diff --git a/inference-engine/tools/cross_check_tool/cross_check_tool.py b/inference-engine/tools/cross_check_tool/cross_check_tool.py
index 83ccc4f7abc063..c2d504e8e33b98 100755
--- a/inference-engine/tools/cross_check_tool/cross_check_tool.py
+++ b/inference-engine/tools/cross_check_tool/cross_check_tool.py
@@ -15,14 +15,14 @@
     from openvino.inference_engine import IENetwork, IECore
 except Exception as e:
     exception_type = type(e).__name__
-    print("The following error happened while importing Python API module:\n[ {} ] {}".format(exception_type, e))
+    print(f"The following error happened while importing Python API module:\n[ {exception_type} ] {e}")
     sys.exit(1)
 
 try:
     import ngraph as ng
 except Exception as e:
     exception_type = type(e).name
-    print("The following error happened while importing nGraph module:\n[ {} ] {}".format(exception_type, e))
+    print(f"The following error happened while importing nGraph module:\n[ {exception_type} ] {e}")
     sys.exit(1)
 
 from utils import get_config_dictionary, get_layers_list, print_output_layers, input_processing, \
@@ -36,7 +36,7 @@
 ###
 
 
-@error_handling('plugin of \'{plugin.device}\' device config \'{config}\' loading')
+@error_handling('plugin of \'{device}\' device config \'{config}\' loading')
 def set_plugin_config(core: IECore, device: str, config: str = None):
     core.set_config(get_config_dictionary(config_file=config), device_name=device)
 
@@ -68,7 +68,7 @@ def get_net(model: str, core: IECore):
     return net
 
 
-@error_handling('loading network to plugin of {plugin.device} device')
+@error_handling('loading network to plugin of {device} device')
 def get_exec_net(core, net, device):
     return core.load_network(network=net, device_name=device)
 
@@ -100,17 +100,17 @@ def get_model_info(net: IENetwork):
 ###
 
 
-@error_handling('processing inference on \'{device}\' device')
+@error_handling('processing inference')
 def get_infer_results(executable_network, inputs: dict):
     return executable_network.infer(inputs=inputs)
 
 
-@error_handling('getting performance counts from executable network on \'{device}\' device')
+@error_handling('getting performance counts from executable network')
 def get_perf_counts(executable_network):
     return executable_network.requests[0].get_perf_counts()
 
 
-@error_handling('getting inference results for outputs: \'{output}\'')
+@error_handling('getting inference results for outputs: \'{output}\' on \'{device}\' device')
 def infer(net: IENetwork, core: IECore, device: str, inputs: dict, output: list):
     executable_network = get_exec_net(core=core, net=net, device=device)
     infer_dict = get_infer_results(executable_network=executable_network, inputs=inputs)
@@ -120,7 +120,7 @@ def infer(net: IENetwork, core: IECore, device: str, inputs: dict, output: list)
     result = {}
     for out in output:
         if out not in infer_dict:
-            log.warning("There is no '{}' layer in Inference Engine outputs results".format(out))
+            log.warning(f"There is no '{out}' layer in Inference Engine outputs results")
             continue
         pc = pc[out] if out in pc else no_info_pc
         pc['device'] = device
@@ -128,7 +128,7 @@ def infer(net: IENetwork, core: IECore, device: str, inputs: dict, output: list)
     return result
 
 
-@error_handling('getting inference results for outputs: \'{output}\'')
+@error_handling('getting inference results for outputs: \'{layers}\'')
 def overall_accuracy_check(model: str, ref_model: str, out_layers: list, ref_out_layers: list, inputs: dict,
                            ref_inputs: dict, core: IECore, device: str, ref_core: IECore, ref_device: str, layers: str,
                            num_of_iterations: int):
@@ -151,8 +151,8 @@ def one_ir_mode(args):
     core = get_plugin(args.device, args.l, args.config)
     net = get_net(model=args.model, core=core)
     net_layers, net_inputs, net_outputs = get_model_info(net)
-    log.info('{} vs {}'.format(args.device, args.reference_device))
-    log.info('The same IR on both devices: {}'.format(args.model))
+    log.info(f'{args.device} vs {args.reference_device}')
+    log.info(f'The same IR on both devices: {args.model}')
     out_layers = get_layers_list(net_layers, net_inputs, net_outputs, args.layers)
     print_input_layers(net_inputs)
     print_output_layers(out_layers)
@@ -166,7 +166,7 @@ def one_ir_mode(args):
                                                             ref_device=args.reference_device, layers=args.layers,
                                                             num_of_iterations=args.num_of_iterations)
     for out_layer in out_layers:
-        log.info('Layer {} statistics'.format(out_layer))
+        log.info(f'Layer {out_layer} statistics')
         net_copy = get_net_copy_with_output(model=args.model, output=out_layer, core=core)
         results = infer(net=net_copy, core=core, device=args.device, inputs=inputs, output=[out_layer])
         if out_layer not in results:
@@ -192,9 +192,9 @@ def two_ir_mode(args):
     net_layers, net_inputs, net_outputs = get_model_info(net)
     ref_net = get_net(model=args.reference_model, core=ref_core)
     ref_net_layers, ref_net_inputs, ref_net_outputs = get_model_info(ref_net)
-    log.info('{} vs {}'.format(args.device, args.reference_device))
-    log.info('IR for {} : {}'.format(args.device, args.model))
-    log.info('IR for {} : {}'.format(args.reference_device, args.reference_model))
+    log.info(f'{args.device} vs {args.reference_device}')
+    log.info(f'IR for {args.device} : {args.model}')
+    log.info(f'IR for {args.reference_device} : {args.reference_model}')
     out_layers = get_layers_list(net_layers, net_inputs, net_outputs, args.layers)
     ref_out_layers = get_layers_list(ref_net_layers, ref_net_inputs, ref_net_outputs, args.layers)
     print_input_layers(net_inputs)
@@ -215,9 +215,9 @@ def two_ir_mode(args):
     for out_layer in layers_map:
         ref_out_layer = layers_map[out_layer]
         if out_layer == ref_out_layer:
-            log.info('Layer {} statistics'.format(out_layer))
+            log.info(f'Layer {out_layer} statistics')
         else:
-            log.info('Statistics \'{}\' vs \'{}\''.format(out_layer, ref_out_layer))
+            log.info(f'Statistics \'{out_layer}\' vs \'{ref_out_layer}\'')
         net_copy = get_net_copy_with_output(model=args.model, output=out_layer, core=core)
         ref_net_copy = get_net_copy_with_output(model=args.reference_model, output=ref_out_layer, core=ref_core)
         results = infer(net=net_copy, core=core, device=args.device, inputs=inputs, output=[out_layer])
@@ -246,7 +246,7 @@ def dump_mode(args):
     inputs = input_processing(args.model, net.input_info, args.input)
     dump_dict = {}
     for out_layer in out_layers:
-        log.info('Layer {} processing'.format(out_layer))
+        log.info(f'Layer {out_layer} processing')
         net_copy = get_net_copy_with_output(model=args.model, output=out_layer, core=core)
         results = infer(net=net_copy, core=core, device=args.device, inputs=inputs, output=[out_layer])
         if out_layer not in results:
@@ -258,8 +258,8 @@ def dump_mode(args):
 
 def load_mode(args):
     core = get_plugin(args.device, args.l, args.config)
-    log.info('IR for {} : {}'.format(args.device, args.model))
-    log.info('Loading blob from {}'.format(args.load))
+    log.info(f'IR for {args.device} : {args.model}')
+    log.info(f'Loading blob from {args.load}')
     net = get_net(model=args.model, core=core)
     net_layers, net_inputs, net_outputs = get_model_info(net)
     out_layers = get_layers_list(net_layers, net_inputs, net_outputs, args.layers)
@@ -273,9 +273,9 @@ def load_mode(args):
     for out_layer in layers_map:
         ref_out_layer = layers_map[out_layer]
         if out_layer == ref_out_layer:
-            log.info('Layer {} statistics'.format(out_layer))
+            log.info(f'Layer {out_layer} statistics')
         else:
-            log.info('Statistics \'{}\' vs \'{}\''.format(out_layer, ref_out_layer))
+            log.info(f'Statistics \'{out_layer}\' vs \'{ref_out_layer}\'')
         net_copy = get_net_copy_with_output(model=args.model, output=out_layer, core=core)
         results = infer(net=net_copy, core=core, device=args.device, inputs=inputs, output=[out_layer])
         if out_layer not in results:
@@ -294,7 +294,7 @@ def load_mode(args):
 
 
 def main(args):
-    log.info('Inference Engine:\n          API version ............ {}'.format(ie.__version__), extra={'no_lvl': True})
+    log.info(f'Inference Engine:\n          API version ............ {ie.__version__}', extra={'no_lvl': True})
     set_verbosity(args.verbosity)
     mode = find_out_cct_mode(args)
     if mode == 1:
diff --git a/inference-engine/tools/cross_check_tool/utils.py b/inference-engine/tools/cross_check_tool/utils.py
index 93a6af637f047f..088f5b7973996b 100644
--- a/inference-engine/tools/cross_check_tool/utils.py
+++ b/inference-engine/tools/cross_check_tool/utils.py
@@ -11,15 +11,15 @@
 try:
     import cv2
 except Exception as e:
-    log.error("Can not import OpenCV Python package.\nPlease install required python packages by running:\n"
-              "pip3 install -r requirements.txt\n\n Original error message: {}".format(e))
+    log.error(f"Can not import OpenCV Python package.\nPlease install required python packages by running:\n"
+              f"pip3 install -r requirements.txt\n\n Original error message: {e}")
     sys.exit(1)
 
 try:
     import numpy as np
 except Exception as e:
-    log.error("Can not import numpy python package.\nPlease install required python packages by running:\n"
-              "pip3 install -r requirements.txt\n\n Original error message: {}".format(e))
+    log.error(f"Can not import numpy python package.\nPlease install required python packages by running:\n"
+              f"pip3 install -r requirements.txt\n\n Original error message: {e}")
     sys.exit(1)
 
 verbosity = False
@@ -78,8 +78,7 @@ def try_except_func(*args, **kwargs):
                 return func(*args, **kwargs)
             except Exception as e:
                 exception_type = type(e).__name__
-                log.error("The following error happened while {}:\n[ {} ] {}".format(desc.format(**kwargs),
-                                                                                     exception_type, e))
+                log.error(f"The following error happened while {desc.format(**kwargs)}:\n[ {exception_type} ] {e}")
                 global verbosity
                 if verbosity:
                     traceback.print_tb(tb=e.__traceback__, file=sys.stdout)
@@ -98,7 +97,7 @@ class ExistingFileAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
         if values is not None:
             if not os.path.isfile(values):
-                log.error("File was not found: {}".format(values))
+                log.error(f"File was not found: {values}")
                 sys.exit(1)
         setattr(namespace, self.dest, values)
 
@@ -111,7 +110,7 @@ class ExistingDirAction(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
         if values is not None:
             if not os.path.isdir(values):
-                log.error("Directory was not found: {}".format(values))
+                log.error(f"Directory was not found: {values}")
                 sys.exit(1)
         setattr(namespace, self.dest, values)
 
@@ -276,12 +275,12 @@ def find_out_cct_mode(args):
 
 def print_input_layers(inputs: list):
     word = 'inputs' if len(inputs) > 1 else 'input'
-    log.info('{} {} detected: {}'.format(len(inputs), word, ', '.join(inputs)))
+    log.info(f"{len(inputs)} {word} detected: {', '.join(inputs)}")
 
 
 def print_output_layers(outputs: list):
     layers = 'layers' if len(outputs) > 1 else 'layer'
-    log.info('Statistics will be dumped for {} {}: {}'.format(len(outputs), layers, ', '.join(outputs)))
+    log.info(f"Statistics will be dumped for {len(outputs)} {layers}: {', '.join(outputs)}")
 
 
 ###
@@ -313,24 +312,21 @@ def read_multi_input_file(input_file: str, net_inputs: dict):
     dump = {}
     for net_input in net_inputs:
         if net_input not in files:
-            raise Exception('Can not find input data for input {} in multi-input file {}.\n'
-                            'Input data was provided for layers: {}\n'
-                            'Network inputs: {}'.format(net_input, input_file, ', '.join(files),
-                                                        ', '.join(net_inputs.keys())))
+            raise Exception(f"Can not find input data for input {net_input} in multi-input file {input_file}.\n"
+                            f"Input data was provided for layers: {', '.join(files)}\n"
+                            f"Network inputs: {', '.join(net_inputs.keys())}")
         if 'blob' in npz[net_input].item(0):
             just_blob = npz[net_input].item(0)['blob']
             network_shape = net_inputs[net_input].input_data.shape
-            log.info('Layer {} shape = {}, input blob from multi-input file shape = {}'
-                     ''.format(net_input, network_shape, just_blob.shape))
+            log.info(f'Layer {net_input} shape = {network_shape}, input blob from multi-input file shape = {just_blob.shape}')
             try:
                 reshaped_blob = np.reshape(just_blob, network_shape)
             except:
-                raise Exception('Can not reshape input blob from multi-input file for layer {} to shape {}'
-                                ''.format(net_input, network_shape))
+                raise Exception(f'Can not reshape input blob from multi-input file for layer {net_input} to shape {network_shape}')
             dump[net_input] = reshaped_blob
         else:
             raise Exception(
-                'Can not find \'blob\' parameter for input {} in input file {}'.format(net_input, input_file))
+                f'Can not find \'blob\' parameter for input {net_input} in input file {input_file}')
     return dump
 
 
@@ -372,8 +368,7 @@ def input_processing(model_path: str, net_inputs: dict, input_file: str, layers_
 
 def accuracy_metrics(out_blob, ref_out_blob):
     if out_blob.size != ref_out_blob.size:
-        raise Exception('Different number of elements in blobs {} and {}. Can not compare'
-                        ''.format(out_blob.size, ref_out_blob.size))
+        raise Exception(f'Different number of elements in blobs {out_blob.size} and {ref_out_blob.size}. Can not compare')
     abs_diff = np.absolute(out_blob - ref_out_blob)
     rel_diff = np.divide(abs_diff, np.min(abs_diff) if np.min(abs_diff) != 0 else 1e-20)
 
@@ -394,9 +389,9 @@ def accuracy_metrics(out_blob, ref_out_blob):
 
     for key, value in metrics:
         if len(str(value)) > 5:
-            log.info('{:>35} : {:.5E}'.format(key, value), extra={'no_lvl': True})
+            log.info(f'{key:>35} : {value:.5E}', extra={'no_lvl': True})
         else:
-            log.info('{:>35} : {}'.format(key, value), extra={'no_lvl': True})
+            log.info(f'{key:>35} : {value}', extra={'no_lvl': True})
     return {metric: value for metric, value in metrics}
 
 
@@ -409,7 +404,7 @@ def performance_metrics(pc, ref_pc):
     ]
 
     for metric, actual, reference in compare:
-        log.info('{:>35}: {:>16} {:>16}'.format(metric, actual, reference), extra={'no_lvl': True})
+        log.info(f'{metric:>35}: {actual:>16} {reference:>16}', extra={'no_lvl': True})
 
 
 def blob_counters(out_blob, ref_out_blob):
@@ -420,7 +415,7 @@ def blob_counters(out_blob, ref_out_blob):
          ref_out_blob.size - np.count_nonzero(ref_out_blob))
     ]
     for metric, actual, reference in counters:
-        log.info('{:>35}: {:>16} {:>16}'.format(metric, actual, reference), extra={'no_lvl': True})
+        log.info(f'{metric:>35}: {actual:>16} {reference:>16}', extra={'no_lvl': True})
 
 
 def update_global_accuracy_matrics(global_accuracy: list, current_accuracy: dict):
@@ -444,12 +439,13 @@ def print_all_over_the_net_metrics(global_accuracy: (str, float), global_times:
                                    ref_global_times: list = None):
     if global_times is not None and ref_global_times is not None and len(global_times) and len(ref_global_times):
         log.info('-' * 70, extra={'no_lvl': True})
-        log.info('{:>35}: {:>16,.5E} {:>16,.5E}'.format(
-            'Overall performance, microseconds', global_times[len(global_times) // 2].microseconds,
-            ref_global_times[len(ref_global_times) // 2].microseconds), extra={'no_lvl': True})
+        log.info(f'{"Overall performance, microseconds":>35}: '
+                                f'{global_times[len(global_times) // 2].microseconds:>16,.5E} '
+                                f'{ref_global_times[len(ref_global_times) // 2].microseconds:>16,.5E}', 
+                                                                                extra={'no_lvl': True})
         log.info('-' * 70, extra={'no_lvl': True})
     for metric, value in global_accuracy:
-        log.info('{} {} = {}'.format('Overall', metric.lower(), value))
+        log.info(f"Overall {metric.lower()} = {value}")
 
 
 ###
@@ -493,9 +489,9 @@ def manage_user_outputs_with_mapping(mapping, reference_mapping, user_layers):
         if layer not in layers_map:
             if mapping is not None and reference_mapping is not None:
                 log.warning(
-                    'Can not map layer {} from --model/-m to any layer from --reference_model/-ref_m'.format(layer))
+                    f'Can not map layer {layer} from --model/-m to any layer from --reference_model/-ref_m')
             else:
-                log.warning('Can not find layer {} in --reference_model/-ref_m model'.format(layer))
+                log.warning(f'Can not find layer {layer} in --reference_model/-ref_m model')
     for layer in layers_map:
         if layer not in user_layers:
             del layers_map[layer]
@@ -513,9 +509,9 @@ def get_layers_list(all_layers: list, inputs: dict, outputs: list, layers: str):
             layers_to_check = []
             for user_layer in user_layers:
                 if user_layer not in all_layers_names:
-                    raise Exception("Layer {} doesn't exist in the model".format(user_layer))
+                    raise Exception(f"Layer {user_layer} doesn't exist in the model")
                 if user_layer in inputs:
-                    raise Exception("Layer {} is input layer. Can not proceed".format(user_layer))
+                    raise Exception(f"Layer {user_layer} is input layer. Can not proceed")
                 if all_layers_names[user_layer].get_type_name() != 'Result':
                     layers_to_check.append(user_layer)
                 else:
@@ -533,7 +529,7 @@ def get_layers_list(all_layers: list, inputs: dict, outputs: list, layers: str):
 
 def dump_output_file(output_file, dump_dict):
     np.savez_compressed(output_file, **dump_dict)
-    log.info('Dump file path: {}'.format(output_file))
+    log.info(f'Dump file path: {output_file}')
 
 
 def load_dump(file_to_load: str):
diff --git a/inference-engine/tools/vpu/vpu_perfcheck/main.cpp b/inference-engine/tools/vpu/vpu_perfcheck/main.cpp
index c093303b47eabc..febb53e0d7ccdf 100644
--- a/inference-engine/tools/vpu/vpu_perfcheck/main.cpp
+++ b/inference-engine/tools/vpu/vpu_perfcheck/main.cpp
@@ -381,7 +381,7 @@ int process(const std::string& modelFileName, const std::string& inputsDir,
         }
     }
 
-    std::vector<InferenceEngine::IExecutableNetwork::Ptr> exeNetwork(num_networks);
+    std::vector<InferenceEngine::ExecutableNetwork> exeNetwork(num_networks);
     std::map<std::string, std::string> networkConfig;
     setConfig(networkConfig, file_config_cl);
 
@@ -403,7 +403,7 @@ int process(const std::string& modelFileName, const std::string& inputsDir,
 
     for (int r = 0, idxPic = 0; r < num_requests; ++r) {
         int n = r % num_networks;
-        IECALL(exeNetwork[n]->CreateInferRequest(request[r], &resp));
+        request[r] = exeNetwork[n].CreateInferRequest();
 
         for (auto &input : networkInputs) {
             InferenceEngine::Blob::Ptr inputBlob;
diff --git a/install_build_dependencies.sh b/install_build_dependencies.sh
index 72fcbf0d51f3b2..ebf5293a52e226 100755
--- a/install_build_dependencies.sh
+++ b/install_build_dependencies.sh
@@ -37,6 +37,7 @@ if [ -f /etc/lsb-release ]; then
             libssl-dev \
             ca-certificates \
             git \
+            git-lfs \
             libboost-regex-dev \
             $x86_64_specific_packages \
             libgtk2.0-dev \
@@ -77,6 +78,7 @@ elif [ -f /etc/redhat-release ]; then
             libssl-dev \
             ca-certificates \
             git \
+            git-lfs \
             boost-devel \
             libtool \
             gcc \
diff --git a/model-optimizer/automation/package_BOM.txt b/model-optimizer/automation/package_BOM.txt
index e209be46f77fe1..301cbc0808212c 100644
--- a/model-optimizer/automation/package_BOM.txt
+++ b/model-optimizer/automation/package_BOM.txt
@@ -11,6 +11,7 @@ extensions/back/__init__.py
 extensions/back/AvgPool.py
 extensions/back/blob_normalizer.py
 extensions/back/CellNormalizer.py
+extensions/back/ChangeCastOutputType.py
 extensions/back/ClampNormalizer.py
 extensions/back/compress_quantized_weights.py
 extensions/back/ConvolutionNormalizer.py
@@ -32,6 +33,7 @@ extensions/back/LayoutChangeForGatherND.py
 extensions/back/LeakyReLUMutation.py
 extensions/back/LinearToLinearONNXReplacer.py
 extensions/back/LRNToNorm.py
+extensions/back/MarkNodesWithShapeValues.py
 extensions/back/MatMulNormalizer.py
 extensions/back/MaxPool.py
 extensions/back/NormalizeToNormalizeL2.py
@@ -125,7 +127,6 @@ extensions/front/caffe/softmax_ext.py
 extensions/front/caffe/spatial_transformer_ext.py
 extensions/front/caffe/split_to_identity.py
 extensions/front/caffe/tanh.py
-extensions/front/ChangeCastOutputType.py
 extensions/front/ChangePlaceholderTypes.py
 extensions/front/create_tensor_nodes.py
 extensions/front/disable_weights_quantize_value_propagation.py
@@ -154,6 +155,7 @@ extensions/front/kaldi/add_reshape_around_pooling.py
 extensions/front/kaldi/apply_counts.py
 extensions/front/kaldi/logsoftmax_component_ext.py
 extensions/front/kaldi/memory_offset_adjustment.py
+extensions/front/kaldi/memoryoffset_batch_update.py
 extensions/front/kaldi/replace_eltwise_nin1.py
 extensions/front/kaldi/replace_lstm_node_pattern.py
 extensions/front/kaldi/replace_lstm_nonlinearity.py
@@ -577,6 +579,7 @@ extensions/middle/L2NormFusing.py
 extensions/middle/LayoutChangeForConstantShapePaths.py
 extensions/middle/LeakyReluPattern.py
 extensions/middle/LSTMRNNSequenceToTensorIterator.py
+extensions/middle/MakeKaldiConstReshapable.py
 extensions/middle/MarkSubgraphsWithCorrectLayout.py
 extensions/middle/MoveConstToLoopBody.py
 extensions/middle/MulFakeQuantizeFuse.py
@@ -774,6 +777,7 @@ install_prerequisites/protobuf-3.6.1-py3.6-win-amd64.egg
 install_prerequisites/protobuf-3.6.1-py3.7-win-amd64.egg
 mo.py
 mo/__init__.py
+mo/__main__.py
 mo/back/__init__.py
 mo/back/ie_ir_ver_2/__init__.py
 mo/back/ie_ir_ver_2/emitter.py
@@ -797,6 +801,7 @@ mo/front/caffe/python_layer_extractor.py
 mo/front/caffe/register_custom_ops.py
 mo/front/common/__init__.py
 mo/front/common/custom_replacement_registry.py
+mo/front/common/extractors/__init__.py
 mo/front/common/extractors/utils.py
 mo/front/common/find_unsupported_ops.py
 mo/front/common/layout.py
@@ -992,6 +997,7 @@ mo/utils/ir_engine/compare_graphs.py
 mo/utils/ir_engine/ir_engine.py
 mo/utils/ir_reader/__init__.py
 mo/utils/ir_reader/extender.py
+mo/utils/ir_reader/extenders/__init__.py
 mo/utils/ir_reader/extenders/binary_convolution_extender.py
 mo/utils/ir_reader/extenders/bucketize_extender.py
 mo/utils/ir_reader/extenders/conv_extender.py
diff --git a/model-optimizer/extensions/back/ChangeCastOutputType.py b/model-optimizer/extensions/back/ChangeCastOutputType.py
new file mode 100644
index 00000000000000..976b6b50a29136
--- /dev/null
+++ b/model-optimizer/extensions/back/ChangeCastOutputType.py
@@ -0,0 +1,43 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import logging as log
+
+import numpy as np
+
+from mo.back.replacement import BackReplacementPattern
+from mo.graph.graph import Graph
+from mo.middle.passes.convert_data_type import data_type_str_to_np
+
+
+class ChangeCastOutputType(BackReplacementPattern):
+    """
+    Change the Cast dst_type from fp64 to fp32 since not all plugins support fp64 data type.
+    Change the Cast dst_type from fp32 to fp16 when generating IR for fp16.
+    But leave fp32 if node returns shape value even if --data_type=FP16 (look extensions/back/MarkNodesWithShapeValues.py).
+    """
+    enabled = True
+    force_shape_inference = True
+
+    def run_after(self):
+        from extensions.back.MarkNodesWithShapeValues import MarkNodesWithShapeValues
+        return [MarkNodesWithShapeValues]
+
+    def run_before(self):
+        return []
+
+    def find_and_replace_pattern(self, graph: Graph):
+        for node in graph.get_op_nodes(op='Cast'):
+            if node.dst_type == np.float64:
+                log.warning('Change data type from {} to {} for node {}'.format(node.dst_type, np.float32, node.name))
+                node.dst_type = np.float32
+
+            ir_data_type = data_type_str_to_np(node.graph.graph['cmd_params'].data_type)
+            if node.dst_type == np.float32 and ir_data_type == np.float16 and not node.has_and_set('returns_shape_value'):
+                log.warning('Change data type from {} to {} for node {}'.format(node.dst_type, ir_data_type, node.name))
+                node.dst_type = ir_data_type
+            elif node.has_and_set('returns_shape_value') and node.dst_type == np.float16:
+                # return back FP32 for all Convert nodes with shape values
+                log.warning('Change data type from {} to {} for node {} in ShapeOf subgraph'.
+                            format(node.dst_type, np.float32, node.name))
+                node.dst_type = np.float32
diff --git a/model-optimizer/extensions/back/InterpolateReshape.py b/model-optimizer/extensions/back/InterpolateReshape.py
index 60323d1f88f2a9..cb6518b39022dd 100644
--- a/model-optimizer/extensions/back/InterpolateReshape.py
+++ b/model-optimizer/extensions/back/InterpolateReshape.py
@@ -15,7 +15,7 @@
 
 
 class InterpolateConcat(BackReplacementPattern):
-    """
+    r"""
     Replaces hard-coded 1-port input of Interpolate with reshape-able sub-graph using the following Concat inputs
 
     BEFORE:
@@ -85,7 +85,7 @@ def find_and_replace_pattern(self, graph: Graph):
 
 
 class InterpolateReshapeWA(BackReplacementPattern):
-    """
+    r"""
     Replaces hard-coded 1-port input of Interpolate with reshape-able sub-graph.
     WARNING: Could cause troubles if model has hard-coded Interpolate intentionally -- rare situation
     BEFORE:
diff --git a/model-optimizer/extensions/back/MarkNodesWithShapeValues.py b/model-optimizer/extensions/back/MarkNodesWithShapeValues.py
new file mode 100644
index 00000000000000..0201e31423a9d9
--- /dev/null
+++ b/model-optimizer/extensions/back/MarkNodesWithShapeValues.py
@@ -0,0 +1,78 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import logging as log
+
+import numpy as np
+
+from extensions.middle.MarkSubgraphsWithCorrectLayout import MarkSubGraphsWithCorrectLayout
+from mo.back.replacement import BackReplacementPattern
+from mo.graph.graph import Graph
+
+
+class MarkNodesWithShapeValues(BackReplacementPattern):
+    """
+    This transformation marks op nodes in ShapeOf subgraphs with 'returns_shape_value' bool attribute and
+    data nodes of float32 constants with 'correct_data_type' attribute.
+    So that float Consts and Cast float will be kept in FP32 even if argument --data_type=FP16 is specified.
+
+    This is needed to enable conversion to FP16 even if values in ShapeOf subgraphs exceed max(float16)
+    or because of FP16 lower precession shape inference is incorrect on some nodes (e.g. if Interpolate in scales mode
+    accepts values from ShapeOf subgraph).
+
+    This transformation should be executed after shape inference and after all transformations which insert/modify
+    Cast nodes in ShapeOf subgraphs therefore it's placed at the end of the back phase.
+    """
+    enabled = True
+    graph_condition = [lambda graph: graph.graph['cmd_params'].data_type == 'FP16']
+
+    def run_after(self):
+        from extensions.back.pass_separator import BackFinish
+        return [BackFinish]
+
+    def run_before(self):
+        return []
+
+    @staticmethod
+    def get_operations_with_shape_inputs():
+        return {
+            'Interpolate': [1, 2],  # sizes, scales inputs
+            'Reshape': [1],  # shape
+            'Broadcast': [1],  # target_shape
+            'ConvBackPropData ': [2],  # output_shape
+            'GroupConvolutionBackpropData ': [2],  # output_shape
+            'BatchToSpace': [1, 2, 3],  # block_shape, crops_begin, crops_end
+            'SpaceToBatch': [1, 2, 3],  # block_shape, pads_begin, pads_end
+            'StridedSlice': [1, 2, 3],  # begin, end, strides
+            'VariadicSplit': [2],  # split_lengths
+            'Tile': [1],  # repeats input
+            'TopK': [1],  # K input
+            'Pad': [1, 2],  # pads_begin, pads_end
+            'Range': [0, 1, 2],  # start, stop, step inputs
+            'OneHot': [1],  # depth input
+        }
+
+    def find_and_replace_pattern(self, graph: Graph):
+        shape_input_ops_map = self.get_operations_with_shape_inputs()
+
+        nodes_with_shape_inputs = []
+        for node in graph.get_op_nodes():
+            if node.soft_get('type') in shape_input_ops_map:
+                nodes_with_shape_inputs.append(node)
+
+        start_nodes = []
+        for node in nodes_with_shape_inputs:
+            start_nodes.extend(
+                [node.in_port(port_idx).get_source().node for port_idx in shape_input_ops_map[node.type] if
+                 node.is_in_port_connected(port_idx)])
+
+        condition = lambda node: node.soft_get('type') != 'ShapeOf'
+        nodes_with_shape_values = MarkSubGraphsWithCorrectLayout.bfs(start_nodes, set(), condition, forward=False)
+        for node in nodes_with_shape_values:
+            node['returns_shape_value'] = True
+            if node.soft_get('type') == 'Const':
+                if node.value.dtype == np.float32:
+                    node.out_node(0)['correct_data_type'] = True
+                elif node.value.dtype in [np.float16, np.float64]:
+                    log.debug('Const nodes {} with shape values have {} type'.format(node.soft_get('name', node.id),
+                                                                                     node.value.dtype))
diff --git a/model-optimizer/extensions/back/MarkNodesWithShapeValues_test.py b/model-optimizer/extensions/back/MarkNodesWithShapeValues_test.py
new file mode 100644
index 00000000000000..0d33f7b6e21dae
--- /dev/null
+++ b/model-optimizer/extensions/back/MarkNodesWithShapeValues_test.py
@@ -0,0 +1,103 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+
+from extensions.back.MarkNodesWithShapeValues import MarkNodesWithShapeValues
+from mo.front.common.partial_infer.utils import int64_array, float32_array
+from mo.graph.graph import Node
+from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
+from mo.utils.unittest.graph import result, regular_op_with_empty_data, \
+    shaped_const_with_data, connect, regular_op
+
+
+class TestMarkDataTypeInShapeOfSubgraphs(unittest.TestCase):
+
+    def test_run_with_shape_subgraph_input(self):
+        inp_shape = (1, 3, 1000, 1000)
+        dst_type = np.float32
+
+        nodes = {
+            **shaped_const_with_data('input', int64_array(inp_shape)),
+            **regular_op_with_empty_data('shape', {'type': 'ShapeOf'}),
+            **regular_op_with_empty_data('cast_to_float', {'type': 'Cast', 'dst_type': dst_type}),
+            **regular_op('mul_const',  {'op': 'Const'}),
+            **{'mul_const_d': {'kind': 'data', 'value': float32_array([1., 1., 1., 100.])}},
+            **regular_op_with_empty_data('mul', {'type': 'Mul'}),
+            **regular_op_with_empty_data('cast_to_int', {'type': 'Cast', 'dst_type': np.int64}),
+            **regular_op_with_empty_data('interpolate', {'type': 'Interpolate', 'shape_calculation_model': 'scales'}),
+            **result('res'),
+        }
+
+        nodes_ref = {
+            **shaped_const_with_data('input', int64_array(inp_shape)),
+            **regular_op_with_empty_data('shape', {'type': 'ShapeOf'}),
+            **regular_op_with_empty_data('cast_to_float', {'type': 'Cast', 'dst_type': dst_type,
+                                                           'returns_shape_value': True}),
+            **regular_op_with_empty_data('mul', {'type': 'Mul', 'returns_shape_value': True}),
+            **regular_op('mul_const',  {'op': 'Const', 'returns_shape_value': True}),
+            **{'mul_const_d': {'kind': 'data', 'value': float32_array([1., 1., 1., 100.]),
+                               'correct_data_type': True}},
+            **regular_op_with_empty_data('cast_to_int', {'type': 'Cast', 'dst_type': np.int64,
+                                                         'returns_shape_value': True}),
+            **regular_op_with_empty_data('interpolate', {'type': 'Interpolate', 'shape_calculation_model': 'scales'}),
+            **result('res'),
+        }
+
+        edges = [
+            *connect('input', '0:interpolate'),
+            *connect('input', '0:shape', skip_data=True),
+            *connect('shape', '0:cast_to_float'),
+            *connect('cast_to_float', '0:mul'),
+            *connect('mul_const', '1:mul'),
+            *connect('mul', '0:cast_to_int'),
+            *connect('cast_to_int', '1:interpolate'),
+            *connect('interpolate', 'res'),
+        ]
+        graph = build_graph(nodes, edges)
+        interp_node = Node(graph, 'interpolate')
+        interp_node.add_input_port(2)
+
+        MarkNodesWithShapeValues().find_and_replace_pattern(graph)
+
+        graph_ref = build_graph(nodes_ref, edges)
+        (flag, resp) = compare_graphs(graph, graph_ref, 'res', check_op_attrs=True)
+        self.assertTrue(flag, resp)
+
+    def test_run_with_const_input(self):
+        inp_shape = (1, 3, 1000, 1000)
+        dst_type = np.float32
+
+        nodes = {
+            **shaped_const_with_data('input', int64_array(inp_shape)),
+            **regular_op('sizes_const',  {'op': 'Const'}),
+            **{'sizes_const_d': {'kind': 'data', 'value': float32_array([1., 1., 1., 100.])}},
+             **regular_op_with_empty_data('interpolate', {'type': 'Interpolate', 'shape_calculation_model': 'scales'}),
+            **result('res'),
+        }
+
+        nodes_ref = {
+            **shaped_const_with_data('input', int64_array(inp_shape)),
+            **regular_op('sizes_const',  {'op': 'Const', 'returns_shape_value': True}),
+            **{'sizes_const_d': {'kind': 'data', 'value': float32_array([1., 1., 1., 100.])}},
+              **regular_op_with_empty_data('interpolate', {'type': 'Interpolate', 'shape_calculation_model': 'scales'}),
+            **result('res'),
+        }
+
+        edges = [
+            *connect('input', '0:interpolate'),
+            *connect('sizes_const', '1:interpolate'),
+            *connect('interpolate', 'res'),
+        ]
+        graph = build_graph(nodes, edges)
+        interp_node = Node(graph, 'interpolate')
+        interp_node.add_input_port(2)
+
+        MarkNodesWithShapeValues().find_and_replace_pattern(graph)
+
+        graph_ref = build_graph(nodes_ref, edges)
+        (flag, resp) = compare_graphs(graph, graph_ref, 'res', check_op_attrs=True)
+        self.assertTrue(flag, resp)
diff --git a/model-optimizer/extensions/back/MatMulNormalizer.py b/model-optimizer/extensions/back/MatMulNormalizer.py
index 46e4b3ec750395..76cf9bfd173b09 100644
--- a/model-optimizer/extensions/back/MatMulNormalizer.py
+++ b/model-optimizer/extensions/back/MatMulNormalizer.py
@@ -70,7 +70,7 @@ def replace_pattern(graph: Graph, match: dict):
 
 
 class PullTransposeThroughFQUp(BackReplacementPattern):
-    """
+    r"""
         BEFORE                                      AFTER
                                                         T  T T  T  T
          \ \ | / /                                       \ \ | / /
@@ -135,7 +135,7 @@ def replace_pattern(graph: Graph, match: dict):
 
 
 class SmartReshape_HC_Reshape_MatMul(BackReplacementPattern):
-    """
+    r"""
     Relaxes hard-coded input of Reshape in such sub-graphs:
 
     input_1     Constant
diff --git a/model-optimizer/extensions/back/ReverseInputChannels.py b/model-optimizer/extensions/back/ReverseInputChannels.py
index c0c62c10f8b356..49f748ac43405f 100644
--- a/model-optimizer/extensions/back/ReverseInputChannels.py
+++ b/model-optimizer/extensions/back/ReverseInputChannels.py
@@ -97,7 +97,7 @@ class ReverseChannelsPropagationDown(BackReplacementPattern):
 
     @staticmethod
     def pass_rc_through_conv(node, reverse_channels):
-        """
+        r"""
         For non grouped convolution:
         BEFORE                          AFTER
 
@@ -167,7 +167,7 @@ def pass_rc_through_conv(node, reverse_channels):
 
     @staticmethod
     def pass_rc_through_eltwise(node, reverse_channels):
-        """
+        r"""
         BEFORE                              AFTER
 
           previous_op                                       previous_op'
@@ -268,7 +268,7 @@ class ReverseChannelsPropagationUp(BackReplacementPattern):
 
     @staticmethod
     def lift_up_through_eltwise(node: Node, reverse_channels: Node):
-        """
+        r"""
         BEFORE                      AFTER
 
                                     previous_op              previous_op'
diff --git a/model-optimizer/extensions/back/SpecialNodesFinalization.py b/model-optimizer/extensions/back/SpecialNodesFinalization.py
index c177d7873cf0f2..915c5670d74cc1 100644
--- a/model-optimizer/extensions/back/SpecialNodesFinalization.py
+++ b/model-optimizer/extensions/back/SpecialNodesFinalization.py
@@ -106,6 +106,8 @@ class RemoveConstToResult(BackReplacementPattern):
     Transformation looks for a constant sub-graph followed by Result operation.
     If sub-graph is Const->data->Result -- then all three nodes are removed.
     If there is more complex constant sub-graph -- then only Result node is removed.
+    If Result node has keep_output_port attribute True the node will not to be removed from graph but
+    the Result node will not to be saved to IR. Only port will be kept in IR.
 
     Currently IE is unable to handle such graph so this transformation is a work around for such case.
     For instance, this case appears for Wide and Deep model.
@@ -123,7 +125,8 @@ def pattern():
         return dict(
             nodes=[
                 ('const_data', {'kind': 'data', 'value': lambda value: value is not None}),
-                ('result_node', {'type': 'Result', 'kind': 'op'}),
+                ('result_node', {'type': 'Result', 'kind': 'op',
+                                 'keep_output_port': lambda attr: not attr}),
             ],
             edges=[
                 ('const_data', 'result_node')
diff --git a/model-optimizer/extensions/back/SpecialNodesFinalization_test.py b/model-optimizer/extensions/back/SpecialNodesFinalization_test.py
index 46523aac61f6e3..ea6516adbc3fac 100644
--- a/model-optimizer/extensions/back/SpecialNodesFinalization_test.py
+++ b/model-optimizer/extensions/back/SpecialNodesFinalization_test.py
@@ -107,7 +107,7 @@ def test_only_consumer(self):
         nodes = [
             ('const_node', {'type': 'Const', 'kind': 'op'}),
             ('const_data', {'kind': 'data', 'value': np.array(5)}),
-            ('result_node', {'type': 'Result', 'kind': 'op'}),
+            ('result_node', {'type': 'Result', 'kind': 'op', 'keep_output_port': False}),
 
             ('placeholder_1', {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}),
             ('placeholder_1_data', {'kind': 'data'}),
@@ -150,6 +150,58 @@ def test_only_consumer(self):
         self.assertNotIn('const_data', graph.node)
         self.assertNotIn('result_node', graph.node)
 
+
+    def test_only_consumer_keep_result(self):
+        """Result node is only consumer of Const data node"""
+        nodes = [
+            ('placeholder_1', {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}),
+            ('placeholder_1_data', {'kind': 'data'}),
+            ('placeholder_2', {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}),
+            ('placeholder_2_data', {'kind': 'data'}),
+            ('shape_of', {'type': 'ShapeOf', 'kind': 'op', 'op': 'ShapeOf'}),
+            ('shape_of_data', {'kind': 'data'}),
+            ('split', {'type': 'Split', 'kind': 'op', 'op': 'Split'}),
+            ('split_data1', {'kind': 'data'}),
+            ('split_data2', {'kind': 'data'}),
+            ('result_node1', {'type': 'Result', 'kind': 'op', 'keep_output_port': True}),
+
+            ('mul', {'type': 'Mul', 'kind': 'op', 'op': 'Mul'}),
+            ('mul_data', {'kind': 'data'}),
+            ('result_node2', {'type': 'Result', 'kind': 'op'}),
+        ]
+        edges = [
+            ('placeholder_1', 'placeholder_1_data'),
+            ('placeholder_2', 'placeholder_2_data'),
+            ('placeholder_1_data', 'shape_of'),
+            ('shape_of', 'shape_of_data'),
+            ('shape_of_data', 'split'),
+            ('split', 'split_data1', {'in': 0}),
+            ('split', 'split_data2', {'in': 1}),
+
+            ('split_data1', 'result_node1'),
+            ('split_data2', 'mul'),
+            ('placeholder_2_data', 'mul'),
+            ('mul', 'mul_data'),
+            ('mul_data', 'result_node2'),
+        ]
+        
+        graph = build_graph_with_attrs(
+            nodes_with_attrs=nodes,
+            edges_with_attrs=edges,
+        )
+        graph_ref = build_graph_with_attrs(
+            nodes_with_attrs=nodes,
+            edges_with_attrs=edges,
+        )
+        tested_pattern = RemoveConstToResult()
+        tested_pattern.find_and_replace_pattern(graph)
+        (flag, resp) = compare_graphs(graph, graph_ref, last_node='mul_data')
+        self.assertTrue(flag, resp)
+        self.assertIn('split_data1', graph.node)
+        self.assertIn('split_data2', graph.node)
+        self.assertIn('result_node1', graph.node)
+
+
     def test_two_consumers(self):
         """Const data node has two consumers: Result and ReLu"""
         nodes = [
@@ -190,3 +242,34 @@ def test_two_consumers(self):
         (flag, resp) = compare_graphs(graph, graph_ref, last_node='relu_1_data')
         self.assertTrue(flag, resp)
         self.assertNotIn('result_node', graph.node)
+
+
+    def test_two_consumers_keep_outputs(self):
+        """Const data node has two consumers: Result and ReLu"""
+        nodes = [
+            ('const_node', {'type': 'Const', 'kind': 'op'}),
+            ('const_data', {'kind': 'data', 'value': np.array(5)}),
+            ('result_node', {'type': 'Result', 'kind': 'op', 'keep_output_port': True}),
+            ('relu_1', {'type': 'ReLU', 'kind': 'op', 'op': 'ReLU'}),
+            ('relu_1_data', {'kind': 'data'}),
+        ]
+        edges = [
+            ('const_node', 'const_data'),
+            ('const_data', 'result_node'),
+            ('const_data', 'relu_1'),
+            ('relu_1', 'relu_1_data')
+        ]
+
+        graph = build_graph_with_attrs(
+            nodes_with_attrs=nodes,
+            edges_with_attrs=edges,
+        )
+        graph_ref = build_graph_with_attrs(
+            nodes_with_attrs=nodes,
+            edges_with_attrs=edges,
+        )
+        tested_pattern = RemoveConstToResult()
+        tested_pattern.find_and_replace_pattern(graph)
+        (flag, resp) = compare_graphs(graph, graph_ref, last_node='relu_1_data')
+        self.assertTrue(flag, resp)
+        self.assertIn('result_node', graph.node)
diff --git a/model-optimizer/extensions/back/TopKNormalizer.py b/model-optimizer/extensions/back/TopKNormalizer.py
index af756a97bfc42d..6fc3f33e3a9401 100644
--- a/model-optimizer/extensions/back/TopKNormalizer.py
+++ b/model-optimizer/extensions/back/TopKNormalizer.py
@@ -52,7 +52,7 @@ def normalize_outputs(node: Node):
         """
         if node.out_port(0).disconnected():
             output = Result(node.graph, {'name': node.name + '/Result_port_0/',
-                                    'remove_from_xml': node.has_and_set('remove_values_output')}).create_node()
+                                         'keep_output_port': node.has_and_set('remove_values_output')}).create_node()
             node.out_port(0).get_connection().set_destination(output.in_port(0))
         if node.out_port(1).disconnected():
             output = Result(node.graph, {'name': node.name + '/Result_port_1/'}).create_node()
diff --git a/model-optimizer/extensions/back/compress_quantized_weights.py b/model-optimizer/extensions/back/compress_quantized_weights.py
index 7a646f00caca2b..ac8918f48c7fe7 100644
--- a/model-optimizer/extensions/back/compress_quantized_weights.py
+++ b/model-optimizer/extensions/back/compress_quantized_weights.py
@@ -14,7 +14,7 @@
 
 
 class CompressQuantizeWeights(BackReplacementPattern):
-    """
+    r"""
     Compress weights transformation goal is to pre-quantize data to minimize runtime calculations with constant data.
     To achieve this goal we perform FakeQuantize decomposition to separate quantization from dequantization in it.
 
diff --git a/model-optimizer/extensions/front/ChangeCastOutputType.py b/model-optimizer/extensions/front/ChangeCastOutputType.py
deleted file mode 100644
index 45bd72d2d0fe49..00000000000000
--- a/model-optimizer/extensions/front/ChangeCastOutputType.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (C) 2018-2021 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import logging as log
-
-import numpy as np
-
-from mo.front.common.replacement import FrontReplacementSubgraph
-from mo.front.subgraph_matcher import SubgraphMatch
-from mo.graph.graph import Graph
-from mo.middle.passes.convert_data_type import data_type_str_to_np
-
-
-class ChangeCastOutputType(FrontReplacementSubgraph):
-    """
-    Change the Cast to fp64 to fp32 since not all plugins support fp64 data type.
-    Change the Cast to fp32 to fp16 when generating IR for fp16.
-    """
-    enabled = True
-
-    def pattern(self):
-        return dict(
-            nodes=[
-                ('cast', dict(op='Cast'))
-            ],
-            edges=[]
-        )
-
-    def replace_sub_graph(self, graph: Graph, match: [dict, SubgraphMatch]):
-        node = match['cast']
-        if node.dst_type == np.float64:
-            log.warning('Change data type from {} to {} for node {}'.format(node.dst_type, np.float32, node.name))
-            node.dst_type = np.float32
-
-        ir_data_type = data_type_str_to_np(node.graph.graph['cmd_params'].data_type)
-        if node.dst_type == np.float32 and ir_data_type == np.float16:
-            log.warning('Change data type from {} to {} for node {}'.format(node.dst_type, ir_data_type, node.name))
-            node.dst_type = ir_data_type
diff --git a/model-optimizer/extensions/front/DropoutWithRandomUniformReplacer.py b/model-optimizer/extensions/front/DropoutWithRandomUniformReplacer.py
index 92a17dca9d1278..2001765be7b5ed 100644
--- a/model-optimizer/extensions/front/DropoutWithRandomUniformReplacer.py
+++ b/model-optimizer/extensions/front/DropoutWithRandomUniformReplacer.py
@@ -12,7 +12,7 @@
 
 
 class DropoutWithRandomUniformReplacer(FrontReplacementSubgraph):
-    """
+    r"""
     This transformation replaces possible Dropout block (in inference mode) with RandomUniform
     to Broadcast of half-ones in a sub-graph.
     WARNING: the transformation can be triggered for other block with RandomUniform by mistake,
diff --git a/model-optimizer/extensions/front/broadcast_with_range.py b/model-optimizer/extensions/front/broadcast_with_range.py
index 77da39a5130929..55fac8f84ac3d9 100644
--- a/model-optimizer/extensions/front/broadcast_with_range.py
+++ b/model-optimizer/extensions/front/broadcast_with_range.py
@@ -13,7 +13,7 @@
 
 
 class ExpandRangeConstant(FrontReplacementSubgraph):
-    """
+    r"""
     Searches for Constant operations filled with range values starting from 0 and replaces it with Range operation
     Faced in ONNX BERT -- replacing it makes model reshape-able by sequence length
 
diff --git a/model-optimizer/extensions/front/interpolate_reshape.py b/model-optimizer/extensions/front/interpolate_reshape.py
index 16a31a847d1a56..fb0e4458357995 100644
--- a/model-optimizer/extensions/front/interpolate_reshape.py
+++ b/model-optimizer/extensions/front/interpolate_reshape.py
@@ -13,7 +13,7 @@
 
 
 class InterpolateWithConcat(FrontReplacementPattern):
-    """
+    r"""
     Replaces hard-coded 1-port input of Interpolate with reshape-able sub-graph using the following Concat inputs
 
     BEFORE:
diff --git a/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py b/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py
index 6708ca737aec7f..42ef5d4da061a1 100644
--- a/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py
+++ b/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py
@@ -5,11 +5,10 @@
 
 from extensions.ops.Cast import Cast
 from extensions.ops.elementwise import Div
-from mo.front.common.partial_infer.utils import int64_array, float_array
+from mo.front.common.partial_infer.utils import int64_array, float32_array
 from mo.front.common.replacement import FrontReplacementPattern
 from mo.front.tf.graph_utils import create_op_with_const_inputs, create_op_node_with_second_input
 from mo.graph.graph import Graph
-from mo.middle.passes.convert_data_type import data_type_str_to_np
 from mo.ops.concat import Concat
 from mo.ops.reshape import Reshape
 from mo.ops.shape import Shape
@@ -46,21 +45,23 @@ def replace_pattern(graph: Graph, match: dict):
         node = match['conv']
         node_name = node.soft_get('name', node.id)
 
+        dst_dtype = np.float32  # even if data_type=FP16 use float32 for shape values
+
         # create Reshape before convolution
         # shape = [in_shape[0], in_shape[1]/patch_stride, 1, patch_stride]
         i_shape = Shape(graph, {'name': node_name + '/Shape'}).create_node()
         shape = Cast(graph, {'name': node_name + '/to_float',
-                             'dst_type': data_type_str_to_np(graph.graph['cmd_params'].data_type)}).create_node()
+                             'dst_type': dst_dtype}).create_node()
         i_shape.in_port(0).connect(node.in_port(0).get_source())
         shape.in_port(0).connect(i_shape.out_port(0))
 
         N, H = node_to_get_shape_value_of_indices(shape, [0]), node_to_get_shape_value_of_indices(shape, [1])
 
         div = create_op_with_const_inputs(
-            graph, Div, {1: float_array([node.patch_stride])}, {'name': node_name + '/div_stride_h'})
+            graph, Div, {1: float32_array([node.patch_stride])}, {'name': node_name + '/div_stride_h'})
         div.in_port(0).connect(H.out_port(0))
 
-        concat = create_op_with_const_inputs(graph, Concat, {2: float_array([1]), 3: float_array([node.patch_stride])},
+        concat = create_op_with_const_inputs(graph, Concat, {2: float32_array([1]), 3: float32_array([node.patch_stride])},
                                              {'name': node_name + '/concat_all_dims', 'in_ports_count': 4, 'axis': 0})
         concat.in_port(0).connect(N.out_port(0))
         concat.in_port(1).connect(div.out_port(0))
diff --git a/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py b/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py
index 0bead3863d792b..10c992d6a830cb 100644
--- a/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py
+++ b/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py
@@ -5,11 +5,10 @@
 
 from extensions.ops.Cast import Cast
 from extensions.ops.elementwise import Div
-from mo.front.common.partial_infer.utils import int64_array, float_array
+from mo.front.common.partial_infer.utils import int64_array, float32_array
 from mo.front.common.replacement import FrontReplacementPattern
 from mo.front.tf.graph_utils import create_op_node_with_second_input, create_op_with_const_inputs
 from mo.graph.graph import Graph
-from mo.middle.passes.convert_data_type import data_type_str_to_np
 from mo.ops.concat import Concat
 from mo.ops.reshape import Reshape
 from mo.ops.shape import Shape
@@ -48,18 +47,20 @@ def replace_pattern(graph: Graph, match: dict):
         # create Reshape before convolution
         # shape = [in_shape[0], pool_stride, 1, in_shape[1]/pool_stride]
         i_shape = Shape(graph, {'name': node_name + '/Shape'}).create_node()
+
+        dst_dtype = np.float32  # even if data_type=FP16 use float32 for shape values
         shape = Cast(graph, {'name': node_name + '/to_float',
-                             'dst_type': data_type_str_to_np(graph.graph['cmd_params'].data_type)}).create_node()
+                             'dst_type': dst_dtype}).create_node()
         i_shape.in_port(0).connect(node.in_port(0).get_source())
         shape.in_port(0).connect(i_shape.out_port(0))
 
         N, H = node_to_get_shape_value_of_indices(shape, [0]), node_to_get_shape_value_of_indices(shape, [1])
 
         div = create_op_with_const_inputs(
-            graph, Div, {1: float_array([node.pool_stride])}, {'name': node_name + '/div_stride_h'})
+            graph, Div, {1: float32_array([node.pool_stride])}, {'name': node_name + '/div_stride_h'})
         div.in_port(0).connect(H.out_port(0))
 
-        concat = create_op_with_const_inputs(graph, Concat, {1: float_array([node.pool_stride]), 2: float_array([1])},
+        concat = create_op_with_const_inputs(graph, Concat, {1: float32_array([node.pool_stride]), 2: float32_array([1])},
                                              {'name': node_name + '/concat_all_dims', 'in_ports_count': 4, 'axis': 0})
         concat.in_port(0).connect(N.out_port(0))
         concat.in_port(3).connect(div.out_port(0))
diff --git a/model-optimizer/extensions/front/kaldi/apply_counts.py b/model-optimizer/extensions/front/kaldi/apply_counts.py
index 6720ce800d757c..a7f003e691e345 100644
--- a/model-optimizer/extensions/front/kaldi/apply_counts.py
+++ b/model-optimizer/extensions/front/kaldi/apply_counts.py
@@ -13,7 +13,7 @@
 
 
 def apply_biases_to_last_layer(graph, counts):
-    """
+    r"""
     When user provides counts file, it is a file that contains log-apriory probabilities,
     technically it should be subtracted from the bias of the last layer unless it is a SoftMax.
 
diff --git a/model-optimizer/extensions/front/kaldi/memory_offset_adjustment.py b/model-optimizer/extensions/front/kaldi/memory_offset_adjustment.py
index d1e38efdf58e16..8a4bbbc2e4ddcd 100644
--- a/model-optimizer/extensions/front/kaldi/memory_offset_adjustment.py
+++ b/model-optimizer/extensions/front/kaldi/memory_offset_adjustment.py
@@ -53,7 +53,7 @@ def align_frame_time(graph: Graph, node: Node, frame_time_max):
                                                           'splitted': False}).create_node()
                 # add element_size for MemoryOffset after Parameter for infer
                 if in_node.op == 'Parameter':
-                    memory_align['element_size'] = in_node.shape[1]
+                    memory_align['element_size'] = in_node.shape
                 in_port.get_connection().set_source(memory_align.out_port(0))
                 memory_align.in_port(0).connect(in_node_out_port)
                 memory_align['frame_time'] = memory_align.t
@@ -64,7 +64,7 @@ def align_frame_time(graph: Graph, node: Node, frame_time_max):
 
 
 class MemoryOffsetAdjustment(FrontReplacementSubgraph):
-    """
+    r"""
     Pass used to fix wrong results in the following situation:
                               input
                               |   \
diff --git a/model-optimizer/extensions/front/kaldi/memoryoffset_batch_update.py b/model-optimizer/extensions/front/kaldi/memoryoffset_batch_update.py
new file mode 100644
index 00000000000000..5a7a0d7a253be2
--- /dev/null
+++ b/model-optimizer/extensions/front/kaldi/memoryoffset_batch_update.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from mo.front.common.replacement import FrontReplacementPattern
+from mo.graph.graph import Graph
+
+
+class MemoryOffsetBatchUpdate(FrontReplacementPattern):
+    """
+    Update batch for MemoryOffset nodes with set element_size.
+    element_size is set in loader according to shape saved in model (for example Parameter node have shape in attribute).
+    But batch can be changed on front stage if user set batch through command line. So, element_size should be updated
+    accordingly.
+    """
+    enabled = True
+    run_not_recursively = True
+
+    def run_after(self):
+        from extensions.front.user_data_repack import UserDataRepack
+        from extensions.front.kaldi.split_recurrent_memoryoffset import SplitRecurrentMemoryOffset
+        return [UserDataRepack, SplitRecurrentMemoryOffset]
+
+    def find_and_replace_pattern(self, graph: Graph):
+        batch = graph.get_op_nodes(op="Parameter")[0].shape[0]
+        for memoryoffset_node in graph.get_op_nodes(op='MemoryOffset'):
+            if memoryoffset_node.has_valid('element_size'):
+                memoryoffset_node.element_size[0] = batch
diff --git a/model-optimizer/extensions/front/kaldi/replace_lstm_node_pattern.py b/model-optimizer/extensions/front/kaldi/replace_lstm_node_pattern.py
index bc74b1500fff2c..f01011a25a4d03 100644
--- a/model-optimizer/extensions/front/kaldi/replace_lstm_node_pattern.py
+++ b/model-optimizer/extensions/front/kaldi/replace_lstm_node_pattern.py
@@ -3,25 +3,21 @@
 
 import numpy as np
 
+from extensions.middle.MakeKaldiConstReshapable import create_const_with_batch_from_input
 from extensions.ops.MatMul import FullyConnected
 from extensions.ops.activation_ops import Tanh, Sigmoid
 from extensions.ops.elementwise import Add, Mul
 from extensions.ops.split import Split
 from mo.front.caffe.extractors.utils import input_as_const
-from mo.front.common.partial_infer.utils import int64_array
 from mo.front.common.replacement import FrontReplacementOp
 from mo.front.tf.graph_utils import create_op_with_const_inputs
-from mo.graph.graph import Node, Graph, Port
+from mo.graph.graph import Node, Graph
 from mo.ops.assign import Assign
-from mo.ops.broadcast import Broadcast
 from mo.ops.clamp import Clamp
-from mo.ops.concat import Concat
 from mo.ops.const import Const
-from mo.ops.crop import Crop
 from mo.ops.read_value import ReadValue
 from mo.ops.result import Result
 from mo.ops.scale_shift import ScaleShiftOp
-from mo.ops.shape import Shape
 
 
 def unique_id(prefix: str = 'id') -> str:
@@ -41,35 +37,6 @@ def unique_id(prefix: str = 'id') -> str:
 unique_id.names = []
 
 
-def create_zero_value_with_batch_from_input(input_out_port: Port, second_dim, precision = np.float):
-    # create init_graph connected to ReadValue
-    graph = input_out_port.node.graph
-    input_name = input_out_port.node.name
-    shape_of_input = Shape(graph, {'name': 'shape/' + input_name}).create_node()
-    shape_of_input.in_port(0).connect(input_out_port)
-    dim_for_get_batch = Const(graph, {'name': 'dim/crop_batch/'+shape_of_input.name,
-                                      'value': int64_array([1]), 'shape': int64_array([1])}).create_node()
-    get_batch = Crop(graph, {'name': 'crop_batch/' + shape_of_input.name,
-                             'axis': int64_array([0]), 'offset': int64_array([0])
-                             }).create_node()
-    get_batch.in_port(0).connect(shape_of_input.out_port(0))
-    get_batch.in_port(1).connect(dim_for_get_batch.out_port(0))
-    mem_shape_2nd_dim = Const(graph, {'name': 'gifo_r_weights_shape/'+input_name,
-                                      'value': int64_array([second_dim]),
-                                      'shape': int64_array([1])}).create_node()
-    mem_shape = Concat(graph, {'name': 'gather_memory_shape/' + input_name,
-                               'axis': 0, 'in_ports_count': 2}).create_node()
-    mem_shape.in_port(0).connect(get_batch.out_port(0))
-    mem_shape.in_port(1).connect(mem_shape_2nd_dim.out_port(0))
-    fill_value = Const(graph, {'name': 'fill_value/'+input_name,
-                               'value': np.array([0.0], precision), 'shape': int64_array([1])}).create_node()
-    init_value_prev_lstm_output = Broadcast(graph, {'name': 'init_value/'+input_name,
-                                                    }).create_node()
-    init_value_prev_lstm_output.in_port(0).connect(fill_value.out_port(0))
-    init_value_prev_lstm_output.in_port(1).connect(mem_shape.out_port(0))
-    return init_value_prev_lstm_output
-
-
 class ReplaceLSTMNodePattern(FrontReplacementOp):
     op = "LSTMCell"
     enabled = True
@@ -110,8 +77,8 @@ def replace_op(self, graph: Graph, node: Node):
         input_as_const(fc_layer_after_input, fc_layer_after_input_attrs, 1, 'weights', node.gifo_x_weights)
         input_as_const(fc_layer_after_input, fc_layer_after_input_attrs, 2, 'biases', node.gifo_biases)
 
-        init_value_prev_lstm_output = create_zero_value_with_batch_from_input(input_out_port,
-                                                                              node.gifo_r_weights_shape[1])
+        init_value_prev_lstm_output = create_const_with_batch_from_input(input_out_port,
+                                                                         node.gifo_r_weights_shape[1])
         prev_lstm_output = ReadValue(graph, {'name': 'prev_memory_output',
                                              'variable_id': memory_pair_input
                                              }).create_node()
@@ -150,14 +117,8 @@ def replace_op(self, graph: Graph, node: Node):
         split_joined_input.in_port(0).connect(join_input_prev_state_sum.out_port(0))
         split_joined_input.in_port(1).connect(split_joined_input_axis.out_port(0))
 
-        # prev_lstm_state = Memory(graph, {'name': 'prev_memory_state',
-        #                                 'id': memory_pair_output,
-        #                                 'index': 1,
-        #                                 'size': 2,
-        #                                 'shape': np.array([node.input_gate_weights.shape[0]], dtype=np.int64)
-        #                                 }).create_node()
-        init_value_prev_lstm_state = create_zero_value_with_batch_from_input(split_joined_input.out_port(0),
-                                                                             node.input_gate_weights.shape[0])
+        init_value_prev_lstm_state = create_const_with_batch_from_input(split_joined_input.out_port(0),
+                                                                        node.input_gate_weights.shape[0])
         prev_lstm_state = ReadValue(graph, {'name': 'prev_memory_state',
                                             'variable_id': memory_pair_output}).create_node()
         prev_lstm_state.in_port(0).connect(init_value_prev_lstm_state.out_port(0))
diff --git a/model-optimizer/extensions/front/kaldi/split_recurrent_memoryoffset.py b/model-optimizer/extensions/front/kaldi/split_recurrent_memoryoffset.py
index 69611a75defec4..f9787dae3ba5d6 100644
--- a/model-optimizer/extensions/front/kaldi/split_recurrent_memoryoffset.py
+++ b/model-optimizer/extensions/front/kaldi/split_recurrent_memoryoffset.py
@@ -3,6 +3,7 @@
 
 import networkx as nx
 
+from mo.front.common.partial_infer.utils import int64_array
 from mo.front.common.replacement import FrontReplacementSubgraph
 from mo.graph.graph import Graph
 from mo.ops.memoryoffset import MemoryOffset
@@ -51,7 +52,7 @@ def find_and_replace_pattern(self, graph: Graph):
                 # check if previous layer contains information about its shape in out-size
                 # out-size is set in extractor of some nodes like affinecomponent based on weight's size
                 if offset_node.in_port(0).get_source().node.has_valid('out-size'):
-                    offset_node['element_size'] = offset_node.in_port(0).get_source().node['out-size']
+                    offset_node['element_size'] = int64_array([1, offset_node.in_port(0).get_source().node['out-size']])
                 else:
                     raise Error("In a recurrent block 'element_size' for node {} is not set".format(offset_node.id))
             SplitRecurrentMemoryOffset.split_offset(offset_node)
diff --git a/model-optimizer/extensions/front/kaldi/tdnn_component_replacer.py b/model-optimizer/extensions/front/kaldi/tdnn_component_replacer.py
index 8452c1ed80e17c..6fa320a8d6d3ad 100644
--- a/model-optimizer/extensions/front/kaldi/tdnn_component_replacer.py
+++ b/model-optimizer/extensions/front/kaldi/tdnn_component_replacer.py
@@ -11,7 +11,7 @@
 
 
 class TdnnComponentReplacer(FrontReplacementPattern):
-    '''
+    r"""
     Expand TdnnComponent into MemoryOffsets, Concat and FullyConected nodes
 
     BEFORE:
@@ -31,7 +31,7 @@ class TdnnComponentReplacer(FrontReplacementPattern):
                              |
                         FullyConnected
                              |
-    '''
+    """
     enabled = True
     run_not_recursively = True
 
diff --git a/model-optimizer/extensions/front/onnx/ONNXResize10ToInterpolate.py b/model-optimizer/extensions/front/onnx/ONNXResize10ToInterpolate.py
index ca1c2ba71f1c7b..ae2b28ece0532e 100644
--- a/model-optimizer/extensions/front/onnx/ONNXResize10ToInterpolate.py
+++ b/model-optimizer/extensions/front/onnx/ONNXResize10ToInterpolate.py
@@ -5,8 +5,8 @@
 
 import numpy as np
 
-from extensions.ops.activation_ops import Floor
 from extensions.ops.Cast import Cast
+from extensions.ops.activation_ops import Floor
 from extensions.ops.elementwise import Add, Mul
 from extensions.ops.interpolate import Interpolate
 from extensions.ops.range import Range
@@ -15,7 +15,6 @@
 from mo.front.common.replacement import FrontReplacementOp
 from mo.front.tf.graph_utils import create_op_with_const_inputs
 from mo.graph.graph import Graph, Node, rename_nodes
-from mo.middle.passes.convert_data_type import data_type_str_to_np
 from mo.ops.shape import Shape
 from mo.ops.strided_slice import StridedSlice
 
@@ -79,9 +78,9 @@ def replace_resize(graph: Graph, resize: Node):
                                            {1: float_array([1.0e-5])},
                                            {'name': resize_name + '/Add'})
 
-    input_data_type = data_type_str_to_np(graph.graph['cmd_params'].data_type)
+    dst_dtype = np.float32  # even if data_type=FP16 use float32 for shape values
 
-    cast_shape_to_float = Cast(graph, {'dst_type': input_data_type}).create_node()
+    cast_shape_to_float = Cast(graph, {'dst_type': dst_dtype}).create_node()
 
     shape_of.out_port(0).connect(cast_shape_to_float.in_port(0))
     mul_node = Mul(graph, {'name': resize_name + '/Mul'}).create_node([cast_shape_to_float, add_node])
diff --git a/model-optimizer/extensions/front/tf/AutomlEfficientDet.py b/model-optimizer/extensions/front/tf/AutomlEfficientDet.py
index c6302da16cc3ed..2ac2a166b669ea 100644
--- a/model-optimizer/extensions/front/tf/AutomlEfficientDet.py
+++ b/model-optimizer/extensions/front/tf/AutomlEfficientDet.py
@@ -22,6 +22,7 @@
 
 class EfficientDet(FrontReplacementFromConfigFileGeneral):
     replacement_id = 'AutomlEfficientDet'
+    run_not_recursively = True
 
     def run_before(self):
         from extensions.front.ExpandDimsToUnsqueeze import ExpandDimsToUnsqueeze
@@ -57,10 +58,11 @@ def transform_graph(self, graph: Graph, replacement_descriptions: dict):
         # which includes padding and resizing from the model
         preprocessing_input_node_id = replacement_descriptions['preprocessing_input_node']
         assert preprocessing_input_node_id in graph.nodes, 'The node with name "{}" is not found in the graph. This ' \
-                                                           'node should provide scaled image output and is specified' \
+                                                           'should be a last node before image normalization and is specified' \
                                                            ' in the json file.'.format(preprocessing_input_node_id)
         preprocessing_input_node = Node(graph, preprocessing_input_node_id)
-        preprocessing_input_node.in_port(0).get_connection().set_source(parameter_node.out_port(0))
+        consumer_node = preprocessing_input_node.out_port(0).get_connection().get_destination().node
+        consumer_node.in_port(0).get_connection().set_source(parameter_node.out_port(0))
 
         preprocessing_output_node_id = replacement_descriptions['preprocessing_output_node']
         assert preprocessing_output_node_id in graph.nodes, 'The node with name "{}" is not found in the graph. This ' \
diff --git a/model-optimizer/extensions/front/tf/NonConstBeginStridedSliceReplacement.py b/model-optimizer/extensions/front/tf/NonConstBeginStridedSliceReplacement.py
index c0567921482ebc..4a624c1b4d960c 100644
--- a/model-optimizer/extensions/front/tf/NonConstBeginStridedSliceReplacement.py
+++ b/model-optimizer/extensions/front/tf/NonConstBeginStridedSliceReplacement.py
@@ -13,7 +13,7 @@
 
 
 class NonConstBeginStridedSliceReplacement(FrontReplacementSubgraph):
-    """
+    r"""
     The transformation handles StridedSlice operation with dynamic begin and end values
     when slicing performs along just one dimension with a dynamic index.
     For example, StridedSlice with begin=(0,idx,0), end=(0,idx+1,0),
diff --git a/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py b/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py
index 3cc7a8b3120549..cdcdfa0a5221fc 100644
--- a/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py
+++ b/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py
@@ -8,7 +8,7 @@
 from extensions.ops.elementwise import Mul, Sub, Pow
 from extensions.ops.gather import Gather
 from extensions.ops.split import VariadicSplit
-from mo.front.common.partial_infer.utils import int64_array
+from mo.front.common.partial_infer.utils import int64_array, float32_array
 from mo.front.subgraph_matcher import SubgraphMatch
 from mo.front.tf.graph_utils import create_op_node_with_second_input, create_op_with_const_inputs
 from mo.front.tf.replacement import FrontReplacementFromConfigFileSubGraph
@@ -54,9 +54,9 @@ def append_variances(priors_scale_node: Node, variance: list):
         sp_shape = Shape(graph, {'name': name + '/shape'}).create_node()
         priors_scale_node.out_port(0).connect(sp_shape.in_port(0))
 
-        begin = Const(graph, {'value': np.array([-2])}).create_node()
-        end = Const(graph, {'value': np.array([-1])}).create_node()
-        stride = Const(graph, {'value': np.array([1])}).create_node()
+        begin = Const(graph, {'value': int64_array([-2])}).create_node()
+        end = Const(graph, {'value': int64_array([-1])}).create_node()
+        stride = Const(graph, {'value': int64_array([1])}).create_node()
         shape_part_for_tiling = StridedSlice(graph, {'name': name + '/get_-2_dim', 'begin_mask': np.array([1]),
                                                      'end_mask': np.array([1]), 'new_axis_mask': np.array([0]),
                                                      'shrink_axis_mask': np.array([0]),
@@ -72,7 +72,7 @@ def append_variances(priors_scale_node: Node, variance: list):
                                                          'axis': int64_array(0)},
                                                         shape_part_for_tiling)
 
-        variance = Const(graph, {'name': name + '/variance', 'value': np.array(variance)}).create_node()
+        variance = Const(graph, {'name': name + '/variance', 'value': float32_array(variance)}).create_node()
         tile = Broadcast(graph, {'name': name + '/variance_tile'}).create_node()
         variance.out_port(0).connect(tile.in_port(0))
         shape_concat.out_port(0).connect(tile.in_port(1))
@@ -113,9 +113,9 @@ def placeholder_scales(self, placeholder: Node):
         shape = Shape(graph, {'name': 'input_image_shape'}).create_node()
         shape.in_port(0).connect(placeholder.out_port(0))
 
-        begin = Const(graph, {'value': np.array([1])}).create_node()
-        end = Const(graph, {'value': np.array([3])}).create_node()
-        stride = Const(graph, {'value': np.array([1])}).create_node()
+        begin = Const(graph, {'value': int64_array([1])}).create_node()
+        end = Const(graph, {'value': int64_array([3])}).create_node()
+        stride = Const(graph, {'value': int64_array([1])}).create_node()
         spatial = StridedSlice(graph, {'name': name + '/get_h_w', 'begin_mask': np.array([1]),
                                        'end_mask': np.array([1]), 'new_axis_mask': np.array([0]),
                                        'shrink_axis_mask': np.array([0]), 'ellipsis_mask': np.array([0])}).create_node()
@@ -125,7 +125,7 @@ def placeholder_scales(self, placeholder: Node):
         spatial.in_port(2).connect(end.out_port(0))
         spatial.in_port(3).connect(stride.out_port(0))
 
-        power = Const(graph, {'value': np.array([-1.])}).create_node()
+        power = Const(graph, {'value': float32_array([-1.])}).create_node()
         spatial_scale = Pow(graph, {}).create_node()
 
         spatial_scale.in_port(0).connect(spatial.out_port(0))
diff --git a/model-optimizer/extensions/front/tf/UnpackPackReverseInputChannels.py b/model-optimizer/extensions/front/tf/UnpackPackReverseInputChannels.py
index eded7589a2ee09..1d9fe3c57ac30c 100644
--- a/model-optimizer/extensions/front/tf/UnpackPackReverseInputChannels.py
+++ b/model-optimizer/extensions/front/tf/UnpackPackReverseInputChannels.py
@@ -12,7 +12,7 @@
 
 
 class UnpackPackReverseInputChannels(FrontReplacementSubgraph):
-    """
+    r"""
     Unpack - Pack nodes sequence from TensorFlow connected like it shown below is a way to ReverseChannels
 
            /  0 - 2  \
diff --git a/model-optimizer/extensions/front/tf/automl_efficientdet.json b/model-optimizer/extensions/front/tf/automl_efficientdet.json
index 19eb1122f0c0d4..ebf13c68ab0495 100644
--- a/model-optimizer/extensions/front/tf/automl_efficientdet.json
+++ b/model-optimizer/extensions/front/tf/automl_efficientdet.json
@@ -2,7 +2,7 @@
   {
     "id": "AutomlEfficientDet",
     "custom_attributes": {
-      "preprocessing_input_node": "convert_image",
+      "preprocessing_input_node": "strided_slice_1",
       "preprocessing_output_node": "truediv",
       "aspect_ratios": [1.0, 1.0, 1.4, 0.7, 0.7, 1.4],
       "variance": [1.0, 1.0, 1.0, 1.0],
diff --git a/model-optimizer/extensions/front/tf/fifo_replacer.py b/model-optimizer/extensions/front/tf/fifo_replacer.py
index 762747980c4f3e..730f5ae9222eb0 100644
--- a/model-optimizer/extensions/front/tf/fifo_replacer.py
+++ b/model-optimizer/extensions/front/tf/fifo_replacer.py
@@ -35,7 +35,7 @@ def pattern(**kwargs):
 
     @staticmethod
     def replace_sub_graph(graph: Graph, match: dict, **kwargs):
-        """
+        r"""
         Usually graph looks like:
 
           main_graph
diff --git a/model-optimizer/extensions/front/tf/floor_div_decomposition.py b/model-optimizer/extensions/front/tf/floor_div_decomposition.py
index 2f3c5bd3759a61..1ec1573a18c8ce 100644
--- a/model-optimizer/extensions/front/tf/floor_div_decomposition.py
+++ b/model-optimizer/extensions/front/tf/floor_div_decomposition.py
@@ -8,7 +8,7 @@
 
 
 class FloorDivDecomposition(FrontReplacementPattern):
-    """
+    r"""
     BEFORE:                     AFTER:
     input_0     input_1         input_0     input_1
         \       /                   \       /
diff --git a/model-optimizer/extensions/front/tf/identityN_to_identity.py b/model-optimizer/extensions/front/tf/identityN_to_identity.py
index 7876cc72f3960b..08dcdc3839d7d1 100644
--- a/model-optimizer/extensions/front/tf/identityN_to_identity.py
+++ b/model-optimizer/extensions/front/tf/identityN_to_identity.py
@@ -7,7 +7,7 @@
 
 
 class IdentityN_to_Identity(FrontReplacementPattern):
-    """
+    r"""
     Replaces IdentityN op with several Identity ops.
 
     Example:
diff --git a/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py b/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py
index cf53ac3622292b..b38e864d4ed2c5 100644
--- a/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py
+++ b/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py
@@ -90,7 +90,7 @@ def pattern(self):
     @staticmethod
     def replace_pattern(graph: Graph, match: dict):
         time_len = match['concatenated_hidden_states'].shape[0]
-        """
+        r"""
         Working with concatenated_cell_states_data part first, because IE TensorIterator primitive doesn't have
         concatenated cell states output and if we can not collapse it, then we does not support this type of BlockLSTM
 
diff --git a/model-optimizer/extensions/middle/FakeSplitOutputs.py b/model-optimizer/extensions/middle/FakeSplitOutputs.py
index da943974f0b6dd..b5ed4b31c94c78 100644
--- a/model-optimizer/extensions/middle/FakeSplitOutputs.py
+++ b/model-optimizer/extensions/middle/FakeSplitOutputs.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from extensions.middle.TensorIteratorMerge import TensorIteratorMerge
-from mo.graph.graph import Graph
+from mo.graph.graph import Graph, Node
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.result import Result
 
@@ -21,23 +21,19 @@ class AddFakeOutputsToSplit(MiddleReplacementPattern):
     def run_after(self):
         return [TensorIteratorMerge]
 
-    @staticmethod
-    def pattern():
-        return dict(
-            nodes=[('op', dict(kind='op', op='Split'))],
-            edges=[],
-        )
+    def find_and_replace_pattern(self, graph: Graph):
+        for split_node in graph.get_op_nodes(op='Split'):
+            AddFakeOutputsToSplit.split_normalize_outputs(split_node)
 
     @staticmethod
-    def replace_pattern(graph: Graph, match: dict):
-        node = match['op']
-
+    def split_normalize_outputs(node: Node):
         if node.has_valid('out_ports_count') and len(node.out_edges()) < node.out_ports_count:
             for p in range(node.out_ports_count):
                 if p not in node.out_ports():
                     node.add_output_port(p)
                 if node.out_port(p).disconnected():
-                    res_node = Result(graph, {'name': node.name + '/Fake_output_{}/'.format(p)}).create_node()
+                    res_node = Result(node.graph, {'name': node.name + '/Fake_output_{}/'.format(p),
+                                                   'keep_output_port': True}).create_node()
                     node.out_port(p).connect(res_node.in_port(0))
 
 
@@ -76,4 +72,4 @@ def replace_pattern(graph: Graph, match: dict):
         if not node.has_valid('out_ports_count'):
             node['out_ports_count'] = len(size_splits)
 
-        AddFakeOutputsToSplit().replace_pattern(graph, match)
+        AddFakeOutputsToSplit().split_normalize_outputs(node)
diff --git a/model-optimizer/extensions/middle/InsertSelect.py b/model-optimizer/extensions/middle/InsertSelect.py
index a6610e838a9bca..737b7469dbfca7 100644
--- a/model-optimizer/extensions/middle/InsertSelect.py
+++ b/model-optimizer/extensions/middle/InsertSelect.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from extensions.front.kaldi.replace_lstm_node_pattern import create_zero_value_with_batch_from_input
+from extensions.middle.MakeKaldiConstReshapable import create_const_with_batch_from_input
 from extensions.ops.elementwise import Equal
 from extensions.ops.select import Select
 from mo.front.common.partial_infer.utils import int64_array
@@ -12,7 +12,6 @@
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.assign import Assign
 from mo.ops.concat import Concat
-from mo.ops.const import Const
 from mo.ops.crop import Crop
 from mo.ops.read_value import ReadValue
 from mo.ops.result import Result
@@ -79,7 +78,7 @@ def replace_pattern(graph: Graph, match: dict):
 
         # add Select before saving state to avoid saving garbage
         select_node = Select(graph, {'name': 'select_' + node.name}).create_node()
-        zero_else = Const(graph, {'name': 'zero_else', 'value': np.zeros(in_node_shape)}).create_node()
+        zero_else = create_const_with_batch_from_input(in_node_port, in_node_shape[1])
         select_node.in_port(1).connect(in_node_port)
         select_node.in_port(2).connect(zero_else.out_port(0))
 
@@ -114,14 +113,14 @@ def replace_pattern(graph: Graph, match: dict):
             ones = Node(graph, inverse_dict(counter_match)['const_1'])
             input_port = Node(graph, inverse_dict(counter_match)['crop_out']).out_port(0)
         else:
-            init_value_mem_out = create_zero_value_with_batch_from_input(in_node_port, context_len, np.int32)
+            init_value_mem_out = create_const_with_batch_from_input(in_node_port, context_len, precision=np.int32)
             mem_out = ReadValue(graph, {'name': 'iteration_number',
                                         'variable_id': 'iteration_'+node.name}).create_node()
             mem_out.in_port(0).connect(init_value_mem_out.out_port(0))
             cut_first = Crop(graph, {'name': 'cut_first', 'axis': int64_array([1]),
                                      'offset': int64_array([1]), 'dim': int64_array([context_len-1])}).create_node()
             cut_first.in_port(0).connect(mem_out.out_port(0))
-            ones = Const(graph, {'name': 'ones', 'value': np.ones([1, 1], dtype=np.int32)}).create_node()
+            ones = create_const_with_batch_from_input(in_node_port, 1, 1, np.int32)
             concat = Concat(graph, {'name': 'concat_ones', 'in_ports_count': 2, 'axis': 1}).create_node()
             concat.in_port(0).connect(cut_first.out_port(0))
             concat.in_port(1).connect(ones.out_port(0))
diff --git a/model-optimizer/extensions/middle/InsertSelect_test.py b/model-optimizer/extensions/middle/InsertSelect_test.py
index 46927000fe194d..db783576de9ab5 100644
--- a/model-optimizer/extensions/middle/InsertSelect_test.py
+++ b/model-optimizer/extensions/middle/InsertSelect_test.py
@@ -15,12 +15,12 @@ class InsertSelectTests(unittest.TestCase):
 
     # graph have no splices - selects should not be inserted
     def test_insert_select_0(self):
-        graph = build_graph({'in_node': {'kind': 'data', 'shape': [1, 13]},
-                             'placeholder_1': {'kind': 'op', 'op': None},
+        graph = build_graph({
+                             'placeholder_1': {'kind': 'op', 'op': 'Parameter'},
                              'placeholder_data_1': {'kind': 'data', 'shape': [1, 13]},
                              'memory': {'kind': 'op', 'op': 'Assign'},
                              },
-                            [('in_node', 'placeholder_1'), ('placeholder_1', 'placeholder_data_1'),
+                            [('placeholder_1', 'placeholder_data_1'),
                              ('placeholder_data_1', 'memory')
                              ],
                             nodes_with_edges_only=True)
@@ -32,8 +32,8 @@ def test_insert_select_0(self):
 
     # graph contains 1 splice with context length 5, should be inserted select with memory as counter with length 5
     def test_insert_select_1(self):
-        graph = build_graph({'in_node': {'kind': 'data', 'shape': [1, 13]},
-                             'placeholder_1': {'kind': 'op', 'op': None},
+        graph = build_graph({
+                             'placeholder_1': {'kind': 'op', 'op': 'Parameter'},
                              'placeholder_data_1': {'kind': 'data', 'shape': [1, 13]},
                              'splice_1': {'kind': 'op', 'op': 'Splice', 'context': np.array([-2, -1, 0, 1, 2])},
                              'splice_data_1': {'kind': 'data', 'shape': [1, 13]},
@@ -41,35 +41,53 @@ def test_insert_select_1(self):
                              'placeholder_data_2': {'kind': 'data', 'shape': [1, 26]},
                              'memory': {'kind': 'op', 'op': 'Assign', 'index': 0},
                              },
-                            [('in_node', 'placeholder_1'), ('placeholder_1', 'placeholder_data_1'),
+                            [('placeholder_1', 'placeholder_data_1'),
                              ('placeholder_data_1', 'splice_1'), ('splice_1', 'splice_data_1'),
                              ('splice_data_1', 'placeholder_2'), ('placeholder_2', 'placeholder_data_2'),
                              ('placeholder_data_2', 'memory')
                              ],
                             nodes_with_edges_only=True)
         AddSelectBeforeMemoryNodePattern().find_and_replace_pattern(graph)
-        ref_graph = build_graph({'in_node': {'kind': 'data', 'shape': [1, 13]},
-                                 'placeholder_1': {'kind': 'op', 'op': None},
+        ref_graph = build_graph({
+                                 'placeholder_1': {'kind': 'op', 'op': 'Parameter'},
                                  'placeholder_data_1': {'kind': 'data', 'shape': [1, 13]},
                                  'splice_1': {'kind': 'op', 'op': 'Splice', 'context': np.array([-2, -1, 0, 1, 2])},
                                  'splice_data_1': {'kind': 'data', 'shape': [1, 13]},
                                  'placeholder_2': {'kind': 'op', 'op': None},
 
+                                 'second_dim_mem_1': {'kind': 'op', 'op': 'Const', 'value': int64_array([5])},
+                                 'second_dim_data_mem_1': {'kind': 'data'},
+                                 'gather_shape_mem_1': {'kind': 'op', 'op': 'Concat'},
+                                 'gather_shape_data_mem_1': {'kind': 'data'},
+                                 'fill_value': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
+                                 'fill_value_data': {'kind': 'data'},
+                                 'broadcast_mem_1': {'kind': 'op', 'op': 'Broadcast'},
+                                 'broadcast_data_mem_1': {'kind': 'data'},
+
                                  'shape': {'kind': 'op', 'op': 'ShapeOf'},
                                  'shape_data': {'kind': 'data'},
                                  'crop_batch': {'kind': 'op', 'op': 'Crop', 'offset': int64_array([0])},
                                  'crop_batch_data': {'kind': 'data'},
-                                 'crop_batch_dim':{'kind': 'op', 'op': 'Const', 'value': int64_array([1])},
+                                 'crop_batch_dim': {'kind': 'op', 'op': 'Const', 'value': int64_array([1])},
                                  'crop_batch_dim_data': {'kind': 'data'},
                                  'second_dim': {'kind': 'op', 'op': 'Const', 'value': int64_array([5])},
                                  'second_dim_data': {'kind': 'data'},
                                  'gather_shape': {'kind': 'op', 'op': 'Concat'},
                                  'gather_shape_data': {'kind': 'data'},
-                                 'fill_value': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
-                                 'fill_value_data': {'kind': 'data'},
+                                 'fill_value_ones': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
+                                 'fill_value_data_ones': {'kind': 'data'},
                                  'broadcast': {'kind': 'op', 'op': 'Broadcast'},
                                  'broadcast_data': {'kind': 'data'},
 
+                                 'second_dim_mem_2': {'kind': 'op', 'op': 'Const', 'value': int64_array([26])},
+                                 'second_dim_data_mem_2': {'kind': 'data'},
+                                 'gather_shape_mem_2': {'kind': 'op', 'op': 'Concat'},
+                                 'gather_shape_data_mem_2': {'kind': 'data'},
+                                 'fill_value_ones_2': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
+                                 'fill_value_data_ones_2': {'kind': 'data'},
+                                 'broadcast_mem_2': {'kind': 'op', 'op': 'Broadcast'},
+                                 'broadcast_data_mem_2': {'kind': 'data'},
+
                                  'memory_in': {'kind': 'op', 'op': 'ReadValue', 'shape': int64_array([5])},
                                  'memory_in_data': {'kind': 'data'},
                                  'memory_out': {'kind': 'op', 'op': 'Assign', 'shape': int64_array([5])},
@@ -85,40 +103,58 @@ def test_insert_select_1(self):
                                  'select_out_data': {'kind': 'data', 'shape': [1, 26]},
                                  'const_0': {'kind': 'op', 'op': 'Const'},
                                  'const_0_data': {'kind': 'data'},
-                                 'const_1': {'kind': 'op', 'op': 'Const'},
-                                 'const_1_data': {'kind': 'data'},
                                  'concat': {'kind': 'op', 'op': 'Concat'},
                                  'concat_data': {'kind': 'data'},
 
                                  'placeholder_data_2': {'kind': 'data', 'shape': [1, 26]},
                                  'memory': {'kind': 'op', 'op': 'Assign'},
                                  },
-                                [('in_node', 'placeholder_1'), ('placeholder_1', 'placeholder_data_1'),
+                                [('placeholder_1', 'placeholder_data_1'),
                                  ('placeholder_data_1', 'splice_1'), ('splice_1', 'splice_data_1'),
                                  ('splice_data_1', 'placeholder_2'), ('placeholder_2', 'placeholder_data_2'),
-                                 ('placeholder_data_2', 'select', {'in': 1}),
+                                ('placeholder_data_2', 'select', {'in': 1}),
 
-                                 ('placeholder_data_2', 'shape'), ('shape', 'shape_data'),
-                                 ('shape_data', 'crop_batch'), ('crop_batch', 'crop_batch_data'),
-                                 ('crop_batch_dim', 'crop_batch_dim_data'),
-                                 ('crop_batch_dim_data', 'crop_batch', {'in': 1}),
-                                 ('second_dim', 'second_dim_data'), ('second_dim_data', 'gather_shape', {'in': 1}),
-                                 ('crop_batch_data', 'gather_shape', {'in': 0}), ('gather_shape', 'gather_shape_data'),
-                                 ('fill_value', 'fill_value_data'), ('fill_value_data', 'broadcast', {'in': 0}),
-                                 ('gather_shape_data', 'broadcast', {'in': 1}), ('broadcast', 'broadcast_data'),
-                                 ('broadcast_data', 'memory_in'),
+                                 ('second_dim_mem_1', 'second_dim_data_mem_1'),
+                                 ('second_dim_data_mem_1', 'gather_shape_mem_1', {'in': 1}),
+                                 ('crop_batch_data', 'gather_shape_mem_1', {'in': 0}),
+                                 ('gather_shape_mem_1', 'gather_shape_data_mem_1'),
+                                 ('fill_value', 'fill_value_data'),
+                                 ('fill_value_data', 'broadcast_mem_1', {'in': 0}),
+                                 ('gather_shape_data_mem_1', 'broadcast_mem_1', {'in': 1}),
+                                 ('broadcast_mem_1', 'broadcast_data_mem_1'),
+                                 ('broadcast_data_mem_1', 'memory_in'),
 
                                  ('memory_in', 'memory_in_data'), ('memory_in_data', 'crop_in'),
                                  ('crop_in', 'crop_in_data'), ('crop_in_data', 'concat', {'in': 0}),
-                                 ('const_1', 'const_1_data'), ('const_1_data', 'concat', {'in': 1}),
+
+                                 ('second_dim_mem_2', 'second_dim_data_mem_2'),
+                                 ('second_dim_data_mem_2', 'gather_shape_mem_2', {'in': 1}),
+                                 ('crop_batch_data', 'gather_shape_mem_2', {'in': 0}),
+                                 ('gather_shape_mem_2', 'gather_shape_data_mem_2'),
+                                 ('fill_value_ones_2', 'fill_value_data_ones_2'),
+                                 ('fill_value_data_ones_2', 'broadcast_mem_2', {'in': 0}),
+                                 ('gather_shape_data_mem_2', 'broadcast_mem_2', {'in': 1}),
+                                 ('broadcast_mem_2', 'broadcast_data_mem_2'),
+                                 ('broadcast_data_mem_2', 'concat', {'in': 1}),
+
                                  ('concat', 'concat_data'), ('concat_data', 'memory_out'),
                                  ('memory_out', 'memory_out_data'), ('memory_out_data', 'result'),
                                  ('concat_data', 'crop_out'), ('crop_out', 'crop_out_data'),
-                                 ('crop_out_data', 'equal', {'in': 1}), ('const_1_data', 'equal', {'in': 0}),
+                                 ('crop_out_data', 'equal', {'in': 1}), ('broadcast_data_mem_2', 'equal', {'in': 0}),
                                  ('equal', 'equal_data'),
                                  ('equal_data', 'select', {'in': 0}),
 
-                                 ('const_0', 'const_0_data'), ('const_0_data', 'select', {'in': 2}),
+                                 ('placeholder_data_2', 'shape'), ('shape', 'shape_data'),
+                                 ('shape_data', 'crop_batch'), ('crop_batch', 'crop_batch_data'),
+                                 ('crop_batch_dim', 'crop_batch_dim_data'),
+                                 ('crop_batch_dim_data', 'crop_batch', {'in': 1}),
+                                 ('second_dim', 'second_dim_data'), ('second_dim_data', 'gather_shape', {'in': 1}),
+                                 ('crop_batch_data', 'gather_shape', {'in': 0}), ('gather_shape', 'gather_shape_data'),
+                                 ('fill_value_ones', 'fill_value_data_ones'),
+                                 ('fill_value_data_ones', 'broadcast', {'in': 0}),
+                                 ('gather_shape_data', 'broadcast', {'in': 1}), ('broadcast', 'broadcast_data'),
+                                 ('broadcast_data', 'select', {'in': 2}),
+
                                  ('select', 'select_out_data'),
                                  ('select_out_data', 'memory')
                                  ],
@@ -131,8 +167,8 @@ def test_insert_select_1(self):
     # graph contains 1 splice with context length 5 on the path to memory and 1 out of path,
     # should be inserted select with memory as counter with length 5
     def test_insert_select_2(self):
-        graph = build_graph({'in_node': {'kind': 'data', 'shape': [1, 13]},
-                             'placeholder_1': {'kind': 'op', 'op': None},
+        graph = build_graph({
+                             'placeholder_1': {'kind': 'op', 'op': 'Parameter'},
                              'placeholder_data_1': {'kind': 'data', 'shape': [1, 13]},
                              'splice_1': {'kind': 'op', 'op': 'Splice', 'context': np.array([-2, -1, 0, 1, 2])},
                              'splice_data_1': {'kind': 'data', 'shape': [1, 65]},
@@ -142,7 +178,7 @@ def test_insert_select_2(self):
                              'placeholder_data_2': {'kind': 'data', 'shape': [1, 26]},
                              'memory': {'kind': 'op', 'op': 'Assign'},
                              },
-                            [('in_node', 'placeholder_1'), ('placeholder_1', 'placeholder_data_1'),
+                            [('placeholder_1', 'placeholder_data_1'),
                              ('placeholder_data_1', 'splice_1'), ('splice_1', 'splice_data_1'),
                              ('placeholder_data_1', 'splice_2'), ('splice_2', 'splice_data_2'),
                              ('splice_data_1', 'placeholder_2'), ('placeholder_2', 'placeholder_data_2'),
@@ -150,8 +186,8 @@ def test_insert_select_2(self):
                              ],
                             nodes_with_edges_only=True)
         AddSelectBeforeMemoryNodePattern().find_and_replace_pattern(graph)
-        ref_graph = build_graph({'in_node': {'kind': 'data', 'shape': [1, 13]},
-                                 'placeholder_1': {'kind': 'op', 'op': None},
+        ref_graph = build_graph({
+                                 'placeholder_1': {'kind': 'op', 'op': 'Parameter'},
                                  'placeholder_data_1': {'kind': 'data', 'shape': [1, 13]},
                                  'splice_1': {'kind': 'op', 'op': 'Splice', 'context': np.array([-2, -1, 0, 1, 2])},
                                  'splice_data_1': {'kind': 'data', 'shape': [1, 65]},
@@ -159,6 +195,15 @@ def test_insert_select_2(self):
                                  'splice_data_2': {'kind': 'data', 'shape': [1, 39]},
                                  'placeholder_2': {'kind': 'op', 'op': None},
 
+                                 'second_dim_mem_1': {'kind': 'op', 'op': 'Const', 'value': int64_array([5])},
+                                 'second_dim_data_mem_1': {'kind': 'data'},
+                                 'gather_shape_mem_1': {'kind': 'op', 'op': 'Concat'},
+                                 'gather_shape_data_mem_1': {'kind': 'data'},
+                                 'fill_value': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
+                                 'fill_value_data': {'kind': 'data'},
+                                 'broadcast_mem_1': {'kind': 'op', 'op': 'Broadcast'},
+                                 'broadcast_data_mem_1': {'kind': 'data'},
+
                                  'shape': {'kind': 'op', 'op': 'ShapeOf'},
                                  'shape_data': {'kind': 'data'},
                                  'crop_batch': {'kind': 'op', 'op': 'Crop', 'offset': int64_array([0])},
@@ -169,14 +214,23 @@ def test_insert_select_2(self):
                                  'second_dim_data': {'kind': 'data'},
                                  'gather_shape': {'kind': 'op', 'op': 'Concat'},
                                  'gather_shape_data': {'kind': 'data'},
-                                 'fill_value': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
-                                 'fill_value_data': {'kind': 'data'},
+                                 'fill_value_ones': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
+                                 'fill_value_data_ones': {'kind': 'data'},
                                  'broadcast': {'kind': 'op', 'op': 'Broadcast'},
                                  'broadcast_data': {'kind': 'data'},
 
-                                 'memory_in': {'kind': 'op', 'op': 'ReadValue'},
+                                 'second_dim_mem_2': {'kind': 'op', 'op': 'Const', 'value': int64_array([26])},
+                                 'second_dim_data_mem_2': {'kind': 'data'},
+                                 'gather_shape_mem_2': {'kind': 'op', 'op': 'Concat'},
+                                 'gather_shape_data_mem_2': {'kind': 'data'},
+                                 'fill_value_ones_2': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
+                                 'fill_value_data_ones_2': {'kind': 'data'},
+                                 'broadcast_mem_2': {'kind': 'op', 'op': 'Broadcast'},
+                                 'broadcast_data_mem_2': {'kind': 'data'},
+
+                                 'memory_in': {'kind': 'op', 'op': 'ReadValue', 'shape': int64_array([5])},
                                  'memory_in_data': {'kind': 'data'},
-                                 'memory_out': {'kind': 'op', 'op': 'Assign'},
+                                 'memory_out': {'kind': 'op', 'op': 'Assign', 'shape': int64_array([5])},
                                  'memory_out_data': {'kind': 'data'},
                                  'result': {'kind': 'op', 'op': 'Result'},
                                  'crop_in': {'kind': 'op', 'op': 'Crop', 'axis': 1, 'offset': 1, 'dim': 4},
@@ -189,55 +243,72 @@ def test_insert_select_2(self):
                                  'select_out_data': {'kind': 'data', 'shape': [1, 26]},
                                  'const_0': {'kind': 'op', 'op': 'Const'},
                                  'const_0_data': {'kind': 'data'},
-                                 'const_1': {'kind': 'op', 'op': 'Const'},
-                                 'const_1_data': {'kind': 'data'},
                                  'concat': {'kind': 'op', 'op': 'Concat'},
                                  'concat_data': {'kind': 'data'},
 
                                  'placeholder_data_2': {'kind': 'data', 'shape': [1, 26]},
                                  'memory': {'kind': 'op', 'op': 'Assign'},
                                  },
-                                [('in_node', 'placeholder_1'), ('placeholder_1', 'placeholder_data_1'),
+                                [('placeholder_1', 'placeholder_data_1'),
                                  ('placeholder_data_1', 'splice_1'), ('splice_1', 'splice_data_1'),
                                  ('placeholder_data_1', 'splice_2'), ('splice_2', 'splice_data_2'),
                                  ('splice_data_1', 'placeholder_2'), ('placeholder_2', 'placeholder_data_2'),
                                  ('placeholder_data_2', 'select', {'in': 1}),
 
-                                 ('placeholder_data_2', 'shape'), ('shape', 'shape_data'),
-                                 ('shape_data', 'crop_batch'), ('crop_batch', 'crop_batch_data'),
-                                 ('crop_batch_dim', 'crop_batch_dim_data'),
-                                 ('crop_batch_dim_data', 'crop_batch', {'in': 1}),
-                                 ('second_dim', 'second_dim_data'), ('second_dim_data', 'gather_shape', {'in': 1}),
-                                 ('crop_batch_data', 'gather_shape', {'in': 0}), ('gather_shape', 'gather_shape_data'),
-                                 ('fill_value', 'fill_value_data'), ('fill_value_data', 'broadcast', {'in': 0}),
-                                 ('gather_shape_data', 'broadcast', {'in': 1}), ('broadcast', 'broadcast_data'),
-                                 ('broadcast_data', 'memory_in'),
+                                 ('second_dim_mem_1', 'second_dim_data_mem_1'),
+                                 ('second_dim_data_mem_1', 'gather_shape_mem_1', {'in': 1}),
+                                 ('crop_batch_data', 'gather_shape_mem_1', {'in': 0}),
+                                 ('gather_shape_mem_1', 'gather_shape_data_mem_1'),
+                                 ('fill_value', 'fill_value_data'),
+                                 ('fill_value_data', 'broadcast_mem_1', {'in': 0}),
+                                 ('gather_shape_data_mem_1', 'broadcast_mem_1', {'in': 1}),
+                                 ('broadcast_mem_1', 'broadcast_data_mem_1'),
+                                 ('broadcast_data_mem_1', 'memory_in'),
 
                                  ('memory_in', 'memory_in_data'), ('memory_in_data', 'crop_in'),
                                  ('crop_in', 'crop_in_data'), ('crop_in_data', 'concat', {'in': 0}),
-                                 ('const_1', 'const_1_data'), ('const_1_data', 'concat', {'in': 1}),
+
+                                 ('second_dim_mem_2', 'second_dim_data_mem_2'),
+                                 ('second_dim_data_mem_2', 'gather_shape_mem_2', {'in': 1}),
+                                 ('crop_batch_data', 'gather_shape_mem_2', {'in': 0}),
+                                 ('gather_shape_mem_2', 'gather_shape_data_mem_2'),
+                                 ('fill_value_ones_2', 'fill_value_data_ones_2'),
+                                 ('fill_value_data_ones_2', 'broadcast_mem_2', {'in': 0}),
+                                 ('gather_shape_data_mem_2', 'broadcast_mem_2', {'in': 1}),
+                                 ('broadcast_mem_2', 'broadcast_data_mem_2'),
+                                 ('broadcast_data_mem_2', 'concat', {'in': 1}),
+
                                  ('concat', 'concat_data'), ('concat_data', 'memory_out'),
                                  ('memory_out', 'memory_out_data'), ('memory_out_data', 'result'),
                                  ('concat_data', 'crop_out'), ('crop_out', 'crop_out_data'),
-                                 ('crop_out_data', 'equal', {'in': 1}), ('const_1_data', 'equal', {'in': 0}),
+                                 ('crop_out_data', 'equal', {'in': 1}), ('broadcast_data_mem_2', 'equal', {'in': 0}),
                                  ('equal', 'equal_data'),
                                  ('equal_data', 'select', {'in': 0}),
-                                 ('const_0', 'const_0_data'), ('const_0_data', 'select', {'in': 2}),
+
+                                 ('placeholder_data_2', 'shape'), ('shape', 'shape_data'),
+                                 ('shape_data', 'crop_batch'), ('crop_batch', 'crop_batch_data'),
+                                 ('crop_batch_dim', 'crop_batch_dim_data'),
+                                 ('crop_batch_dim_data', 'crop_batch', {'in': 1}),
+                                 ('second_dim', 'second_dim_data'), ('second_dim_data', 'gather_shape', {'in': 1}),
+                                 ('crop_batch_data', 'gather_shape', {'in': 0}), ('gather_shape', 'gather_shape_data'),
+                                 ('fill_value_ones', 'fill_value_data_ones'),
+                                 ('fill_value_data_ones', 'broadcast', {'in': 0}),
+                                 ('gather_shape_data', 'broadcast', {'in': 1}), ('broadcast', 'broadcast_data'),
+                                 ('broadcast_data', 'select', {'in': 2}),
 
                                  ('select', 'select_out_data'),
                                  ('select_out_data', 'memory')
                                  ],
                                 nodes_with_edges_only=True
                                 )
-
         (flag, resp) = compare_graphs(graph, ref_graph, 'memory')
         self.assertTrue(flag, resp)
 
     # graph contains 2 splices with sum context length 8 on the path to memory,
     # should be inserted select with memory as counter with length 7
     def test_insert_select_3(self):
-        graph = build_graph({'in_node': {'kind': 'data', 'shape': [1, 13]},
-                             'placeholder_1': {'kind': 'op', 'op': None},
+        graph = build_graph({
+                             'placeholder_1': {'kind': 'op', 'op': 'Parameter'},
                              'placeholder_data_1': {'kind': 'data', 'shape': [1, 13]},
                              'splice_1': {'kind': 'op', 'op': 'Splice', 'context': np.array([-2, -1, 0, 1, 2])},
                              'splice_data_1': {'kind': 'data', 'shape': [1, 65]},
@@ -247,7 +318,7 @@ def test_insert_select_3(self):
                              'placeholder_data_2': {'kind': 'data', 'shape': [1, 26]},
                              'memory': {'kind': 'op', 'op': 'Assign', 'index': 0},
                              },
-                            [('in_node', 'placeholder_1'), ('placeholder_1', 'placeholder_data_1'),
+                            [('placeholder_1', 'placeholder_data_1'),
                              ('placeholder_data_1', 'splice_1'), ('splice_1', 'splice_data_1'),
                              ('splice_data_1', 'splice_2'), ('splice_2', 'splice_data_2'),
                              ('splice_data_2', 'placeholder_2'), ('placeholder_2', 'placeholder_data_2'),
@@ -255,8 +326,8 @@ def test_insert_select_3(self):
                              ],
                             nodes_with_edges_only=True)
         AddSelectBeforeMemoryNodePattern().find_and_replace_pattern(graph)
-        ref_graph = build_graph({'in_node': {'kind': 'data', 'shape': [1, 13]},
-                                 'placeholder_1': {'kind': 'op', 'op': None},
+        ref_graph = build_graph({
+                                 'placeholder_1': {'kind': 'op', 'op': 'Parameter'},
                                  'placeholder_data_1': {'kind': 'data', 'shape': [1, 13]},
                                  'splice_1': {'kind': 'op', 'op': 'Splice', 'context': np.array([-2, -1, 0, 1, 2])},
                                  'splice_data_1': {'kind': 'data', 'shape': [1, 65]},
@@ -264,27 +335,45 @@ def test_insert_select_3(self):
                                  'splice_data_2': {'kind': 'data', 'shape': [1, 39]},
                                  'placeholder_2': {'kind': 'op', 'op': None},
 
+                                 'second_dim_mem_1': {'kind': 'op', 'op': 'Const', 'value': int64_array([5])},
+                                 'second_dim_data_mem_1': {'kind': 'data'},
+                                 'gather_shape_mem_1': {'kind': 'op', 'op': 'Concat'},
+                                 'gather_shape_data_mem_1': {'kind': 'data'},
+                                 'fill_value': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
+                                 'fill_value_data': {'kind': 'data'},
+                                 'broadcast_mem_1': {'kind': 'op', 'op': 'Broadcast'},
+                                 'broadcast_data_mem_1': {'kind': 'data'},
+
                                  'shape': {'kind': 'op', 'op': 'ShapeOf'},
                                  'shape_data': {'kind': 'data'},
                                  'crop_batch': {'kind': 'op', 'op': 'Crop', 'offset': int64_array([0])},
                                  'crop_batch_data': {'kind': 'data'},
                                  'crop_batch_dim': {'kind': 'op', 'op': 'Const', 'value': int64_array([1])},
                                  'crop_batch_dim_data': {'kind': 'data'},
-                                 'second_dim': {'kind': 'op', 'op': 'Const', 'value': int64_array([7])},
+                                 'second_dim': {'kind': 'op', 'op': 'Const', 'value': int64_array([5])},
                                  'second_dim_data': {'kind': 'data'},
                                  'gather_shape': {'kind': 'op', 'op': 'Concat'},
                                  'gather_shape_data': {'kind': 'data'},
-                                 'fill_value': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
-                                 'fill_value_data': {'kind': 'data'},
+                                 'fill_value_ones': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
+                                 'fill_value_data_ones': {'kind': 'data'},
                                  'broadcast': {'kind': 'op', 'op': 'Broadcast'},
                                  'broadcast_data': {'kind': 'data'},
 
-                                 'memory_in': {'kind': 'op', 'op': 'ReadValue'},
+                                 'second_dim_mem_2': {'kind': 'op', 'op': 'Const', 'value': int64_array([26])},
+                                 'second_dim_data_mem_2': {'kind': 'data'},
+                                 'gather_shape_mem_2': {'kind': 'op', 'op': 'Concat'},
+                                 'gather_shape_data_mem_2': {'kind': 'data'},
+                                 'fill_value_ones_2': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
+                                 'fill_value_data_ones_2': {'kind': 'data'},
+                                 'broadcast_mem_2': {'kind': 'op', 'op': 'Broadcast'},
+                                 'broadcast_data_mem_2': {'kind': 'data'},
+
+                                 'memory_in': {'kind': 'op', 'op': 'ReadValue', 'shape': int64_array([5])},
                                  'memory_in_data': {'kind': 'data'},
-                                 'memory_out': {'kind': 'op', 'op': 'Assign'},
+                                 'memory_out': {'kind': 'op', 'op': 'Assign', 'shape': int64_array([5])},
                                  'memory_out_data': {'kind': 'data'},
                                  'result': {'kind': 'op', 'op': 'Result'},
-                                 'crop_in': {'kind': 'op', 'op': 'Crop', 'axis': 1, 'offset': 1, 'dim': 6},
+                                 'crop_in': {'kind': 'op', 'op': 'Crop', 'axis': 1, 'offset': 1, 'dim': 4},
                                  'crop_in_data': {'kind': 'data'},
                                  'crop_out': {'kind': 'op', 'op': 'Crop', 'axis': 1, 'offset': 0, 'dim': 1},
                                  'crop_out_data': {'kind': 'data'},
@@ -294,40 +383,58 @@ def test_insert_select_3(self):
                                  'select_out_data': {'kind': 'data', 'shape': [1, 26]},
                                  'const_0': {'kind': 'op', 'op': 'Const'},
                                  'const_0_data': {'kind': 'data'},
-                                 'const_1': {'kind': 'op', 'op': 'Const'},
-                                 'const_1_data': {'kind': 'data'},
                                  'concat': {'kind': 'op', 'op': 'Concat'},
                                  'concat_data': {'kind': 'data'},
 
                                  'placeholder_data_2': {'kind': 'data', 'shape': [1, 26]},
                                  'memory': {'kind': 'op', 'op': 'Assign', 'index': 0},
                                  },
-                                [('in_node', 'placeholder_1'), ('placeholder_1', 'placeholder_data_1'),
+                                [('placeholder_1', 'placeholder_data_1'),
                                  ('placeholder_data_1', 'splice_1'), ('splice_1', 'splice_data_1'),
                                  ('splice_data_1', 'splice_2'), ('splice_2', 'splice_data_2'),
                                  ('splice_data_2', 'placeholder_2'), ('placeholder_2', 'placeholder_data_2'),
                                  ('placeholder_data_2', 'select', {'in': 1}),
 
-                                 ('placeholder_data_2', 'shape'), ('shape', 'shape_data'),
-                                 ('shape_data', 'crop_batch'), ('crop_batch', 'crop_batch_data'),
-                                 ('crop_batch_dim', 'crop_batch_dim_data'),
-                                 ('crop_batch_dim_data', 'crop_batch', {'in': 1}),
-                                 ('second_dim', 'second_dim_data'), ('second_dim_data', 'gather_shape', {'in': 1}),
-                                 ('crop_batch_data', 'gather_shape', {'in': 0}), ('gather_shape', 'gather_shape_data'),
-                                 ('fill_value', 'fill_value_data'), ('fill_value_data', 'broadcast', {'in': 0}),
-                                 ('gather_shape_data', 'broadcast', {'in': 1}), ('broadcast', 'broadcast_data'),
-                                 ('broadcast_data', 'memory_in'),
+                                 ('second_dim_mem_1', 'second_dim_data_mem_1'),
+                                 ('second_dim_data_mem_1', 'gather_shape_mem_1', {'in': 1}),
+                                 ('crop_batch_data', 'gather_shape_mem_1', {'in': 0}),
+                                 ('gather_shape_mem_1', 'gather_shape_data_mem_1'),
+                                 ('fill_value', 'fill_value_data'),
+                                 ('fill_value_data', 'broadcast_mem_1', {'in': 0}),
+                                 ('gather_shape_data_mem_1', 'broadcast_mem_1', {'in': 1}),
+                                 ('broadcast_mem_1', 'broadcast_data_mem_1'),
+                                 ('broadcast_data_mem_1', 'memory_in'),
 
                                  ('memory_in', 'memory_in_data'), ('memory_in_data', 'crop_in'),
                                  ('crop_in', 'crop_in_data'), ('crop_in_data', 'concat', {'in': 0}),
-                                 ('const_1', 'const_1_data'), ('const_1_data', 'concat', {'in': 1}),
+
+                                 ('second_dim_mem_2', 'second_dim_data_mem_2'),
+                                 ('second_dim_data_mem_2', 'gather_shape_mem_2', {'in': 1}),
+                                 ('crop_batch_data', 'gather_shape_mem_2', {'in': 0}),
+                                 ('gather_shape_mem_2', 'gather_shape_data_mem_2'),
+                                 ('fill_value_ones_2', 'fill_value_data_ones_2'),
+                                 ('fill_value_data_ones_2', 'broadcast_mem_2', {'in': 0}),
+                                 ('gather_shape_data_mem_2', 'broadcast_mem_2', {'in': 1}),
+                                 ('broadcast_mem_2', 'broadcast_data_mem_2'),
+                                 ('broadcast_data_mem_2', 'concat', {'in': 1}),
+
                                  ('concat', 'concat_data'), ('concat_data', 'memory_out'),
                                  ('memory_out', 'memory_out_data'), ('memory_out_data', 'result'),
                                  ('concat_data', 'crop_out'), ('crop_out', 'crop_out_data'),
-                                 ('crop_out_data', 'equal', {'in': 1}), ('const_1_data', 'equal', {'in': 0}),
+                                 ('crop_out_data', 'equal', {'in': 1}), ('broadcast_data_mem_2', 'equal', {'in': 0}),
                                  ('equal', 'equal_data'),
                                  ('equal_data', 'select', {'in': 0}),
-                                 ('const_0', 'const_0_data'), ('const_0_data', 'select', {'in': 2}),
+
+                                 ('placeholder_data_2', 'shape'), ('shape', 'shape_data'),
+                                 ('shape_data', 'crop_batch'), ('crop_batch', 'crop_batch_data'),
+                                 ('crop_batch_dim', 'crop_batch_dim_data'),
+                                 ('crop_batch_dim_data', 'crop_batch', {'in': 1}),
+                                 ('second_dim', 'second_dim_data'), ('second_dim_data', 'gather_shape', {'in': 1}),
+                                 ('crop_batch_data', 'gather_shape', {'in': 0}), ('gather_shape', 'gather_shape_data'),
+                                 ('fill_value_ones', 'fill_value_data_ones'),
+                                 ('fill_value_data_ones', 'broadcast', {'in': 0}),
+                                 ('gather_shape_data', 'broadcast', {'in': 1}), ('broadcast', 'broadcast_data'),
+                                 ('broadcast_data', 'select', {'in': 2}),
 
                                  ('select', 'select_out_data'),
                                  ('select_out_data', 'memory')
diff --git a/model-optimizer/extensions/middle/MakeKaldiConstReshapable.py b/model-optimizer/extensions/middle/MakeKaldiConstReshapable.py
new file mode 100644
index 00000000000000..9e42d74c205318
--- /dev/null
+++ b/model-optimizer/extensions/middle/MakeKaldiConstReshapable.py
@@ -0,0 +1,118 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+
+from mo.front.common.partial_infer.utils import int64_array
+from mo.front.tf.graph_utils import create_op_node_with_second_input, create_op_with_const_inputs
+from mo.graph.graph import Graph, Port
+from mo.middle.replacement import MiddleReplacementPattern
+from mo.ops.broadcast import Broadcast
+from mo.ops.concat import Concat
+from mo.ops.crop import Crop
+from mo.ops.shape import Shape
+
+
+def create_const_with_batch_from_input(producer_port: Port, second_dim, value=0, precision=np.float32):
+    """
+    Create const with batch taken from input_out_port and second dimension equals second_dim
+    :param producer_port: take batch from this port
+    :param second_dim: second dimension for created constant
+    :param value: value to initialize constant
+    :param precision: precision for constant
+    :return created constant node
+    """
+    graph = producer_port.node.graph
+    input_name = producer_port.node.soft_get('name', producer_port.node.id)
+
+    shape_of_input = None
+    for dest in producer_port.get_destinations():
+        if dest.node.soft_get('op') == "ShapeOf":
+            shape_of_input = dest.node
+            break
+
+    if shape_of_input is None:
+        shape_of_input = Shape(graph, {'name': input_name + '/Shape'}).create_node()
+        shape_of_input.in_port(0).connect(producer_port)
+
+    get_batch = None
+    for dest in shape_of_input.out_port(0).get_destinations():
+        if dest.node.soft_get('op') == "Crop" and \
+                dest.node.in_port(1).get_source().node.soft_get('value', []) == int64_array([1]):
+            get_batch = dest.node
+            break
+
+    if get_batch is None:
+        get_batch = create_op_node_with_second_input(graph, Crop, int64_array([1]),
+                                                     {'name': shape_of_input.name + '/Crop',
+                                                      'axis': int64_array([0]), 'offset': int64_array([0])},
+                                                     shape_of_input)
+
+    mem_shape = None
+    for dest in get_batch.out_port(0).get_destinations():
+        if dest.node.soft_get('op') == "Concat" and \
+                dest.node.in_port(1).get_source().node.soft_get('value', []) == int64_array([second_dim]):
+            mem_shape = dest.node
+            break
+
+    if mem_shape is None:
+        mem_shape = create_op_node_with_second_input(graph, Concat, int64_array([second_dim]),
+                                                     {'name': get_batch.name + '/Concat', 'axis': 0,
+                                                      'in_ports_count': 2}, get_batch)
+
+    init_value_prev_lstm_output = None
+    for dest in mem_shape.out_port(0).get_destinations():
+        if dest.node.soft_get('op') == "Broadcast" and \
+                dest.node.in_port(1).get_source().node.soft_get('value', []) == np.array([value], dtype=precision):
+            init_value_prev_lstm_output = dest.node
+            break
+
+    if init_value_prev_lstm_output is None:
+        init_value_prev_lstm_output = create_op_with_const_inputs(graph, Broadcast,
+                                                                  {0: np.array([value], dtype=precision)},
+                                                                  {'name': mem_shape.name + '/Broadcast'})
+        init_value_prev_lstm_output.in_port(1).connect(mem_shape.out_port(0))
+
+    return init_value_prev_lstm_output
+
+
+class MakeKaldiConstReshapable(MiddleReplacementPattern):
+    """
+    Add broadcasting of constant nodes based on batch from Parameter node. This approach works only for Kaldi,
+    because it has the same batch in whole graph due to framework specific.
+    """
+    enabled = True
+    graph_condition = [lambda graph: graph.graph['fw'] == "kaldi"]
+
+    def run_after(self):
+        from extensions.middle.InsertSelect import AddSelectBeforeMemoryNodePattern
+        from extensions.middle.ReplaceMemoryOffsetWithSplice import ReplaceMemoryOffsetWithMemoryNodePattern
+        from extensions.middle.ReplaceSpliceNodePattern import ReplaceSpliceNodePattern
+        return [AddSelectBeforeMemoryNodePattern, ReplaceMemoryOffsetWithMemoryNodePattern,
+                ReplaceSpliceNodePattern]
+
+    def find_and_replace_pattern(self, graph: Graph):
+        params = graph.get_op_nodes(op="Parameter")
+        batch = params[0].shape[0]
+
+        # check that all Parameters have the same batch
+        for p in params:
+            assert p.shape[0] == batch, \
+                   "Parameter {} has batch different from the {}".format(p.soft_get('name', p.id),
+                                                                          params[0].soft_get('name', params[0].id))
+
+        # make constants for initialization of ReadValue reshapable
+        for read in graph.get_op_nodes(op='ReadValue'):
+            input_node = read.in_port(0).get_source().node
+            if input_node.soft_get('op') == "Const":
+                const_shape = input_node.out_port(0).data.get_shape()
+                # extra check to be sure that we don't break shapes compatibility in graph
+                # in Kaldi models we have only 2 dimensions
+                # and batch should be set the same as we will get from Parameter
+                # otherwise just skip such node
+                if len(const_shape) != 2 or const_shape[0] != batch:
+                    continue
+                new_const = create_const_with_batch_from_input(params[0].out_port(0),
+                                                               const_shape[1],
+                                                               value=input_node.value[0], precision=input_node.data_type)
+                input_node.out_port(0).get_connection().set_source(new_const.out_port(0))
diff --git a/model-optimizer/extensions/middle/MakeKaldiConstReshapable_test.py b/model-optimizer/extensions/middle/MakeKaldiConstReshapable_test.py
new file mode 100644
index 00000000000000..db0389829325be
--- /dev/null
+++ b/model-optimizer/extensions/middle/MakeKaldiConstReshapable_test.py
@@ -0,0 +1,104 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+import numpy as np
+
+from extensions.middle.MakeKaldiConstReshapable import MakeKaldiConstReshapable
+from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph, result, regular_op_with_shaped_data, connect
+
+nodes = {
+    **regular_op_with_shaped_data('placeholder_1', [1, 13], {'kind': 'op', 'op': 'Parameter', 'shape': [1, 13]}),
+    **regular_op_with_shaped_data('splice_1', [1, 13], {'kind': 'op', 'op': 'Splice',
+                                                        'context': np.array([-2, -1, 0, 1, 2])}),
+    **regular_op_with_shaped_data('placeholder_2', [1, 26], {'kind': 'op', 'op': None}),
+    **regular_op_with_shaped_data('memory_in', [1, 5], {'kind': 'op', 'op': 'ReadValue',
+                                                        'shape': int64_array([1, 5])}),
+    **regular_op_with_shaped_data('memory_out', [1, 5], {'kind': 'op', 'op': 'Assign', 'shape': int64_array([1, 5])}),
+    **result('result'),
+    **regular_op_with_shaped_data('crop_in', [1, 4], {'kind': 'op', 'op': 'Crop', 'axis': 1, 'offset': 1, 'dim': 4}),
+    **regular_op_with_shaped_data('crop_out', [1, 1], {'kind': 'op', 'op': 'Crop', 'axis': 1, 'offset': 0, 'dim': 1}),
+    **regular_op_with_shaped_data('equal', [1, 1], {'kind': 'op', 'op': 'Equal'}),
+    **regular_op_with_shaped_data('select', [1, 26], {'kind': 'op', 'op': 'Select'}),
+    **regular_op_with_shaped_data('const_0', [1, 1], {'kind': 'op', 'op': 'Const', 'shape': [1, 1],
+                                                      'value': [0], 'data_type': np.float32}),
+    **regular_op_with_shaped_data('const_1', [1, 1], {'kind': 'op', 'op': 'Const', 'shape': [1, 1],
+                                                      'value': [0], 'data_type': np.float32}),
+    **regular_op_with_shaped_data('concat', [1, 5], {'kind': 'op', 'op': 'Concat'}),
+    **regular_op_with_shaped_data('memory', [1, 26], {'kind': 'op', 'op': 'Assign'}),
+
+    **regular_op_with_shaped_data('shape', None,  {'kind': 'op', 'op': 'ShapeOf'}),
+    **regular_op_with_shaped_data('crop_batch', None, {'kind': 'op', 'op': 'Crop', 'offset': int64_array([0])}),
+    **regular_op_with_shaped_data('crop_batch_dim', None, {'kind': 'op', 'op': 'Const', 'shape': [1],
+                                                             'value': [1], 'data_type': np.int64}),
+    **regular_op_with_shaped_data('second_dim', None, {'kind': 'op', 'op': 'Const', 'shape': [1],
+                                                      'value': [5], 'data_type': np.int64}),
+    **regular_op_with_shaped_data('gather_shape', None, {'kind': 'op', 'op': 'Concat'}),
+    **regular_op_with_shaped_data('fill_value', [1, 5], {'kind': 'op', 'op': 'Const', 'shape': [1, 5],
+                                                         'value': np.zeros([1, 5]), 'data_type': np.float32}),
+    **regular_op_with_shaped_data('fill_value_2', None, {'kind': 'op', 'op': 'Const', 'shape': [1],
+                                                         'value': [0], 'data_type': np.float32}),
+    **regular_op_with_shaped_data('broadcast', [1, 5], {'kind': 'op', 'op': 'Broadcast'}),
+
+    **regular_op_with_shaped_data('fill_value_ones', [1, 26], {'kind': 'op', 'op': 'Const', 'shape': [1, 26],
+                                                               'value': np.zeros([1, 26]), 'data_type': np.int64}),
+    **regular_op_with_shaped_data('fill_value_ones_2', [1, 1], {'kind': 'op', 'op': 'Const', 'shape': [1, 1],
+                                                                'value': [1], 'data_type': np.int64}),
+}
+
+
+class MakeKaldiConstReshapableTests(unittest.TestCase):
+
+    # graph contains 1 splice with context length 5, should be inserted select with memory as counter with length 5
+    def test_reshapable_const(self):
+        graph = build_graph(nodes,
+                            [*connect('placeholder_1', 'splice_1'),
+                             *connect('splice_1', 'placeholder_2'),
+                             *connect('placeholder_2', '1:select'),
+                             *connect('fill_value', 'memory_in'),
+                             *connect('memory_in', 'crop_in'),
+                             *connect('crop_in', '0:concat'),
+                             *connect('fill_value_ones_2:0', '1:concat'),
+                             *connect('concat', 'memory_out'),
+                             *connect('memory_out', 'result'),
+                             *connect('concat', 'crop_out'),
+                             *connect('crop_out', '1:equal'),
+                             *connect('fill_value_ones_2:0', '0:equal'),
+                             *connect('equal', '0:select'),
+                             *connect('fill_value_ones', '2:select'),
+                             *connect('select', 'memory')
+                             ],
+                            nodes_with_edges_only=True)
+        graph.strict_mode = False
+        MakeKaldiConstReshapable().find_and_replace_pattern(graph)
+        ref_graph = build_graph(nodes,
+                                [*connect('placeholder_1:0', 'splice_1'),
+                                 *connect('splice_1', 'placeholder_2'),
+                                 *connect('placeholder_2', '1:select'),
+                                 *connect('placeholder_1:0', 'shape', skip_data=True),
+                                 *connect('shape', '0:crop_batch'),
+                                 *connect('crop_batch_dim', '1:crop_batch'),
+                                 *connect('second_dim', '1:gather_shape'),
+                                 *connect('crop_batch', '0:gather_shape'),
+                                 *connect('fill_value_2', '0:broadcast'),
+                                 *connect('gather_shape', '1:broadcast'),
+                                 *connect('broadcast', 'memory_in'),
+                                 *connect('memory_in', 'crop_in'),
+                                 *connect('crop_in', '0:concat'),
+                                 *connect('fill_value_ones_2', '1:concat'),
+                                 *connect('concat', 'memory_out'),
+                                 *connect('memory_out', 'result'),
+                                 *connect('concat', 'crop_out'),
+                                 *connect('crop_out', '1:equal'),
+                                 *connect('fill_value_ones_2', '0:equal'),
+                                 *connect('equal', '0:select'),
+                                 *connect('const_0', '2:select'),
+                                 *connect('fill_value_ones', '2:select'),
+                                 *connect('select', 'memory')
+                                 ], nodes_with_edges_only=True)
+
+        (flag, resp) = compare_graphs(graph, ref_graph, 'memory')
+        self.assertTrue(flag, resp)
diff --git a/model-optimizer/extensions/middle/MarkSubgraphsWithCorrectLayout.py b/model-optimizer/extensions/middle/MarkSubgraphsWithCorrectLayout.py
index 2ec24fbba034a1..5017d3e4660a9c 100644
--- a/model-optimizer/extensions/middle/MarkSubgraphsWithCorrectLayout.py
+++ b/model-optimizer/extensions/middle/MarkSubgraphsWithCorrectLayout.py
@@ -3,14 +3,12 @@
 
 import logging as log
 from collections import deque
-
 from typing import Set
 
 from extensions.middle.InsertLayoutPropagationTransposes import InsertLayoutPropagationTranspose, \
     mark_as_correct_data_layout, mark_output_as_in_correct_layout, mark_input_as_in_correct_layout
 from extensions.middle.LayoutChangeForConstantShapePaths import LayoutChangeForConstantShapePaths
 from extensions.middle.pass_separator import PostMiddleStart
-from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Graph, Node
 from mo.graph.perm_inputs import PermuteInputs
 from mo.graph.port import Port
@@ -51,7 +49,8 @@ def get_output_nodes(node: Node):
                     result.append(dest_port.node)
         return result
 
-    def bfs(self, start_nodes: list, visited: set, condition: callable = None, forward: bool = True):
+    @staticmethod
+    def bfs(start_nodes: list, visited: set, condition: callable = None, forward: bool = True):
         """
         The function performs BFS starting from selected nodes in forward or backward direction adding nodes by an
         optional condition
@@ -63,7 +62,7 @@ def bfs(self, start_nodes: list, visited: set, condition: callable = None, forwa
         :return: the list of Nodes visited
         """
         assert visited is not None, 'The "visited" set must be defined'
-        assert start_nodes is not None and len(start_nodes) != 0, 'The list of start nodes must be specified'
+        assert start_nodes is not None, 'The list of start nodes must be specified'
 
         result = list()
         d = deque(start_nodes)
@@ -72,9 +71,9 @@ def bfs(self, start_nodes: list, visited: set, condition: callable = None, forwa
             result.append(cur_node)
             visited.add(cur_node)
             if forward:
-                next_nodes = self.get_output_nodes(cur_node)
+                next_nodes = MarkSubGraphsWithCorrectLayout.get_output_nodes(cur_node)
             else:
-                next_nodes = self.get_input_nodes(cur_node)
+                next_nodes = MarkSubGraphsWithCorrectLayout.get_input_nodes(cur_node)
             for next_node in next_nodes:
                 if next_node not in visited and (condition is None or condition(next_node)):
                     d.append(next_node)
@@ -166,7 +165,7 @@ def insert_permute_inputs_before_dynamic_weights_subgraph(dynamic_subgraphs: Set
 
     @staticmethod
     def walk_up_from_in_ports_to_out_ports(in_ports: Set[Port], out_ports: Set[Port], port_condition=None):
-        """"
+        r""""
         Returns all intermediate ports and nodes of such a sub-graph:
 
             out_ports
diff --git a/model-optimizer/extensions/middle/ONNXResize11ToInterpolate.py b/model-optimizer/extensions/middle/ONNXResize11ToInterpolate.py
index 956e6d017cd3ed..ffd3c27eebdba6 100644
--- a/model-optimizer/extensions/middle/ONNXResize11ToInterpolate.py
+++ b/model-optimizer/extensions/middle/ONNXResize11ToInterpolate.py
@@ -2,18 +2,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging as log
+
 import numpy as np
 
-from extensions.ops.activation_ops import Floor
 from extensions.ops.Cast import Cast
+from extensions.ops.activation_ops import Floor
 from extensions.ops.elementwise import Add, Div, Mul
 from extensions.ops.interpolate import Interpolate
 from mo.front.common.layout import get_depth_dim, get_height_dim, get_width_dim
 from mo.front.common.partial_infer.utils import int64_array, float_array
 from mo.front.tf.graph_utils import create_op_with_const_inputs
-from mo.middle.passes.convert_data_type import data_type_str_to_np
-from mo.middle.replacement import MiddleReplacementPattern
 from mo.graph.graph import Graph, Node, rename_nodes
+from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.const import Const
 from mo.ops.shape import Shape
 from mo.ops.strided_slice import StridedSlice
@@ -94,10 +94,10 @@ def replace_resize(graph: Graph, resize: Node):
                                            {1: float_array([1.0e-5])},
                                            {'name': resize_name + '/Add'})
 
-    input_data_type = data_type_str_to_np(graph.graph['cmd_params'].data_type)
+    dst_dtype = np.float32  # even if data_type=FP16 use float32 for shape values
 
     if num_of_inputs == 3:
-        cast_shape_to_float = Cast(graph, {'dst_type': input_data_type}).create_node()
+        cast_shape_to_float = Cast(graph, {'dst_type': dst_dtype}).create_node()
         mul_node = Mul(graph, {'name': resize_name + '/Mul'}).create_node()
         shape_of.out_port(0).connect(cast_shape_to_float.in_port(0))
         cast_shape_to_float.out_port(0).connect(mul_node.in_port(0))
@@ -119,8 +119,8 @@ def replace_resize(graph: Graph, resize: Node):
         connection_of_resize_input.get_source().connect(shape_of.in_port(0))
         connection_of_scales.get_source().connect(mul_node.in_port(1))
     else:
-        cast_shape_to_float = Cast(graph, {'dst_type': input_data_type}).create_node()
-        cast_sizes_to_float = Cast(graph, {'dst_type': input_data_type}).create_node()
+        cast_shape_to_float = Cast(graph, {'dst_type': dst_dtype}).create_node()
+        cast_sizes_to_float = Cast(graph, {'dst_type': dst_dtype}).create_node()
         div_node = Div(graph, {'name': resize_name + '/Div'}).create_node()
         cast_sizes_to_float.out_port(0).connect(div_node.in_port(0))
         cast_shape_to_float.out_port(0).connect(div_node.in_port(1))
diff --git a/model-optimizer/extensions/middle/RemoveUselessConcatSplit.py b/model-optimizer/extensions/middle/RemoveUselessConcatSplit.py
index 1f16adc63af961..4bce3abfd85e1f 100644
--- a/model-optimizer/extensions/middle/RemoveUselessConcatSplit.py
+++ b/model-optimizer/extensions/middle/RemoveUselessConcatSplit.py
@@ -6,7 +6,7 @@
 
 
 class RemoveUselessConcatSplitPattern(MiddleReplacementPattern):
-    """
+    r"""
     Remove useless construction with concat and split like follows:
          /    /   |    \     \
        br1  br2   ..  br(n-1)br(n)
diff --git a/model-optimizer/extensions/middle/RemoveUselessCrops.py b/model-optimizer/extensions/middle/RemoveUselessCrops.py
index 47ae5c6e6104b1..2d37ab1487efe3 100644
--- a/model-optimizer/extensions/middle/RemoveUselessCrops.py
+++ b/model-optimizer/extensions/middle/RemoveUselessCrops.py
@@ -6,7 +6,7 @@
 
 
 class RemoveUselessCropsPattern(MiddleReplacementPattern):
-    """
+    r"""
     Remove useless construction with crops and concat like follows:
                 in_node
          /    /   |    \     \
diff --git a/model-optimizer/extensions/middle/ReplaceMemoryOffsetWithSplice.py b/model-optimizer/extensions/middle/ReplaceMemoryOffsetWithSplice.py
index 2a9a1f52a6c1fd..6a70e5337ed5e5 100644
--- a/model-optimizer/extensions/middle/ReplaceMemoryOffsetWithSplice.py
+++ b/model-optimizer/extensions/middle/ReplaceMemoryOffsetWithSplice.py
@@ -2,15 +2,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import numpy as np
-import logging as log
 
-from extensions.front.kaldi.replace_lstm_node_pattern import create_zero_value_with_batch_from_input
 from extensions.ops.splice import Splice
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Graph, Node
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.assign import Assign
 from mo.ops.concat import Concat
+from mo.ops.const import Const
 from mo.ops.crop import Crop
 from mo.ops.read_value import ReadValue
 from mo.ops.result import Result
@@ -134,7 +133,9 @@ def replace_pattern(graph: Graph, match: dict):
         in_shape = input_port.data.get_shape()
         node_t = abs(node.t)
 
-        init_value_memory_out = create_zero_value_with_batch_from_input(input_port, in_shape[1]*node_t)
+        init_value_memory_out = Const(graph, {'name': 'init_value_' + pair_name,
+                                              'value': np.zeros(int64_array([in_shape[0], in_shape[1]*node_t])),
+                                              'shape': int64_array([in_shape[0], in_shape[1]*node_t])}).create_node()
         memory_out = ReadValue(graph, {'name': pair_name, 'variable_id': node_name+pair_name}).create_node()
         init_value_memory_out.out_port(0).connect(memory_out.in_port(0))
 
@@ -163,14 +164,6 @@ def replace_pattern(graph: Graph, match: dict):
             memory_in.out_port(0).connect(out.in_port(0))
             out_port.get_connection().set_source(memory_out.out_port(0))
 
-        if not graph.graph['cmd_params'].static_shape:
-            log.error(
-                "Model can not be translated in a reshape-able way.\n"
-                "Model Optimizer key static_shape was turned on to prevent related errors.\n"
-                "There will be no success changing input shapes of the model with the help of "
-                "InferenceEngine reshape method", extra={'is_warning': True})
-            graph.graph['cmd_params'].static_shape = True
-
         graph.remove_node(op_output_id)
         graph.remove_node(node.id)
         graph.remove_node(pair_node.id)
diff --git a/model-optimizer/extensions/middle/ReplaceSpliceNodePattern.py b/model-optimizer/extensions/middle/ReplaceSpliceNodePattern.py
index 69103df850bbd2..92425780b5fdd5 100644
--- a/model-optimizer/extensions/middle/ReplaceSpliceNodePattern.py
+++ b/model-optimizer/extensions/middle/ReplaceSpliceNodePattern.py
@@ -1,7 +1,9 @@
 # Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from extensions.front.kaldi.replace_lstm_node_pattern import unique_id, create_zero_value_with_batch_from_input
+import numpy as np
+
+from extensions.front.kaldi.replace_lstm_node_pattern import unique_id
 from extensions.ops.split import VariadicSplit
 from mo.front.common.partial_infer.utils import int64_array
 from mo.front.tf.graph_utils import create_op_with_const_inputs
@@ -9,13 +11,14 @@
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.assign import Assign
 from mo.ops.concat import Concat
+from mo.ops.const import Const
 from mo.ops.crop import Crop
 from mo.ops.read_value import ReadValue
 from mo.ops.result import Result
 
 
 class ReplaceSpliceNodePattern(MiddleReplacementPattern):
-    """
+    r"""
        This pass decomposes Splice layer to the sequence Slice Concat and Memory layers
        For example:
            Let's suppose we have next graph:
@@ -93,8 +96,11 @@ def replace_pattern(graph: Graph, match: dict):
 
             # create separate splice construction for const_dim
             memory_pair_id = unique_id('memory_for_const_dim')
-            init_value_input_memory_const_dim = create_zero_value_with_batch_from_input(split.out_port(1),
-                                                                                        memory_size_constdim)
+            init_value_input_memory_const_dim = Const(graph, {'name': 'init_value_const_dim_in_memory',
+                                                              'value': np.zeros(int64_array([in_shape[0],
+                                                                                             memory_size_constdim])),
+                                                              'shape': int64_array([in_shape[0],
+                                                                                    memory_size_constdim])}).create_node()
             input_memory_const_dim = ReadValue(graph, {'name': 'const_dim_in_memory',
                                                        'variable_id': memory_pair_id}).create_node()
             init_value_input_memory_const_dim.out_port(0).connect(input_memory_const_dim.in_port(0))
@@ -129,14 +135,16 @@ def replace_pattern(graph: Graph, match: dict):
             concat_const.in_port(1).connect(crop_first.out_port(0))
             concat_const.in_port(0).connect(concat_node.out_port(0))
 
-            init_value_input_memory = create_zero_value_with_batch_from_input(split.out_port(0),
-                                                                              memory_size)
+            init_value_input_memory = Const(graph, {'name': 'init_value_' + node.name,
+                                                    'value': np.zeros(int64_array([in_shape[0], memory_size])),
+                                                    'shape': int64_array([in_shape[0], memory_size])}).create_node()
             init_value_input_memory.out_port(0).connect(input_memory.in_port(0))
             node.in_port(0).get_connection().set_destination(split.in_port(0))
             node.out_port(0).get_connection().set_source(concat_const.out_port(0))
         else:
-            init_value_input_memory = create_zero_value_with_batch_from_input(node.in_port(0).get_source(),
-                                                                              memory_size)
+            init_value_input_memory = Const(graph, {'name': 'init_value_' + node.name,
+                                                    'value': np.zeros(int64_array([in_shape[0], memory_size])),
+                                                    'shape': int64_array([in_shape[0], memory_size])}).create_node()
             init_value_input_memory.out_port(0).connect(input_memory.in_port(0))
             node.in_port(0).get_connection().set_destination(concat_node.in_port(1))
             node.out_port(0).get_connection().set_source(concat_node.out_port(0))
diff --git a/model-optimizer/extensions/middle/ReplaceSpliceNodePattern_test.py b/model-optimizer/extensions/middle/ReplaceSpliceNodePattern_test.py
index 62492fc35c715c..ec5985196a4d27 100644
--- a/model-optimizer/extensions/middle/ReplaceSpliceNodePattern_test.py
+++ b/model-optimizer/extensions/middle/ReplaceSpliceNodePattern_test.py
@@ -32,20 +32,8 @@ def test_splice(self):
         ref_graph = build_graph({'in_placeholder': {'kind': 'op', 'op': None},
                                  'in_node': {'kind': 'data', 'shape': [1, 13]},
 
-                                 'shape': {'kind': 'op', 'op': 'ShapeOf'},
-                                 'shape_data': {'kind': 'data'},
-                                 'crop_batch': {'kind': 'op', 'op': 'Crop', 'offset': int64_array([0])},
-                                 'crop_batch_data': {'kind': 'data'},
-                                 'crop_batch_dim': {'kind': 'op', 'op': 'Const', 'value': int64_array([1])},
-                                 'crop_batch_dim_data': {'kind': 'data'},
-                                 'second_dim': {'kind': 'op', 'op': 'Const', 'value': int64_array([143])},
-                                 'second_dim_data': {'kind': 'data'},
-                                 'gather_shape': {'kind': 'op', 'op': 'Concat'},
-                                 'gather_shape_data': {'kind': 'data'},
                                  'fill_value': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
                                  'fill_value_data': {'kind': 'data'},
-                                 'broadcast': {'kind': 'op', 'op': 'Broadcast'},
-                                 'broadcast_data': {'kind': 'data'},
 
                                  'memory_in': {'kind': 'op', 'op': 'ReadValue'},
                                  'memory_in_data': {'kind': 'data'},
@@ -61,16 +49,7 @@ def test_splice(self):
                                 [
                                     ('in_placeholder', 'in_node'),
 
-                                    ('in_node', 'shape'), ('shape', 'shape_data'),
-                                    ('shape_data', 'crop_batch'), ('crop_batch', 'crop_batch_data'),
-                                    ('crop_batch_dim', 'crop_batch_dim_data'),
-                                    ('crop_batch_dim_data', 'crop_batch', {'in': 1}),
-                                    ('second_dim', 'second_dim_data'), ('second_dim_data', 'gather_shape', {'in': 1}),
-                                    ('crop_batch_data', 'gather_shape', {'in': 0}),
-                                    ('gather_shape', 'gather_shape_data'),
-                                    ('fill_value', 'fill_value_data'), ('fill_value_data', 'broadcast', {'in': 0}),
-                                    ('gather_shape_data', 'broadcast', {'in': 1}), ('broadcast', 'broadcast_data'),
-                                    ('broadcast_data', 'memory_in'),
+                                    ('fill_value', 'fill_value_data'), ('fill_value_data', 'memory_in'),
 
                                     ('memory_in', 'memory_in_data'),
                                     ('memory_in_data', 'crop_mem'),
@@ -104,20 +83,8 @@ def test_splice_with_constdim(self):
                                  'split_data_0': {'kind': 'data'},
                                  'split_data_1': {'kind': 'data'},
 
-                                 'shape': {'kind': 'op', 'op': 'ShapeOf'},
-                                 'shape_data': {'kind': 'data'},
-                                 'crop_batch': {'kind': 'op', 'op': 'Crop', 'offset': int64_array([0])},
-                                 'crop_batch_data': {'kind': 'data'},
-                                 'crop_batch_dim': {'kind': 'op', 'op': 'Const', 'value': int64_array([1])},
-                                 'crop_batch_dim_data': {'kind': 'data'},
-                                 'second_dim': {'kind': 'op', 'op': 'Const', 'value': int64_array([33])},
-                                 'second_dim_data': {'kind': 'data'},
-                                 'gather_shape': {'kind': 'op', 'op': 'Concat'},
-                                 'gather_shape_data': {'kind': 'data'},
                                  'fill_value': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
                                  'fill_value_data': {'kind': 'data'},
-                                 'broadcast': {'kind': 'op', 'op': 'Broadcast'},
-                                 'broadcast_data': {'kind': 'data'},
 
                                  'memory_in': {'kind': 'op', 'op': 'ReadValue'},
                                  'memory_in_data': {'kind': 'data'},
@@ -129,21 +96,10 @@ def test_splice_with_constdim(self):
                                  'memory_out_data': {'kind': 'data'},
                                  'result': {'kind': 'op', 'op': 'Result'},
 
-                                 'shape_2': {'kind': 'op', 'op': 'ShapeOf'},
-                                 'shape_2_data': {'kind': 'data'},
-                                 'crop_batch_2': {'kind': 'op', 'op': 'Crop', 'offset': int64_array([0])},
-                                 'crop_batch_2_data': {'kind': 'data'},
-                                 'crop_batch_dim_2': {'kind': 'op', 'op': 'Const', 'value': int64_array([1])},
-                                 'crop_batch_dim_2_data': {'kind': 'data'},
-                                 'second_dim_2': {'kind': 'op', 'op': 'Const', 'value': int64_array([33])},
-                                 'second_dim_2_data': {'kind': 'data'},
-                                 'gather_shape_2': {'kind': 'op', 'op': 'Concat'},
-                                 'gather_shape_2_data': {'kind': 'data'},
+
                                  'fill_value_2': {'kind': 'op', 'op': 'Const', 'value': int64_array([0])},
                                  'fill_value_2_data': {'kind': 'data'},
-                                 'broadcast_2': {'kind': 'op', 'op': 'Broadcast'},
-                                 'broadcast_2_data': {'kind': 'data'},
-
+\
                                  'memory_in_constdims': {'kind': 'op', 'op': 'ReadValue'},
                                  'memory_in_constdims_data': {'kind': 'data'},
                                  'crop_mem_constdims': {'kind': 'op', 'op': 'Crop', 'offset': 10, 'dim': 100},
@@ -171,16 +127,7 @@ def test_splice_with_constdim(self):
                                     ('split', 'split_data_0', {'out': 0}),
                                     ('split', 'split_data_1', {'out': 1}),
 
-                                    ('split_data_0', 'shape'), ('shape', 'shape_data'),
-                                    ('shape_data', 'crop_batch'), ('crop_batch', 'crop_batch_data'),
-                                    ('crop_batch_dim', 'crop_batch_dim_data'),
-                                    ('crop_batch_dim_data', 'crop_batch', {'in': 1}),
-                                    ('second_dim', 'second_dim_data'), ('second_dim_data', 'gather_shape', {'in': 1}),
-                                    ('crop_batch_data', 'gather_shape', {'in': 0}),
-                                    ('gather_shape', 'gather_shape_data'),
-                                    ('fill_value', 'fill_value_data'), ('fill_value_data', 'broadcast', {'in': 0}),
-                                    ('gather_shape_data', 'broadcast', {'in': 1}), ('broadcast', 'broadcast_data'),
-                                    ('broadcast_data', 'memory_in'),
+                                    ('fill_value', 'fill_value_data'), ('fill_value_data', 'memory_in'),
 
                                     ('memory_in', 'memory_in_data'),
                                     ('memory_in_data', 'crop_mem'),
@@ -192,16 +139,7 @@ def test_splice_with_constdim(self):
                                     ('memory_out', 'memory_out_data'),
                                     ('memory_out_data', 'result'),
 
-                                    ('split_data_1', 'shape_2'), ('shape_2', 'shape_2_data'),
-                                    ('shape_2_data', 'crop_batch_2'), ('crop_batch_2', 'crop_batch_2_data'),
-                                    ('crop_batch_dim_2', 'crop_batch_dim_2_data'),
-                                    ('crop_batch_dim_2_data', 'crop_batch_2', {'in': 1}),
-                                    ('second_dim_2', 'second_dim_2_data'), ('second_dim_2_data', 'gather_shape_2', {'in': 1}),
-                                    ('crop_batch_2_data', 'gather_shape_2', {'in': 0}),
-                                    ('gather_shape_2', 'gather_shape_2_data'),
-                                    ('fill_value_2', 'fill_value_2_data'), ('fill_value_2_data', 'broadcast_2', {'in': 0}),
-                                    ('gather_shape_2_data', 'broadcast_2', {'in': 1}), ('broadcast_2', 'broadcast_2_data'),
-                                    ('broadcast_2_data', 'memory_in_constdims'),
+                                    ('fill_value_2', 'fill_value_2_data'), ('fill_value_2_data', 'memory_in_constdims'),
 
                                     ('memory_in_constdims', 'memory_in_constdims_data'),
                                     ('memory_in_constdims_data', 'crop_mem_constdims'),
diff --git a/model-optimizer/extensions/middle/StridedSliceNormalizer.py b/model-optimizer/extensions/middle/StridedSliceNormalizer.py
index 580cf533c61221..00ab785847e049 100644
--- a/model-optimizer/extensions/middle/StridedSliceNormalizer.py
+++ b/model-optimizer/extensions/middle/StridedSliceNormalizer.py
@@ -17,7 +17,7 @@
 
 
 class StridedSliceNormalizer(MiddleReplacementPattern):
-    """
+    r"""
     StridedSlice is not normal if it cannot be permuted by ApplyPermutations. This normalizer
     inserts blank colons ':' in slice expression so that it can be correctly permuted
     from NHWC to NCHW layout. It changes masks and inserts blank begin, end and strides values.
diff --git a/model-optimizer/extensions/middle/TensorIteratorInput.py b/model-optimizer/extensions/middle/TensorIteratorInput.py
index 48c27c0074e9ce..0b5124e74b7dd4 100644
--- a/model-optimizer/extensions/middle/TensorIteratorInput.py
+++ b/model-optimizer/extensions/middle/TensorIteratorInput.py
@@ -245,7 +245,7 @@ def replace_pattern(graph: Graph, match: dict):
 
 
 class SmartMatcherInputSlicingWithGather(MiddleReplacementPattern):
-    """
+    r"""
     The transformation matches a sub-graph where input tensor is consequently sliced along some axis
     for each time step (or index) inside TensorFlow 1.x while_loop operation.
     In the original graph StridedSlice with non-constant begin and end attributes performs this slicing.
diff --git a/model-optimizer/extensions/middle/split_tdnn_memoryoffset.py b/model-optimizer/extensions/middle/split_tdnn_memoryoffset.py
index 63ad5ba9369241..e9762ab515beef 100644
--- a/model-optimizer/extensions/middle/split_tdnn_memoryoffset.py
+++ b/model-optimizer/extensions/middle/split_tdnn_memoryoffset.py
@@ -32,4 +32,4 @@ def find_and_replace_pattern(self, graph: Graph):
                 paired_node['element_size'] = offset_node['element_size']
             # Copy shape from previous node. Typically (but not always) for TDNN blocks this is the case
             else:
-                paired_node['element_size'] = offset_node.in_port(0).data.get_shape()[1]
+                paired_node['element_size'] = offset_node.in_port(0).data.get_shape()
diff --git a/model-optimizer/extensions/ops/Cast.py b/model-optimizer/extensions/ops/Cast.py
index 066ef1cf0d697e..6c80106b3c83c4 100644
--- a/model-optimizer/extensions/ops/Cast.py
+++ b/model-optimizer/extensions/ops/Cast.py
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging as log
+
 import numpy as np
 
-from mo.front.common.partial_infer.elemental import copy_shape_infer
 from mo.graph.graph import Node, Graph
 from mo.middle.passes.convert_data_type import np_data_type_to_precision, convert_blob, \
     np_data_type_to_destination_type, packed_I4, packed_U4
@@ -84,7 +84,7 @@ def custom_type_casting_and_packing(node: Node, value, dst_type):
         bit_order_little = (padded[:, None] & (1 << np.arange(num_bits)) > 0).astype(np.uint8)
         bit_order_big = np.flip(bit_order_little, axis=1)
         bit_order_big_flattened = bit_order_big.flatten()
-        packed = np.packbits(bit_order_big_flattened, bitorder='big')
+        packed = np.packbits(bit_order_big_flattened)
 
         node.out_node(0)['force_shape'] = data_shape.copy()
         node.out_node(0)['force_type'] = np_data_type_to_precision(dst_type)
diff --git a/model-optimizer/extensions/ops/scatternd.py b/model-optimizer/extensions/ops/scatternd.py
index cffb226b268c6b..8917d11cfb8f1f 100644
--- a/model-optimizer/extensions/ops/scatternd.py
+++ b/model-optimizer/extensions/ops/scatternd.py
@@ -44,14 +44,16 @@ def infer(node: Node):
         # 1. ranks of both input and indices must be at least 1
         assert len(input_shape) >= 1 and len(indices_shape) >= 1, \
             'The node "{}" input and indices ranks must be at least 1'.format(node_name)
-           
+
         # 2. the last dimension of indices shape must be at most a rank of input
         assert indices_shape[-1] <= len(input_shape), \
             'The last dimension of indices shape must be at most a rank of input for the node "{}"'.format(node_name)
 
         # 3. updates is a tensor of shape indices_shape[:-1] + input_shape[indices_shape[-1]:]
+        # if expected updates shape is scalar, updates can be tensor with the single element (for example, of shape [1], [[1]], etc.)
         expected_updates_shape = np.concatenate((indices_shape[:-1], input_shape[indices_shape[-1]:]), axis=0)
-        assert np.array_equal(updates_shape, expected_updates_shape), \
+        assert np.array_equal(updates_shape, expected_updates_shape) or\
+               np.array_equal(expected_updates_shape, []) and np.array_equal(updates_shape, np.ones(len(updates_shape))), \
             'The updates shape must be equal to indices_shape[:-1] + input_shape[indices_shape[-1]:] for the node "{}"'.format(node_name)
 
         node.out_port(0).data.set_shape(input_shape)
diff --git a/model-optimizer/extensions/ops/scatternd_test.py b/model-optimizer/extensions/ops/scatternd_test.py
index 41e9ad9410173c..a53b020202ccea 100644
--- a/model-optimizer/extensions/ops/scatternd_test.py
+++ b/model-optimizer/extensions/ops/scatternd_test.py
@@ -62,6 +62,11 @@
           'updates': {'shape': int64_array([]), 'value': 9}}
 output7 = int64_array([1, 2, 3, 4, 9, 6, 7, 8])
 
+inputs8 = {'input': {'shape': int64_array([3]), 'value': int64_array([1, 2, 3])},
+          'indices': {'shape': int64_array([1]), 'value': int64_array([2])},
+          'updates': {'shape': int64_array([1]), 'value': int64_array([9])}}
+output8 = int64_array([1, 2, 9])
+
 class TestScatterNDUpdate(unittest.TestCase):
     def test_partial_infer1(self):
         graph = build_graph(nodes_attributes, edges, inputs1)
@@ -139,7 +144,7 @@ def test_infer6(self):
         res_output_value = graph.node['output']['value']
 
         self.assertTrue(np.array_equal(output6, res_output_value),
-                        'values do not match expected: {} and given: {}'.format(output5, res_output_value))
+                        'values do not match expected: {} and given: {}'.format(output6, res_output_value))
 
     def test_infer7_scalar(self):
         graph = build_graph(nodes_attributes, edges, inputs7)
@@ -150,4 +155,15 @@ def test_infer7_scalar(self):
         res_output_value = graph.node['output']['value']
 
         self.assertTrue(np.array_equal(output7, res_output_value),
-                        'values do not match expected: {} and given: {}'.format(output5, res_output_value))
+                        'values do not match expected: {} and given: {}'.format(output7, res_output_value))
+
+    def test_infer8(self):
+        graph = build_graph(nodes_attributes, edges, inputs8)
+        scatternd_node = Node(graph, 'scatternd_node')
+        ScatterNDUpdate.infer(scatternd_node)
+
+        # get the result
+        res_output_value = graph.node['output']['value']
+
+        self.assertTrue(np.array_equal(output8, res_output_value),
+                        'values do not match expected: {} and given: {}'.format(output8, res_output_value))
diff --git a/model-optimizer/install_prerequisites/install_prerequisites.sh b/model-optimizer/install_prerequisites/install_prerequisites.sh
index 5ae1d9f8986b22..bd2deb23ae50b3 100755
--- a/model-optimizer/install_prerequisites/install_prerequisites.sh
+++ b/model-optimizer/install_prerequisites/install_prerequisites.sh
@@ -35,7 +35,7 @@ for ((i=1;i <= $#;i++)) {
         esac
 }
 
-SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" && pwd )"
 
 if [[ -f /etc/centos-release ]]; then
     DISTRO="centos"
diff --git a/model-optimizer/mo/__main__.py b/model-optimizer/mo/__main__.py
new file mode 100644
index 00000000000000..1e84a6a65a94d5
--- /dev/null
+++ b/model-optimizer/mo/__main__.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from mo.utils.versions_checker import check_python_version  # pylint: disable=no-name-in-module
+
+ret_code = check_python_version()
+if ret_code:
+    sys.exit(ret_code)
+
+from mo.main import main
+from mo.utils.cli_parser import get_all_cli_parser  # pylint: disable=no-name-in-module
+
+sys.exit(main(get_all_cli_parser(), None))
+
diff --git a/model-optimizer/mo/back/ie_ir_ver_2/emitter.py b/model-optimizer/mo/back/ie_ir_ver_2/emitter.py
index 0a80a1f102d410..85dce75201bc48 100644
--- a/model-optimizer/mo/back/ie_ir_ver_2/emitter.py
+++ b/model-optimizer/mo/back/ie_ir_ver_2/emitter.py
@@ -247,7 +247,7 @@ def serialize_node_attributes(
         unsupported):
     # the Result op may be marked so it should not appear in the IR. For example, refer to transformation
     # model-optimizer/extensions/back/TopKNormalizer.py
-    if isinstance(node, Node) and node.soft_get('result' == 'Result') and node.has_and_set('remove_from_xml'):
+    if isinstance(node, Node) and node.soft_get('type') == 'Result' and node.has_and_set('keep_output_port'):
         return
     try:
         for s in schema:
diff --git a/model-optimizer/mo/front/caffe/loader.py b/model-optimizer/mo/front/caffe/loader.py
index 8419acc8d9d718..c2f670505cb91d 100644
--- a/model-optimizer/mo/front/caffe/loader.py
+++ b/model-optimizer/mo/front/caffe/loader.py
@@ -169,7 +169,7 @@ def caffe_pb_to_nx(graph, proto, model):
     # Blobs in prototxt model can be reused by inplace layer.
     # This requires loading of pb layers in order and tracking the latest
     # layer that writes a particular blob.
-    blob_producers = {}  # maps layer blob name to the layer name and port
+    blob_producers = {}  # maps layer blob name to node id in graph, port and layer name
     proto_layers = get_layers(proto)
     model_layers = None
     if model:
@@ -239,7 +239,7 @@ def caffe_pb_to_nx(graph, proto, model):
         # Input is defined at the top level of proto instead of distinct Input layer
         graph.add_node(input_name, pb=None, model_pb=None, type='GlobalInput', name=input_name, shape=input_dim,
                        kind='op')
-        blob_producers[input_name] = (input_name, 0)
+        blob_producers[input_name] = (input_name, 0, input_name)
 
     used_blobs = set()
     for i, layer in enumerate(proto_layers):
@@ -280,19 +280,19 @@ def caffe_pb_to_nx(graph, proto, model):
                 input_dims.append(np.array(list(dims), dtype=np.int64))
                 input_names.append(layer.name)
 
-        layer.name = graph.unique_id(layer.name)
-        graph.add_node(layer.name, pb=layer, model_pb=model_layer, kind='op', type='Parameter')
+        node_id = graph.unique_id(layer.name)
+        graph.add_node(node_id, pb=layer, model_pb=model_layer, kind='op', type='Parameter')
 
         # connect inputs based on blob_producers dictionary
         for dst_port, bottom in enumerate(layer.bottom):
-            add_edge_caffe(graph, bottom, layer.name, blob_producers, dst_port)
+            add_edge_caffe(graph, bottom, node_id, blob_producers, dst_port)
             used_blobs.add(bottom)
 
         # update blob producers dictionary by output ports
         for src_port, top in enumerate(layer.top):
             if top in blob_producers:
-                log.debug("Detected reuse of blob {} by layer {}".format(top, layer.name))
-            blob_producers[top] = (layer.name, src_port)
+                log.debug("Detected reuse of blob {} by layer {}".format(top, node_id))
+            blob_producers[top] = (node_id, src_port, layer.name)
 
     # Tensor names information corresponding to a node is stored on outgoing edges.
     # As output nodes do not have outgoing edges, fake outputs are required. In the following code
@@ -320,7 +320,7 @@ def add_edge_caffe(graph: Graph, bottom: str, dst_layer: str, blob_producers: di
         'in': dst_port,
         'name': bottom,
         # debug anchor for a framework name, out port and tensor name
-        'fw_tensor_debug_info': [(src_layer, src_port, bottom)],
+        'fw_tensor_debug_info': [(blob_producers[bottom][2], src_port, bottom)],
         'in_attrs': ['in', 'name'],
         'out_attrs': ['out', 'name'],
         'data_attrs': ['fw_tensor_debug_info']
diff --git a/model-optimizer/mo/front/common/extractors/__init__.py b/model-optimizer/mo/front/common/extractors/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/model-optimizer/mo/front/extractor.py b/model-optimizer/mo/front/extractor.py
index c5f7cd4742461a..28d5b0973e438c 100644
--- a/model-optimizer/mo/front/extractor.py
+++ b/model-optimizer/mo/front/extractor.py
@@ -753,7 +753,7 @@ def add_outputs_identity(graph: Graph, outputs: list, add_edge: callable, params
     for output in outputs:
         fake_node_name = graph.unique_id(output)
         graph.add_node(fake_node_name, name=fake_node_name, identity=True, kind='op', op='Identity',
-                       infer=None, needs_removal=True)
+                       infer=None, needs_removal=True, symbol_dict={'op': 'Identity'})
         add_edge(graph, output, fake_node_name, **params)
 
 
diff --git a/model-optimizer/mo/front/kaldi/loader/loader.py b/model-optimizer/mo/front/kaldi/loader/loader.py
index ff07e9e24af71c..3d60677e412e02 100644
--- a/model-optimizer/mo/front/kaldi/loader/loader.py
+++ b/model-optimizer/mo/front/kaldi/loader/loader.py
@@ -9,7 +9,7 @@
 
 from extensions.ops.elementwise import Mul
 from extensions.ops.split import AttributedVariadicSplit
-from mo.front.common.partial_infer.utils import float_array
+from mo.front.common.partial_infer.utils import float_array, int64_array
 from mo.front.extractor import add_outputs_identity
 from mo.front.kaldi.loader.utils import find_next_tag, read_placeholder, find_next_component, get_name_from_path, \
     find_end_of_component, end_of_nnet_tag, read_binary_integer32_token, get_parameters, read_token_value, \
@@ -214,7 +214,9 @@ def load_kaldi_nnet3_model(graph, file_descr, nnet_name):
         for o_n_name, params in node.get_outputs():
             o_n = Node(graph, o_n_name)
             if o_n['op'] == 'MemoryOffset':
-                o_n['parameters']['element_size'] = node['shape'][1]
+                # don't take batch from Parameter, it will be overwritten
+                # take only second dimension because we have only 2 dimensions
+                o_n['parameters']['element_size'] = int64_array([1, node.shape[1]])
 
     load_components(file_descr, graph, component_layer_map)
 
@@ -268,7 +270,7 @@ def load_components(file_descr, graph, component_layer_map=None):
                     for o_n_name, params in node.get_outputs():
                         o_n = Node(graph, o_n_name)
                         if o_n['op'] == 'MemoryOffset' and dim != 0:
-                            o_n['parameters']['element_size'] = dim
+                            o_n['parameters']['element_size'] = int64_array([1, dim])
             else:
                 raise Error("Something wrong with layer {}".format(name))
         else:
@@ -401,7 +403,7 @@ def read_node(file_descr, graph, component_layer_map, layer_node_map):
         for o_n_name, params in node.get_outputs():
             o_n = Node(graph, o_n_name)
             if o_n['op'] == 'MemoryOffset':
-                o_n['parameters']['element_size'] = dim
+                o_n['parameters']['element_size'] = int64_array([1, dim])
     else:
         raise Error("Unsupported node specifier {}".format(tokens[0]))
     return True
diff --git a/model-optimizer/mo/front/mxnet/extractors/utils.py b/model-optimizer/mo/front/mxnet/extractors/utils.py
index b33dbdd68679fd..0727ae82c26ab9 100644
--- a/model-optimizer/mo/front/mxnet/extractors/utils.py
+++ b/model-optimizer/mo/front/mxnet/extractors/utils.py
@@ -101,21 +101,26 @@ def has(self, key):
 
 def get_mxnet_node_edges(node: dict, node_id: [int, str], nodes_list: list, index_node_key: dict):
     edge_list = []
+    used_indices = set()
     for in_port, src_node_id in enumerate(node['inputs']):
-        src_node = src_node_id[0]
-        dest_port = src_node_id[1]
-        edge_attrs = {
-            'in': in_port,
-            'out': dest_port,
-            # debug anchor for framework name, out port and tensor name
-            'fw_tensor_debug_info': [(index_node_key[src_node], src_node_id[1], nodes_list[src_node]['name'])],
-            'in_attrs': ['in'],
-            'out_attrs': ['out'],
-            'data_attrs': ['fw_tensor_debug_info']
-        }
-        edge = (index_node_key[src_node], index_node_key[node_id], edge_attrs)
+        edge = create_mxnet_edge(index_node_key[src_node_id[0]], index_node_key[node_id], in_port, src_node_id[1],
+                                 nodes_list[src_node_id[0]]['name'])
         edge_list.append(edge)
-    return edge_list
+        used_indices.add(src_node_id[0])
+    return edge_list, used_indices
+
+
+def create_mxnet_edge(src_node_id: str, dst_node_id: str, src_port: int, dst_port: int, framework_name: str):
+    edge_attrs = {
+        'in': src_port,
+        'out': dst_port,
+        # debug anchor for framework name, out port and tensor name
+        'fw_tensor_debug_info': [(framework_name, dst_port, framework_name)],
+        'in_attrs': ['in'],
+        'out_attrs': ['out'],
+        'data_attrs': ['fw_tensor_debug_info']
+    }
+    return src_node_id, dst_node_id, edge_attrs
 
 
 def get_mxnet_layer_attrs(json_dic: dict):
diff --git a/model-optimizer/mo/front/mxnet/loader.py b/model-optimizer/mo/front/mxnet/loader.py
index 76e01936d31020..14da8890df227e 100644
--- a/model-optimizer/mo/front/mxnet/loader.py
+++ b/model-optimizer/mo/front/mxnet/loader.py
@@ -8,8 +8,9 @@
 import mxnet as mx
 import numpy as np
 
+from mo.front.extractor import add_outputs_identity
 from mo.front.mxnet.extractor import common_mxnet_fields
-from mo.front.mxnet.extractors.utils import get_mxnet_node_edges, load_params, init_rnn_states
+from mo.front.mxnet.extractors.utils import get_mxnet_node_edges, load_params, init_rnn_states, create_mxnet_edge
 from mo.front.mxnet.nd_to_params import build_params_file
 from mo.graph.graph import Node, Graph
 from mo.utils.error import Error
@@ -50,7 +51,8 @@ def parse_input_model(input_model):
     return model_name, iteration_number
 
 
-def load_symbol_def(input_model_name, input_symbol, input_names: str = '', nd_prefix_name: str = '', pretrained_model_name: str = '', legacy_mxnet_model: bool = False):
+def load_symbol_def(input_model_name, input_symbol, input_names: str = '', nd_prefix_name: str = '',
+                    pretrained_model_name: str = '', legacy_mxnet_model: bool = False):
     if not nd_prefix_name and not pretrained_model_name:
         # model name always has extension 'param'
         try:
@@ -95,6 +97,7 @@ def symbol2nx(graph, model_nodes, model_params, input_names: str = ''):
 
     # as mxnet contain input layers as index of layer, for correct set up edges, we need provide index of layer with name of  graph node
     index_node_keys = {}
+    fw_name_map = {}
     for i, node in enumerate(model_nodes):
         if node['name'] in model_params._arg_params and node['name'] not in input_names:
             node['value'] = np.array(model_params._arg_params[node['name']].asnumpy(), dtype=np.float32)
@@ -106,12 +109,25 @@ def symbol2nx(graph, model_nodes, model_params, input_names: str = ''):
         graph.add_node(node_name, **symbol_attrs(node))
         graph.node[node_name].update(common_mxnet_fields(Node(graph, node_name)))
         index_node_keys[i] = node_name
+        fw_name_map[node_name] = node['name']
 
+    used_indices_set = set()
     for i, attrs in enumerate(model_nodes):
         node = attrs
-        edges = get_mxnet_node_edges(node, i, list(model_nodes), index_node_keys)
+        edges, used_indices = get_mxnet_node_edges(node, i, list(model_nodes), index_node_keys)
         if len(edges) > 0:
             graph.add_edges_from(edges)
+        used_indices_set = used_indices_set.union(used_indices)
+
+    output_ids = [index_node_keys[node_id] for node_id in set(range(len(model_nodes))) - used_indices_set]
+
+    # Tensor names information corresponding to a node is stored on outgoing edges.
+    # As output nodes do not have outgoing edges, fake outputs are required. In the following code
+    # for each output Identity node is added, and tensor name for the output is kept
+    # on (output, fake output) edge. After Result nodes adding transformation fake outputs
+    # are deleted from graph.
+    add_outputs_identity(graph, output_ids, lambda g, output_id, fake_node_id, fw_name: g.add_edges_from([
+        create_mxnet_edge(output_id, fake_node_id, 0, 0, fw_name[output_id])]), {'fw_name': fw_name_map})
 
     return graph
 
diff --git a/model-optimizer/mo/front/onnx/loader.py b/model-optimizer/mo/front/onnx/loader.py
index 3f8c2fd831fceb..93fdee5e0e3d8c 100644
--- a/model-optimizer/mo/front/onnx/loader.py
+++ b/model-optimizer/mo/front/onnx/loader.py
@@ -74,7 +74,8 @@ def protobuf2nx(graph: Graph, pb):
     # important)
     for node in graph_pb.node:
         # create an NX node
-        id = graph.unique_id(node_id(node))
+        fw_name = node_id(node)
+        id = graph.unique_id(fw_name)
         graph.add_node(id, pb=node, kind='op')
 
         # add incoming edges based on data_nodes_map
@@ -109,7 +110,7 @@ def protobuf2nx(graph: Graph, pb):
                     'out': src_port,
                     'in': 0,
                     'name': out,
-                    'fw_tensor_debug_info': [(id, src_port, out)],
+                    'fw_tensor_debug_info': [(fw_name, src_port, out)],
                     'in_attrs': ['in', 'name'],
                     'out_attrs': ['out', 'name'],
                     'data_attrs': ['fw_tensor_debug_info']
diff --git a/model-optimizer/mo/front/tf/extractors/utils.py b/model-optimizer/mo/front/tf/extractors/utils.py
index 753cd0271afc77..c8fac1de697972 100644
--- a/model-optimizer/mo/front/tf/extractors/utils.py
+++ b/model-optimizer/mo/front/tf/extractors/utils.py
@@ -55,6 +55,8 @@ def tf_tensor_content(tf_dtype, shape, pb_tensor):
         raise Error("Data type is unsupported: {}. " +
                     refer_to_faq_msg(50), tf_dtype)
 
+    decode_err_msg = 'Failed to parse a tensor with Unicode characters. Note that Inference Engine does not support ' \
+                     'string literals, so the string constant should be eliminated from the graph.'
     if pb_tensor.tensor_content:
         value = np.array(np.frombuffer(pb_tensor.tensor_content, type_helper[0]))
     else:
@@ -65,16 +67,17 @@ def tf_tensor_content(tf_dtype, shape, pb_tensor):
             try:
                 value = np.array(type_helper[1](pb_tensor), dtype=type_helper[0])
             except UnicodeDecodeError:
-                log.error(
-                    'Failed to parse a tensor with Unicode characters. Note that Inference Engine does not support '
-                    'string literals, so the string constant should be eliminated from the graph.',
-                    extra={'is_warning': True})
+                log.error(decode_err_msg, extra={'is_warning': True})
                 value = np.array(type_helper[1](pb_tensor))
 
     if len(shape) == 0 or shape.prod() == 0:
         if len(value) == 1:
             # return scalar if shape is [] otherwise broadcast according to shape
-            return np.array(value[0], dtype=type_helper[0])
+            try:
+                return np.array(value[0], dtype=type_helper[0])
+            except UnicodeDecodeError:
+                log.error(decode_err_msg, extra={'is_warning': True})
+                return np.array(value[0])
         else:
             # no shape, return value as is
             return value
diff --git a/model-optimizer/mo/front/tf/extractors/utils_test.py b/model-optimizer/mo/front/tf/extractors/utils_test.py
index 7cbd90a05717f4..8024491fd4cceb 100644
--- a/model-optimizer/mo/front/tf/extractors/utils_test.py
+++ b/model-optimizer/mo/front/tf/extractors/utils_test.py
@@ -199,3 +199,15 @@ def test_str_decode(self):
             self.assertEqual([warning_message], cm.output)
             self.assertEqual(ref_val, result)
 
+    def test_str_decode_list(self):
+        pb_tensor = PB({
+            'dtype': 7,
+            'string_val': [b'\377\330\377\377\330\377'],
+        })
+        shape = int64_array([])
+        warning_message = 'ERROR:root:Failed to parse a tensor with Unicode characters. Note that Inference Engine ' \
+                          'does not support string literals, so the string constant should be eliminated from the ' \
+                          'graph.'
+        with self.assertLogs(log.getLogger(), level="ERROR") as cm:
+            result = tf_tensor_content(pb_tensor.dtype, shape, pb_tensor)
+            self.assertEqual([warning_message, warning_message], cm.output)
diff --git a/model-optimizer/mo/front/tf/graph_utils.py b/model-optimizer/mo/front/tf/graph_utils.py
index 3a38938ee6eafb..bc39da8fcd7128 100644
--- a/model-optimizer/mo/front/tf/graph_utils.py
+++ b/model-optimizer/mo/front/tf/graph_utils.py
@@ -143,7 +143,7 @@ def add_convolution_to_swap_xy_coordinates(graph: Graph, input_node: Node, coord
 
 
 def add_fake_background_loc(graph: Graph, input_node: Node):
-    """
+    r"""
     DetectionOutput layer expects that box coordinates contains coordinates of boxes for the "background" class also,
     but in the TensorFlow\* Object Detection API the tensor contains information about real object classes only.
     The function copies a slice of the output data of the node 'input_node' and then concats it to the beginning of the
diff --git a/model-optimizer/mo/graph/connection.py b/model-optimizer/mo/graph/connection.py
index c5d8a82aaa7cc4..e94d3e787fecf8 100644
--- a/model-optimizer/mo/graph/connection.py
+++ b/model-optimizer/mo/graph/connection.py
@@ -203,8 +203,8 @@ def check_and_remove_edge():
             return {}, None
 
         if self.destinations and len(self.destinations) > 1:
-            raise Error("set_destination applicable only for connections that has exactly one destination or \
-                         when there is no destinations")
+            raise Error("set_destination applicable only for connections that has exactly one destination or "
+                        "when there is no destinations")
 
         if port.type == 'out':
             raise Error("Wrong port type in set_destination method. Should be 'in' but given 'out'")
diff --git a/model-optimizer/mo/graph/graph.py b/model-optimizer/mo/graph/graph.py
index 657f71f1e30f3f..6fca02b665369b 100644
--- a/model-optimizer/mo/graph/graph.py
+++ b/model-optimizer/mo/graph/graph.py
@@ -1045,12 +1045,8 @@ def add_opoutput(graph: Graph, node_name: str, port: int, cut: bool = True):
     if cut and len(node.out_edges()) != 0:
         opoutput_node = Result(graph).create_node_on_port(node, port, {'name': node_name + '/sink_port_' + str(port)})
     else:
-        tensor_names = None
-        if node.has_valid('op') and port in node.out_ports():
-            tensor_names = node.out_port(port).get_tensor_names()
         opoutput_node = Result(graph).create_node([(node, port)], {'name': node_name + '/sink_port_' + str(port)})
         opoutput_node.in_edge()['data_attrs'] = ['fw_tensor_debug_info']
-        opoutput_node.in_edge()['fw_tensor_debug_info'] = [(node_name, port, tensor_names)]
 
     log.debug('Sink: {} for node {}'.format(opoutput_node.id, node_name))
     log.debug(str(graph.node[opoutput_node.id]))
@@ -1125,8 +1121,7 @@ def set_edge_attribute_between_nodes(node1: Node, node2: Node, attr_name: str, n
         out_port = edge['out']
         out_node = node1.out_node(out_port)
         if out_node.id == node2.id:
-            if attr_name in edge:
-                edge[attr_name] = new_value
+            edge[attr_name] = new_value
 
 # All functions below are deprecated and will be removed in next release
 # Please, use methods from Graph/Node classes instead
diff --git a/model-optimizer/mo/ops/memoryoffset.py b/model-optimizer/mo/ops/memoryoffset.py
index d543eb3eb8b03e..bcbcdd3f28d7de 100644
--- a/model-optimizer/mo/ops/memoryoffset.py
+++ b/model-optimizer/mo/ops/memoryoffset.py
@@ -17,17 +17,16 @@ def __init__(self, graph: Graph, attrs: dict):
             'pair_name': None,
             'splitted': False,
             'has_default': False,
-            'infer': __class__.infer,
+            'infer': self.infer,
             'in_ports_count': 1,
             'out_ports_count': 1,
         }, attrs)
 
-
     @staticmethod
     def infer(node: Node):
         if node.has_valid('element_size'):
-            # element_size should be set by Kaldi loader or by MemoryOffsetAdjustment
-            node.out_port(0).data.set_shape([1, node['element_size']])
+            # element_size should be set by Kaldi loader or MemoryOffsetAdjustment or SplitRecurrentMemoryOffset
+            node.out_port(0).data.set_shape(node.element_size)
         else:
             # for TDNN blocks
             copy_shape_infer(node)
diff --git a/model-optimizer/mo/utils/ir_reader/extenders/__init__.py b/model-optimizer/mo/utils/ir_reader/extenders/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/model-optimizer/mo/utils/ir_reader/extenders/strided_slice_extender.py b/model-optimizer/mo/utils/ir_reader/extenders/strided_slice_extender.py
index 6b08f2ab7f2cf8..34fb75136a05f6 100644
--- a/model-optimizer/mo/utils/ir_reader/extenders/strided_slice_extender.py
+++ b/model-optimizer/mo/utils/ir_reader/extenders/strided_slice_extender.py
@@ -13,7 +13,14 @@ class StridedSlice_extender(Extender):
     @staticmethod
     def extend(op: Node):
         for attr in StridedSlice.get_mask_names():
-            Extender.attr_to_list(op, attr)
+            # We can not use op.has_and_set(attr) here as a condition, because it will return False if begin/end is
+            # 1D tensor and begin_mask/end_mask is equal to 0
+            if op.has(attr) and op[attr] != '':
+                Extender.attr_to_list(op, attr)
+            else:
+                assert attr not in ['begin_mask', 'end_mask'],\
+                    '{} is not defined for the node {}'.format(attr, op.soft_get('name', op.id))
+                op[attr] = int64_array([0])
 
         op.begin_mask = int64_array([1 - i for i in op.begin_mask])
         op.end_mask = int64_array([1 - i for i in op.end_mask])
diff --git a/model-optimizer/mo/utils/ir_reader/layer_to_class.py b/model-optimizer/mo/utils/ir_reader/layer_to_class.py
index 36bae0358e740e..67bc917f747ee9 100644
--- a/model-optimizer/mo/utils/ir_reader/layer_to_class.py
+++ b/model-optimizer/mo/utils/ir_reader/layer_to_class.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from extensions.back.TopKNormalizer import TopKNormalizer
+from extensions.middle.FakeSplitOutputs import AddFakeOutputsToSplit
 from extensions.ops.Cast import Cast
 from extensions.ops.ReduceOps import ReduceOp
 from extensions.ops.activation_ops import Activation
@@ -272,6 +273,8 @@ def copy_input_blobs(op: Node, copy_op: Node):
     'Assign': assign_add_output_result,
     'TensorIterator': ti_add_edge_attrs,
     'TopK': TopKNormalizer.normalize_outputs,
+    # Call normalize Split outputs for generated IR by ir-reader
+    'Split': AddFakeOutputsToSplit.split_normalize_outputs
 }
 
 
diff --git a/model-optimizer/mo/utils/ir_reader/restore_graph.py b/model-optimizer/mo/utils/ir_reader/restore_graph.py
index 0f92ed61c47a23..9f1c8e0e9e9eb5 100644
--- a/model-optimizer/mo/utils/ir_reader/restore_graph.py
+++ b/model-optimizer/mo/utils/ir_reader/restore_graph.py
@@ -17,6 +17,7 @@
 from mo.utils.ir_engine.ir_engine import IREngine
 from mo.utils.ir_reader.layer_to_class import copy_graph_with_ops, collect_extenders, collect_ops
 from mo.utils.utils import get_mo_root_dir
+from extensions.back.MarkNodesWithShapeValues import MarkNodesWithShapeValues
 
 
 def restore_graph_from_ir(path_to_xml: str, path_to_bin: str = None) -> (Graph, dict):
@@ -64,6 +65,7 @@ def save_restored_graph(graph: Graph, path: str, meta_data, name=None):
         BlobNormalizer,
         ConvolutionNormalizer,
         KaldiRemoveMemoryOutputBackReplacementPattern,
+        MarkNodesWithShapeValues,
     ]
 
     # We need to run some specific passes from MO back stage.
diff --git a/model-optimizer/setup.py b/model-optimizer/setup.py
index 4ba80b119ad37f..f77ffb38408c7d 100644
--- a/model-optimizer/setup.py
+++ b/model-optimizer/setup.py
@@ -15,6 +15,7 @@
 from setuptools import setup, find_packages
 from setuptools.command.install import install
 from setuptools.command.build_py import build_py
+from shutil import copyfile
 
 package_name = 'mo'
 
@@ -44,6 +45,11 @@ def run(self):
             path = os.path.join(self.install_purelib, package_name, name)
             with open(path, 'wt') as f:
                 f.write('\n'.join(deps))
+        # Add version.txt if exists
+        version_txt = 'version.txt'
+        if os.path.exists(version_txt):
+            copyfile(os.path.join(version_txt),
+                     os.path.join(self.install_purelib, package_name, version_txt))
         
         path = os.path.join(self.install_purelib, package_name, '__init__.py')
         with open(path, 'wt') as f:
@@ -81,6 +87,11 @@ def find_package_modules(self, package, package_dir):
           'install': InstallCmd,
           'build_py': BuildCmd,
       },
+      entry_points={
+          'console_scripts': [
+              'mo = mo.__main__:main',
+           ],
+      },
       classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: Apache Software License",
diff --git a/ngraph/CMakeLists.txt b/ngraph/CMakeLists.txt
index 7bfba968d03ad2..fb7b461b2058d9 100644
--- a/ngraph/CMakeLists.txt
+++ b/ngraph/CMakeLists.txt
@@ -78,6 +78,7 @@ option(NGRAPH_UNIT_TEST_ENABLE "Control the building of unit tests" ON)
 option(NGRAPH_INTERPRETER_ENABLE "Control the building of the INTERPRETER backend" ON)
 option(NGRAPH_DEBUG_ENABLE "Enable output for NGRAPH_DEBUG statements" OFF)
 option(NGRAPH_ONNX_IMPORT_ENABLE "Enable ONNX importer" OFF)
+option(NGRAPH_ONNX_EDITOR_ENABLE "Enable ONNX Editor" OFF)
 option(NGRAPH_LIB_VERSIONING_ENABLE "Enable shared library versioning" OFF)
 option(NGRAPH_PYTHON_BUILD_ENABLE "Enable build nGraph python package wheel" OFF)
 option(NGRAPH_DYNAMIC_COMPONENTS_ENABLE "Enable dynamic loading of components" ON)
@@ -91,6 +92,9 @@ option(NGRAPH_USE_PROTOBUF_LITE "Compiles and links with protobuf-lite" OFF)
 if (NGRAPH_ONNX_IMPORT_ENABLE)
     option(NGRAPH_USE_SYSTEM_PROTOBUF "Use system provided Protobuf shared object" OFF)
 endif()
+if(NGRAPH_ONNX_EDITOR_ENABLE AND NOT NGRAPH_ONNX_IMPORT_ENABLE)
+    message(FATAL_ERROR "ONNX Editor compotent requires ONNX Importer. Set NGRAPH_ONNX_IMPORT_ENABLE=ON.")
+endif()
 
 message(STATUS "NGRAPH_ADDRESS_SANITIZER_ENABLE:      ${NGRAPH_ADDRESS_SANITIZER_ENABLE}")
 message(STATUS "NGRAPH_DEBUG_ENABLE:                  ${NGRAPH_DEBUG_ENABLE}")
@@ -99,9 +103,9 @@ message(STATUS "NGRAPH_EXPORT_TARGETS_ENABLE:         ${NGRAPH_EXPORT_TARGETS_EN
 message(STATUS "NGRAPH_INTERPRETER_ENABLE:            ${NGRAPH_INTERPRETER_ENABLE}")
 message(STATUS "NGRAPH_LIB_VERSIONING_ENABLE:         ${NGRAPH_LIB_VERSIONING_ENABLE}")
 message(STATUS "NGRAPH_ONNX_IMPORT_ENABLE:            ${NGRAPH_ONNX_IMPORT_ENABLE}")
+message(STATUS "NGRAPH_ONNX_EDITOR_ENABLE:            ${NGRAPH_ONNX_EDITOR_ENABLE}")
 message(STATUS "NGRAPH_PYTHON_BUILD_ENABLE:           ${NGRAPH_PYTHON_BUILD_ENABLE}")
 message(STATUS "NGRAPH_THREAD_SANITIZER_ENABLE:       ${NGRAPH_THREAD_SANITIZER_ENABLE}")
-message(STATUS "NGRAPH_TOOLS_ENABLE:                  ${NGRAPH_TOOLS_ENABLE}")
 message(STATUS "NGRAPH_UB_SANITIZER_ENABLE:           ${NGRAPH_UB_SANITIZER_ENABLE}")
 message(STATUS "NGRAPH_USE_PROTOBUF_LITE:             ${NGRAPH_USE_PROTOBUF_LITE}")
 message(STATUS "NGRAPH_UNIT_TEST_ENABLE:              ${NGRAPH_UNIT_TEST_ENABLE}")
diff --git a/ngraph/cmake/coverage.cmake b/ngraph/cmake/coverage.cmake
index 7880ebc129bf83..7c9238df7b23e4 100644
--- a/ngraph/cmake/coverage.cmake
+++ b/ngraph/cmake/coverage.cmake
@@ -21,7 +21,8 @@ if (NGRAPH_ONNX_IMPORT_ENABLE)
         "${NGRAPH_COVERAGE_BASE_DIRECTORY}/ngraph/frontend/onnx_import*")
     ie_coverage_genhtml(INFO_FILE "onnx_importer"
         PREFIX "${NGRAPH_COVERAGE_BASE_DIRECTORY}")
-
+endif()
+if (NGRAPH_ONNX_EDITOR_ENABLE)
     ie_coverage_extract(INPUT "nGraph" OUTPUT "onnx_editor"
         PATTERNS 
         "${NGRAPH_COVERAGE_BASE_DIRECTORY}/ngraph/frontend/onnx_editor*")
diff --git a/ngraph/core/include/ngraph/op/deformable_convolution.hpp b/ngraph/core/include/ngraph/op/deformable_convolution.hpp
index cef2cba22367aa..e1a6ad9790b7c6 100644
--- a/ngraph/core/include/ngraph/op/deformable_convolution.hpp
+++ b/ngraph/core/include/ngraph/op/deformable_convolution.hpp
@@ -26,7 +26,7 @@ namespace ngraph
                 ///
                 /// \param arg                Node that produces the input tensor.
                 /// \param deformable_values  Node producing the deformable values tensor.
-                /// \param filters            Node producing the filters(kernels) tensor wit OIZYX
+                /// \param filters            Node producing the filters(kernels) tensor with OIZYX
                 ///                           layout.
                 /// \param strides            Convolution strides.
                 /// \param pads_begin         Amount of padding to be added to the beginning along
diff --git a/ngraph/core/include/ngraph/op/deformable_psroi_pooling.hpp b/ngraph/core/include/ngraph/op/deformable_psroi_pooling.hpp
index 88c37e817e1713..120ef942086684 100644
--- a/ngraph/core/include/ngraph/op/deformable_psroi_pooling.hpp
+++ b/ngraph/core/include/ngraph/op/deformable_psroi_pooling.hpp
@@ -20,20 +20,21 @@ namespace ngraph
                 DeformablePSROIPooling() = default;
                 /// \brief Constructs a DeformablePSROIPooling operation
                 ///
-                /// \param input           Input tensor with feature maps
-                /// \param coords          Input tensor describing box consisting
-                ///                        of five element tuples
-                /// \param offsets         Input blob with transformation values
+                /// \param input           Input tensor with position sensitive score maps
+                /// \param coords          Input tensor with list of five element tuples
+                ///                        describing ROI coordinates
+                /// \param offsets         Input tensor with transformation values
                 /// \param output_dim      Pooled output channel number
-                /// \param group_size      Number of groups to encode position-sensitive score maps
+                /// \param group_size      Number of horizontal bins per row to divide ROI area,
+                ///                        it defines output width and height
                 /// \param spatial_scale   Multiplicative spatial scale factor to translate ROI
                 ///                        coordinates from their input scale to the scale used when
                 ///                        pooling
                 /// \param mode            Specifies mode for pooling.
-                /// \param spatial_bins_x  Specifies numbers of bins to divide the input feature
-                ///                         maps over width
-                /// \param spatial_bins_y  Specifies numbers of bins to divide the input feature
-                ///                        maps over height
+                /// \param spatial_bins_x  Specifies numbers of bins to divide ROI single
+                ///                        bin over width
+                /// \param spatial_bins_y  Specifies numbers of bins to divide ROI single
+                ///                        bin over height
                 /// \param no_trans        The flag that specifies whenever third input exists
                 ///                        and contains transformation (offset) values
                 /// \param trans_std       The value that all transformation (offset) values are
@@ -84,7 +85,7 @@ namespace ngraph
                 int64_t m_output_dim;
                 float m_spatial_scale;
                 int64_t m_group_size = 1;
-                std::string m_mode = "bilinear";
+                std::string m_mode = "bilinear_deformable";
                 int64_t m_spatial_bins_x = 1;
                 int64_t m_spatial_bins_y = 1;
                 float m_trans_std = 1.f;
diff --git a/ngraph/core/include/ngraph/op/gather.hpp b/ngraph/core/include/ngraph/op/gather.hpp
index ce612f0d4e4b29..6a1c096f04fb62 100644
--- a/ngraph/core/include/ngraph/op/gather.hpp
+++ b/ngraph/core/include/ngraph/op/gather.hpp
@@ -52,5 +52,48 @@ namespace ngraph
                                      const HostTensorVector& inputs) const;
             };
         } // namespace v1
+
+        namespace v7
+        {
+            /// \brief Gather slices from axis of params according to indices
+            class NGRAPH_API Gather : public Op
+            {
+            public:
+                NGRAPH_RTTI_DECLARATION;
+                Gather() = default;
+
+                /// \param data The tensor from which slices are gathered
+                /// \param indices Tensor with indexes to gather
+                /// \param axis The tensor is a dimension index to gather data from
+                /// \param batch_dims The number of batch dimension in data and indices tensors
+                Gather(const Output<Node>& data,
+                       const Output<Node>& indices,
+                       const Output<Node>& axis,
+                       const int64_t batch_dims = 0);
+
+                bool visit_attributes(AttributeVisitor& visitor) override;
+                void validate_and_infer_types() override;
+
+                std::shared_ptr<Node>
+                    clone_with_new_inputs(const OutputVector& new_args) const override;
+
+                int64_t get_batch_dims() const;
+                int64_t get_axis() const;
+                bool is_axis_set() const;
+
+                bool evaluate_gather(const HostTensorVector& outputs,
+                                     const HostTensorVector& inputs) const;
+                bool evaluate(const HostTensorVector& outputs,
+                              const HostTensorVector& inputs) const override;
+                bool evaluate_lower(const HostTensorVector& outputs) const override;
+                bool evaluate_upper(const HostTensorVector& outputs) const override;
+
+                bool constant_fold(OutputVector& output_values,
+                                   const OutputVector& inputs_values) override;
+
+            private:
+                int64_t m_batch_dims = 0;
+            };
+        } // namespace v7
     }     // namespace op
 } // namespace ngraph
diff --git a/ngraph/core/include/ngraph/op/max.hpp b/ngraph/core/include/ngraph/op/max.hpp
index 5d83e74f31cfe5..94032212f4c49a 100644
--- a/ngraph/core/include/ngraph/op/max.hpp
+++ b/ngraph/core/include/ngraph/op/max.hpp
@@ -16,8 +16,7 @@ namespace ngraph
             class NGRAPH_API ReduceMax : public util::ArithmeticReductionKeepDims
             {
             public:
-                static constexpr NodeTypeInfo type_info{"ReduceMax", 1};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 /// \brief Constructs a summation operation.
                 ReduceMax() = default;
                 /// \brief Constructs a summation operation.
diff --git a/ngraph/core/include/ngraph/op/min.hpp b/ngraph/core/include/ngraph/op/min.hpp
index 78cd5edcbbb031..d78d30725d1854 100644
--- a/ngraph/core/include/ngraph/op/min.hpp
+++ b/ngraph/core/include/ngraph/op/min.hpp
@@ -16,8 +16,7 @@ namespace ngraph
             class NGRAPH_API ReduceMin : public util::ArithmeticReductionKeepDims
             {
             public:
-                static constexpr NodeTypeInfo type_info{"ReduceMin", 1};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 /// \brief Constructs a summation operation.
                 ReduceMin() = default;
                 /// \brief Constructs a summation operation.
diff --git a/ngraph/core/include/ngraph/op/mod.hpp b/ngraph/core/include/ngraph/op/mod.hpp
index 73eab9f5f35198..e586b90a42781d 100644
--- a/ngraph/core/include/ngraph/op/mod.hpp
+++ b/ngraph/core/include/ngraph/op/mod.hpp
@@ -4,11 +4,7 @@
 
 #pragma once
 
-#include "ngraph/node.hpp"
-#include "ngraph/op/op.hpp"
-#include "ngraph/op/util/fused_op.hpp"
-
-NGRAPH_SUPPRESS_DEPRECATED_START
+#include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
 
 namespace ngraph
 {
@@ -18,34 +14,28 @@ namespace ngraph
         {
             /// \brief Mod returns an element-wise division reminder with two given tensors applying
             /// multi-directional broadcast rules.
-            class NGRAPH_API Mod : public ngraph::op::util::FusedOp
+            class NGRAPH_API Mod : public util::BinaryElementwiseArithmetic
             {
             public:
                 static constexpr NodeTypeInfo type_info{"Mod", 0};
                 const NodeTypeInfo& get_type_info() const override { return type_info; }
-                Mod();
                 /// \brief Constructs a Mod node.
+                Mod()
+                    : util::BinaryElementwiseArithmetic(AutoBroadcastSpec::NUMPY)
+                {
+                }
                 ///
                 /// \param A - Dividend tensor
                 /// \param B - Divisor tensor
                 /// \param auto_broadcast Auto broadcast specification
                 Mod(const Output<Node>& A,
                     const Output<Node>& B,
-                    const AutoBroadcastSpec& auto_broadcast = AutoBroadcastType::NUMPY);
-
-                bool visit_attributes(AttributeVisitor& visitor) override;
-                virtual OutputVector decompose_op() const override;
+                    const AutoBroadcastSpec& auto_broadcast =
+                        AutoBroadcastSpec(AutoBroadcastType::NUMPY));
 
                 virtual std::shared_ptr<Node>
                     clone_with_new_inputs(const OutputVector& new_args) const override;
-
-                const AutoBroadcastSpec& get_auto_broadcast() const { return m_auto_broadcast; }
-
-            private:
-                AutoBroadcastSpec m_auto_broadcast;
             };
-        }
+        } // namespace v1
     }
 }
-
-NGRAPH_SUPPRESS_DEPRECATED_END
diff --git a/ngraph/core/include/ngraph/op/reduce_l1.hpp b/ngraph/core/include/ngraph/op/reduce_l1.hpp
index b09c9398be73e5..1329b5d9e22c27 100644
--- a/ngraph/core/include/ngraph/op/reduce_l1.hpp
+++ b/ngraph/core/include/ngraph/op/reduce_l1.hpp
@@ -19,8 +19,7 @@ namespace ngraph
             class NGRAPH_API ReduceL1 : public util::ArithmeticReductionKeepDims
             {
             public:
-                static constexpr NodeTypeInfo type_info{"ReduceL1", 4};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 /// \brief Constructs a reducet L1-norm operation.
                 ReduceL1() = default;
                 /// \brief Constructs a reduce L1-norm operation.
diff --git a/ngraph/core/include/ngraph/op/reduce_l2.hpp b/ngraph/core/include/ngraph/op/reduce_l2.hpp
index 3841d359141f56..1daa8697acde7c 100644
--- a/ngraph/core/include/ngraph/op/reduce_l2.hpp
+++ b/ngraph/core/include/ngraph/op/reduce_l2.hpp
@@ -18,8 +18,7 @@ namespace ngraph
             class NGRAPH_API ReduceL2 : public util::ArithmeticReductionKeepDims
             {
             public:
-                static constexpr NodeTypeInfo type_info{"ReduceL2", 4};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 /// \brief Constructs a reducet L2-norm operation.
                 ReduceL2() = default;
                 /// \brief Constructs a reduce L2-norm operation.
diff --git a/ngraph/core/include/ngraph/op/reduce_mean.hpp b/ngraph/core/include/ngraph/op/reduce_mean.hpp
index 6eca8555be3fea..9f0f3bdb2629fe 100644
--- a/ngraph/core/include/ngraph/op/reduce_mean.hpp
+++ b/ngraph/core/include/ngraph/op/reduce_mean.hpp
@@ -16,8 +16,7 @@ namespace ngraph
             class NGRAPH_API ReduceMean : public util::ArithmeticReductionKeepDims
             {
             public:
-                static constexpr NodeTypeInfo type_info{"ReduceMean", 1};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 ReduceMean() = default;
 
                 /// \param arg The tensor to be summed.
diff --git a/ngraph/core/include/ngraph/op/reduce_prod.hpp b/ngraph/core/include/ngraph/op/reduce_prod.hpp
index c54b87a64b9a03..b3904a76da9052 100644
--- a/ngraph/core/include/ngraph/op/reduce_prod.hpp
+++ b/ngraph/core/include/ngraph/op/reduce_prod.hpp
@@ -18,8 +18,7 @@ namespace ngraph
             class NGRAPH_API ReduceProd : public util::ArithmeticReductionKeepDims
             {
             public:
-                static constexpr NodeTypeInfo type_info{"ReduceProd", 1};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 /// \brief Constructs a product reduction operation.
                 ReduceProd() = default;
                 /// \brief Constructs a product reduction operation.
diff --git a/ngraph/core/include/ngraph/op/reduce_sum.hpp b/ngraph/core/include/ngraph/op/reduce_sum.hpp
index 8becb286f6387d..2de81ee71fff3d 100644
--- a/ngraph/core/include/ngraph/op/reduce_sum.hpp
+++ b/ngraph/core/include/ngraph/op/reduce_sum.hpp
@@ -65,8 +65,7 @@ namespace ngraph
             class NGRAPH_API ReduceSum : public util::ArithmeticReductionKeepDims
             {
             public:
-                static constexpr NodeTypeInfo type_info{"ReduceSum", 1};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 /// \brief Constructs a summation operation.
                 ReduceSum() = default;
                 /// \brief Constructs a summation operation.
diff --git a/ngraph/core/include/ngraph/op/scatter_nd_update.hpp b/ngraph/core/include/ngraph/op/scatter_nd_update.hpp
index ae646d6980abcb..3fccbdee97ae91 100644
--- a/ngraph/core/include/ngraph/op/scatter_nd_update.hpp
+++ b/ngraph/core/include/ngraph/op/scatter_nd_update.hpp
@@ -32,6 +32,8 @@ namespace ngraph
 
                 virtual std::shared_ptr<Node>
                     clone_with_new_inputs(const OutputVector& new_args) const override;
+                bool evaluate(const HostTensorVector& outputs,
+                              const HostTensorVector& inputs) const override;
             };
         }
         using v3::ScatterNDUpdate;
diff --git a/ngraph/core/include/ngraph/op/squeeze.hpp b/ngraph/core/include/ngraph/op/squeeze.hpp
index 58d0259ab3aee2..28cd28afa340c0 100644
--- a/ngraph/core/include/ngraph/op/squeeze.hpp
+++ b/ngraph/core/include/ngraph/op/squeeze.hpp
@@ -39,6 +39,8 @@ namespace ngraph
 
                 virtual std::shared_ptr<Node>
                     clone_with_new_inputs(const OutputVector& new_args) const override;
+
+                bool is_dynamic() const override;
             };
         }
         using v0::Squeeze;
diff --git a/ngraph/core/include/ngraph/op/util/arithmetic_reduction.hpp b/ngraph/core/include/ngraph/op/util/arithmetic_reduction.hpp
index acc5e22b9f68a9..893c54664c0046 100644
--- a/ngraph/core/include/ngraph/op/util/arithmetic_reduction.hpp
+++ b/ngraph/core/include/ngraph/op/util/arithmetic_reduction.hpp
@@ -21,11 +21,6 @@ namespace ngraph
                 /// \brief Constructs an arithmetic reduction operation.
                 ArithmeticReduction();
 
-                /// \brief Constructs an arithmetic reduction operation.
-                ///
-                /// \param arg Output that produces the first input tensor.
-                /// \param reduction_axes The axis positions (0-based) to be eliminated.
-                ArithmeticReduction(const Output<Node>& arg, const AxisSet& reduction_axes);
                 /// \brief Constructs an arithmetic reduction operation.
                 ///
                 /// \param arg Output that produces the first input tensor.
@@ -33,6 +28,7 @@ namespace ngraph
                 ArithmeticReduction(const Output<Node>& arg, const Output<Node>& reduction_axes);
 
             public:
+                NGRAPH_RTTI_DECLARATION;
                 void validate_and_infer_types() override;
 
                 /// \return true if reduction axes are constant else false.
diff --git a/ngraph/core/include/ngraph/op/util/arithmetic_reductions_keep_dims.hpp b/ngraph/core/include/ngraph/op/util/arithmetic_reductions_keep_dims.hpp
index 5398bb53394ccc..f92d282ce42c5c 100644
--- a/ngraph/core/include/ngraph/op/util/arithmetic_reductions_keep_dims.hpp
+++ b/ngraph/core/include/ngraph/op/util/arithmetic_reductions_keep_dims.hpp
@@ -28,6 +28,7 @@ namespace ngraph
                 bool visit_attributes(AttributeVisitor& visitor) override;
 
             public:
+                NGRAPH_RTTI_DECLARATION;
                 void validate_and_infer_types() override;
 
                 /// \return If set to 1 it holds axes that are used for reduction.
diff --git a/ngraph/core/include/ngraph/op/util/logical_reduction.hpp b/ngraph/core/include/ngraph/op/util/logical_reduction.hpp
index 9508887e4a9d6a..e5d0d95ba38631 100644
--- a/ngraph/core/include/ngraph/op/util/logical_reduction.hpp
+++ b/ngraph/core/include/ngraph/op/util/logical_reduction.hpp
@@ -32,6 +32,7 @@ namespace ngraph
                 LogicalReduction(const Output<Node>& arg, const Output<Node>& reduction_axes);
 
             public:
+                NGRAPH_RTTI_DECLARATION;
                 void validate_and_infer_types() override;
 
                 /// \return true if reduction axes are constant else false.
diff --git a/ngraph/core/include/ngraph/op/util/logical_reduction_keep_dims.hpp b/ngraph/core/include/ngraph/op/util/logical_reduction_keep_dims.hpp
index e7a5d8ca44830f..340f377f67ff3e 100644
--- a/ngraph/core/include/ngraph/op/util/logical_reduction_keep_dims.hpp
+++ b/ngraph/core/include/ngraph/op/util/logical_reduction_keep_dims.hpp
@@ -28,6 +28,7 @@ namespace ngraph
                 bool visit_attributes(AttributeVisitor& visitor) override;
 
             public:
+                NGRAPH_RTTI_DECLARATION;
                 void validate_and_infer_types() override;
 
                 /// \return If set to 1 it holds axes that are used for reduction.
diff --git a/ngraph/core/include/ngraph/opsets/opset7_tbl.hpp b/ngraph/core/include/ngraph/opsets/opset7_tbl.hpp
index ecdf62a94cf135..8a3d0d6ef9bc5d 100644
--- a/ngraph/core/include/ngraph/opsets/opset7_tbl.hpp
+++ b/ngraph/core/include/ngraph/opsets/opset7_tbl.hpp
@@ -42,7 +42,7 @@ NGRAPH_OP(ExtractImagePatches, ngraph::op::v3)
 NGRAPH_OP(FakeQuantize, ngraph::op::v0)
 NGRAPH_OP(Floor, ngraph::op::v0)
 NGRAPH_OP(FloorMod, ngraph::op::v1)
-NGRAPH_OP(Gather, ngraph::op::v1)
+NGRAPH_OP(Gather, ngraph::op::v7)
 NGRAPH_OP(GatherTree, ngraph::op::v1)
 NGRAPH_OP(Greater, ngraph::op::v1)
 NGRAPH_OP(GreaterEqual, ngraph::op::v1)
diff --git a/ngraph/core/include/ngraph/util.hpp b/ngraph/core/include/ngraph/util.hpp
index 3295eae9ba302f..49605f00e1678f 100644
--- a/ngraph/core/include/ngraph/util.hpp
+++ b/ngraph/core/include/ngraph/util.hpp
@@ -215,9 +215,15 @@ namespace ngraph
     NGRAPH_API
     AxisVector get_default_order(size_t rank);
 
+    NGRAPH_API
+    AxisVector get_default_order(const Rank& rank);
+
     NGRAPH_API
     AxisVector get_default_order(const Shape& shape);
 
+    NGRAPH_API
+    AxisVector get_default_order(const PartialShape& shape);
+
     //
     // EnumMask is intended to work with a scoped enum type. It's used to store
     // a combination of enum values and provides easy access and manipulation
diff --git a/ngraph/core/include/ngraph/validation_util.hpp b/ngraph/core/include/ngraph/validation_util.hpp
index 659a190ac1e141..bfac7306ccb117 100644
--- a/ngraph/core/include/ngraph/validation_util.hpp
+++ b/ngraph/core/include/ngraph/validation_util.hpp
@@ -35,6 +35,28 @@ namespace ngraph
                                                        bool is_window_all_in_padding_allowed,
                                                        bool ceil_mode = false);
 
+    /// \brief      Validates input shape ranks and infers convolution forward output shape.
+    ///
+    /// \param[in] node              Node with convolution operation.
+    /// \param[in] data_batch_pshape Partial shape of data batch input.
+    /// \param[in] filters_pshape    Partial shape of filters input.
+    /// \param[in] auto_pad          Type of padding.
+    /// \param     strides           Strides.
+    /// \param     dilations         Dilations.
+    /// \param     pads_begin        Pads begin.
+    /// \param     pads_end          Pads end.
+    ///
+    /// \return Partial shape of the output.
+    PartialShape
+        validate_and_infer_convolution_forward_output_shape(const Node* node,
+                                                            const PartialShape& data_batch_pshape,
+                                                            const PartialShape& filters_pshape,
+                                                            const op::PadType auto_pad,
+                                                            Strides& strides,
+                                                            Strides& dilations,
+                                                            CoordinateDiff& pads_begin,
+                                                            CoordinateDiff& pads_end);
+
     NGRAPH_API
     PartialShape infer_convolution_forward(const Node* node,
                                            const PartialShape& data_batch_shape,
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/binary_convolution.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/binary_convolution.hpp
index 2b04fcf6ee1433..daf1780519475f 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/binary_convolution.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/binary_convolution.hpp
@@ -127,36 +127,6 @@ namespace ngraph
                 }
             }
 
-            void validate_convolution_parameters(const Shape& in_shape,
-                                                 const Shape& f_shape,
-                                                 const Strides& strides,
-                                                 const Strides& dilations,
-                                                 const CoordinateDiff& pads_begin,
-                                                 const CoordinateDiff& pads_end)
-            {
-                // this implementation supports 1D, 2D and 3D convolutions
-                NGRAPH_CHECK(in_shape.size() >= 3 && in_shape.size() <= 5,
-                             "Unsupported input rank: ",
-                             in_shape);
-
-                NGRAPH_CHECK(in_shape.size() == f_shape.size(),
-                             "Incompatible input ranks: ",
-                             in_shape.size(),
-                             " and ",
-                             f_shape.size());
-
-                const auto spatial_dims = in_shape.size() - 2;
-                NGRAPH_CHECK(strides.size() == spatial_dims,
-                             "Strides not definied for all and only spatial dimensions");
-
-                NGRAPH_CHECK(dilations.size() == spatial_dims,
-                             "Dilations not defined for all and only spatial dimensions");
-
-                NGRAPH_CHECK((pads_begin.size() == pads_end.size()) &&
-                                 (pads_begin.size() == spatial_dims),
-                             "Pads not defined for all and only spatial dimensions");
-            }
-
             template <typename T_IN, typename T_F>
             void binary_convolution(const T_IN* in,
                                     const T_F* f,
@@ -171,7 +141,7 @@ namespace ngraph
                                     const float pad_value)
             {
                 validate_convolution_parameters(
-                    in_shape, f_shape, strides, dilations, pads_begin, pads_end);
+                    in_shape, f_shape, out_shape, strides, dilations, pads_begin, pads_end);
 
                 // here we are converting all param types to int's to avoid arithmetic issues
                 // (e.g signed + unsigned) in indexes calculation later
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/convolution.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/convolution.hpp
index 5643bfd6a843fa..2e52d88753bfcf 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/convolution.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/convolution.hpp
@@ -159,6 +159,81 @@ namespace ngraph
                             std::prev(filter_shape.end(), spatial_rank), missing_dims, 1);
                     }
                 }
+
+                void infer_forward_conv_output_shape(const Shape& in_spatial_shape,
+                                                     const Shape& f_spatial_shape,
+                                                     Shape& out_spatial_shape,
+                                                     const Strides& strides,
+                                                     const Strides& dilations,
+                                                     const CoordinateDiff& pads_begin,
+                                                     const CoordinateDiff& pads_end)
+                {
+                    for (size_t idx = 0; idx < in_spatial_shape.size(); idx++)
+                    {
+                        size_t in_padded_dim =
+                            in_spatial_shape[idx] + pads_begin[idx] + pads_end[idx];
+                        size_t filter_dilated_dim = dilations[idx] * (f_spatial_shape[idx] - 1) + 1;
+                        size_t out_spatial_dim =
+                            (in_padded_dim - filter_dilated_dim) / strides[idx] + 1;
+                        out_spatial_shape.push_back(out_spatial_dim);
+                    }
+                }
+
+                void validate_convolution_parameters(const Shape& in_shape,
+                                                     const Shape& f_shape,
+                                                     const Shape& out_shape,
+                                                     const Strides& strides,
+                                                     const Strides& dilations,
+                                                     const CoordinateDiff& pads_begin,
+                                                     const CoordinateDiff& pads_end)
+                {
+                    // this implementation supports 1D, 2D and 3D convolutions
+                    NGRAPH_CHECK(in_shape.size() >= 3 && in_shape.size() <= 5,
+                                 "Unsupported input rank: ",
+                                 in_shape);
+
+                    NGRAPH_CHECK(in_shape.size() == f_shape.size(),
+                                 "Incompatible input ranks: ",
+                                 in_shape.size(),
+                                 " and ",
+                                 f_shape.size());
+
+                    NGRAPH_CHECK(in_shape[in_channel_axis] == f_shape[filter_in_ch_axis],
+                                 "Incompatible input channels in data batch and filters shapes: ",
+                                 in_shape[in_channel_axis],
+                                 " and ",
+                                 f_shape[filter_in_ch_axis]);
+
+                    NGRAPH_CHECK(in_shape.size() == out_shape.size(),
+                                 "Incompatible input and output ranks: ",
+                                 in_shape.size(),
+                                 " and ",
+                                 out_shape.size());
+
+                    const auto spatial_dims = in_shape.size() - 2;
+                    NGRAPH_CHECK(strides.size() == spatial_dims,
+                                 "Strides not definied for all and only spatial dimensions");
+
+                    NGRAPH_CHECK(dilations.size() == spatial_dims,
+                                 "Dilations not defined for all and only spatial dimensions");
+
+                    NGRAPH_CHECK((pads_begin.size() == pads_end.size()) &&
+                                     (pads_begin.size() == spatial_dims),
+                                 "Pads not defined for all and only spatial dimensions");
+
+                    Shape out_spatial_shape{std::next(out_shape.begin(), 2), std::end(out_shape)};
+                    Shape infered_out_spatial_shape{};
+                    infer_forward_conv_output_shape(
+                        Shape{std::next(in_shape.begin(), 2), std::end(in_shape)},
+                        Shape{std::next(f_shape.begin(), 2), std::end(f_shape)},
+                        infered_out_spatial_shape,
+                        strides,
+                        dilations,
+                        pads_begin,
+                        pads_end);
+                    NGRAPH_CHECK(out_spatial_shape == infered_out_spatial_shape,
+                                 "Incorrect output shape provided");
+                }
             }
 
             template <typename T>
@@ -169,23 +244,17 @@ namespace ngraph
                              const Shape& f_shape,
                              const Shape& out_shape,
                              const Strides& strides,
-                             const Strides& dilation,
+                             const Strides& dilations,
                              const CoordinateDiff& pads_begin,
                              const CoordinateDiff& pads_end)
 
             {
-                // this implementation supports 1D, 2D and 3D convolutions
-                NGRAPH_CHECK(in_shape.size() >= 3 && in_shape.size() <= 5,
-                             "Unsupported input rank: ",
-                             in_shape);
-
-                NGRAPH_CHECK(f_shape.size() >= 3 && f_shape.size() <= 5,
-                             "Unsupported kernel rank: ",
-                             f_shape);
+                validate_convolution_parameters(
+                    in_shape, f_shape, out_shape, strides, dilations, pads_begin, pads_end);
 
                 // here we are converting all param types to int's to avoid arithmetic issues
                 // (e.g signed + unsigned) in indexes calculation later
-                ConvolutionParams params{strides, dilation, pads_begin, pads_end};
+                ConvolutionParams params{strides, dilations, pads_begin, pads_end};
 
                 // here we are extending spatial dimensions to 3D, because we are going to use 3D
                 // convolution implementation to convolve also in 1D & 2D case
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/floor_mod.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/floor_mod.hpp
index c7742e65f7f733..3ae7bbebeac6c6 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/floor_mod.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/floor_mod.hpp
@@ -26,7 +26,10 @@ namespace ngraph
             {
                 autobroadcast_binop(
                     arg0, arg1, out, arg0_shape, arg1_shape, broadcast_spec, [](T x, T y) -> T {
-                        return x - y * std::floor(x / y);
+                        // Cast to double is needed for integer input,
+                        // otherwise std::floor will act like std::trunc
+                        const double divisor = static_cast<double>(y);
+                        return x - y * std::floor(x / divisor);
                     });
             }
         }
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/gather.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/gather.hpp
index 365bdf9ac2509a..fcdf1c2a122cc1 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/gather.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/gather.hpp
@@ -6,9 +6,7 @@
 
 #include <numeric>
 
-#include "ngraph/coordinate_range.hpp"
-#include "ngraph/coordinate_transform.hpp"
-#include "ngraph/runtime/reference/gather_nd.hpp"
+#include "ngraph/shape.hpp"
 #include "utils/span.hpp"
 
 namespace ngraph
@@ -17,107 +15,54 @@ namespace ngraph
     {
         namespace reference
         {
-            namespace
-            {
-                template <typename Container>
-                Shape to_shape(const Container& c)
-                {
-                    return Shape(begin(c), end(c));
-                }
-
-                template <typename Container>
-                std::vector<size_t>
-                    join(const Container& c1, const Container& c2, const Container& c3)
-                {
-                    using container_value_type =
-                        typename std::remove_cv<typename Container::value_type>::type;
-                    static_assert(std::is_same<container_value_type, size_t>::value,
-                                  "Expect same type in container");
-                    std::vector<size_t> ret;
-                    ret.reserve(c1.size() + c2.size() + c3.size());
-                    std::copy(begin(c1), end(c1), std::back_inserter(ret));
-                    std::copy(begin(c2), end(c2), std::back_inserter(ret));
-                    std::copy(begin(c3), end(c3), std::back_inserter(ret));
-                    return ret;
-                }
-
-                const auto only_one = [] { return coordinates::index(Shape{1}); };
-            } // namespace
             template <typename T, typename U>
-            void gather(const T* const params,
+            void gather(const T* const data,
                         const U* const indices,
-                        T* const out,
-                        const Shape& params_shape,
+                        T* out,
+                        const Shape& data_shape,
                         const Shape& indices_shape,
                         const Shape& out_shape,
-                        size_t axis)
+                        size_t axis,
+                        size_t batch_dims = 0)
             {
-                using std::next;
-                assert(std::memset(out, 0, shape_size(out_shape) * sizeof(T)));
-
-                const auto params_axes_part = span(params_shape).subspan(0, axis);
-
-                NGRAPH_CHECK(params_shape.size() >= axis, "Not enough axes in param_shape.");
-
-                const auto remainder_part_shape = span(params_shape).subspan(axis + 1);
-
-                const auto found_out_shape =
-                    join(params_axes_part, span(indices_shape), remainder_part_shape);
-
-                NGRAPH_CHECK(found_out_shape == out_shape,
-                             "Output shape mismatch with calculations");
-
-                const auto batch_shape = span(params_shape).subspan(axis);
-
-                const auto batch_size = shape_size(batch_shape);
-
-                const auto copy_size = shape_size(remainder_part_shape);
-
-                const size_t copy_round_in_batch =
-                    indices_shape.size() > 1
-                        ? shape_size(span(indices_shape.data(), indices_shape.size() - 1))
-                        : 1;
-                const size_t round_batch_offset = indices_shape.empty() ? 1 : indices_shape.back();
-
-                auto dst = out;
-
-                auto gather_range = params_axes_part.empty()
-                                        ? only_one()
-                                        : coordinates::index(to_shape(params_axes_part));
-                for (auto i : gather_range)
-                {
-                    auto batch_index = i.begin_index;
-                    for (size_t batch = 0; batch != i.element_number;
-                         batch_index += i.step, ++batch)
+                // flattened shapes
+                int64_t batch_size = shape_size(span(data_shape).subspan(0, batch_dims));
+                int64_t outer_size =
+                    shape_size(span(data_shape).subspan(batch_dims, axis - batch_dims));
+                int64_t indices_size = shape_size(span(indices_shape).subspan(batch_dims));
+                int64_t inner_size = shape_size(span(data_shape).subspan(axis + 1));
+
+                int64_t batch_data_mul = shape_size(span(data_shape).subspan(batch_dims));
+                int64_t batch_out_mul = shape_size(span(out_shape).subspan(batch_dims));
+                int64_t batch_indices_mul = shape_size(span(indices_shape).subspan(batch_dims));
+
+                int64_t axis_size = data_shape[axis];
+                int64_t data_offset, out_offset, idx;
+
+                for (int64_t batch = 0; batch < batch_size; batch++)
+                    for (int64_t outer_idx = 0; outer_idx < outer_size; outer_idx++)
                     {
-                        const auto batch_offset = batch_index * batch_size;
-                        assert(batch_offset < shape_size(params_shape));
-                        for (size_t round = 0; round != copy_round_in_batch; ++round)
+                        data_offset = batch_data_mul * batch + inner_size * axis_size * outer_idx;
+                        out_offset = batch_out_mul * batch + indices_size * inner_size * outer_idx;
+                        for (int64_t i = 0; i < indices_size; i++)
                         {
-                            const U* input_indices = indices + round * round_batch_offset;
-                            const auto indices_no =
-                                indices_shape.empty() ? 1 : indices_shape.back();
-
-                            assert(!batch_shape.empty());
-                            for (size_t ii = 0; ii != indices_no; ++ii)
-                            {
-                                const auto positive_input_index =
-                                    input_indices[ii] < 0 ? batch_shape.front() + input_indices[ii]
-                                                          : input_indices[ii];
-
-                                const auto src_offset =
-                                    batch_offset + copy_size * positive_input_index;
-
-                                const auto src_begin = next(params, src_offset);
-                                const auto src_end = next(src_begin, copy_size);
-
-                                std::copy(src_begin, src_end, dst);
-                                dst += copy_size;
-                            }
+                            idx = indices[i + batch_indices_mul * batch];
+                            // clang-format off
+                            // todo: check if bound check is needed
+                            // if (idx >= axis_size || (idx < 0 && -idx >= axis_size))
+                            //    throw std::domain_error{"indices values of Gather exceed size along axis"};
+                            // clang-format on
+                            if (idx < 0)
+                                idx += axis_size;
+
+                            const auto src_begin = std::next(data, data_offset + inner_size * idx);
+                            const auto src_end = std::next(src_begin, inner_size);
+                            const auto out_ptr = std::next(out, out_offset + inner_size * i);
+                            std::copy(src_begin, src_end, out_ptr);
                         }
                     }
-                }
             }
+
         } // namespace reference
     }     // namespace runtime
 } // namespace ngraph
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/group_convolution.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/group_convolution.hpp
index 421b81ac09290b..ff56719cd63ae7 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/group_convolution.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/group_convolution.hpp
@@ -10,6 +10,7 @@
 namespace
 {
     constexpr size_t filter_group_axis = 0;
+    constexpr size_t filter_in_ch_axis = 2;
     constexpr size_t in_batch_axis = 0;
     constexpr size_t in_channel_axis = 1;
     constexpr size_t out_batch_axis = 0;
@@ -22,6 +23,58 @@ namespace ngraph
     {
         namespace reference
         {
+            void validate_group_convolution_parameters(const Shape& in_shape,
+                                                       const Shape& f_shape,
+                                                       const Shape& out_shape,
+                                                       const Strides& strides,
+                                                       const Strides& dilations,
+                                                       const CoordinateDiff& pads_begin,
+                                                       const CoordinateDiff& pads_end)
+            {
+                // this implementation supports 1D, 2D and 3D convolutions
+                NGRAPH_CHECK(in_shape.size() >= 3 && in_shape.size() <= 5,
+                             "Unsupported input rank: ",
+                             in_shape);
+
+                NGRAPH_CHECK(in_shape.size() + 1 == f_shape.size(),
+                             "Unsupported filter rank: ",
+                             f_shape.size());
+
+                NGRAPH_CHECK(in_shape.size() == out_shape.size(),
+                             "Incompatible input and output ranks: ",
+                             in_shape.size(),
+                             " and ",
+                             out_shape.size());
+
+                const size_t groups = f_shape[filter_group_axis];
+                const size_t in_channels = in_shape[in_channel_axis];
+                NGRAPH_CHECK(in_channels % groups == 0,
+                             "Input channels of data batch input must be multiple of groups");
+                const Shape in_group_shape = [&]() {
+                    Shape new_shape{in_shape};
+                    new_shape[in_channel_axis] /= groups;
+                    return new_shape;
+                }();
+
+                const size_t out_channels = out_shape[out_channel_axis];
+                NGRAPH_CHECK(out_channels % groups == 0,
+                             "Output channels of output must be multiple of groups");
+                const Shape out_group_shape = [&]() {
+                    Shape new_shape{out_shape};
+                    new_shape[out_channel_axis] /= groups;
+                    return new_shape;
+                }();
+
+                const Shape f_group_shape{std::next(f_shape.begin(), 1), std::end(f_shape)};
+                validate_convolution_parameters(in_group_shape,
+                                                f_group_shape,
+                                                out_group_shape,
+                                                strides,
+                                                dilations,
+                                                pads_begin,
+                                                pads_end);
+            }
+
             template <typename INPUT,
                       typename FILTER,
                       typename OUTPUT,
@@ -38,9 +91,8 @@ namespace ngraph
                                    const CoordinateDiff& pads_end)
 
             {
-                NGRAPH_CHECK(filter_shape.size() >= 4 && filter_shape.size() <= 6,
-                             "Unsupported kernel rank: ",
-                             filter_shape);
+                validate_group_convolution_parameters(
+                    in_shape, filter_shape, out_shape, strides, dilation, pads_begin, pads_end);
 
                 const size_t group_count = filter_shape[filter_group_axis];
 
@@ -69,11 +121,6 @@ namespace ngraph
                 }();
                 const size_t group_out_size = shape_size(group_out_shape);
 
-                // TODO: delete in_dilation when Convolution PR (#3922) is merged
-                // in_dilation parameter is needed only for old implementation (CoordinateTransform
-                // based)
-                Strides in_dilation(in_shape.size());
-                std::fill(in_dilation.begin(), in_dilation.end(), 1);
                 for (size_t batch_idx = 0; batch_idx < in_shape[in_batch_axis]; ++batch_idx)
                 {
                     group_filter = f;
@@ -88,8 +135,7 @@ namespace ngraph
                                                         strides,
                                                         dilation,
                                                         pads_begin,
-                                                        pads_end,
-                                                        in_dilation);
+                                                        pads_end);
                         group_batch += group_batch_size;
                         group_filter += group_filter_size;
                         group_out += group_out_size;
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/group_convolution_backprop_data.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/group_convolution_backprop_data.hpp
index 9355c80e026640..b70c0d3ed9ab9d 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/group_convolution_backprop_data.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/group_convolution_backprop_data.hpp
@@ -13,6 +13,127 @@ namespace ngraph
     {
         namespace reference
         {
+            void infer_backward_conv_output_shape(const Shape& in_spatial_shape,
+                                                  const Shape& f_spatial_shape,
+                                                  Shape& out_spatial_shape,
+                                                  const Strides& strides,
+                                                  const Strides& dilations,
+                                                  const CoordinateDiff& pads_begin,
+                                                  const CoordinateDiff& pads_end)
+            {
+                for (size_t idx = 0; idx < in_spatial_shape.size(); idx++)
+                {
+                    size_t in_padded_dim = (in_spatial_shape[idx] - 1) * strides[idx] -
+                                           pads_begin[idx] - pads_end[idx];
+                    size_t filter_dilated_dim = dilations[idx] * (f_spatial_shape[idx] - 1) + 1;
+                    size_t out_spatial_dim = in_padded_dim + filter_dilated_dim;
+                    out_spatial_shape.push_back(out_spatial_dim);
+                }
+            }
+
+            void validate_convolution_backprop_data_parameters(const Shape& in_shape,
+                                                               const Shape& f_shape,
+                                                               const Shape& out_shape,
+                                                               const Strides& strides,
+                                                               const Strides& dilations,
+                                                               const CoordinateDiff& pads_begin,
+                                                               const CoordinateDiff& pads_end)
+            {
+                // this implementation supports 1D, 2D and 3D convolutions
+                NGRAPH_CHECK(in_shape.size() >= 3 && in_shape.size() <= 5,
+                             "Unsupported input rank: ",
+                             in_shape);
+                NGRAPH_CHECK(in_shape.size() == f_shape.size(),
+                             "Incompatible input ranks: ",
+                             in_shape.size(),
+                             " and ",
+                             f_shape.size());
+                NGRAPH_CHECK(in_shape[in_channel_axis] == f_shape[filter_in_ch_axis],
+                             "Incompatible input channels in data batch and filters shapes: ",
+                             in_shape[in_channel_axis],
+                             " and ",
+                             f_shape[filter_in_ch_axis]);
+                NGRAPH_CHECK(in_shape.size() == out_shape.size(),
+                             "Incompatible input and output ranks: ",
+                             in_shape.size(),
+                             " and ",
+                             out_shape.size());
+                const auto spatial_dims = in_shape.size() - 2;
+                NGRAPH_CHECK(strides.size() == spatial_dims,
+                             "Strides not definied for all and only spatial dimensions");
+                NGRAPH_CHECK(dilations.size() == spatial_dims,
+                             "Dilations not defined for all and only spatial dimensions");
+                NGRAPH_CHECK((pads_begin.size() == pads_end.size()) &&
+                                 (pads_begin.size() == spatial_dims),
+                             "Pads not defined for all and only spatial dimensions");
+
+                Shape out_spatial_shape{std::next(out_shape.begin(), 2), std::end(out_shape)};
+                Shape infered_out_spatial_shape{};
+                infer_backward_conv_output_shape(
+                    Shape{std::next(in_shape.begin(), 2), std::end(in_shape)},
+                    Shape{std::next(f_shape.begin(), 2), std::end(f_shape)},
+                    infered_out_spatial_shape,
+                    strides,
+                    dilations,
+                    pads_begin,
+                    pads_end);
+                NGRAPH_CHECK(out_spatial_shape == infered_out_spatial_shape,
+                             "Incorrect output shape provided");
+            }
+
+            void validate_group_convolution_backprop_data_parameters(
+                const Shape& in_shape,
+                const Shape& f_shape,
+                const Shape& out_shape,
+                const Strides& strides,
+                const Strides& dilations,
+                const CoordinateDiff& pads_begin,
+                const CoordinateDiff& pads_end)
+            {
+                // this implementation supports 1D, 2D and 3D convolutions
+                NGRAPH_CHECK(in_shape.size() >= 3 && in_shape.size() <= 5,
+                             "Unsupported input rank: ",
+                             in_shape);
+
+                NGRAPH_CHECK(in_shape.size() + 1 == f_shape.size(),
+                             "Unsupported filter rank: ",
+                             f_shape.size());
+
+                NGRAPH_CHECK(in_shape.size() == out_shape.size(),
+                             "Incompatible input and output ranks: ",
+                             in_shape.size(),
+                             " and ",
+                             out_shape.size());
+
+                const size_t groups = f_shape[filter_group_axis];
+                const size_t in_channels = in_shape[in_channel_axis];
+                NGRAPH_CHECK(in_channels % groups == 0,
+                             "Input channels of data batch input must be multiple of groups");
+                const Shape in_group_shape = [&]() {
+                    Shape new_shape{in_shape};
+                    new_shape[in_channel_axis] /= groups;
+                    return new_shape;
+                }();
+
+                const size_t out_channels = out_shape[out_channel_axis];
+                NGRAPH_CHECK(out_channels % groups == 0,
+                             "Output channels of output must be multiple of groups");
+                const Shape out_group_shape = [&]() {
+                    Shape new_shape{out_shape};
+                    new_shape[out_channel_axis] /= groups;
+                    return new_shape;
+                }();
+
+                const Shape f_group_shape{std::next(f_shape.begin(), 1), std::end(f_shape)};
+                validate_convolution_backprop_data_parameters(in_group_shape,
+                                                              f_group_shape,
+                                                              out_group_shape,
+                                                              strides,
+                                                              dilations,
+                                                              pads_begin,
+                                                              pads_end);
+            }
+
             template <typename INPUT,
                       typename FILTER,
                       typename OUTPUT,
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/roll.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/roll.hpp
new file mode 100644
index 00000000000000..0c479222cb5fa6
--- /dev/null
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/roll.hpp
@@ -0,0 +1,94 @@
+//*****************************************************************************
+// Copyright 2017-2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include "ngraph/coordinate_transform.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            size_t shift_pos(size_t pos_in_spanned_data,
+                             size_t dim_shift,
+                             size_t spanned_shape_size,
+                             size_t dim_size)
+            {
+                size_t pos = pos_in_spanned_data / spanned_shape_size % dim_size;
+                size_t shift = (pos + dim_shift) % dim_size - pos;
+                return pos_in_spanned_data + shift * spanned_shape_size;
+            }
+
+            void roll(const char* arg,
+                      const int64_t* shift,
+                      const int64_t* axes,
+                      char* out,
+                      const Shape& arg_shape,
+                      const Shape& shift_shape,
+                      const Shape& axes_shape,
+                      size_t elem_size)
+            {
+                std::vector<int64_t> axes_vector = std::vector<int64_t>(axes, axes + axes_shape[0]);
+                for (auto& axis : axes_vector)
+                {
+                    if (axis < 0)
+                        axis += arg_shape.size();
+                }
+
+                std::vector<int64_t> shift_vector = std::vector<int64_t>(arg_shape.size(), 0);
+                for (size_t i = 0; i < axes_vector.size(); i++)
+                {
+                    int64_t shift_sum = shift_vector[axes_vector[i]] + shift[i];
+                    int64_t dim_size = arg_shape[axes_vector[i]];
+                    // the modulo which supports negative values
+                    shift_vector[axes_vector[i]] = (shift_sum % dim_size + dim_size) % dim_size;
+                }
+
+                size_t last_dim = arg_shape[arg_shape.size() - 1];
+                size_t start = 0;
+                while (start < shape_size(arg_shape))
+                {
+                    size_t left_block_size = last_dim - shift_vector[shift_vector.size() - 1];
+                    size_t p1 = start;
+                    size_t p2 = start + left_block_size;
+                    size_t spanned_shape_size = 1;
+                    for (int dim = arg_shape.size() - 1; dim >= 0; dim--)
+                    {
+                        p1 = shift_pos(p1, shift_vector[dim], spanned_shape_size, arg_shape[dim]);
+                        p2 = shift_pos(p2, shift_vector[dim], spanned_shape_size, arg_shape[dim]);
+                        spanned_shape_size *= arg_shape[dim];
+                    }
+
+                    if (left_block_size > 0)
+                        memcpy(out + p1 * elem_size,
+                               arg + start * elem_size,
+                               left_block_size * elem_size);
+
+                    size_t right_block_size = last_dim - left_block_size;
+                    if (right_block_size > 0)
+                        memcpy(out + p2 * elem_size,
+                               arg + (start + left_block_size) * elem_size,
+                               right_block_size * elem_size);
+
+                    start += last_dim;
+                }
+            }
+        }
+    }
+}
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/utils/span.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/utils/span.hpp
index 8318c26694984f..1cee089e7034be 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/utils/span.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/utils/span.hpp
@@ -6,6 +6,7 @@
 
 #include <iterator>
 #include <limits>
+#include <stdexcept>
 #include <type_traits>
 
 namespace ngraph
@@ -81,9 +82,21 @@ namespace ngraph
                 constexpr Element& front() const noexcept { return *m_data; }
                 constexpr Element& back() const noexcept { return *(m_data + (m_size - 1)); }
                 constexpr Element& operator[](std::size_t idx) const { return *(m_data + idx); }
-                Element& at(std::size_t idx) const { return *(m_data + idx); }
+                Element& at(std::size_t idx) const
+                {
+                    if (idx >= m_size)
+                    {
+                        throw std::out_of_range{"index out of range"};
+                    }
+                    return *(m_data + idx);
+                }
+
+                /**
+                 * @brief return sub part of span starting from offset and not greater than size
+                 *
+                 */
                 Span subspan(std::size_t offset,
-                             std::size_t size = std::numeric_limits<std::size_t>::max())
+                             std::size_t size = std::numeric_limits<std::size_t>::max()) const
                 {
                     if (offset > m_size)
                     {
@@ -92,6 +105,41 @@ namespace ngraph
                     return {m_data + offset, std::min(size, m_size - offset)};
                 }
 
+                /**
+                 * @brief drop number of elements from front
+                 *
+                 */
+                Span& drop_front(std::size_t number_of_elements)
+                {
+                    if (number_of_elements < m_size)
+                    {
+                        m_data += number_of_elements;
+                        m_size -= number_of_elements;
+                    }
+                    else
+                    {
+                        m_size = 0;
+                    }
+                    return *this;
+                }
+
+                /**
+                 * @brief drop number of elements from back
+                 *
+                 */
+                Span& drop_back(std::size_t number_of_elements)
+                {
+                    if (number_of_elements < m_size)
+                    {
+                        m_size -= number_of_elements;
+                    }
+                    else
+                    {
+                        m_size = 0;
+                    }
+                    return *this;
+                }
+
             private:
                 Element* m_data{nullptr};
                 std::size_t m_size{0};
diff --git a/ngraph/core/src/itt.hpp b/ngraph/core/src/itt.hpp
index 8670efc3cd41f2..2c20ae5ae2ba7b 100644
--- a/ngraph/core/src/itt.hpp
+++ b/ngraph/core/src/itt.hpp
@@ -41,7 +41,8 @@ OV_ITT_DOMAIN(SIMPLE_ngraph_pass);
                                " is disabled!")
 #define NGRAPH_PASS_CALLBACK(matcher)
 #else
-#define NGRAPH_OP_SCOPE(region) OV_ITT_SCOPED_TASK(ngraph::itt::domains::ngraph_op, #region)
+#define NGRAPH_OP_SCOPE(region)                                                                    \
+    OV_ITT_SCOPED_TASK(ngraph::itt::domains::ngraph_op, OV_PP_TOSTRING(region))
 #define NGRAPH_PASS_CALLBACK(matcher)
 #endif
 
diff --git a/ngraph/core/src/op/binary_convolution.cpp b/ngraph/core/src/op/binary_convolution.cpp
index d65deafab317fe..328097c66e0fa7 100644
--- a/ngraph/core/src/op/binary_convolution.cpp
+++ b/ngraph/core/src/op/binary_convolution.cpp
@@ -66,111 +66,22 @@ void op::v1::BinaryConvolution::validate_and_infer_types()
     const PartialShape& filters_pshape = get_input_partial_shape(1);
 
     NODE_VALIDATION_CHECK(this,
-                          data_batch_et.is_real(),
-                          "Data batch element type must be float point. Got: ",
+                          data_batch_et.is_real() || data_batch_et.is_integral_number(),
+                          "Data batch element type must be numeric. Got: ",
                           data_batch_et);
 
     // TODO: Add NodeValidationCheck to filters et once u1 is supported in nGraph Python API
     // (#49517)
 
-    NODE_VALIDATION_CHECK(this,
-                          data_batch_pshape.rank().compatible(filters_pshape.rank()),
-                          "Shapes for data batch and filters must have same rank. Got: ",
-                          data_batch_pshape,
-                          "and ",
-                          filters_pshape);
-
-    if (m_strides.size() == 0)
-    {
-        m_strides = conv_default_strides(this, data_batch_pshape, filters_pshape);
-    }
-
-    if (m_dilations.size() == 0)
-    {
-        m_dilations = conv_default_strides(this, data_batch_pshape, filters_pshape);
-    }
-
-    if (m_pads_begin.size() == 0)
-    {
-        m_pads_begin = conv_default_padding(this, data_batch_pshape, filters_pshape);
-    }
-
-    if (m_pads_end.size() == 0)
-    {
-        m_pads_end = conv_default_padding(this, data_batch_pshape, filters_pshape);
-    }
-
-    PartialShape result_shape = PartialShape::dynamic();
-    if (data_batch_pshape.rank().is_static() || filters_pshape.rank().is_static())
-    {
-        const bool is_data_batch_ps_static = data_batch_pshape.rank().is_static();
-        const auto output_ps_rank =
-            is_data_batch_ps_static ? data_batch_pshape.rank() : filters_pshape.rank();
-        const auto num_spatial_dims = output_ps_rank.get_length() - 2;
-
-        NODE_VALIDATION_CHECK(this,
-                              m_strides.size() == num_spatial_dims,
-                              "Strides should be defined for all and only spatial features.");
-
-        NODE_VALIDATION_CHECK(this,
-                              m_dilations.size() == num_spatial_dims,
-                              "Dilations should be defined for all and only spatial features.");
-
-        NODE_VALIDATION_CHECK(this,
-                              m_pads_begin.size() == num_spatial_dims &&
-                                  m_pads_end.size() == num_spatial_dims,
-                              "Pads should be defined for all and only spatial features.");
-
-        result_shape = std::vector<Dimension>(output_ps_rank.get_length(), Dimension::dynamic());
-        if (data_batch_pshape.rank().is_static())
-        {
-            result_shape[0] = data_batch_pshape[0]; // batch size
-        }
-        if (filters_pshape.rank().is_static())
-        {
-            result_shape[1] = filters_pshape[0]; // filter channel size
-        }
-        if (m_auto_pad == PadType::SAME_UPPER || m_auto_pad == PadType::SAME_LOWER)
-        {
-            bool auto_padding_applied = false;
-            if (filters_pshape.rank().is_static() && filters_pshape.rank().get_length() > 2)
-            {
-                m_pads_begin.clear();
-                m_pads_end.clear();
-
-                const PartialShape filter_spatial_shape = [filters_pshape]() {
-                    vector<Dimension> filter_dims{filters_pshape};
-                    filter_dims.erase(filter_dims.begin(), filter_dims.begin() + 2); // Remove {O,I}
-                    return PartialShape{filter_dims};
-                }();
-
-                if (filter_spatial_shape.is_static())
-                {
-                    auto_padding_applied = try_apply_auto_padding(data_batch_pshape,
-                                                                  filter_spatial_shape.to_shape(),
-                                                                  m_strides,
-                                                                  m_dilations,
-                                                                  m_auto_pad,
-                                                                  m_pads_end,
-                                                                  m_pads_begin);
-                }
-            }
-            if (!auto_padding_applied)
-            {
-                set_output_type(0, data_batch_et, result_shape);
-                return;
-            }
-        }
-
-        result_shape = infer_convolution_forward(this,
-                                                 data_batch_pshape,
-                                                 Strides(num_spatial_dims, 1),
-                                                 m_pads_begin,
-                                                 m_pads_end,
-                                                 filters_pshape,
-                                                 m_strides,
-                                                 m_dilations);
-    }
+    PartialShape result_shape =
+        validate_and_infer_convolution_forward_output_shape(this,
+                                                            data_batch_pshape,
+                                                            filters_pshape,
+                                                            m_auto_pad,
+                                                            m_strides,
+                                                            m_dilations,
+                                                            m_pads_begin,
+                                                            m_pads_end);
     set_output_type(0, data_batch_et, result_shape);
 }
 
diff --git a/ngraph/core/src/op/clamp.cpp b/ngraph/core/src/op/clamp.cpp
index 03849e286c5c3f..d2f9aa920e5800 100644
--- a/ngraph/core/src/op/clamp.cpp
+++ b/ngraph/core/src/op/clamp.cpp
@@ -31,6 +31,20 @@ namespace clamp
         bool rc = true;
         switch (arg->get_element_type())
         {
+            TYPE_CASE(i8)
+            (arg,
+             out,
+             double_to_int<int8_t>(min, ceil_func),
+             double_to_int<int8_t>(max, floor_func),
+             count);
+            break;
+            TYPE_CASE(i16)
+            (arg,
+             out,
+             double_to_int<int16_t>(min, ceil_func),
+             double_to_int<int16_t>(max, floor_func),
+             count);
+            break;
             TYPE_CASE(i32)
             (arg,
              out,
@@ -45,6 +59,20 @@ namespace clamp
              double_to_int<int64_t>(max, floor_func),
              count);
             break;
+            TYPE_CASE(u8)
+            (arg,
+             out,
+             double_to_int<uint8_t>(min, ceil_func),
+             double_to_int<uint8_t>(max, floor_func),
+             count);
+            break;
+            TYPE_CASE(u16)
+            (arg,
+             out,
+             double_to_int<uint16_t>(min, ceil_func),
+             double_to_int<uint16_t>(max, floor_func),
+             count);
+            break;
             TYPE_CASE(u32)
             (arg,
              out,
@@ -61,6 +89,9 @@ namespace clamp
             break;
             TYPE_CASE(f16)(arg, out, static_cast<float16>(min), static_cast<float16>(max), count);
             break;
+            TYPE_CASE(bf16)
+            (arg, out, static_cast<bfloat16>(min), static_cast<bfloat16>(max), count);
+            break;
             TYPE_CASE(f32)(arg, out, static_cast<float>(min), static_cast<float>(max), count);
             break;
         default: rc = false; break;
@@ -96,9 +127,19 @@ op::Clamp::Clamp(const Output<Node>& data, const double min, const double max)
 
 void op::Clamp::validate_and_infer_types()
 {
-    NODE_VALIDATION_CHECK(
-        this, m_min < m_max, "The 'min' parameter needs to be less than 'max' for Clamp");
-    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+    NGRAPH_OP_SCOPE(v0_Clamp_validate_and_infer_types);
+    const element::Type& input_et = get_input_element_type(0);
+    NODE_VALIDATION_CHECK(this,
+                          input_et.is_integral_number() || input_et.is_real(),
+                          "Input element type must be numeric. Got: ",
+                          input_et);
+    NODE_VALIDATION_CHECK(this,
+                          m_min <= m_max,
+                          "Attribute 'min' must be less or equal than 'max'. Got: ",
+                          m_min,
+                          " and ",
+                          m_max);
+    set_output_type(0, input_et, get_input_partial_shape(0));
 }
 
 shared_ptr<Node> op::Clamp::clone_with_new_inputs(const OutputVector& new_args) const
diff --git a/ngraph/core/src/op/convolution.cpp b/ngraph/core/src/op/convolution.cpp
index e60969e331fcc1..2e831c960ada13 100644
--- a/ngraph/core/src/op/convolution.cpp
+++ b/ngraph/core/src/op/convolution.cpp
@@ -47,27 +47,11 @@ bool op::v1::Convolution::visit_attributes(AttributeVisitor& visitor)
 void op::v1::Convolution::validate_and_infer_types()
 {
     NGRAPH_OP_SCOPE(v1_Convolution_validate_and_infer_types);
-    const PartialShape& data_batch_shape = get_input_partial_shape(0);
+    const PartialShape& data_batch_pshape = get_input_partial_shape(0);
     element::Type data_batch_et = get_input_element_type(0);
-    const PartialShape& filters_shape = get_input_partial_shape(1);
+    const PartialShape& filters_pshape = get_input_partial_shape(1);
     element::Type filters_et = get_input_element_type(1);
 
-    PartialShape result_shape = PartialShape::dynamic();
-    if (data_batch_shape.rank().is_static())
-    {
-        result_shape =
-            std::vector<Dimension>(data_batch_shape.rank().get_length(), Dimension::dynamic());
-
-        if (data_batch_shape.rank().get_length() > 1)
-        {
-            result_shape[0] = data_batch_shape[0]; // batch size
-        }
-        if (filters_shape.rank().is_static() && filters_shape.rank().get_length() > 1)
-        {
-            result_shape[1] = filters_shape[0]; // filter channel size
-        }
-    }
-
     element::Type result_et;
     NODE_VALIDATION_CHECK(
         this,
@@ -78,59 +62,20 @@ void op::v1::Convolution::validate_and_infer_types()
         filters_et,
         ").");
 
-    if (m_strides.size() == 0)
-    {
-        m_strides = conv_default_strides(this, data_batch_shape, filters_shape);
-    }
-
-    if (m_dilations.size() == 0)
-    {
-        m_dilations = conv_default_strides(this, data_batch_shape, filters_shape);
-    }
-
-    if (m_pads_begin.size() == 0 || m_auto_pad == PadType::VALID)
-    {
-        m_pads_begin = conv_default_padding(this, data_batch_shape, filters_shape);
-    }
-
-    if (m_pads_end.size() == 0 || m_auto_pad == PadType::VALID)
-    {
-        m_pads_end = conv_default_padding(this, data_batch_shape, filters_shape);
-    }
-
-    if (m_auto_pad == PadType::SAME_UPPER || m_auto_pad == PadType::SAME_LOWER)
-    {
-        bool auto_padding_applied = false;
-        if (filters_shape.is_static())
-        {
-            m_pads_begin.clear();
-            m_pads_end.clear();
-            auto filter_shape = filters_shape.to_shape();
-            filter_shape.erase(filter_shape.begin(), filter_shape.begin() + 2); // Remove {O,I}
-            auto_padding_applied = try_apply_auto_padding(data_batch_shape,
-                                                          filter_shape,
-                                                          m_strides,
-                                                          m_dilations,
-                                                          m_auto_pad,
-                                                          m_pads_end,
-                                                          m_pads_begin);
-        }
-        if (!auto_padding_applied)
-        {
-            set_output_type(0, result_et, result_shape);
-            return;
-        }
-    }
-
-    result_shape = infer_convolution_forward(this,
-                                             data_batch_shape,
-                                             Strides(m_strides.size(), 1), // dummy data dilations
-                                             m_pads_begin,
-                                             m_pads_end,
-                                             filters_shape,
-                                             m_strides,
-                                             m_dilations);
-
+    NODE_VALIDATION_CHECK(this,
+                          result_et.is_real() || result_et.is_integral_number(),
+                          "Element types must be numeric. Got: ",
+                          result_et);
+
+    PartialShape result_shape =
+        validate_and_infer_convolution_forward_output_shape(this,
+                                                            data_batch_pshape,
+                                                            filters_pshape,
+                                                            m_auto_pad,
+                                                            m_strides,
+                                                            m_dilations,
+                                                            m_pads_begin,
+                                                            m_pads_end);
     set_output_type(0, result_et, result_shape);
 }
 
diff --git a/ngraph/core/src/op/deformable_psroi_pooling.cpp b/ngraph/core/src/op/deformable_psroi_pooling.cpp
index d1220c932e65c2..01fd57610d96b0 100644
--- a/ngraph/core/src/op/deformable_psroi_pooling.cpp
+++ b/ngraph/core/src/op/deformable_psroi_pooling.cpp
@@ -80,32 +80,34 @@ void op::v1::DeformablePSROIPooling::validate_and_infer_types()
     const auto& box_coords_pshape = get_input_partial_shape(1);
 
     NODE_VALIDATION_CHECK(this,
-                          input_pshape.rank().is_dynamic() || input_pshape.rank().get_length() == 4,
-                          "Feature map input rank must equal to 4 (input rank: ",
-                          input_pshape.rank().get_length(),
+                          input_pshape.rank().compatible(4),
+                          "First input rank must be compatible with 4 (input rank: ",
+                          input_pshape.rank(),
                           ")");
     NODE_VALIDATION_CHECK(this,
-                          box_coords_pshape.rank().is_dynamic() ||
-                              box_coords_pshape.rank().get_length() == 2,
-                          "Box coordinates input rank must equal to 2 (input rank: ",
-                          box_coords_pshape.rank().get_length(),
+                          box_coords_pshape.rank().compatible(2),
+                          "Second input rank must be compatible with 2 (input rank: ",
+                          box_coords_pshape.rank(),
                           ")");
 
     if (get_input_size() == 3) // offsets input is provided
     {
         const auto& offsets_pshape = get_input_partial_shape(2);
         NODE_VALIDATION_CHECK(this,
-                              offsets_pshape.rank().is_dynamic() ||
-                                  offsets_pshape.rank().get_length() == 4,
-                              "Offsets input rank must equal to 4 (input rank: ",
-                              offsets_pshape.rank().get_length(),
+                              offsets_pshape.rank().compatible(4),
+                              "Third input rank must be compatible with 4 (input rank: ",
+                              offsets_pshape.rank(),
                               ")");
     }
+
+    NODE_VALIDATION_CHECK(
+        this, m_group_size > 0, "Value of `group_size` attribute has to be greater than 0 ");
+
     int64_t output_rank = 4;
     std::vector<Dimension> output_dim_vec(output_rank, Dimension::dynamic());
-    if (box_coords_pshape[0].is_static())
+    if (box_coords_pshape.rank().is_static())
     {
-        output_dim_vec[0] = box_coords_pshape.to_shape()[0];
+        output_dim_vec[0] = box_coords_pshape[0]; // Number of ROIs
     }
     output_dim_vec[1] = m_output_dim;
     for (int i = 2; i < output_rank; ++i)
diff --git a/ngraph/core/src/op/gather.cpp b/ngraph/core/src/op/gather.cpp
index 7e1c5295b79394..df1be923b2e8f3 100644
--- a/ngraph/core/src/op/gather.cpp
+++ b/ngraph/core/src/op/gather.cpp
@@ -126,24 +126,208 @@ shared_ptr<Node> op::v1::Gather::clone_with_new_inputs(const OutputVector& new_a
     return make_shared<v1::Gather>(new_args.at(PARAMS), new_args.at(INDICES), new_args.at(AXIS));
 }
 
+NGRAPH_RTTI_DEFINITION(op::v7::Gather, "Gather", 7);
+
+op::v7::Gather::Gather(const Output<Node>& data,
+                       const Output<Node>& indices,
+                       const Output<Node>& axis,
+                       const int64_t batch_dims)
+    : Op({data, indices, axis})
+    , m_batch_dims(batch_dims)
+{
+    constructor_validate_and_infer_types();
+}
+
+bool ngraph::op::v7::Gather::visit_attributes(AttributeVisitor& visitor)
+{
+    NGRAPH_OP_SCOPE(v7_Gather_visit_attributes);
+    visitor.on_attribute("batch_dims", m_batch_dims);
+    return true;
+}
+
+void op::v7::Gather::validate_and_infer_types()
+{
+    NGRAPH_OP_SCOPE(v7_Gather_validate_and_infer_types);
+    const auto& data_type = get_input_element_type(0);
+    const auto& indices_type = get_input_element_type(1);
+
+    NODE_VALIDATION_CHECK(this,
+                          indices_type == element::Type_t::i32 ||
+                              indices_type == element::Type_t::i64,
+                          "indices must be of int32 or int64 type. But instead got: ",
+                          indices_type);
+
+    const auto& data_pshape = get_input_partial_shape(0);
+    const auto& indices_pshape = get_input_partial_shape(1);
+    const auto& axis_pshape = get_input_partial_shape(2);
+    auto data_rank = data_pshape.rank();
+    auto indices_rank = indices_pshape.rank();
+    auto axis_rank = axis_pshape.rank();
+
+    if (axis_rank.is_static() && axis_pshape.is_static())
+    {
+        const auto axis_is_scalar = axis_rank.get_length() == 0;
+        const auto axis_has_one_elem =
+            axis_rank.get_length() == 1 && axis_pshape[0].get_length() == 1;
+        NODE_VALIDATION_CHECK(
+            this,
+            axis_is_scalar || axis_has_one_elem,
+            "Axes input must be scalar or have 1 element. But instead got axis_shape = ",
+            axis_pshape);
+    }
+
+    int64_t batch_dims = get_batch_dims(); // will not be converted to positive if axis is not set
+    if (is_axis_set())
+    {
+        int64_t axis = get_axis();
+        NODE_VALIDATION_CHECK(this,
+                              batch_dims <= axis,
+                              "The batch_dims <= axis. But instead got: batch_dims = ",
+                              batch_dims,
+                              ", axis = ",
+                              axis);
+
+        if (data_rank.is_static())
+        {
+            NODE_VALIDATION_CHECK(this,
+                                  axis >= 0 && axis < data_rank.get_length(),
+                                  "The axis must be => 0 and < data_rank. But instead got axis = ",
+                                  axis,
+                                  " data_rank = ",
+                                  data_rank.get_length());
+        }
+    }
+
+    if (indices_rank.is_static() && batch_dims >= 0)
+    {
+        NODE_VALIDATION_CHECK(
+            this,
+            batch_dims <= indices_rank.get_length(),
+            "The batch_dims must be <= indices_rank. But instead got: batch_dims = ",
+            batch_dims,
+            ", indices_rank = ",
+            indices_rank.get_length());
+    }
+
+    if (data_rank.is_static() && indices_rank.is_static())
+    {
+        if (batch_dims >= 0)
+        {
+            auto out_rank = data_rank.get_length() + indices_rank.get_length() - 1 - batch_dims;
+            PartialShape output_pshape = PartialShape::dynamic(out_rank);
+
+            // implementation of out_shape formula
+            // data.shape[:batch_dims] + data.shape[batch_dims:axis] + indices.shape[batch_dims:] +
+            // data.shape[axis + 1:]
+            int i = 0;
+            for (; i < batch_dims; i++)
+            {
+                NODE_VALIDATION_CHECK(this,
+                                      data_pshape[i].compatible(indices_pshape[i]),
+                                      "Shapes ",
+                                      data_pshape,
+                                      " and ",
+                                      indices_pshape,
+                                      " are not consistent. data and indices must have equal or "
+                                      "intersecting sizes until batch_dims");
+
+                output_pshape[i] = data_pshape[i] & indices_pshape[i];
+            }
+
+            if (is_axis_set())
+            {
+                int64_t axis = get_axis();
+                for (; i < axis; i++)
+                {
+                    output_pshape[i] = data_pshape[i];
+                }
+                for (; i < axis + indices_rank.get_length() - batch_dims; i++)
+                {
+                    output_pshape[i] = indices_pshape[batch_dims - axis + i];
+                }
+                for (; i < out_rank; i++)
+                {
+                    output_pshape[i] = data_pshape[batch_dims + 1 - indices_rank.get_length() + i];
+                }
+            }
+
+            set_output_type(0, data_type, output_pshape);
+        }
+        else if (batch_dims < 0)
+        {
+            // batch_dims < 0 could be only if axis is not set
+            // as soon as axis value will arrive negative batch_dims should be resolved
+            // batch_dims value will be within [0, data_rank] && [0, indices_rank]
+            int64_t max_rank = data_rank.get_length() + indices_rank.get_length() - 1;
+            int64_t min_rank = max_rank - max(data_rank.get_length(), indices_rank.get_length());
+
+            set_output_type(0, data_type, PartialShape::dynamic(Dimension(min_rank, max_rank)));
+        }
+    }
+    else
+    {
+        set_output_type(0, data_type, PartialShape::dynamic());
+    }
+}
+
+int64_t op::v7::Gather::get_axis() const
+{
+    const auto& const_op = get_constant_from_source(input_value(2));
+    int64_t axis = const_op->cast_vector<int64_t>()[0];
+    if (axis < 0)
+    {
+        const auto& data_rank = get_input_partial_shape(0).rank();
+        if (data_rank.is_static())
+        {
+            axis += data_rank.get_length();
+        }
+    }
+    return axis;
+}
+
+int64_t op::v7::Gather::get_batch_dims() const
+{
+    if (m_batch_dims < 0 && is_axis_set())
+        return get_axis() + m_batch_dims;
+    else
+        return m_batch_dims;
+}
+
+bool op::v7::Gather::is_axis_set() const
+{
+    const auto& axes_constant = get_constant_from_source(input_value(2));
+    if (axes_constant)
+        return true;
+    else
+        return false;
+}
+
+shared_ptr<Node> op::v7::Gather::clone_with_new_inputs(const OutputVector& new_args) const
+{
+    NGRAPH_OP_SCOPE(v7_Gather_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    return make_shared<v7::Gather>(new_args.at(0), new_args.at(1), new_args.at(2), m_batch_dims);
+}
+
 namespace gather
 {
     template <element::Type_t ET>
     bool evaluate(const HostTensorPtr& arg0,
                   const HostTensorPtr& arg1,
                   const HostTensorPtr& out,
-                  size_t axis)
+                  size_t axis,
+                  size_t batch_dims)
     {
         using T = typename element_type_traits<ET>::value_type;
         Shape params_shape = arg0->get_shape();
         Shape indices_shape = arg1->get_shape();
-        Shape out_shape(params_shape.size() + indices_shape.size() - 1);
+        Shape out_shape(params_shape.size() + indices_shape.size() - 1 - batch_dims);
         uint64_t i = 0;
         for (; i < axis; i++)
         {
             out_shape[i] = params_shape[i];
         }
-        for (uint64_t j = 0; j < indices_shape.size(); i++, j++)
+        for (uint64_t j = batch_dims; j < indices_shape.size(); i++, j++)
         {
             out_shape[i] = indices_shape[j];
         }
@@ -162,7 +346,8 @@ namespace gather
                                                    arg0->get_shape(),
                                                    arg1->get_shape(),
                                                    out->get_shape(),
-                                                   axis);
+                                                   axis,
+                                                   batch_dims);
         }
         else if (arg1->get_element_type() == element::i32)
         {
@@ -172,7 +357,8 @@ namespace gather
                                                    arg0->get_shape(),
                                                    arg1->get_shape(),
                                                    out->get_shape(),
-                                                   axis);
+                                                   axis,
+                                                   batch_dims);
         }
         else
         {
@@ -185,19 +371,20 @@ namespace gather
     bool evaluate_gather(const HostTensorPtr& arg0,
                          const HostTensorPtr& arg1,
                          const HostTensorPtr& out,
-                         size_t axis)
+                         size_t axis,
+                         size_t batch_dims = 0)
     {
         bool rc = true;
 
         switch (out->get_element_type())
         {
-            NGRAPH_TYPE_CASE(evaluate_gather, i32, arg0, arg1, out, axis);
-            NGRAPH_TYPE_CASE(evaluate_gather, i64, arg0, arg1, out, axis);
-            NGRAPH_TYPE_CASE(evaluate_gather, u32, arg0, arg1, out, axis);
-            NGRAPH_TYPE_CASE(evaluate_gather, u64, arg0, arg1, out, axis);
-            NGRAPH_TYPE_CASE(evaluate_gather, f16, arg0, arg1, out, axis);
-            NGRAPH_TYPE_CASE(evaluate_gather, f32, arg0, arg1, out, axis);
-            NGRAPH_TYPE_CASE(evaluate_gather, boolean, arg0, arg1, out, axis);
+            NGRAPH_TYPE_CASE(evaluate_gather, i32, arg0, arg1, out, axis, batch_dims);
+            NGRAPH_TYPE_CASE(evaluate_gather, i64, arg0, arg1, out, axis, batch_dims);
+            NGRAPH_TYPE_CASE(evaluate_gather, u32, arg0, arg1, out, axis, batch_dims);
+            NGRAPH_TYPE_CASE(evaluate_gather, u64, arg0, arg1, out, axis, batch_dims);
+            NGRAPH_TYPE_CASE(evaluate_gather, f16, arg0, arg1, out, axis, batch_dims);
+            NGRAPH_TYPE_CASE(evaluate_gather, f32, arg0, arg1, out, axis, batch_dims);
+            NGRAPH_TYPE_CASE(evaluate_gather, boolean, arg0, arg1, out, axis, batch_dims);
         default: rc = false; break;
         }
         return rc;
@@ -335,3 +522,63 @@ bool op::v1::Gather::constant_fold(OutputVector& output_values, const OutputVect
             output_values, input_values, get_output_partial_shape(0));
     }
 }
+
+bool op::v7::Gather::evaluate_gather(const HostTensorVector& outputs,
+                                     const HostTensorVector& inputs) const
+{
+    int64_t axis = 0;
+    switch (inputs[2]->get_element_type())
+    {
+    case element::Type_t::i32: axis = inputs[2]->get_data_ptr<element::Type_t::i32>()[0]; break;
+    case element::Type_t::i64: axis = inputs[2]->get_data_ptr<element::Type_t::i64>()[0]; break;
+    default: throw ngraph_error("axis must be of int32 or int64 type.");
+    }
+
+    if (axis < 0)
+    {
+        const auto& input_rank = get_input_partial_shape(0).rank();
+        if (input_rank.is_static())
+        {
+            axis += input_rank.get_length();
+        }
+    }
+    return gather::evaluate_gather(inputs[0], inputs[1], outputs[0], axis, get_batch_dims());
+}
+
+bool op::v7::Gather::evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const
+{
+    NGRAPH_OP_SCOPE(v7_Gather_evaluate);
+    NGRAPH_CHECK(this, validate_host_tensor_vector(inputs, 3));
+    NGRAPH_CHECK(this, validate_host_tensor_vector(outputs, 1));
+    return evaluate_gather(outputs, inputs);
+}
+
+bool op::v7::Gather::evaluate_lower(const HostTensorVector& output_values) const
+{
+    if (!input_value(1).get_tensor().has_and_set_bound() ||
+        !input_value(2).get_tensor().has_and_set_bound())
+        return false;
+    return default_lower_bound_evaluator(this, output_values);
+}
+
+bool op::v7::Gather::evaluate_upper(const HostTensorVector& output_values) const
+{
+    if (!input_value(1).get_tensor().has_and_set_bound() ||
+        !input_value(2).get_tensor().has_and_set_bound())
+        return false;
+    return default_upper_bound_evaluator(this, output_values);
+}
+
+bool op::v7::Gather::constant_fold(OutputVector& output_values, const OutputVector& input_values)
+{
+    // try the regular constant folding just for the Gather node
+    if (Node::constant_fold(output_values, input_values))
+    {
+        return true;
+    }
+    else
+    {
+        return gather::cf_gather_with_subgraph(
+            output_values, input_values, get_output_partial_shape(0));
+    }
+}
diff --git a/ngraph/core/src/op/gather_elements.cpp b/ngraph/core/src/op/gather_elements.cpp
index d86a2a97472489..def1721144e6c6 100644
--- a/ngraph/core/src/op/gather_elements.cpp
+++ b/ngraph/core/src/op/gather_elements.cpp
@@ -87,13 +87,12 @@ void op::v6::GatherElements::validate_and_infer_types()
     {
         if (i != axis)
         {
-            // if size of the current axis of indices is unknown it will retrieve it from data
-            // e.g., if data_shape = {4, 4, ?} indices_shape = {1, ?, 5} and axis = 0
+            // if size of the current dimension of indices is unknown it will be retrieved from data
+            // e.g., if data_shape = {4, 4, ?}, indices_shape = {1, ?, 5} and axis = 0
             // (and if intervals intersect) then output_pshape will be {1, 4, 5}
-            Dimension curr_dim = data_pshape[i] & indices_pshape[i];
 
             NODE_VALIDATION_CHECK(this,
-                                  !curr_dim.get_interval().empty(),
+                                  data_pshape[i].compatible(indices_pshape[i]),
                                   "Shapes ",
                                   data_pshape,
                                   " and ",
@@ -102,7 +101,7 @@ void op::v6::GatherElements::validate_and_infer_types()
                                   "intersecting sizes, except for axis ",
                                   m_axis);
 
-            output_pshape[i] = curr_dim;
+            output_pshape[i] = data_pshape[i] & indices_pshape[i];
         }
     }
     set_output_type(0, data_type, output_pshape);
diff --git a/ngraph/core/src/op/group_conv.cpp b/ngraph/core/src/op/group_conv.cpp
index 97b16c8b0b51fc..6831ec6dff9ffb 100644
--- a/ngraph/core/src/op/group_conv.cpp
+++ b/ngraph/core/src/op/group_conv.cpp
@@ -56,16 +56,32 @@ bool ngraph::op::v1::GroupConvolution::visit_attributes(AttributeVisitor& visito
     return true;
 }
 
+static Dimension infer_group_from_input_shapes(const PartialShape& data_pshape,
+                                               const PartialShape& filters_pshape)
+{
+    Dimension group_dim = Dimension();
+    if (data_pshape.rank().is_static() && data_pshape[1].is_static() &&
+        filters_pshape.rank().is_static() && filters_pshape[2].is_static())
+    {
+        auto n_data_channels = data_pshape[1].get_length();
+        auto input_channels = filters_pshape[2].get_length();
+
+        NGRAPH_CHECK((n_data_channels % input_channels) == 0);
+        auto groups = n_data_channels / input_channels;
+        group_dim = Dimension(groups);
+    }
+    return group_dim;
+}
+
 void op::v1::GroupConvolution::validate_and_infer_types()
 {
     NGRAPH_OP_SCOPE(v1_GroupConvolution_validate_and_infer_types);
-    PartialShape data_batch_shape = get_input_partial_shape(0);
-    PartialShape filters_shape = get_input_partial_shape(1);
+    PartialShape data_batch_pshape = get_input_partial_shape(0);
+    PartialShape filters_pshape = get_input_partial_shape(1);
     element::Type data_batch_et = get_input_element_type(0);
     element::Type filters_et = get_input_element_type(1);
 
     element::Type result_et;
-
     NODE_VALIDATION_CHECK(
         this,
         element::Type::merge(result_et, data_batch_et, filters_et),
@@ -75,90 +91,181 @@ void op::v1::GroupConvolution::validate_and_infer_types()
         filters_et,
         ").");
 
-    PartialShape result_shape{PartialShape::dynamic()};
+    NODE_VALIDATION_CHECK(this,
+                          result_et.is_real() || result_et.is_integral_number(),
+                          "Element type of inputs must be numeric. Got: ",
+                          result_et);
 
-    if (data_batch_shape.rank().is_static())
-    {
-        result_shape =
-            std::vector<Dimension>(data_batch_shape.rank().get_length(), Dimension::dynamic());
-        result_shape[0] = data_batch_shape[0];
-    }
+    NODE_VALIDATION_CHECK(
+        this,
+        (data_batch_pshape.rank().compatible(5) && filters_pshape.rank().compatible(6)) ||
+            (data_batch_pshape.rank().compatible(4) && filters_pshape.rank().compatible(5)) ||
+            (data_batch_pshape.rank().compatible(3) && filters_pshape.rank().compatible(4)),
+        "Shapes for data batch and filters do not match. (data batch shape: ",
+        data_batch_pshape,
+        ", filters shape: ",
+        filters_pshape,
+        ").");
 
-    Dimension groups(1);
-    // we need to adjust filters_shape to reuse helpers for normal convolution
-    if (filters_shape.rank().is_static() && filters_shape.rank().get_length() > 2)
+    PartialShape result_shape{PartialShape::dynamic()};
+    if (data_batch_pshape.rank().is_static() || filters_pshape.rank().is_static())
     {
-        groups = filters_shape[0];
-        filters_shape[1] *= groups;
-        auto dim_vec = static_cast<std::vector<Dimension>>(filters_shape);
-        dim_vec.erase(dim_vec.begin());
-        filters_shape = PartialShape(dim_vec);
-        if (data_batch_shape.rank().is_static())
+        const bool is_data_batch_ps_static = data_batch_pshape.rank().is_static();
+        const auto output_ps_rank = is_data_batch_ps_static
+                                        ? data_batch_pshape.rank().get_length()
+                                        : filters_pshape.rank().get_length() - 1;
+        const auto num_spatial_dims = output_ps_rank - 2;
+
+        if (m_strides.size() == 0)
         {
-            result_shape[1] = filters_shape[0];
+            m_strides = Strides(num_spatial_dims, 1);
         }
-    }
 
-    if (data_batch_shape.rank().is_static() && data_batch_shape.rank().get_length() > 2 &&
-        data_batch_shape[1].is_static() && groups.is_static())
-    {
-        data_batch_shape[1] = Dimension(data_batch_shape[1].get_length() / groups.get_length());
-    }
+        if (m_dilations.size() == 0)
+        {
+            m_dilations = Strides(num_spatial_dims, 1);
+        }
 
-    if (m_strides.size() == 0)
-    {
-        m_strides = conv_default_strides(this, data_batch_shape, filters_shape);
-    }
+        if (m_pads_begin.size() == 0 || m_auto_pad == PadType::VALID)
+        {
+            m_pads_begin = CoordinateDiff(num_spatial_dims, 0);
+        }
 
-    if (m_dilations.size() == 0)
-    {
-        m_dilations = conv_default_strides(this, data_batch_shape, filters_shape);
-    }
+        if (m_pads_end.size() == 0 || m_auto_pad == PadType::VALID)
+        {
+            m_pads_end = CoordinateDiff(num_spatial_dims, 0);
+        }
 
-    if (m_pads_begin.size() == 0 || m_auto_pad == PadType::VALID)
-    {
-        m_pads_begin = conv_default_padding(this, data_batch_shape, filters_shape);
-    }
+        NODE_VALIDATION_CHECK(this,
+                              m_strides.size() == num_spatial_dims,
+                              "Strides should be defined for all and only spatial features.");
 
-    if (m_pads_end.size() == 0 || m_auto_pad == PadType::VALID)
-    {
-        m_pads_end = conv_default_padding(this, data_batch_shape, filters_shape);
-    }
+        NODE_VALIDATION_CHECK(this,
+                              m_dilations.size() == num_spatial_dims,
+                              "Dilations should be defined for all and only spatial features.");
 
-    if (m_auto_pad == PadType::SAME_UPPER || m_auto_pad == PadType::SAME_LOWER)
-    {
-        bool auto_padding_applied = false;
-        if (filters_shape.is_static())
+        NODE_VALIDATION_CHECK(this,
+                              m_pads_begin.size() == num_spatial_dims &&
+                                  m_pads_end.size() == num_spatial_dims,
+                              "Pads should be defined for all and only spatial features.");
+
+        if (data_batch_pshape.rank().is_static() && filters_pshape.rank().is_static())
         {
-            m_pads_begin.clear();
-            m_pads_end.clear();
-            auto filters_static_shape = filters_shape.to_shape();
-            filters_static_shape.erase(filters_static_shape.begin(),
-                                       filters_static_shape.begin() + 2); // Remove {O,I}
-            auto_padding_applied = try_apply_auto_padding(data_batch_shape,
-                                                          filters_static_shape,
-                                                          m_strides,
-                                                          m_dilations,
-                                                          m_auto_pad,
-                                                          m_pads_end,
-                                                          m_pads_begin);
+            auto data_in_channels_dim = data_batch_pshape[1];
+            if (data_in_channels_dim.is_static())
+            {
+                auto groups_dim = filters_pshape[0];
+                if (groups_dim.is_static() && filters_pshape[2].is_static())
+                {
+                    NODE_VALIDATION_CHECK(
+                        this,
+                        data_in_channels_dim.get_length() / groups_dim.get_length() ==
+                            filters_pshape[2].get_length(),
+                        "Input channels dimension of data batch has incompatible value "
+                        "with filter shape.");
+                }
+                else if (groups_dim.is_static())
+                {
+                    NODE_VALIDATION_CHECK(
+                        this,
+                        data_in_channels_dim.get_length() % groups_dim.get_length() == 0,
+                        "Input channels dimension of data batch not a multiple of group size.");
+                }
+            }
+        }
+
+        result_shape = std::vector<Dimension>(output_ps_rank, Dimension::dynamic());
+        if (data_batch_pshape.rank().is_static())
+        {
+            result_shape[0] = data_batch_pshape[0]; // batch size
         }
-        if (!auto_padding_applied)
+        if (filters_pshape.rank().is_static() && filters_pshape.rank().get_length() > 2)
         {
-            set_output_type(0, result_et, result_shape);
-            return;
+            result_shape[1] = filters_pshape[0] * filters_pshape[1];
         }
-    }
+        if (m_auto_pad == PadType::SAME_UPPER || m_auto_pad == PadType::SAME_LOWER)
+        {
+            bool auto_padding_applied = false;
+            if (filters_pshape.rank().is_static() && filters_pshape.rank().get_length() > 2)
+            {
+                m_pads_begin.clear();
+                m_pads_end.clear();
 
-    result_shape = infer_convolution_forward(this,
-                                             data_batch_shape,
-                                             Strides(m_strides.size(), 1), // dummy data dilations
-                                             m_pads_begin,
-                                             m_pads_end,
-                                             filters_shape,
-                                             m_strides,
-                                             m_dilations);
+                const PartialShape filter_spatial_shape = [filters_pshape]() {
+                    vector<Dimension> filter_dims{filters_pshape};
+                    filter_dims.erase(filter_dims.begin(),
+                                      filter_dims.begin() + 3); // Remove {GROUP, C_OUT, C_IN}
+                    return PartialShape{filter_dims};
+                }();
+
+                if (filter_spatial_shape.is_static())
+                {
+                    auto_padding_applied = try_apply_auto_padding(data_batch_pshape,
+                                                                  filter_spatial_shape.to_shape(),
+                                                                  m_strides,
+                                                                  m_dilations,
+                                                                  m_auto_pad,
+                                                                  m_pads_end,
+                                                                  m_pads_begin);
+                }
+            }
+            if (!auto_padding_applied)
+            {
+                set_output_type(0, result_et, result_shape);
+                return;
+            }
+        }
+
+        // we need to adjust channels input dim to reuse helpers for regular convolution
+        PartialShape data_batch_ps = [&]() {
+            auto shape = PartialShape{data_batch_pshape};
+            auto groups = filters_pshape.rank().is_static() ? filters_pshape[0] : Dimension();
+            if (groups.is_dynamic())
+            {
+                groups = infer_group_from_input_shapes(data_batch_pshape, filters_pshape);
+            }
+            if (data_batch_pshape.rank().is_static() && data_batch_pshape.rank().get_length())
+            {
+                if (data_batch_pshape[1].is_static() && groups.is_static())
+                {
+                    shape[1] = Dimension(data_batch_pshape[1].get_length() / groups.get_length());
+                }
+                else
+                {
+                    shape[1] = Dimension();
+                }
+            }
+            return shape;
+        }();
+
+        // we need to adjust filters shape to reuse helpers for regular convolution
+        PartialShape filters_ps = [&]() {
+            auto shape = PartialShape{filters_pshape};
+            if (shape.rank().is_static() && shape.rank().get_length() > 2)
+            {
+                auto groups = filters_pshape.rank().is_static() ? filters_pshape[0] : Dimension();
+                if (groups.is_dynamic())
+                {
+                    groups = infer_group_from_input_shapes(data_batch_pshape, filters_pshape);
+                }
+                shape[1] = groups * shape[1];
+                vector<Dimension> dim_vec{shape};
+                dim_vec.erase(dim_vec.begin());
+                shape = PartialShape{dim_vec};
+            }
+            return shape;
+        }();
 
+        result_shape =
+            infer_convolution_forward(this,
+                                      data_batch_ps,
+                                      Strides(m_strides.size(), 1), // dummy data dilations
+                                      m_pads_begin,
+                                      m_pads_end,
+                                      filters_ps,
+                                      m_strides,
+                                      m_dilations);
+    }
     set_output_type(0, result_et, result_shape);
 }
 
@@ -275,8 +382,8 @@ bool op::v1::GroupConvolutionBackpropData::is_dynamic() const
     return is_dynamic;
 }
 
-static Dimension infer_group_from_input_shapes(const PartialShape& data_pshape,
-                                               const PartialShape& filters_pshape)
+static Dimension infer_backprop_group_from_input_shapes(const PartialShape& data_pshape,
+                                                        const PartialShape& filters_pshape)
 {
     Dimension group_dim = Dimension();
     if (data_pshape.rank().is_static() && data_pshape[1].is_static() &&
@@ -298,33 +405,25 @@ const PartialShape op::v1::GroupConvolutionBackpropData::get_convolution_output_
     auto filter_pshape = get_input_partial_shape(1);
 
     PartialShape shape;
+    if (inputs().size() == 3)
+    {
+        if (const auto& const_op = get_constant_from_source(input_value(2)))
+        {
+            return PartialShape{const_op->get_shape_val()};
+        }
+    }
+
     if (data_pshape.rank().is_static())
     {
         shape = PartialShape{vector<Dimension>(data_pshape.rank().get_length() - 2)};
     }
-    else
+    else if (filter_pshape.rank().is_static())
     {
-        shape = PartialShape{vector<Dimension>(m_strides.size())};
+        shape = PartialShape{vector<Dimension>(filter_pshape.rank().get_length() - 3)};
     }
-    bool is_output_shape_present = inputs().size() == 3;
-    if (is_output_shape_present)
+    else
     {
-        if (const auto& const_op = get_constant_from_source(input_value(2)))
-        {
-            shape = const_op->get_shape_val();
-        }
-        else if (data_pshape.rank().is_static())
-        {
-            shape = PartialShape{vector<Dimension>(data_pshape.rank().get_length() - 2)};
-        }
-        else if (filter_pshape.rank().is_static())
-        {
-            shape = PartialShape{vector<Dimension>(data_pshape.rank().get_length() - 3)};
-        }
-        else
-        {
-            shape = PartialShape::dynamic();
-        }
+        shape = PartialShape::dynamic();
     }
     return shape;
 }
@@ -369,10 +468,10 @@ void op::v1::GroupConvolutionBackpropData::infer_conv_backprop_output_spatial_sh
 
 void op::v1::GroupConvolutionBackpropData::validate_and_infer_types()
 {
-    const auto& data_pshape = get_input_partial_shape(0);
+    NGRAPH_OP_SCOPE(v1_GroupConvolutionBackpropData_validate_and_infer_types);
+    const PartialShape& data_pshape = get_input_partial_shape(0);
     element::Type data_et = get_input_element_type(0);
-
-    const auto& filters_pshape = get_input_partial_shape(1);
+    const PartialShape& filters_pshape = get_input_partial_shape(1);
     element::Type filters_et = get_input_element_type(1);
 
     element::Type result_et;
@@ -385,6 +484,11 @@ void op::v1::GroupConvolutionBackpropData::validate_and_infer_types()
         filters_et,
         ").");
 
+    NODE_VALIDATION_CHECK(this,
+                          result_et.is_real() || result_et.is_integral_number(),
+                          "Element type of inputs must be numeric. Got: ",
+                          result_et);
+
     NODE_VALIDATION_CHECK(
         this,
         (data_pshape.rank().compatible(5) && filters_pshape.rank().compatible(6)) ||
@@ -396,51 +500,55 @@ void op::v1::GroupConvolutionBackpropData::validate_and_infer_types()
         filters_pshape,
         ").");
 
-    if (m_pads_begin.size() == 0)
-    {
-        m_pads_begin = conv_default_padding(this, data_pshape, filters_pshape);
-    }
-    if (m_pads_end.size() == 0)
-    {
-        m_pads_end = conv_default_padding(this, data_pshape, filters_pshape);
-    }
-    if (m_output_padding.size() == 0)
-    {
-        m_output_padding = conv_default_padding(this, data_pshape, filters_pshape);
-    }
-    if (m_strides.size() == 0)
-    {
-        m_strides = conv_default_strides(this, data_pshape, filters_pshape);
-    }
-    if (m_dilations.size() == 0)
+    bool is_output_shape_present = inputs().size() == 3;
+    if (is_output_shape_present)
     {
-        m_dilations = conv_default_strides(this, data_pshape, filters_pshape);
+        const PartialShape& output_shape_pshape = get_input_partial_shape(2);
+        const element::Type output_shape_et = get_input_element_type(2);
+
+        NODE_VALIDATION_CHECK(this,
+                              output_shape_et.is_integral_number(),
+                              "Element type for output shape should be of integer type ",
+                              "(output_shape element type: ",
+                              output_shape_et,
+                              ").");
+
+        NODE_VALIDATION_CHECK(this,
+                              output_shape_pshape.rank().compatible(1),
+                              "Spatial shape of output input must be of rank 1 ",
+                              "(output_shape shape: ",
+                              output_shape_pshape,
+                              ").");
     }
+    PartialShape output_spatial_pshape = get_convolution_output_shape();
 
-    if (data_pshape.rank().is_static() && filters_pshape.rank().is_static())
+    if (data_pshape.rank().is_static() || filters_pshape.rank().is_static())
     {
-        if (filters_pshape[0].is_static() && filters_pshape[1].is_static() &&
-            data_pshape[1].is_static())
-        {
-            auto groups = filters_pshape[0].get_length();
-            auto input_channels = filters_pshape[1].get_length();
-            auto n_data_channels = data_pshape[1].get_length();
+        const bool is_data_ps_static = data_pshape.rank().is_static();
+        const auto output_ps_rank = is_data_ps_static ? data_pshape.rank().get_length()
+                                                      : filters_pshape.rank().get_length() - 1;
+        const auto num_spatial_dims = output_ps_rank - 2;
 
-            NODE_VALIDATION_CHECK(this,
-                                  n_data_channels % groups == 0,
-                                  "Number of data channels not a multiple of group size.");
-            NODE_VALIDATION_CHECK(this,
-                                  n_data_channels / groups == input_channels,
-                                  "Data second dimension has incompatible value "
-                                  "with number of input channels.");
+        if (m_strides.size() == 0)
+        {
+            m_strides = Strides(num_spatial_dims, 1);
+        }
+        if (m_dilations.size() == 0)
+        {
+            m_dilations = Strides(num_spatial_dims, 1);
+        }
+        if (m_pads_begin.size() == 0 || m_auto_pad == PadType::VALID)
+        {
+            m_pads_begin = CoordinateDiff(num_spatial_dims, 0);
+        }
+        if (m_pads_end.size() == 0 || m_auto_pad == PadType::VALID)
+        {
+            m_pads_end = CoordinateDiff(num_spatial_dims, 0);
+        }
+        if (m_output_padding.size() == 0)
+        {
+            m_output_padding = CoordinateDiff(num_spatial_dims, 0);
         }
-
-        const auto num_spatial_dims = data_pshape.rank().get_length() - 2;
-
-        NODE_VALIDATION_CHECK(this,
-                              m_pads_begin.size() == num_spatial_dims &&
-                                  m_pads_end.size() == num_spatial_dims,
-                              "Pads should be defined for all and only spatial features.");
 
         NODE_VALIDATION_CHECK(this,
                               m_strides.size() == num_spatial_dims,
@@ -450,91 +558,102 @@ void op::v1::GroupConvolutionBackpropData::validate_and_infer_types()
                               m_dilations.size() == num_spatial_dims,
                               "Dilations should be defined for all and only spatial features.");
 
+        NODE_VALIDATION_CHECK(this,
+                              m_pads_begin.size() == num_spatial_dims &&
+                                  m_pads_end.size() == num_spatial_dims,
+                              "Pads should be defined for all and only spatial features.");
+
         NODE_VALIDATION_CHECK(this,
                               m_output_padding.size() == num_spatial_dims,
                               "Output padding should be defined for all and only "
                               "spatial features.");
-    }
 
-    bool is_output_shape_present = inputs().size() == 3;
-    PartialShape output_pshape;
+        if (data_pshape.rank().is_static() && filters_pshape.rank().is_static())
+        {
+            if (filters_pshape[0].is_static() && filters_pshape[1].is_static() &&
+                data_pshape[1].is_static())
+            {
+                auto groups = filters_pshape[0].get_length();
+                auto input_channels = filters_pshape[1].get_length();
+                auto n_data_channels = data_pshape[1].get_length();
+
+                NODE_VALIDATION_CHECK(this,
+                                      n_data_channels % groups == 0,
+                                      "Number of data channels not a multiple of group size.");
+                NODE_VALIDATION_CHECK(this,
+                                      n_data_channels / groups == input_channels,
+                                      "Data second dimension has incompatible value "
+                                      "with number of input channels.");
+            }
+        }
 
+        if (is_output_shape_present && output_spatial_pshape.is_static())
+        {
+            Shape output_shape = output_spatial_pshape.to_shape();
+            NODE_VALIDATION_CHECK(this,
+                                  output_shape.size() == num_spatial_dims,
+                                  "Output shape should be specified only and for "
+                                  "all spatial dimensions.");
+        }
+    }
+
+    PartialShape result_pshape{PartialShape::dynamic()};
     // If output shape is provided, ignore current values for padding begin/end
     // and infer them.
     if (is_output_shape_present)
     {
-        const auto& output_shape_pshape = get_input_partial_shape(2);
-        const element::Type output_shape_et = get_input_element_type(2);
-
-        NODE_VALIDATION_CHECK(this,
-                              output_shape_et.is_integral_number(),
-                              "Element type for output shape should be of integer type ",
-                              "(output_shape element type: ",
-                              output_shape_et,
-                              ").");
-
-        NODE_VALIDATION_CHECK(this,
-                              output_shape_pshape.rank().compatible(1),
-                              "Spatial shape of output input must be of rank 1 ",
-                              "(output_shape shape: ",
-                              output_shape_pshape,
-                              ").");
-
-        output_pshape = get_convolution_output_shape();
-
-        if (output_pshape.rank().is_static())
+        if (output_spatial_pshape.rank().is_static())
         {
-            vector<Dimension> tmp_output_shape{output_pshape};
             if (data_pshape.rank().is_static() && filters_pshape.rank().is_static())
             {
-                const size_t num_spatial_dims = data_pshape.rank().get_length() - 2;
-                NODE_VALIDATION_CHECK(this,
-                                      output_pshape.rank().get_length() == num_spatial_dims,
-                                      "Output shape should be specified only and for "
-                                      "all spatial dimensions.");
+                const PartialShape data_spatial_shape = [data_pshape]() {
+                    vector<Dimension> data_dims{data_pshape};
+                    data_dims.erase(data_dims.begin(), data_dims.begin() + 2); // remove {N, C_IN}
+                    return PartialShape{data_dims};
+                }();
+
+                const PartialShape filters_spatial_shape = [filters_pshape]() {
+                    vector<Dimension> filters_dims{filters_pshape};
+                    filters_dims.erase(filters_dims.begin(),
+                                       filters_dims.begin() + 3); // remove {GROUPS, C_OUT, C_IN}
+                    return PartialShape{filters_dims};
+                }();
 
                 // If auto_pad has one of following mode we infer paddings. Otherwise in
                 // EXPLICIT auto_pad mode we use what is provided.
-                if ((output_pshape.is_static() && data_pshape.is_static() &&
-                     filters_pshape.is_static()) &&
-                    (m_auto_pad == PadType::SAME_UPPER || m_auto_pad == PadType::SAME_LOWER))
+                if ((m_auto_pad == PadType::SAME_UPPER || m_auto_pad == PadType::SAME_LOWER) &&
+                    (data_spatial_shape.is_static() && filters_spatial_shape.is_static() &&
+                     output_spatial_pshape.is_static()))
                 {
-                    const Shape& data_shape = data_pshape.to_shape();
-                    const Shape& filters_shape = filters_pshape.to_shape();
-
-                    opset1::infer_conv_backprop_auto_padding(
-                        Shape{std::next(data_shape.begin(), 2), std::end(data_shape)},
-                        Shape{std::next(filters_shape.begin(), 3), std::end(filters_shape)},
-                        output_pshape.to_shape(),
-                        m_strides,
-                        m_dilations,
-                        m_auto_pad,
-                        m_output_padding,
-                        m_pads_begin,
-                        m_pads_end);
+                    opset1::infer_conv_backprop_auto_padding(data_spatial_shape.to_shape(),
+                                                             filters_spatial_shape.to_shape(),
+                                                             output_spatial_pshape.to_shape(),
+                                                             m_strides,
+                                                             m_dilations,
+                                                             m_auto_pad,
+                                                             m_output_padding,
+                                                             m_pads_begin,
+                                                             m_pads_end);
                 }
+            }
 
-                // GROUP * C_OUTPUT
+            vector<Dimension> output_pshape{output_spatial_pshape};
+            // GROUPS * C_OUT
+            auto n_out_channels = Dimension::dynamic();
+            if (filters_pshape.rank().is_static())
+            {
                 auto group_dim = filters_pshape[0];
                 if (!group_dim.is_static())
                 {
-                    group_dim = infer_group_from_input_shapes(data_pshape, filters_pshape);
+                    group_dim = infer_backprop_group_from_input_shapes(data_pshape, filters_pshape);
                 }
-                tmp_output_shape.insert(tmp_output_shape.begin(), group_dim * filters_pshape[2]);
-                // N
-                tmp_output_shape.insert(tmp_output_shape.begin(), data_pshape[0]);
-            }
-            else
-            {
-                auto n_out_channels = filters_pshape.rank().is_static()
-                                          ? filters_pshape[0] * filters_pshape[2]
-                                          : Dimension::dynamic();
-                auto batches =
-                    data_pshape.rank().is_static() ? data_pshape[0] : Dimension::dynamic();
-                tmp_output_shape.insert(tmp_output_shape.begin(), n_out_channels);
-                tmp_output_shape.insert(tmp_output_shape.begin(), batches);
+                n_out_channels = group_dim * filters_pshape[2];
             }
-            output_pshape = tmp_output_shape;
+            output_pshape.insert(output_pshape.begin(), n_out_channels);
+            // N
+            auto batches = data_pshape.rank().is_static() ? data_pshape[0] : Dimension::dynamic();
+            output_pshape.insert(output_pshape.begin(), batches);
+            result_pshape = PartialShape{output_pshape};
         }
         set_input_is_relevant_to_shape(2);
     }
@@ -549,53 +668,57 @@ void op::v1::GroupConvolutionBackpropData::validate_and_infer_types()
             m_pads_end.assign(m_pads_end.size(), 0);
         }
 
+        vector<Dimension> output_pshape;
         if (data_pshape.rank().is_static() && filters_pshape.rank().is_static())
         {
-            vector<Dimension> data_shape{data_pshape}, filters_shape{filters_pshape}, output_shape;
-
-            infer_conv_backprop_output_spatial_shape(
-                vector<Dimension>{std::next(data_shape.begin(), 2), std::end(data_shape)},
-                vector<Dimension>{std::next(filters_shape.begin(), 3), std::end(filters_shape)},
-                m_strides,
-                m_dilations,
-                m_pads_begin,
-                m_pads_end,
-                m_output_padding,
-                output_shape);
-
-            // GROUP * C_OUTPUT
-            auto group_dim = filters_pshape[0];
-            if (!group_dim.is_static())
-            {
-                group_dim = infer_group_from_input_shapes(data_pshape, filters_pshape);
-            }
-            output_shape.insert(output_shape.begin(), group_dim * filters_shape.at(2));
-            // N
-            output_shape.insert(output_shape.begin(), data_shape.at(0));
-            output_pshape = PartialShape{output_shape};
+            auto data_spatial_shape = [data_pshape]() {
+                vector<Dimension> data_dims{data_pshape};
+                return vector<Dimension>{std::next(data_dims.begin(), 2), std::end(data_dims)};
+            }();
+
+            auto filters_spatial_shape = [filters_pshape]() {
+                vector<Dimension> filters_dims{filters_pshape};
+                return vector<Dimension>{std::next(filters_dims.begin(), 3),
+                                         std::end(filters_dims)};
+            }();
+
+            infer_conv_backprop_output_spatial_shape(data_spatial_shape,
+                                                     filters_spatial_shape,
+                                                     m_strides,
+                                                     m_dilations,
+                                                     m_pads_begin,
+                                                     m_pads_end,
+                                                     m_output_padding,
+                                                     output_pshape);
         }
         else
         {
-            if (data_pshape.rank().is_static())
-            {
-                output_pshape = PartialShape::dynamic(data_pshape.rank());
-                output_pshape[0] = data_pshape[0];
-            }
-            else if (filters_pshape.rank().is_static())
-            {
-                output_pshape = PartialShape::dynamic(filters_pshape.rank().get_length() - 1);
-                output_pshape[1] = filters_pshape[0] * filters_pshape[2];
-            }
-            else
+            output_pshape = vector<Dimension>{output_spatial_pshape};
+        }
+
+        if (output_pshape.size())
+        {
+            // GROUPS * C_OUT
+            auto n_out_channels = Dimension::dynamic();
+            if (filters_pshape.rank().is_static())
             {
-                output_pshape = PartialShape::dynamic();
+                auto group_dim = filters_pshape[0];
+                if (!group_dim.is_static())
+                {
+                    group_dim = infer_backprop_group_from_input_shapes(data_pshape, filters_pshape);
+                }
+                n_out_channels = group_dim * filters_pshape[2];
             }
+            output_pshape.insert(output_pshape.begin(), n_out_channels);
+            // N
+            auto batches = data_pshape.rank().is_static() ? data_pshape[0] : Dimension::dynamic();
+            output_pshape.insert(output_pshape.begin(), batches);
+            result_pshape = PartialShape{output_pshape};
         }
     }
-
     set_input_is_relevant_to_shape(0);
     set_input_is_relevant_to_shape(1);
-    set_output_type(0, result_et, output_pshape);
+    set_output_type(0, result_et, result_pshape);
 }
 
 shared_ptr<Node>
diff --git a/ngraph/core/src/op/max.cpp b/ngraph/core/src/op/max.cpp
index 063d55c804fe7a..493810edbac75f 100644
--- a/ngraph/core/src/op/max.cpp
+++ b/ngraph/core/src/op/max.cpp
@@ -46,7 +46,7 @@ namespace maxop
     }
 }
 
-constexpr NodeTypeInfo op::v1::ReduceMax::type_info;
+NGRAPH_RTTI_DEFINITION(op::v1::ReduceMax, "ReduceMax", 1, util::ArithmeticReductionKeepDims);
 
 op::v1::ReduceMax::ReduceMax(const Output<Node>& arg,
                              const Output<Node>& reduction_axes,
diff --git a/ngraph/core/src/op/min.cpp b/ngraph/core/src/op/min.cpp
index 25c41d1766cbdf..300bd8add3cf4e 100644
--- a/ngraph/core/src/op/min.cpp
+++ b/ngraph/core/src/op/min.cpp
@@ -46,7 +46,7 @@ namespace minop
     }
 } // namespace minop
 
-constexpr NodeTypeInfo op::v1::ReduceMin::type_info;
+NGRAPH_RTTI_DEFINITION(op::v1::ReduceMin, "ReduceMin", 1, util::ArithmeticReductionKeepDims);
 
 op::v1::ReduceMin::ReduceMin(const Output<Node>& arg,
                              const Output<Node>& reduction_axes,
diff --git a/ngraph/core/src/op/mod.cpp b/ngraph/core/src/op/mod.cpp
index da329c7789ea82..8f3703c829fbf4 100644
--- a/ngraph/core/src/op/mod.cpp
+++ b/ngraph/core/src/op/mod.cpp
@@ -4,66 +4,25 @@
 
 #include "ngraph/op/mod.hpp"
 #include "itt.hpp"
-#include "ngraph/attribute_visitor.hpp"
-#include "ngraph/builder/make_constant.hpp"
-#include "ngraph/op/abs.hpp"
-#include "ngraph/op/convert.hpp"
-#include "ngraph/op/divide.hpp"
-#include "ngraph/op/multiply.hpp"
-#include "ngraph/op/sign.hpp"
-#include "ngraph/op/subtract.hpp"
 
 using namespace std;
 using namespace ngraph;
 
-NGRAPH_SUPPRESS_DEPRECATED_START
+// ------------------------------ v1 -------------------------------------------
 
 constexpr NodeTypeInfo op::v1::Mod::type_info;
 
-op::v1::Mod::Mod()
-    : FusedOp()
-    , m_auto_broadcast()
-{
-}
-
-op::v1::Mod::Mod(const Output<Node>& A,
-                 const Output<Node>& B,
+op::v1::Mod::Mod(const Output<Node>& arg0,
+                 const Output<Node>& arg1,
                  const AutoBroadcastSpec& auto_broadcast)
-    : FusedOp({A, B})
-    , m_auto_broadcast(auto_broadcast)
+    : BinaryElementwiseArithmetic(arg0, arg1, auto_broadcast)
 {
     constructor_validate_and_infer_types();
 }
 
-bool ngraph::op::v1::Mod::visit_attributes(AttributeVisitor& visitor)
-{
-    NGRAPH_OP_SCOPE(v1_Mod_visit_attributes);
-    visitor.on_attribute("auto_broadcast", m_auto_broadcast);
-    return true;
-}
-
-OutputVector op::v1::Mod::decompose_op() const
-{
-    const auto dividend = make_shared<op::Abs>(input_value(0));
-    const auto dividend_sign = make_shared<op::Sign>(input_value(0));
-    const auto dividend_et = dividend->get_element_type();
-    const auto divisor = make_shared<op::Abs>(input_value(1));
-
-    // truncated(a / b)
-    auto division = make_shared<op::Convert>(
-        make_shared<op::v1::Divide>(dividend, divisor, m_auto_broadcast), ngraph::element::i64);
-    division = make_shared<op::Convert>(division, dividend_et);
-    // truncated(a / b) * b
-    const auto multiplication = make_shared<op::v1::Multiply>(division, divisor, m_auto_broadcast);
-    // a mod b = a - truncated(a / b) * b
-    const auto mod = make_shared<op::v1::Subtract>(dividend, multiplication, m_auto_broadcast);
-
-    // apply sign of dividend
-    return {make_shared<op::v1::Multiply>(dividend_sign, mod, m_auto_broadcast)};
-}
-
 shared_ptr<Node> op::v1::Mod::clone_with_new_inputs(const OutputVector& new_args) const
 {
     NGRAPH_OP_SCOPE(v1_Mod_clone_with_new_inputs);
-    return make_shared<Mod>(new_args.at(0), new_args.at(1), m_auto_broadcast);
-}
+    check_new_args_count(this, new_args);
+    return make_shared<Mod>(new_args.at(0), new_args.at(1), this->get_autob());
+}
\ No newline at end of file
diff --git a/ngraph/core/src/op/reduce_l1.cpp b/ngraph/core/src/op/reduce_l1.cpp
index 39f3b0f48af867..f4c02d6f133a1b 100644
--- a/ngraph/core/src/op/reduce_l1.cpp
+++ b/ngraph/core/src/op/reduce_l1.cpp
@@ -12,7 +12,7 @@
 using namespace std;
 using namespace ngraph;
 
-constexpr NodeTypeInfo op::v4::ReduceL1::type_info;
+NGRAPH_RTTI_DEFINITION(op::v4::ReduceL1, "ReduceL1", 4, util::ArithmeticReductionKeepDims);
 
 op::v4::ReduceL1::ReduceL1(const Output<Node>& arg,
                            const Output<Node>& reduction_axes,
diff --git a/ngraph/core/src/op/reduce_l2.cpp b/ngraph/core/src/op/reduce_l2.cpp
index 567581f43168b0..8c2498f0c3d0b7 100644
--- a/ngraph/core/src/op/reduce_l2.cpp
+++ b/ngraph/core/src/op/reduce_l2.cpp
@@ -12,7 +12,7 @@
 using namespace std;
 using namespace ngraph;
 
-constexpr NodeTypeInfo op::v4::ReduceL2::type_info;
+NGRAPH_RTTI_DEFINITION(op::v4::ReduceL2, "ReduceL2", 4, util::ArithmeticReductionKeepDims);
 
 op::v4::ReduceL2::ReduceL2(const Output<Node>& arg,
                            const Output<Node>& reduction_axes,
diff --git a/ngraph/core/src/op/reduce_logical_and.cpp b/ngraph/core/src/op/reduce_logical_and.cpp
index 90814b94603040..a522131a7d3ae6 100644
--- a/ngraph/core/src/op/reduce_logical_and.cpp
+++ b/ngraph/core/src/op/reduce_logical_and.cpp
@@ -12,7 +12,10 @@
 using namespace ngraph;
 using namespace std;
 
-NGRAPH_RTTI_DEFINITION(op::v1::ReduceLogicalAnd, "ReduceLogicalAnd", 1);
+NGRAPH_RTTI_DEFINITION(op::v1::ReduceLogicalAnd,
+                       "ReduceLogicalAnd",
+                       1,
+                       util::LogicalReductionKeepDims);
 
 op::v1::ReduceLogicalAnd::ReduceLogicalAnd(const Output<Node>& data,
                                            const Output<Node>& reduction_axes,
diff --git a/ngraph/core/src/op/reduce_logical_or.cpp b/ngraph/core/src/op/reduce_logical_or.cpp
index 4008863580f220..cc09e5c42bf430 100644
--- a/ngraph/core/src/op/reduce_logical_or.cpp
+++ b/ngraph/core/src/op/reduce_logical_or.cpp
@@ -12,7 +12,10 @@
 using namespace ngraph;
 using namespace std;
 
-NGRAPH_RTTI_DEFINITION(op::v1::ReduceLogicalOr, "ReduceLogicalOr", 1);
+NGRAPH_RTTI_DEFINITION(op::v1::ReduceLogicalOr,
+                       "ReduceLogicalOr",
+                       1,
+                       util::LogicalReductionKeepDims);
 
 op::v1::ReduceLogicalOr::ReduceLogicalOr(const Output<Node>& data,
                                          const Output<Node>& reduction_axes,
diff --git a/ngraph/core/src/op/reduce_mean.cpp b/ngraph/core/src/op/reduce_mean.cpp
index 9036766527e0d8..28331a8e90542d 100644
--- a/ngraph/core/src/op/reduce_mean.cpp
+++ b/ngraph/core/src/op/reduce_mean.cpp
@@ -13,7 +13,7 @@
 using namespace std;
 using namespace ngraph;
 
-constexpr NodeTypeInfo op::v1::ReduceMean::type_info;
+NGRAPH_RTTI_DEFINITION(op::v1::ReduceMean, "ReduceMean", 1, util::ArithmeticReductionKeepDims);
 
 op::v1::ReduceMean::ReduceMean(const Output<Node>& arg,
                                const Output<Node>& reduction_axes,
diff --git a/ngraph/core/src/op/reduce_prod.cpp b/ngraph/core/src/op/reduce_prod.cpp
index 7696d9a7299201..d24ac763f6500d 100644
--- a/ngraph/core/src/op/reduce_prod.cpp
+++ b/ngraph/core/src/op/reduce_prod.cpp
@@ -13,7 +13,7 @@
 using namespace std;
 using namespace ngraph;
 
-constexpr NodeTypeInfo op::v1::ReduceProd::type_info;
+NGRAPH_RTTI_DEFINITION(op::v1::ReduceProd, "ReduceProd", 1, util::ArithmeticReductionKeepDims);
 
 op::v1::ReduceProd::ReduceProd(const Output<Node>& arg,
                                const Output<Node>& reduction_axes,
diff --git a/ngraph/core/src/op/reduce_sum.cpp b/ngraph/core/src/op/reduce_sum.cpp
index cbb64dde3b2e64..935942fe7cfbf5 100644
--- a/ngraph/core/src/op/reduce_sum.cpp
+++ b/ngraph/core/src/op/reduce_sum.cpp
@@ -13,7 +13,7 @@
 using namespace std;
 using namespace ngraph;
 
-constexpr NodeTypeInfo op::v1::ReduceSum::type_info;
+NGRAPH_RTTI_DEFINITION(op::v1::ReduceSum, "ReduceSum", 1, util::ArithmeticReductionKeepDims);
 
 op::v1::ReduceSum::ReduceSum(const Output<Node>& arg,
                              const Output<Node>& reduction_axes,
diff --git a/ngraph/core/src/op/scatter_nd_update.cpp b/ngraph/core/src/op/scatter_nd_update.cpp
index 0142286981d1d9..2c4716c9c016a8 100644
--- a/ngraph/core/src/op/scatter_nd_update.cpp
+++ b/ngraph/core/src/op/scatter_nd_update.cpp
@@ -4,6 +4,9 @@
 
 #include "ngraph/op/scatter_nd_update.hpp"
 #include "itt.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/reference/scatter_nd_update.hpp"
+#include "ngraph/validation_util.hpp"
 
 using namespace std;
 using namespace ngraph;
@@ -18,3 +21,79 @@ shared_ptr<Node> op::v3::ScatterNDUpdate::clone_with_new_inputs(const OutputVect
                                                 new_args.at(op::util::ScatterNDBase::INDICES),
                                                 new_args.at(op::util::ScatterNDBase::UPDATES));
 }
+
+namespace scatter
+{
+    template <element::Type_t ET>
+    bool evaluate(const HostTensorPtr& arg0,
+                  const HostTensorPtr& arg1,
+                  const HostTensorPtr& arg2,
+                  const HostTensorPtr& out)
+    {
+        using T = typename element_type_traits<ET>::value_type;
+        Shape params_shape = arg0->get_shape();
+        Shape indices_shape = arg1->get_shape();
+        Shape updates_shape = arg1->get_shape();
+        Shape out_shape(params_shape);
+        out->set_shape(out_shape);
+
+        if (arg1->get_element_type() == element::i64)
+        {
+            runtime::reference::scatterNdUpdate<T, int64_t>(arg0->get_data_ptr<ET>(),
+                                                            arg1->get_data_ptr<int64_t>(),
+                                                            arg2->get_data_ptr<ET>(),
+                                                            out->get_data_ptr<ET>(),
+                                                            arg0->get_shape(),
+                                                            arg1->get_shape(),
+                                                            arg2->get_shape());
+        }
+        else if (arg1->get_element_type() == element::i32)
+        {
+            runtime::reference::scatterNdUpdate<T, int32_t>(arg0->get_data_ptr<ET>(),
+                                                            arg1->get_data_ptr<int32_t>(),
+                                                            arg2->get_data_ptr<ET>(),
+                                                            out->get_data_ptr<ET>(),
+                                                            arg0->get_shape(),
+                                                            arg1->get_shape(),
+                                                            arg2->get_shape());
+        }
+        else
+        {
+            throw ngraph_error("Unexpected type");
+        }
+
+        return true;
+    }
+
+    bool evaluate_scatter(const HostTensorPtr& arg0,
+                          const HostTensorPtr& arg1,
+                          const HostTensorPtr& arg2,
+                          const HostTensorPtr& out)
+    {
+        bool rc = true;
+
+        switch (out->get_element_type())
+        {
+            NGRAPH_TYPE_CASE(evaluate_scatter, i32, arg0, arg1, arg2, out);
+            NGRAPH_TYPE_CASE(evaluate_scatter, i64, arg0, arg1, arg2, out);
+            NGRAPH_TYPE_CASE(evaluate_scatter, u32, arg0, arg1, arg2, out);
+            NGRAPH_TYPE_CASE(evaluate_scatter, u64, arg0, arg1, arg2, out);
+            NGRAPH_TYPE_CASE(evaluate_scatter, f16, arg0, arg1, arg2, out);
+            NGRAPH_TYPE_CASE(evaluate_scatter, f32, arg0, arg1, arg2, out);
+            NGRAPH_TYPE_CASE(evaluate_scatter, boolean, arg0, arg1, arg2, out);
+        default: rc = false; break;
+        }
+        return rc;
+    }
+}
+
+bool op::v3::ScatterNDUpdate::evaluate(const HostTensorVector& outputs,
+                                       const HostTensorVector& inputs) const
+{
+    NGRAPH_OP_SCOPE(v3_ScatterNDUpdate_evaluate);
+    NGRAPH_CHECK(this, !inputs.empty());
+    NGRAPH_CHECK(this, validate_host_tensor_vector(inputs, 3));
+    NGRAPH_CHECK(this, validate_host_tensor_vector(outputs, 1));
+
+    return scatter::evaluate_scatter(inputs[0], inputs[1], inputs[2], outputs[0]);
+}
diff --git a/ngraph/core/src/op/squeeze.cpp b/ngraph/core/src/op/squeeze.cpp
index fc62fff85b52b8..f21a048b49f9ce 100644
--- a/ngraph/core/src/op/squeeze.cpp
+++ b/ngraph/core/src/op/squeeze.cpp
@@ -61,7 +61,7 @@ void op::Squeeze::pre_validate_and_infer_types()
         normalize_axes(this->description(), axes_constant->cast_vector<int64_t>(), data_rank);
 
     // Prepare set of unique axes marked to be removed from input data.
-    vector<uint64_t> axes_to_squeeze(data_rank);
+    vector<bool> axes_to_squeeze(data_rank);
     if (axes_is_empty_constant)
     {
         auto data_shape = data.get_shape();
@@ -70,11 +70,11 @@ void op::Squeeze::pre_validate_and_infer_types()
         {
             if (data_shape.at(idx) == 1)
             {
-                axes_to_squeeze.at(idx) = 1;
+                axes_to_squeeze.at(idx) = true;
             }
             else
             {
-                axes_to_squeeze.at(idx) = 0;
+                axes_to_squeeze.at(idx) = false;
             }
         }
     }
@@ -91,14 +91,14 @@ void op::Squeeze::pre_validate_and_infer_types()
                     (data_shape.at(axis) == 1),
                     "provided axis value is invalid. Only axes of size 1 may be removed.");
             }
-            axes_to_squeeze.at(axis) = 1;
+            axes_to_squeeze.at(axis) = true;
         }
     }
 
     vector<Dimension> output_data_shape;
     for (uint64_t idx = 0; idx < data_rank; ++idx)
     {
-        if (axes_to_squeeze.at(idx) == 0)
+        if (!axes_to_squeeze.at(idx))
         {
             output_data_shape.push_back(data_partial_shape[idx]);
         }
@@ -252,3 +252,8 @@ bool op::v0::Squeeze::constant_fold(OutputVector& output_values, const OutputVec
     }
     return false;
 }
+
+bool op::v0::Squeeze::is_dynamic() const
+{
+    return get_output_partial_shape(0).is_dynamic();
+}
diff --git a/ngraph/core/src/op/unsqueeze.cpp b/ngraph/core/src/op/unsqueeze.cpp
index 2d0b0aca4e0e6d..b6f849ad2763c4 100644
--- a/ngraph/core/src/op/unsqueeze.cpp
+++ b/ngraph/core/src/op/unsqueeze.cpp
@@ -45,14 +45,11 @@ void op::v0::Unsqueeze::validate_and_infer_types()
     // Get value of axes from Constant
     const auto axes_values = axes_constant->cast_vector<int64_t>();
     const auto expanded_rank = data_rank_value + axes_values.size();
-    auto axes = normalize_axes(this->description(), axes_values, expanded_rank);
 
-    NODE_VALIDATION_CHECK(this, !axes.empty(), "'axes' input is mandatory.");
-    NODE_VALIDATION_CHECK(this,
-                          axes.size() == set<int64_t>(begin(axes), end(axes)).size(),
-                          "'axes' input has a duplicate axis.");
+    NODE_VALIDATION_CHECK(this, !axes_values.empty(), "'axes' input is mandatory");
 
-    sort(begin(axes), end(axes), less<int64_t>());
+    auto normalized_axes = normalize_axes(this->description(), axes_values, expanded_rank);
+    set<int64_t> axes(begin(normalized_axes), end(normalized_axes));
 
     vector<Dimension> output_shape{data_partial_shape};
     for (auto axis : axes)
diff --git a/ngraph/core/src/op/util/arithmetic_reduction.cpp b/ngraph/core/src/op/util/arithmetic_reduction.cpp
index 565f78f970d5d4..2861ef5f287a39 100644
--- a/ngraph/core/src/op/util/arithmetic_reduction.cpp
+++ b/ngraph/core/src/op/util/arithmetic_reduction.cpp
@@ -10,17 +10,9 @@
 using namespace std;
 using namespace ngraph;
 
-op::util::ArithmeticReduction::ArithmeticReduction() {}
+NGRAPH_RTTI_DEFINITION(op::util::ArithmeticReduction, "ArithmeticReduction", 0);
 
-op::util::ArithmeticReduction::ArithmeticReduction(const Output<Node>& arg,
-                                                   const AxisSet& reduction_axes)
-    : Op({arg,
-          op::Constant::create(
-              element::i64, Shape{reduction_axes.size()}, reduction_axes.to_vector())
-              ->output(0)})
-{
-    add_provenance_group_member(input_value(1).get_node_shared_ptr());
-}
+op::util::ArithmeticReduction::ArithmeticReduction() {}
 
 op::util::ArithmeticReduction::ArithmeticReduction(const Output<Node>& arg,
                                                    const Output<Node>& reduction_axes)
diff --git a/ngraph/core/src/op/util/arithmetic_reductions_keep_dims.cpp b/ngraph/core/src/op/util/arithmetic_reductions_keep_dims.cpp
index 97cdc05bd8101d..67670c55f1500c 100644
--- a/ngraph/core/src/op/util/arithmetic_reductions_keep_dims.cpp
+++ b/ngraph/core/src/op/util/arithmetic_reductions_keep_dims.cpp
@@ -11,6 +11,8 @@
 using namespace std;
 using namespace ngraph;
 
+NGRAPH_RTTI_DEFINITION(op::util::ArithmeticReductionKeepDims, "ArithmeticReductionKeepDims", 0);
+
 op::util::ArithmeticReductionKeepDims::ArithmeticReductionKeepDims(
     const ngraph::Output<ngraph::Node>& arg,
     const ngraph::Output<ngraph::Node>& reduction_axes,
diff --git a/ngraph/core/src/op/util/broadcast_base.cpp b/ngraph/core/src/op/util/broadcast_base.cpp
index d24aa008298e7f..38519c9df6ede5 100644
--- a/ngraph/core/src/op/util/broadcast_base.cpp
+++ b/ngraph/core/src/op/util/broadcast_base.cpp
@@ -147,7 +147,8 @@ void op::util::BroadcastBase::validate_target_shape_none(const PartialShape& arg
         if (arg_shape.rank().get_length() > 0)
         {
             NODE_VALIDATION_CHECK(this,
-                                  target_shape[axes_mapping_val[i]].compatible(arg_shape[i]),
+                                  target_shape[axes_mapping_val[i]].compatible(arg_shape[i]) ||
+                                      arg_shape[i].compatible(1),
                                   "Broadcast target[axes_mapping[",
                                   i,
                                   "]]",
@@ -575,4 +576,4 @@ bool op::util::BroadcastBase::evaluate_upper(const HostTensorVector& output_valu
         (get_input_size() > 2 && !input_value(2).get_tensor().has_and_set_bound()))
         return false;
     return default_upper_bound_evaluator(this, output_values);
-}
\ No newline at end of file
+}
diff --git a/ngraph/core/src/op/util/logical_reduction.cpp b/ngraph/core/src/op/util/logical_reduction.cpp
index 627692eea4b51c..698dbc32c50e66 100644
--- a/ngraph/core/src/op/util/logical_reduction.cpp
+++ b/ngraph/core/src/op/util/logical_reduction.cpp
@@ -10,6 +10,8 @@
 using namespace std;
 using namespace ngraph;
 
+NGRAPH_RTTI_DEFINITION(op::util::LogicalReduction, "LogicalReduction", 1);
+
 op::util::LogicalReduction::LogicalReduction() {}
 
 op::util::LogicalReduction::LogicalReduction(const Output<Node>& arg, const AxisSet& reduction_axes)
diff --git a/ngraph/core/src/op/util/logical_reduction_keep_dims.cpp b/ngraph/core/src/op/util/logical_reduction_keep_dims.cpp
index f19d87e187247a..9c4ae46c0553fc 100644
--- a/ngraph/core/src/op/util/logical_reduction_keep_dims.cpp
+++ b/ngraph/core/src/op/util/logical_reduction_keep_dims.cpp
@@ -11,6 +11,8 @@
 using namespace std;
 using namespace ngraph;
 
+NGRAPH_RTTI_DEFINITION(op::util::LogicalReductionKeepDims, "LogicalReductionKeepDims", 1);
+
 op::util::LogicalReductionKeepDims::LogicalReductionKeepDims(
     const ngraph::Output<ngraph::Node>& arg,
     const ngraph::Output<ngraph::Node>& reduction_axes,
diff --git a/ngraph/core/src/op/util/scatter_nd_base.cpp b/ngraph/core/src/op/util/scatter_nd_base.cpp
index 449de80cf206ae..9d91891c58f073 100644
--- a/ngraph/core/src/op/util/scatter_nd_base.cpp
+++ b/ngraph/core/src/op/util/scatter_nd_base.cpp
@@ -40,6 +40,10 @@ void op::util::ScatterNDBase::validate_and_infer_types()
     const PartialShape& indices_shape = get_input_partial_shape(INDICES);
     const PartialShape& updates_shape = get_input_partial_shape(UPDATES);
 
+    const auto& inputs_rank = inputs_shape.rank();
+    const auto& indices_rank = indices_shape.rank();
+    const auto& updates_rank = updates_shape.rank();
+
     NODE_VALIDATION_CHECK(this,
                           indices_et == element::i32 || indices_et == element::i64,
                           "Indices element type must be i64 or i32");
@@ -48,47 +52,47 @@ void op::util::ScatterNDBase::validate_and_infer_types()
         this, updates_et == inputs_et, "Updates element type must be the same as inputs");
 
     NODE_VALIDATION_CHECK(this,
-                          indices_shape.rank().is_dynamic() ||
-                              indices_shape.rank().get_length() >= 1,
+                          indices_rank.is_dynamic() || indices_rank.get_length() >= 1,
                           "Indices rank is expected to be at least 1");
 
     NODE_VALIDATION_CHECK(this,
-                          inputs_shape.rank().is_dynamic() || indices_shape.rank().is_dynamic() ||
-                              indices_shape[indices_shape.rank().get_length() - 1].get_length() <=
-                                  inputs_shape.rank().get_length(),
+                          inputs_rank.is_dynamic() || indices_rank.is_dynamic() ||
+                              indices_shape[indices_rank.get_length() - 1].get_length() <=
+                                  inputs_rank.get_length(),
                           "Last dimension of indices can be at most the rank of inputs");
 
-    NODE_VALIDATION_CHECK(
-        this,
-        inputs_shape.rank().is_dynamic() || indices_shape.rank().is_dynamic() ||
-            updates_shape.rank().is_dynamic() ||
-            updates_shape.rank().get_length() ==
-                indices_shape.rank().get_length() + inputs_shape.rank().get_length() -
-                    indices_shape[indices_shape.rank().get_length() - 1].get_length() - 1,
-        "Rank of updates must be rank of inputs + rank of indices - last dimension of indices "
-        "- 1");
-
-    bool compatible = true;
-    if (inputs_shape.is_static() && indices_shape.is_static() && updates_shape.is_static())
+    if (inputs_rank.is_static() && indices_rank.is_static() && updates_rank.is_static())
     {
-        size_t indices_rank = indices_shape.rank().get_length();
-        size_t updates_rank = updates_shape.rank().get_length();
-        for (size_t i = 0; i < indices_rank - 1; i++)
-        {
-            compatible = compatible && updates_shape[i].same_scheme(indices_shape[i]);
-            NODE_VALIDATION_CHECK(
-                this,
-                compatible,
-                "updates_shape[0:indices_rank-1] shape must be indices_shape[:-1]");
-        }
-        size_t j = indices_shape[indices_rank - 1].get_length();
-        for (size_t i = indices_rank - 1; i < updates_rank; i++, j++)
+        auto expected_updates_rank = indices_rank.get_length() + inputs_rank.get_length() -
+                                     indices_shape[indices_rank.get_length() - 1].get_length() - 1;
+        // If expected updates rank is 0D it also can be a tensor with one element
+        NODE_VALIDATION_CHECK(
+            this,
+            updates_rank.get_length() == expected_updates_rank || expected_updates_rank == 0,
+            "Rank of updates must be rank of inputs + rank of indices - last dimension of indices "
+            "- 1");
+
+        bool compatible = true;
+        if (inputs_shape.is_static() && indices_shape.is_static() && updates_shape.is_static())
         {
-            compatible = compatible && updates_shape[i].same_scheme(inputs_shape[j]);
-            NODE_VALIDATION_CHECK(
-                this,
-                compatible,
-                "updates_shape[indices_rank-1:] shape must be input_shape[indices_shape[-1]:]");
+            size_t static_indices_rank = indices_rank.get_length();
+            for (size_t i = 0; i < static_indices_rank - 1; i++)
+            {
+                compatible = compatible && updates_shape[i].same_scheme(indices_shape[i]);
+                NODE_VALIDATION_CHECK(
+                    this,
+                    compatible,
+                    "updates_shape[0:indices_rank-1] shape must be indices_shape[:-1]");
+            }
+            size_t j = indices_shape[static_indices_rank - 1].get_length();
+            for (size_t i = static_indices_rank - 1; i < expected_updates_rank; i++, j++)
+            {
+                compatible = compatible && updates_shape[i].same_scheme(inputs_shape[j]);
+                NODE_VALIDATION_CHECK(
+                    this,
+                    compatible,
+                    "updates_shape[indices_rank-1:] shape must be input_shape[indices_shape[-1]:]");
+            }
         }
     }
 
diff --git a/ngraph/core/src/opsets/opset.cpp b/ngraph/core/src/opsets/opset.cpp
index a59ca4c37262c0..ea09eec98c1487 100644
--- a/ngraph/core/src/opsets/opset.cpp
+++ b/ngraph/core/src/opsets/opset.cpp
@@ -34,133 +34,84 @@ ngraph::Node* ngraph::OpSet::create_insensitive(const std::string& name) const
 
 const ngraph::OpSet& ngraph::get_opset1()
 {
-    static std::mutex init_mutex;
-    static bool opset_is_initialized = false;
     static OpSet opset;
-    if (!opset_is_initialized)
-    {
-        std::lock_guard<std::mutex> guard(init_mutex);
-        if (!opset_is_initialized)
-        {
+    static std::once_flag flag;
+    std::call_once(flag, [&]() {
 #define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
 #include "ngraph/opsets/opset1_tbl.hpp"
 #undef NGRAPH_OP
-            opset_is_initialized = true;
-        }
-    }
+    });
     return opset;
 }
 
 const ngraph::OpSet& ngraph::get_opset2()
 {
-    static std::mutex init_mutex;
-    static bool opset_is_initialized = false;
     static OpSet opset;
-    if (!opset_is_initialized)
-    {
-        std::lock_guard<std::mutex> guard(init_mutex);
-        if (!opset_is_initialized)
-        {
+    static std::once_flag flag;
+    std::call_once(flag, [&]() {
 #define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
 #include "ngraph/opsets/opset2_tbl.hpp"
 #undef NGRAPH_OP
-            opset_is_initialized = true;
-        }
-    }
+    });
     return opset;
 }
 
 const ngraph::OpSet& ngraph::get_opset3()
 {
-    static std::mutex init_mutex;
-    static bool opset_is_initialized = false;
     static OpSet opset;
-    if (!opset_is_initialized)
-    {
-        std::lock_guard<std::mutex> guard(init_mutex);
-        if (!opset_is_initialized)
-        {
+    static std::once_flag flag;
+    std::call_once(flag, [&]() {
 #define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
 #include "ngraph/opsets/opset3_tbl.hpp"
 #undef NGRAPH_OP
-            opset_is_initialized = true;
-        }
-    }
+    });
     return opset;
 }
 
 const ngraph::OpSet& ngraph::get_opset4()
 {
-    static std::mutex init_mutex;
-    static bool opset_is_initialized = false;
     static OpSet opset;
-    if (!opset_is_initialized)
-    {
-        std::lock_guard<std::mutex> guard(init_mutex);
-        if (!opset_is_initialized)
-        {
+    static std::once_flag flag;
+    std::call_once(flag, [&]() {
 #define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
 #include "ngraph/opsets/opset4_tbl.hpp"
 #undef NGRAPH_OP
-            opset_is_initialized = true;
-        }
-    }
+    });
     return opset;
 }
 
 const ngraph::OpSet& ngraph::get_opset5()
 {
-    static std::mutex init_mutex;
-    static bool opset_is_initialized = false;
     static OpSet opset;
-    if (!opset_is_initialized)
-    {
-        std::lock_guard<std::mutex> guard(init_mutex);
-        if (!opset_is_initialized)
-        {
+    static std::once_flag flag;
+    std::call_once(flag, [&]() {
 #define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
 #include "ngraph/opsets/opset5_tbl.hpp"
 #undef NGRAPH_OP
-            opset_is_initialized = true;
-        }
-    }
+    });
     return opset;
 }
 
 const ngraph::OpSet& ngraph::get_opset6()
 {
-    static std::mutex init_mutex;
-    static bool opset_is_initialized = false;
     static OpSet opset;
-    if (!opset_is_initialized)
-    {
-        std::lock_guard<std::mutex> guard(init_mutex);
-        if (!opset_is_initialized)
-        {
+    static std::once_flag flag;
+    std::call_once(flag, [&]() {
 #define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
 #include "ngraph/opsets/opset6_tbl.hpp"
 #undef NGRAPH_OP
-            opset_is_initialized = true;
-        }
-    }
+    });
     return opset;
 }
 
 const ngraph::OpSet& ngraph::get_opset7()
 {
-    static std::mutex init_mutex;
-    static bool opset_is_initialized = false;
     static OpSet opset;
-    if (!opset_is_initialized)
-    {
-        std::lock_guard<std::mutex> guard(init_mutex);
-        if (!opset_is_initialized)
-        {
+    static std::once_flag flag;
+    std::call_once(flag, [&]() {
 #define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
 #include "ngraph/opsets/opset7_tbl.hpp"
 #undef NGRAPH_OP
-            opset_is_initialized = true;
-        }
-    }
+    });
     return opset;
 }
diff --git a/ngraph/core/src/partial_shape.cpp b/ngraph/core/src/partial_shape.cpp
index cb1fea2de3e4cb..35929dc9e8d246 100644
--- a/ngraph/core/src/partial_shape.cpp
+++ b/ngraph/core/src/partial_shape.cpp
@@ -484,7 +484,7 @@ const std::vector<int64_t>& ngraph::AttributeAdapter<ngraph::PartialShape>::get(
         {
             for (size_t i = 0; i < m_ref.rank().get_length(); ++i)
             {
-                auto& elt = m_ref[i];
+                const auto& elt = static_cast<const PartialShape&>(m_ref)[i];
                 m_buffer.push_back(elt.is_dynamic() ? -1 : elt.get_length());
             }
         }
diff --git a/ngraph/core/src/pass/graph_rewrite.cpp b/ngraph/core/src/pass/graph_rewrite.cpp
index f8804532bfdacb..75798fcec8e51b 100644
--- a/ngraph/core/src/pass/graph_rewrite.cpp
+++ b/ngraph/core/src/pass/graph_rewrite.cpp
@@ -15,6 +15,7 @@
 #include "ngraph/log.hpp"
 #include "ngraph/op/util/sub_graph_base.hpp"
 #include "ngraph/pass/graph_rewrite.hpp"
+#include "perf_counters.hpp"
 
 using namespace std;
 using namespace ngraph;
@@ -55,6 +56,21 @@ NGRAPH_RTTI_DEFINITION(ngraph::pass::GraphRewrite, "ngraph::pass::GraphRewrite",
 
 NGRAPH_RTTI_DEFINITION(ngraph::pass::MatcherPass, "ngraph::pass::MatcherPass", 0);
 
+namespace ngraph
+{
+    namespace pass
+    {
+        namespace
+        {
+            PerfCounters& perf_counters()
+            {
+                static PerfCounters counters;
+                return counters;
+            }
+        } // namespace
+    }     // namespace pass
+} // namespace ngraph
+
 bool pass::GraphRewrite::run_on_function(shared_ptr<Function> f)
 {
     OV_ITT_SCOPED_TASK(itt::domains::nGraph, "pass::GraphRewrite::run_on_function");
@@ -394,7 +410,7 @@ void ngraph::pass::MatcherPass::register_matcher(const std::shared_ptr<ngraph::p
 
 bool ngraph::pass::MatcherPass::apply(std::shared_ptr<ngraph::Node> node)
 {
-    OV_ITT_SCOPED_TASK(itt::domains::nGraph, "ngraph::pass::MatcherPass::apply");
+    OV_ITT_SCOPED_TASK(itt::domains::nGraph, pass::perf_counters()[get_type_info()]);
     m_new_nodes.clear();
     if (m_handler)
         return m_handler(node);
diff --git a/ngraph/core/src/pass/manager.cpp b/ngraph/core/src/pass/manager.cpp
index a3c376355278d1..aefcb51d77d47a 100644
--- a/ngraph/core/src/pass/manager.cpp
+++ b/ngraph/core/src/pass/manager.cpp
@@ -20,6 +20,7 @@
 #include "ngraph/pass/pass.hpp"
 #include "ngraph/pass/visualize_tree.hpp"
 #include "ngraph/util.hpp"
+#include "perf_counters.hpp"
 
 using namespace std;
 using namespace ngraph;
@@ -30,32 +31,6 @@ namespace ngraph
     {
         namespace
         {
-            class PerfCounters
-            {
-                PerfCounters(PerfCounters const&) = delete;
-                PerfCounters& operator=(PerfCounters const&) = delete;
-
-            public:
-                PerfCounters() = default;
-
-                openvino::itt::handle_t operator[](::ngraph::Node::type_info_t const& type_inf)
-                {
-                    std::lock_guard<std::mutex> guard(m_mutex);
-                    auto it = m_counters.find(&type_inf);
-                    if (it != m_counters.end())
-                        return it->second;
-                    return m_counters[&type_inf] = openvino::itt::handle(type_inf.name);
-                }
-
-            private:
-                using key = ::ngraph::Node::type_info_t const*;
-                using value = openvino::itt::handle_t;
-                using counters_map = std::unordered_map<key, value>;
-
-                std::mutex m_mutex;
-                counters_map m_counters;
-            };
-
             PerfCounters& perf_counters()
             {
                 static PerfCounters counters;
diff --git a/ngraph/core/src/pass/perf_counters.cpp b/ngraph/core/src/pass/perf_counters.cpp
new file mode 100644
index 00000000000000..9fb3a81c22656b
--- /dev/null
+++ b/ngraph/core/src/pass/perf_counters.cpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "perf_counters.hpp"
+
+namespace ngraph
+{
+    namespace pass
+    {
+        openvino::itt::handle_t
+            PerfCounters::operator[](::ngraph::Node::type_info_t const& type_inf)
+        {
+            std::lock_guard<std::mutex> guard(m_mutex);
+            auto it = m_counters.find(&type_inf);
+            if (it != m_counters.end())
+                return it->second;
+            return m_counters[&type_inf] = openvino::itt::handle(type_inf.name);
+        }
+    }
+}
diff --git a/ngraph/core/src/pass/perf_counters.hpp b/ngraph/core/src/pass/perf_counters.hpp
new file mode 100644
index 00000000000000..5d1cef265c0fef
--- /dev/null
+++ b/ngraph/core/src/pass/perf_counters.hpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma once
+#include <mutex>
+#include <unordered_map>
+
+#include <itt.hpp>
+#include <ngraph/node.hpp>
+
+namespace ngraph
+{
+    namespace pass
+    {
+        class PerfCounters
+        {
+            PerfCounters(PerfCounters const&) = delete;
+            PerfCounters& operator=(PerfCounters const&) = delete;
+
+        public:
+            PerfCounters() = default;
+
+            openvino::itt::handle_t operator[](::ngraph::Node::type_info_t const& type_inf);
+
+        private:
+            using key = ::ngraph::Node::type_info_t const*;
+            using value = openvino::itt::handle_t;
+            using counters_map = std::unordered_map<key, value>;
+
+            std::mutex m_mutex;
+            counters_map m_counters;
+        };
+    }
+}
diff --git a/ngraph/core/src/util.cpp b/ngraph/core/src/util.cpp
index 93d815e1ed3d14..c6e87cb6cadf61 100644
--- a/ngraph/core/src/util.cpp
+++ b/ngraph/core/src/util.cpp
@@ -401,6 +401,11 @@ AxisVector ngraph::get_default_order(const Shape& shape)
     return get_default_order(shape.size());
 }
 
+AxisVector ngraph::get_default_order(const PartialShape& shape)
+{
+    return get_default_order(shape.rank());
+}
+
 AxisVector ngraph::get_default_order(size_t rank)
 {
     AxisVector default_order(rank);
@@ -408,6 +413,15 @@ AxisVector ngraph::get_default_order(size_t rank)
     return default_order;
 }
 
+AxisVector ngraph::get_default_order(const Rank& rank)
+{
+    NGRAPH_CHECK(rank.is_static(), "Can not calculate default order for dynamic rank");
+
+    AxisVector default_order(rank.get_length());
+    std::iota(begin(default_order), end(default_order), 0);
+    return default_order;
+}
+
 void ngraph::parse_version_string(
     std::string version, size_t& major, size_t& minor, size_t& patch, string& extra)
 {
diff --git a/ngraph/core/src/validation_util.cpp b/ngraph/core/src/validation_util.cpp
index 3f1baf8bf7a077..e37f05c5285392 100644
--- a/ngraph/core/src/validation_util.cpp
+++ b/ngraph/core/src/validation_util.cpp
@@ -224,6 +224,115 @@ PartialShape ngraph::infer_windowed_reduction_output_shape(const Node* node,
     return output_shape;
 }
 
+PartialShape ngraph::validate_and_infer_convolution_forward_output_shape(
+    const Node* node,
+    const PartialShape& data_batch_pshape,
+    const PartialShape& filters_pshape,
+    const op::PadType auto_pad,
+    Strides& strides,
+    Strides& dilations,
+    CoordinateDiff& pads_begin,
+    CoordinateDiff& pads_end)
+{
+    Rank result_ps_rank;
+    NODE_VALIDATION_CHECK(
+        node,
+        Rank::merge(result_ps_rank, data_batch_pshape.rank(), filters_pshape.rank()),
+        "Data batch and filters inputs must have same rank. Got: ",
+        data_batch_pshape,
+        " and ",
+        filters_pshape);
+
+    PartialShape result_shape = PartialShape::dynamic();
+    if (result_ps_rank.is_static())
+    {
+        const auto num_spatial_dims = result_ps_rank.get_length() - 2;
+        if (strides.size() == 0)
+        {
+            strides = Strides(num_spatial_dims, 1);
+        }
+
+        if (dilations.size() == 0)
+        {
+            dilations = Strides(num_spatial_dims, 1);
+        }
+
+        if (pads_begin.size() == 0 || auto_pad == op::PadType::VALID)
+        {
+            pads_begin = CoordinateDiff(num_spatial_dims, 0);
+        }
+
+        if (pads_end.size() == 0 || auto_pad == op::PadType::VALID)
+        {
+            pads_end = CoordinateDiff(num_spatial_dims, 0);
+        }
+
+        NODE_VALIDATION_CHECK(node,
+                              strides.size() == num_spatial_dims,
+                              "Strides should be defined for all and only spatial features.");
+
+        NODE_VALIDATION_CHECK(node,
+                              dilations.size() == num_spatial_dims,
+                              "Dilations should be defined for all and only spatial features.");
+
+        NODE_VALIDATION_CHECK(node,
+                              pads_begin.size() == num_spatial_dims &&
+                                  pads_end.size() == num_spatial_dims,
+                              "Pads should be defined for all and only spatial features.");
+
+        result_shape = PartialShape::dynamic(result_ps_rank);
+        if (data_batch_pshape.rank().is_static())
+        {
+            result_shape[0] = data_batch_pshape[0]; // batch size
+        }
+        if (filters_pshape.rank().is_static())
+        {
+            result_shape[1] = filters_pshape[0]; // filter channel size
+        }
+        if (auto_pad == op::PadType::SAME_UPPER || auto_pad == op::PadType::SAME_LOWER)
+        {
+            bool auto_padding_applied = false;
+            if (filters_pshape.rank().is_static() && filters_pshape.rank().get_length() > 2)
+            {
+                pads_begin.clear();
+                pads_end.clear();
+
+                const PartialShape filter_spatial_shape = [filters_pshape]() {
+                    vector<Dimension> filter_dims{filters_pshape};
+                    filter_dims.erase(filter_dims.begin(),
+                                      filter_dims.begin() + 2); // Remove {C_OUT, C_IN}
+                    return PartialShape{filter_dims};
+                }();
+
+                if (filter_spatial_shape.is_static())
+                {
+                    auto_padding_applied = try_apply_auto_padding(data_batch_pshape,
+                                                                  filter_spatial_shape.to_shape(),
+                                                                  strides,
+                                                                  dilations,
+                                                                  auto_pad,
+                                                                  pads_end,
+                                                                  pads_begin);
+                }
+            }
+            if (!auto_padding_applied)
+            {
+                return result_shape;
+            }
+        }
+        result_shape =
+            infer_convolution_forward(node,
+                                      data_batch_pshape,
+                                      Strides(num_spatial_dims, 1), // dummy data dilations
+                                      pads_begin,
+                                      pads_end,
+                                      filters_pshape,
+                                      strides,
+                                      dilations);
+    }
+    return result_shape;
+}
+
 //
 // Infers the output batch shape and element type for convolution fprop.
 //
diff --git a/ngraph/frontend/CMakeLists.txt b/ngraph/frontend/CMakeLists.txt
index fbb38167302769..3e21b4b50171ec 100644
--- a/ngraph/frontend/CMakeLists.txt
+++ b/ngraph/frontend/CMakeLists.txt
@@ -5,5 +5,8 @@
 if (NGRAPH_ONNX_IMPORT_ENABLE)
     add_subdirectory(onnx_common)
     add_subdirectory(onnx_import)
+endif()
+
+if (NGRAPH_ONNX_EDITOR_ENABLE)
     add_subdirectory(onnx_editor)
 endif()
diff --git a/ngraph/frontend/onnx_common/src/utils.cpp b/ngraph/frontend/onnx_common/src/utils.cpp
index 5c63e0430b1d14..998f5f4daa8386 100644
--- a/ngraph/frontend/onnx_common/src/utils.cpp
+++ b/ngraph/frontend/onnx_common/src/utils.cpp
@@ -29,6 +29,7 @@ namespace ngraph
             case ONNX_NAMESPACE::TensorProto_DataType_UINT16: return sizeof(uint16_t);
             case ONNX_NAMESPACE::TensorProto_DataType_UINT32: return sizeof(uint32_t);
             case ONNX_NAMESPACE::TensorProto_DataType_UINT64: return sizeof(uint64_t);
+            case ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16: return sizeof(uint16_t);
             }
 #ifdef NGRAPH_USE_PROTOBUF_LITE
             throw ngraph_error("unsupported element type");
diff --git a/ngraph/frontend/onnx_editor/CMakeLists.txt b/ngraph/frontend/onnx_editor/CMakeLists.txt
index 0714b1e02be332..d893f40a4faac2 100644
--- a/ngraph/frontend/onnx_editor/CMakeLists.txt
+++ b/ngraph/frontend/onnx_editor/CMakeLists.txt
@@ -22,7 +22,7 @@ add_library(ngraph::onnx_editor ALIAS ${TARGET_NAME})
 
 # TODO Add handling ie_faster_build
 
-target_link_libraries(${TARGET_NAME} PRIVATE onnx_common
+target_link_libraries(${TARGET_NAME} PRIVATE onnx_common onnx_importer
                                      PUBLIC ngraph)
 
 set(ONNX_EDITOR_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
diff --git a/ngraph/frontend/onnx_editor/include/onnx_editor/editor.hpp b/ngraph/frontend/onnx_editor/include/onnx_editor/editor.hpp
index 72a6ececff3aec..465890cab110ba 100644
--- a/ngraph/frontend/onnx_editor/include/onnx_editor/editor.hpp
+++ b/ngraph/frontend/onnx_editor/include/onnx_editor/editor.hpp
@@ -8,6 +8,7 @@
 #include <map>
 #include <memory>
 
+#include "ngraph/function.hpp"
 #include "ngraph/op/constant.hpp"
 #include "ngraph/partial_shape.hpp"
 #include "ngraph/type/element_type.hpp"
@@ -86,15 +87,12 @@ namespace ngraph
             void set_input_values(
                 const std::map<std::string, std::shared_ptr<ngraph::op::Constant>>& input_values);
 
-            /// \brief Returns a non-const reference to the underlying ModelProto object, possibly
-            ///        modified by the editor's API calls
-            ///
-            /// \return A reference to ONNX ModelProto object containing the in-memory model
-            ONNX_NAMESPACE::ModelProto& model() const;
-
             /// \brief Returns a serialized ONNX model, possibly modified by the editor.
             std::string model_string() const;
 
+            /// \brief     Converts an edited ONNX model to an nGraph Function representation.
+            std::shared_ptr<Function> get_function() const;
+
             /// \brief Returns a list of all inputs of the in-memory model, including initializers.
             ///        The returned value might depend on the previous operations executed on an
             ///        instance of the model editor, in particular the subgraph extraction which
diff --git a/ngraph/frontend/onnx_editor/src/editor.cpp b/ngraph/frontend/onnx_editor/src/editor.cpp
index ad60dd6c702b40..d4b24300beac39 100644
--- a/ngraph/frontend/onnx_editor/src/editor.cpp
+++ b/ngraph/frontend/onnx_editor/src/editor.cpp
@@ -11,6 +11,7 @@
 #include "onnx_common/parser.hpp"
 #include "onnx_common/utils.hpp"
 #include "onnx_editor/editor.hpp"
+#include "onnx_import/utils/onnx_internal.hpp"
 
 using namespace ngraph;
 
@@ -217,11 +218,6 @@ onnx_editor::ONNXModelEditor::ONNXModelEditor(const std::string& model_path)
 {
 }
 
-ONNX_NAMESPACE::ModelProto& onnx_editor::ONNXModelEditor::model() const
-{
-    return m_pimpl->m_model_proto;
-}
-
 const std::string& onnx_editor::ONNXModelEditor::model_path() const
 {
     return m_model_path;
@@ -330,6 +326,11 @@ std::string onnx_editor::ONNXModelEditor::model_string() const
     return m_pimpl->m_model_proto.SerializeAsString();
 }
 
+std::shared_ptr<Function> onnx_editor::ONNXModelEditor::get_function() const
+{
+    return onnx_import::detail::import_onnx_model(m_pimpl->m_model_proto, m_model_path);
+}
+
 void onnx_editor::ONNXModelEditor::set_input_values(
     const std::map<std::string, std::shared_ptr<ngraph::op::Constant>>& input_values)
 {
diff --git a/ngraph/frontend/onnx_import/include/onnx_import/utils/onnx_internal.hpp b/ngraph/frontend/onnx_import/include/onnx_import/utils/onnx_internal.hpp
new file mode 100644
index 00000000000000..58554bd3c99234
--- /dev/null
+++ b/ngraph/frontend/onnx_import/include/onnx_import/utils/onnx_internal.hpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "ngraph/function.hpp"
+#include "onnx_import/utils/onnx_importer_visibility.hpp"
+
+namespace ONNX_NAMESPACE
+{
+    class ModelProto;
+}
+
+namespace ngraph
+{
+    namespace onnx_import
+    {
+        namespace detail
+        {
+            /// \brief      Imports and converts an serialized ONNX model from a ModelProto
+            ///             to an nGraph Function representation.
+            ///
+            /// \note       The function can be used only internally by OV components!
+            ///             Passing ModelProto between componets which use different protobuf
+            ///             library can cause segfaults. If stream parsing fails or the ONNX model
+            ///             contains unsupported ops, the function throws an ngraph_error exception.
+            ///
+            /// \param[in]  model_proto Reference to a GraphProto object.
+            /// \param[in]  model_path  The path to the imported onnx model.
+            ///                         It is required if the imported model uses data saved in
+            ///                         external files.
+            ///
+            /// \return     An nGraph function that represents a single output from the created
+            /// graph.
+            ONNX_IMPORTER_API
+            std::shared_ptr<Function> import_onnx_model(ONNX_NAMESPACE::ModelProto& model_proto,
+                                                        const std::string& model_path);
+        } // namespace detail
+    }     // namespace onnx_import
+} // namespace ngraph
diff --git a/ngraph/frontend/onnx_import/src/core/transform.hpp b/ngraph/frontend/onnx_import/src/core/transform.hpp
index f2e213a3f94f23..526b0654e2b7d9 100644
--- a/ngraph/frontend/onnx_import/src/core/transform.hpp
+++ b/ngraph/frontend/onnx_import/src/core/transform.hpp
@@ -42,6 +42,7 @@ namespace ngraph
             void expand_onnx_functions(ONNX_NAMESPACE::ModelProto& model_proto);
 
             static const std::vector<std::string> legacy_ops_to_fixup = {
+                "DeformableConv2D",
                 "DetectionOutput",
                 "ExperimentalDetectronDetectionOutput",
                 "ExperimentalDetectronGenerateProposalsSingleImage",
diff --git a/ngraph/frontend/onnx_import/src/onnx.cpp b/ngraph/frontend/onnx_import/src/onnx.cpp
index 3fbd799ab4170d..09f6623611d4eb 100644
--- a/ngraph/frontend/onnx_import/src/onnx.cpp
+++ b/ngraph/frontend/onnx_import/src/onnx.cpp
@@ -4,47 +4,18 @@
 
 #include <fstream>
 #include <memory>
+#include <onnx/onnx_pb.h>
 
-#include "core/graph.hpp"
-#include "core/model.hpp"
-#include "core/transform.hpp"
 #include "ngraph/except.hpp"
 #include "onnx_common/parser.hpp"
 #include "onnx_import/onnx.hpp"
+#include "onnx_import/utils/onnx_internal.hpp"
 #include "ops_bridge.hpp"
 
 namespace ngraph
 {
     namespace onnx_import
     {
-        namespace detail
-        {
-            std::shared_ptr<Function>
-                convert_to_ng_function(const ONNX_NAMESPACE::ModelProto& model_proto)
-            {
-                Model model{model_proto};
-                Graph graph{model_proto.graph(), model};
-                auto function = std::make_shared<Function>(
-                    graph.get_ng_outputs(), graph.get_ng_parameters(), graph.get_name());
-                for (std::size_t i{0}; i < function->get_output_size(); ++i)
-                {
-                    function->get_output_op(i)->set_friendly_name(
-                        graph.get_outputs().at(i).get_name());
-                }
-                return function;
-            }
-
-            std::shared_ptr<Function> import_onnx_model(ONNX_NAMESPACE::ModelProto& model_proto,
-                                                        const std::string& model_path)
-            {
-                transform::expand_onnx_functions(model_proto);
-                transform::fixup_legacy_operators(model_proto);
-                transform::update_external_data_paths(model_proto, model_path);
-
-                return detail::convert_to_ng_function(model_proto);
-            }
-        } // namespace detail
-
         std::shared_ptr<Function> import_onnx_model(std::istream& stream,
                                                     const std::string& model_path)
         {
diff --git a/ngraph/frontend/onnx_import/src/op/conv.cpp b/ngraph/frontend/onnx_import/src/op/conv.cpp
index 4d503dc55be0ac..0b5eed22a83770 100644
--- a/ngraph/frontend/onnx_import/src/op/conv.cpp
+++ b/ngraph/frontend/onnx_import/src/op/conv.cpp
@@ -71,25 +71,9 @@ namespace ngraph
                         const auto conv_shape = std::make_shared<default_opset::ShapeOf>(ng_conv);
                         const auto conv_rank = std::make_shared<default_opset::ShapeOf>(conv_shape);
 
-                        // Prepare tail shape (rank = conv.rank - 2): [1, 1, 1, 1, ... ]
-                        const auto one_const =
-                            default_opset::Constant::create(element::i64, Shape{1}, {1});
-                        const auto two_const =
-                            default_opset::Constant::create(element::i64, Shape{1}, {2});
-                        const auto tail_shape_rank =
-                            std::make_shared<default_opset::Subtract>(conv_rank, two_const);
-                        const auto tail_shape =
-                            std::make_shared<default_opset::Broadcast>(one_const, tail_shape_rank);
-
-                        // Construct new bias shape: [1, C, 1, 1, ... ]
-                        const auto C_dim = std::make_shared<default_opset::ShapeOf>(bias);
-                        const auto bias_shape = std::make_shared<default_opset::Concat>(
-                            OutputVector{one_const, C_dim, tail_shape}, 0);
-
-                        const auto reshaped_bias =
-                            std::make_shared<default_opset::Reshape>(bias, bias_shape, false);
-
-                        return {std::make_shared<default_opset::Add>(ng_conv, reshaped_bias)};
+                        return {std::make_shared<default_opset::Add>(
+                            ng_conv,
+                            reshape::reshape_channel_shaped_node_to_nchw(bias, conv_rank))};
                     }
                 } // namespace
 
diff --git a/ngraph/frontend/onnx_import/src/op/instance_norm.cpp b/ngraph/frontend/onnx_import/src/op/instance_norm.cpp
index 069b9c3f7622f7..70e98e4db23306 100644
--- a/ngraph/frontend/onnx_import/src/op/instance_norm.cpp
+++ b/ngraph/frontend/onnx_import/src/op/instance_norm.cpp
@@ -18,6 +18,7 @@
 #include "ngraph/partial_shape.hpp"
 #include "op/instance_norm.hpp"
 #include "utils/common.hpp"
+#include "utils/reshape.hpp"
 
 namespace ngraph
 {
@@ -84,22 +85,15 @@ namespace ngraph
                     auto mvn = std::make_shared<default_opset::MVN>(
                         data, reduction_axes, true, epsilon, ngraph::op::MVNEpsMode::INSIDE_SQRT);
 
-                    const auto data_shape_node = std::make_shared<default_opset::ShapeOf>(data);
-
-                    // Broadcast preserving channel dimension
-                    scale = std::make_shared<default_opset::Broadcast>(
-                        scale,
-                        data_shape_node,
-                        std::make_shared<default_opset::Constant>(element::i64, Shape{1}, 1));
-                    bias = std::make_shared<default_opset::Broadcast>(
-                        bias,
-                        data_shape_node,
-                        std::make_shared<default_opset::Constant>(element::i64, Shape{1}, 1));
+                    const auto mvn_shape = std::make_shared<default_opset::ShapeOf>(mvn);
+                    const auto mvn_rank = std::make_shared<default_opset::ShapeOf>(mvn_shape);
 
                     // scale * mvn + bias
                     std::shared_ptr<ngraph::Node> result =
-                        std::make_shared<default_opset::Multiply>(mvn, scale);
-                    result = std::make_shared<default_opset::Add>(result, bias);
+                        std::make_shared<default_opset::Multiply>(
+                            mvn, reshape::reshape_channel_shaped_node_to_nchw(scale, mvn_rank));
+                    result = std::make_shared<default_opset::Add>(
+                        result, reshape::reshape_channel_shaped_node_to_nchw(bias, mvn_rank));
 
                     return {result};
                 }
diff --git a/ngraph/frontend/onnx_import/src/op/log_softmax.cpp b/ngraph/frontend/onnx_import/src/op/log_softmax.cpp
index 43a797c2441dcd..88ce01299741aa 100644
--- a/ngraph/frontend/onnx_import/src/op/log_softmax.cpp
+++ b/ngraph/frontend/onnx_import/src/op/log_softmax.cpp
@@ -19,15 +19,7 @@ namespace ngraph
                                                           const int64_t axis)
             {
                 const auto coerced_data = ngraph::builder::opset1::flatten(data, axis);
-
-                const auto axis_1 = default_opset::Constant::create(element::i64, Shape{1}, {1});
-                const auto max =
-                    std::make_shared<default_opset::ReduceMax>(coerced_data, axis_1, true);
-
-                const auto data_minus_max =
-                    std::make_shared<default_opset::Subtract>(coerced_data, max);
-
-                const auto result = std::make_shared<default_opset::LogSoftmax>(data_minus_max, 1);
+                const auto result = std::make_shared<default_opset::LogSoftmax>(coerced_data, 1);
                 const auto data_shape = std::make_shared<default_opset::ShapeOf>(data);
                 return std::make_shared<default_opset::Reshape>(result, data_shape, false);
             }
diff --git a/ngraph/frontend/onnx_import/src/op/mod.cpp b/ngraph/frontend/onnx_import/src/op/mod.cpp
index 2ffa2283850ad1..a74ed9904bb568 100644
--- a/ngraph/frontend/onnx_import/src/op/mod.cpp
+++ b/ngraph/frontend/onnx_import/src/op/mod.cpp
@@ -25,10 +25,25 @@ namespace ngraph
                     Output<ngraph::Node> divisor{node.get_ng_inputs().at(1)};
 
                     std::int64_t fmod = node.get_attribute_value<std::int64_t>("fmod", 0);
-                    CHECK_VALID_NODE(
-                        node, fmod == 1, "Only 'fmod=1' mode is supported for mod operator.");
-
-                    return {std::make_shared<default_opset::Mod>(dividend, divisor)};
+                    OutputVector output;
+                    if (fmod == 1)
+                    {
+                        output = {std::make_shared<default_opset::Mod>(dividend, divisor)};
+                    }
+                    else if (fmod == 0)
+                    {
+                        NGRAPH_CHECK(dividend.get_element_type().is_integral() &&
+                                         divisor.get_element_type().is_integral(),
+                                     "If the input type is floating point, then `fmod` attribute "
+                                     "must be set to 1.");
+                        output = {std::make_shared<default_opset::FloorMod>(dividend, divisor)};
+                    }
+                    else
+                    {
+                        throw ngraph_error(
+                            "Unsupported value of 'fmod' attribute (should be: 0 or 1)");
+                    }
+                    return output;
                 }
 
             } // namespace set_1
diff --git a/ngraph/frontend/onnx_import/src/op/non_max_suppression.cpp b/ngraph/frontend/onnx_import/src/op/non_max_suppression.cpp
index 7b06e8e7b81356..2c5da242582e1b 100644
--- a/ngraph/frontend/onnx_import/src/op/non_max_suppression.cpp
+++ b/ngraph/frontend/onnx_import/src/op/non_max_suppression.cpp
@@ -4,6 +4,7 @@
 
 #include <memory>
 
+#include "core/null_node.hpp"
 #include "default_opset.hpp"
 #include "exceptions.hpp"
 #include "ngraph/op/non_max_suppression.hpp"
@@ -21,6 +22,7 @@ namespace ngraph
             {
                 OutputVector non_max_suppression(const Node& node)
                 {
+                    using ngraph::op::is_null;
                     // TODO: this op will not be tested until at least
                     //       a reference implementation is added
 
@@ -29,7 +31,7 @@ namespace ngraph
                     const Output<ngraph::Node> scores = ng_inputs.at(1);
 
                     Output<ngraph::Node> max_output_boxes_per_class;
-                    if (ng_inputs.size() > 2)
+                    if (ng_inputs.size() > 2 && !is_null(ng_inputs.at(2)))
                     {
                         max_output_boxes_per_class =
                             ngraph::onnx_import::reshape::interpret_as_scalar(ng_inputs.at(2));
@@ -41,7 +43,7 @@ namespace ngraph
                     }
 
                     Output<ngraph::Node> iou_threshold;
-                    if (ng_inputs.size() > 3)
+                    if (ng_inputs.size() > 3 && !is_null(ng_inputs.at(3)))
                     {
                         iou_threshold =
                             ngraph::onnx_import::reshape::interpret_as_scalar(ng_inputs.at(3));
@@ -53,7 +55,7 @@ namespace ngraph
                     }
 
                     Output<ngraph::Node> score_threshold;
-                    if (ng_inputs.size() > 4)
+                    if (ng_inputs.size() > 4 && !is_null(ng_inputs.at(4)))
                     {
                         score_threshold =
                             ngraph::onnx_import::reshape::interpret_as_scalar(ng_inputs.at(4));
diff --git a/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/deformable_conv_2d.cpp b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/deformable_conv_2d.cpp
new file mode 100644
index 00000000000000..f8f81a11efa36a
--- /dev/null
+++ b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/deformable_conv_2d.cpp
@@ -0,0 +1,51 @@
+//*****************************************************************************
+// Copyright 2017-2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "op/org.openvinotoolkit/deformable_conv_2d.hpp"
+#include "default_opset.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/op/deformable_convolution.hpp"
+#include "utils/convpool.hpp"
+
+namespace ngraph
+{
+    namespace onnx_import
+    {
+        OutputVector op::set_1::deformable_conv_2d(const Node& node)
+        {
+            const OutputVector& inputs = node.get_ng_inputs();
+            const auto strides = convpool::get_strides(node);
+            const auto dilations = convpool::get_dilations(node);
+            const auto paddings = convpool::get_pads(node);
+
+            const auto group = node.get_attribute_value<int64_t>("group", 1);
+            const auto deformable_groups =
+                node.get_attribute_value<int64_t>("deformable_groups", 1);
+            const auto auto_pad_type = convpool::get_auto_pad(node);
+
+            return {std::make_shared<default_opset::DeformableConvolution>(inputs.at(0),
+                                                                           inputs.at(1),
+                                                                           inputs.at(2),
+                                                                           strides,
+                                                                           paddings.first,
+                                                                           paddings.second,
+                                                                           dilations,
+                                                                           auto_pad_type,
+                                                                           group,
+                                                                           deformable_groups)};
+        }
+    } // namespace onnx_import
+} // namespace ngraph
diff --git a/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/deformable_conv_2d.hpp b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/deformable_conv_2d.hpp
new file mode 100644
index 00000000000000..9f0b7552cb6aac
--- /dev/null
+++ b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/deformable_conv_2d.hpp
@@ -0,0 +1,38 @@
+//*****************************************************************************
+// Copyright 2017-2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include "ngraph/node.hpp"
+#include "onnx_import/core/node.hpp"
+
+namespace ngraph
+{
+    namespace onnx_import
+    {
+        namespace op
+        {
+            namespace set_1
+            {
+                OutputVector deformable_conv_2d(const Node& node);
+
+            } // namespace set_1
+
+        } // namespace op
+
+    } // namespace onnx_import
+
+} // namespace ngraph
diff --git a/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/group_norm.cpp b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/group_norm.cpp
index d2ec4dc8acb1be..3e91c6ca4c1f99 100644
--- a/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/group_norm.cpp
+++ b/ngraph/frontend/onnx_import/src/op/org.openvinotoolkit/group_norm.cpp
@@ -84,15 +84,35 @@ namespace ngraph
                     std::shared_ptr<ngraph::Node> result =
                         std::make_shared<default_opset::Reshape>(mvn, data_shape_node, true);
 
-                    const auto& rank = data.get_partial_shape().rank();
-                    NGRAPH_CHECK(rank.is_static());
-                    auto data_rank_size = rank.get_length();
-
-                    result = std::make_shared<default_opset::Multiply>(
-                        result,
-                        reshape::reshape_channel_shaped_node_to_nchw(scale, data_rank_size));
-                    result = std::make_shared<default_opset::Add>(
-                        result, reshape::reshape_channel_shaped_node_to_nchw(bias, data_rank_size));
+                    const auto& scale_shape = scale.get_partial_shape();
+                    NGRAPH_CHECK(scale_shape.rank().is_static());
+                    auto scale_rank = scale_shape.rank().get_length();
+
+                    const auto& bias_shape = bias.get_partial_shape();
+                    NGRAPH_CHECK(bias_shape.rank().is_static());
+                    auto bias_rank = bias_shape.rank().get_length();
+
+                    const auto data_rank =
+                        std::make_shared<default_opset::ShapeOf>(data_shape_node);
+
+                    if (scale_rank == 1)
+                    {
+                        result = std::make_shared<default_opset::Multiply>(
+                            result, reshape::reshape_channel_shaped_node_to_nchw(scale, data_rank));
+                    }
+                    else
+                    {
+                        result = std::make_shared<default_opset::Multiply>(result, scale);
+                    }
+                    if (bias_rank == 1)
+                    {
+                        result = std::make_shared<default_opset::Add>(
+                            result, reshape::reshape_channel_shaped_node_to_nchw(bias, data_rank));
+                    }
+                    else
+                    {
+                        result = std::make_shared<default_opset::Add>(result, bias);
+                    }
 
                     return {result};
                 }
diff --git a/ngraph/frontend/onnx_import/src/op/pad.cpp b/ngraph/frontend/onnx_import/src/op/pad.cpp
index 4811cd6d1620ed..1c01f64182b707 100644
--- a/ngraph/frontend/onnx_import/src/op/pad.cpp
+++ b/ngraph/frontend/onnx_import/src/op/pad.cpp
@@ -119,14 +119,10 @@ namespace ngraph
                     }
                     else
                     {
-                        auto axis =
-                            default_opset::Constant::create(element::i64, ngraph::Shape{}, {0});
                         OutputVector padding = builder::opset1::split(pads, 2, 0);
 
-                        padding_begin =
-                            std::make_shared<default_opset::Convert>(padding.at(0), element::i64);
-                        padding_end =
-                            std::make_shared<default_opset::Convert>(padding.at(1), element::i64);
+                        padding_begin = padding.at(0);
+                        padding_end = padding.at(1);
                     }
 
                     const std::string mode =
diff --git a/ngraph/frontend/onnx_import/src/op/range.cpp b/ngraph/frontend/onnx_import/src/op/range.cpp
index ffffdb73a1e65d..7b97cf15d860da 100644
--- a/ngraph/frontend/onnx_import/src/op/range.cpp
+++ b/ngraph/frontend/onnx_import/src/op/range.cpp
@@ -21,7 +21,7 @@ namespace ngraph
                     const Output<ngraph::Node> stop{node.get_ng_inputs().at(1)};
                     const Output<ngraph::Node> step{node.get_ng_inputs().at(2)};
                     return {std::make_shared<default_opset::Range>(
-                        start, stop, step, node.get_ng_inputs().at(0).get_element_type())};
+                        start, stop, step, start.get_element_type())};
                 }
             } // namespace set_1
 
diff --git a/ngraph/frontend/onnx_import/src/op/reshape.cpp b/ngraph/frontend/onnx_import/src/op/reshape.cpp
index f32adad1bd661b..be94db877fd2de 100644
--- a/ngraph/frontend/onnx_import/src/op/reshape.cpp
+++ b/ngraph/frontend/onnx_import/src/op/reshape.cpp
@@ -27,7 +27,7 @@ namespace ngraph
                     const auto data = ng_inputs.at(0);
 
                     Output<ngraph::Node> pattern;
-
+                    bool special_zero = true;
                     // Since opset 5 the target shape is provided as input
                     if (ng_inputs.size() == 2)
                     {
@@ -38,11 +38,14 @@ namespace ngraph
                         const auto output_shape =
                             node.get_attribute_value<std::vector<int64_t>>("shape", {});
 
+                        // Added in onnx reshape version 14
+                        special_zero = !node.get_attribute_value<int64_t>("allowzero", 0);
+
                         pattern = default_opset::Constant::create(
                             element::i64, Shape{output_shape.size()}, output_shape);
                     }
 
-                    return {std::make_shared<default_opset::Reshape>(data, pattern, true)};
+                    return {std::make_shared<default_opset::Reshape>(data, pattern, special_zero)};
                 }
 
             } // namespace set_1
diff --git a/ngraph/frontend/onnx_import/src/op/slice.cpp b/ngraph/frontend/onnx_import/src/op/slice.cpp
index c796b08647cf5f..8029e10373c29a 100644
--- a/ngraph/frontend/onnx_import/src/op/slice.cpp
+++ b/ngraph/frontend/onnx_import/src/op/slice.cpp
@@ -6,6 +6,7 @@
 #include <memory>
 #include <vector>
 
+#include "core/null_node.hpp"
 #include "default_opset.hpp"
 #include "exceptions.hpp"
 #include "ngraph/node.hpp"
@@ -167,6 +168,8 @@ namespace ngraph
             {
                 OutputVector slice(const Node& node)
                 {
+                    using ngraph::op::is_null;
+
                     OutputVector inputs{node.get_ng_inputs()};
                     const auto data = inputs.at(0);
                     const auto data_rank = data.get_partial_shape().rank();
@@ -176,7 +179,7 @@ namespace ngraph
 
                     // Slice is calculated over all axes as default
                     Output<ngraph::Node> axes;
-                    if (inputs.size() >= 4) // axes input provided
+                    if (inputs.size() >= 4 && !is_null(inputs.at(3))) // axes input provided
                     {
                         axes = inputs.at(3);
                         CHECK_VALID_NODE(node,
@@ -202,12 +205,12 @@ namespace ngraph
                     std::vector<uint64_t> axes_vec =
                         get_normalized_axes_vector(node, data_rank, raw_axes_vec);
 
-                    const uint64_t slice_indices_length =
+                    const size_t slice_indices_length =
                         *std::max_element(std::begin(axes_vec), std::end(axes_vec)) + 1;
                     const auto begin_end_mask = axes_to_mask(axes_vec, slice_indices_length);
 
                     Output<ngraph::Node> steps;
-                    if (inputs.size() == 5) // steps input provided
+                    if (inputs.size() == 5 && !is_null(inputs.at(4))) // steps input provided
                     {
                         steps = inputs.at(4);
                     }
@@ -260,7 +263,7 @@ namespace ngraph
                     std::vector<uint64_t> normalized_axes =
                         get_normalized_axes_vector(node, data_rank, axes);
 
-                    const uint64_t slice_indices_length =
+                    const size_t slice_indices_length =
                         *std::max_element(std::begin(normalized_axes), std::end(normalized_axes)) +
                         1;
                     const auto begin_end_mask = axes_to_mask(normalized_axes, slice_indices_length);
diff --git a/ngraph/frontend/onnx_import/src/ops_bridge.cpp b/ngraph/frontend/onnx_import/src/ops_bridge.cpp
index b54af15b26e452..e2107c8ade35da 100644
--- a/ngraph/frontend/onnx_import/src/ops_bridge.cpp
+++ b/ngraph/frontend/onnx_import/src/ops_bridge.cpp
@@ -133,6 +133,7 @@
 #include "op/xor.hpp"
 #include "ops_bridge.hpp"
 
+#include "op/org.openvinotoolkit/deformable_conv_2d.hpp"
 #include "op/org.openvinotoolkit/detection_output.hpp"
 #include "op/org.openvinotoolkit/experimental_detectron/detection_output.hpp"
 #include "op/org.openvinotoolkit/experimental_detectron/generate_proposals_single_image.hpp"
@@ -461,6 +462,8 @@ namespace ngraph
             REGISTER_OPERATOR("Xor", 1, logical_xor);
 
             // custom OPs
+            REGISTER_OPERATOR_WITH_DOMAIN(
+                OPENVINO_ONNX_DOMAIN, "DeformableConv2D", 1, deformable_conv_2d);
             REGISTER_OPERATOR_WITH_DOMAIN(
                 OPENVINO_ONNX_DOMAIN, "DetectionOutput", 1, detection_output);
             REGISTER_OPERATOR_WITH_DOMAIN(OPENVINO_ONNX_DOMAIN,
diff --git a/ngraph/frontend/onnx_import/src/utils/common.cpp b/ngraph/frontend/onnx_import/src/utils/common.cpp
index fd3d387000d625..67431ac8c52887 100644
--- a/ngraph/frontend/onnx_import/src/utils/common.cpp
+++ b/ngraph/frontend/onnx_import/src/utils/common.cpp
@@ -31,6 +31,7 @@ namespace ngraph
                 case ONNX_NAMESPACE::TensorProto_DataType_UINT32: return element::u32;
                 case ONNX_NAMESPACE::TensorProto_DataType_UINT64: return element::u64;
                 case ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED: return element::dynamic;
+                case ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16: return element::bf16;
                 }
 #ifdef NGRAPH_USE_PROTOBUF_LITE
                 throw ngraph_error("unsupported element type");
diff --git a/ngraph/frontend/onnx_import/src/utils/onnx_internal.cpp b/ngraph/frontend/onnx_import/src/utils/onnx_internal.cpp
new file mode 100644
index 00000000000000..00544c1fabfc2b
--- /dev/null
+++ b/ngraph/frontend/onnx_import/src/utils/onnx_internal.cpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <onnx/onnx_pb.h>
+
+#include "core/graph.hpp"
+#include "core/model.hpp"
+#include "core/transform.hpp"
+#include "onnx_import/utils/onnx_internal.hpp"
+
+namespace ngraph
+{
+    namespace onnx_import
+    {
+        namespace detail
+        {
+            std::shared_ptr<Function>
+                convert_to_ng_function(const ONNX_NAMESPACE::ModelProto& model_proto)
+            {
+                Model model{model_proto};
+                Graph graph{model_proto.graph(), model};
+                auto function = std::make_shared<Function>(
+                    graph.get_ng_outputs(), graph.get_ng_parameters(), graph.get_name());
+                for (std::size_t i{0}; i < function->get_output_size(); ++i)
+                {
+                    function->get_output_op(i)->set_friendly_name(
+                        graph.get_outputs().at(i).get_name());
+                }
+                return function;
+            }
+
+            std::shared_ptr<Function> import_onnx_model(ONNX_NAMESPACE::ModelProto& model_proto,
+                                                        const std::string& model_path)
+            {
+                transform::expand_onnx_functions(model_proto);
+                transform::fixup_legacy_operators(model_proto);
+                transform::update_external_data_paths(model_proto, model_path);
+
+                return detail::convert_to_ng_function(model_proto);
+            }
+        } // namespace detail
+    }     // namespace onnx_import
+} // namespace ngraph
diff --git a/ngraph/frontend/onnx_import/src/utils/reshape.cpp b/ngraph/frontend/onnx_import/src/utils/reshape.cpp
index 8a017542c9b746..daa6d4cd5ea8a1 100644
--- a/ngraph/frontend/onnx_import/src/utils/reshape.cpp
+++ b/ngraph/frontend/onnx_import/src/utils/reshape.cpp
@@ -104,21 +104,22 @@ namespace ngraph
 
             Output<ngraph::Node>
                 reshape_channel_shaped_node_to_nchw(const Output<ngraph::Node>& node,
-                                                    size_t expected_rank)
+                                                    const Output<ngraph::Node>& expected_rank)
             {
-                const auto& rank = node.get_partial_shape().rank();
-                NGRAPH_CHECK(rank.is_static());
-                size_t node_rank = rank.get_length();
-                if (node_rank == 1)
-                {
-                    // reshape the node with shape {C} to {1, C, 1, 1, ..., 1}
-                    std::vector<size_t> reshape_pattern_values(expected_rank, 1U);
-                    reshape_pattern_values[1] = node.get_shape().front();
-                    const auto reshape_pattern = default_opset::Constant::create(
-                        element::u64, Shape{reshape_pattern_values.size()}, reshape_pattern_values);
-                    return std::make_shared<default_opset::Reshape>(node, reshape_pattern, false);
-                }
-                return node;
+                // Prepare tail shape (rank = conv.rank - 2): [1, 1, 1, 1, ... ]
+                const auto one_const = default_opset::Constant::create(element::i64, Shape{1}, {1});
+                const auto two_const = default_opset::Constant::create(element::i64, Shape{1}, {2});
+                const auto tail_shape_rank =
+                    std::make_shared<default_opset::Subtract>(expected_rank, two_const);
+                const auto tail_shape =
+                    std::make_shared<default_opset::Broadcast>(one_const, tail_shape_rank);
+
+                // Construct new bias shape: [1, C, 1, 1, ... ]
+                const auto C_dim = std::make_shared<default_opset::ShapeOf>(node);
+                const auto new_shape = std::make_shared<default_opset::Concat>(
+                    OutputVector{one_const, C_dim, tail_shape}, 0);
+
+                return std::make_shared<default_opset::Reshape>(node, new_shape, false);
             }
 
         } // namespace  reshape
diff --git a/ngraph/frontend/onnx_import/src/utils/reshape.hpp b/ngraph/frontend/onnx_import/src/utils/reshape.hpp
index 74b71d8cb9ca61..d1bd4d7f74ddaa 100644
--- a/ngraph/frontend/onnx_import/src/utils/reshape.hpp
+++ b/ngraph/frontend/onnx_import/src/utils/reshape.hpp
@@ -63,7 +63,7 @@ namespace ngraph
             ///
             Output<ngraph::Node>
                 reshape_channel_shaped_node_to_nchw(const Output<ngraph::Node>& node,
-                                                    size_t expected_rank);
+                                                    const Output<ngraph::Node>& expected_rank);
 
         } // namespace  reshape
     }     // namespace onnx_import
diff --git a/ngraph/python/requirements_test.txt b/ngraph/python/requirements_test.txt
index 16d93ebbeea19a..6f5b802c21ecff 100644
--- a/ngraph/python/requirements_test.txt
+++ b/ngraph/python/requirements_test.txt
@@ -1,10 +1,10 @@
 flake8==3.9.0
 flake8-comprehensions==3.3.0
-flake8-docstrings==1.5.0
+flake8-docstrings==1.6.0
 flake8-quotes==3.2.0
 onnx==1.8.1
 pydocstyle==5.1.1
 pytest==6.1.2
 retrying==1.3.3
-tox==3.22.0
-wheel==0.35.1
+tox==3.23.0
+wheel==0.36.2
diff --git a/ngraph/python/src/ngraph/__init__.py b/ngraph/python/src/ngraph/__init__.py
index 25bba07cae0392..c9343497b24151 100644
--- a/ngraph/python/src/ngraph/__init__.py
+++ b/ngraph/python/src/ngraph/__init__.py
@@ -16,155 +16,155 @@
 from ngraph.helpers import function_from_cnn
 from ngraph.helpers import function_to_cnn
 
-from ngraph.opset6 import absolute
-from ngraph.opset6 import absolute as abs
-from ngraph.opset6 import acos
-from ngraph.opset6 import acosh
-from ngraph.opset6 import add
-from ngraph.opset6 import asin
-from ngraph.opset6 import asinh
-from ngraph.opset6 import assign
-from ngraph.opset6 import atan
-from ngraph.opset6 import atanh
-from ngraph.opset6 import avg_pool
-from ngraph.opset6 import batch_norm_inference
-from ngraph.opset6 import batch_to_space
-from ngraph.opset6 import binary_convolution
-from ngraph.opset6 import broadcast
-from ngraph.opset6 import bucketize
-from ngraph.opset6 import ceiling
-from ngraph.opset6 import ceiling as ceil
-from ngraph.opset6 import clamp
-from ngraph.opset6 import concat
-from ngraph.opset6 import constant
-from ngraph.opset6 import convert
-from ngraph.opset6 import convert_like
-from ngraph.opset6 import convolution
-from ngraph.opset6 import convolution_backprop_data
-from ngraph.opset6 import cos
-from ngraph.opset6 import cosh
-from ngraph.opset6 import ctc_greedy_decoder
-from ngraph.opset6 import ctc_greedy_decoder_seq_len
-from ngraph.opset6 import ctc_loss
-from ngraph.opset6 import cum_sum
-from ngraph.opset6 import cum_sum as cumsum
-from ngraph.opset6 import deformable_convolution
-from ngraph.opset6 import deformable_psroi_pooling
-from ngraph.opset6 import depth_to_space
-from ngraph.opset6 import detection_output
-from ngraph.opset6 import divide
-from ngraph.opset6 import elu
-from ngraph.opset6 import embedding_bag_offsets_sum
-from ngraph.opset6 import embedding_bag_packed_sum
-from ngraph.opset6 import embedding_segments_sum
-from ngraph.opset6 import extract_image_patches
-from ngraph.opset6 import equal
-from ngraph.opset6 import erf
-from ngraph.opset6 import exp
-from ngraph.opset6 import fake_quantize
-from ngraph.opset6 import floor
-from ngraph.opset6 import floor_mod
-from ngraph.opset6 import gather
-from ngraph.opset6 import gather_elements
-from ngraph.opset6 import gather_nd
-from ngraph.opset6 import gather_tree
-from ngraph.opset6 import gelu
-from ngraph.opset6 import greater
-from ngraph.opset6 import greater_equal
-from ngraph.opset6 import grn
-from ngraph.opset6 import group_convolution
-from ngraph.opset6 import group_convolution_backprop_data
-from ngraph.opset6 import gru_cell
-from ngraph.opset6 import gru_sequence
-from ngraph.opset6 import hard_sigmoid
-from ngraph.opset6 import hsigmoid
-from ngraph.opset6 import hswish
-from ngraph.opset6 import interpolate
-from ngraph.opset6 import less
-from ngraph.opset6 import less_equal
-from ngraph.opset6 import log
-from ngraph.opset6 import logical_and
-from ngraph.opset6 import logical_not
-from ngraph.opset6 import logical_or
-from ngraph.opset6 import logical_xor
-from ngraph.opset6 import log_softmax
-from ngraph.opset6 import loop
-from ngraph.opset6 import lrn
-from ngraph.opset6 import lstm_cell
-from ngraph.opset6 import lstm_sequence
-from ngraph.opset6 import matmul
-from ngraph.opset6 import max_pool
-from ngraph.opset6 import maximum
-from ngraph.opset6 import minimum
-from ngraph.opset6 import mish
-from ngraph.opset6 import mod
-from ngraph.opset6 import multiply
-from ngraph.opset6 import mvn
-from ngraph.opset6 import negative
-from ngraph.opset6 import non_max_suppression
-from ngraph.opset6 import non_zero
-from ngraph.opset6 import normalize_l2
-from ngraph.opset6 import not_equal
-from ngraph.opset6 import one_hot
-from ngraph.opset6 import pad
-from ngraph.opset6 import parameter
-from ngraph.opset6 import power
-from ngraph.opset6 import prelu
-from ngraph.opset6 import prior_box
-from ngraph.opset6 import prior_box_clustered
-from ngraph.opset6 import psroi_pooling
-from ngraph.opset6 import proposal
-from ngraph.opset6 import range
-from ngraph.opset6 import read_value
-from ngraph.opset6 import reduce_l1
-from ngraph.opset6 import reduce_l2
-from ngraph.opset6 import reduce_logical_and
-from ngraph.opset6 import reduce_logical_or
-from ngraph.opset6 import reduce_max
-from ngraph.opset6 import reduce_mean
-from ngraph.opset6 import reduce_min
-from ngraph.opset6 import reduce_prod
-from ngraph.opset6 import reduce_sum
-from ngraph.opset6 import region_yolo
-from ngraph.opset6 import reorg_yolo
-from ngraph.opset6 import relu
-from ngraph.opset6 import reshape
-from ngraph.opset6 import result
-from ngraph.opset6 import reverse_sequence
-from ngraph.opset6 import rnn_cell
-from ngraph.opset6 import rnn_sequence
-from ngraph.opset6 import roi_align
-from ngraph.opset6 import roi_pooling
-from ngraph.opset6 import round
-from ngraph.opset6 import scatter_elements_update
-from ngraph.opset6 import scatter_update
-from ngraph.opset6 import select
-from ngraph.opset6 import selu
-from ngraph.opset6 import shape_of
-from ngraph.opset6 import shuffle_channels
-from ngraph.opset6 import sigmoid
-from ngraph.opset6 import sign
-from ngraph.opset6 import sin
-from ngraph.opset6 import sinh
-from ngraph.opset6 import softmax
-from ngraph.opset6 import softplus
-from ngraph.opset6 import space_to_batch
-from ngraph.opset6 import space_to_depth
-from ngraph.opset6 import split
-from ngraph.opset6 import sqrt
-from ngraph.opset6 import squared_difference
-from ngraph.opset6 import squeeze
-from ngraph.opset6 import strided_slice
-from ngraph.opset6 import subtract
-from ngraph.opset6 import swish
-from ngraph.opset6 import tan
-from ngraph.opset6 import tanh
-from ngraph.opset6 import tensor_iterator
-from ngraph.opset6 import tile
-from ngraph.opset6 import topk
-from ngraph.opset6 import transpose
-from ngraph.opset6 import unsqueeze
-from ngraph.opset6 import variadic_split
+from ngraph.opset7 import absolute
+from ngraph.opset7 import absolute as abs
+from ngraph.opset7 import acos
+from ngraph.opset7 import acosh
+from ngraph.opset7 import add
+from ngraph.opset7 import asin
+from ngraph.opset7 import asinh
+from ngraph.opset7 import assign
+from ngraph.opset7 import atan
+from ngraph.opset7 import atanh
+from ngraph.opset7 import avg_pool
+from ngraph.opset7 import batch_norm_inference
+from ngraph.opset7 import batch_to_space
+from ngraph.opset7 import binary_convolution
+from ngraph.opset7 import broadcast
+from ngraph.opset7 import bucketize
+from ngraph.opset7 import ceiling
+from ngraph.opset7 import ceiling as ceil
+from ngraph.opset7 import clamp
+from ngraph.opset7 import concat
+from ngraph.opset7 import constant
+from ngraph.opset7 import convert
+from ngraph.opset7 import convert_like
+from ngraph.opset7 import convolution
+from ngraph.opset7 import convolution_backprop_data
+from ngraph.opset7 import cos
+from ngraph.opset7 import cosh
+from ngraph.opset7 import ctc_greedy_decoder
+from ngraph.opset7 import ctc_greedy_decoder_seq_len
+from ngraph.opset7 import ctc_loss
+from ngraph.opset7 import cum_sum
+from ngraph.opset7 import cum_sum as cumsum
+from ngraph.opset7 import deformable_convolution
+from ngraph.opset7 import deformable_psroi_pooling
+from ngraph.opset7 import depth_to_space
+from ngraph.opset7 import detection_output
+from ngraph.opset7 import divide
+from ngraph.opset7 import elu
+from ngraph.opset7 import embedding_bag_offsets_sum
+from ngraph.opset7 import embedding_bag_packed_sum
+from ngraph.opset7 import embedding_segments_sum
+from ngraph.opset7 import extract_image_patches
+from ngraph.opset7 import equal
+from ngraph.opset7 import erf
+from ngraph.opset7 import exp
+from ngraph.opset7 import fake_quantize
+from ngraph.opset7 import floor
+from ngraph.opset7 import floor_mod
+from ngraph.opset7 import gather
+from ngraph.opset7 import gather_elements
+from ngraph.opset7 import gather_nd
+from ngraph.opset7 import gather_tree
+from ngraph.opset7 import gelu
+from ngraph.opset7 import greater
+from ngraph.opset7 import greater_equal
+from ngraph.opset7 import grn
+from ngraph.opset7 import group_convolution
+from ngraph.opset7 import group_convolution_backprop_data
+from ngraph.opset7 import gru_cell
+from ngraph.opset7 import gru_sequence
+from ngraph.opset7 import hard_sigmoid
+from ngraph.opset7 import hsigmoid
+from ngraph.opset7 import hswish
+from ngraph.opset7 import interpolate
+from ngraph.opset7 import less
+from ngraph.opset7 import less_equal
+from ngraph.opset7 import log
+from ngraph.opset7 import logical_and
+from ngraph.opset7 import logical_not
+from ngraph.opset7 import logical_or
+from ngraph.opset7 import logical_xor
+from ngraph.opset7 import log_softmax
+from ngraph.opset7 import loop
+from ngraph.opset7 import lrn
+from ngraph.opset7 import lstm_cell
+from ngraph.opset7 import lstm_sequence
+from ngraph.opset7 import matmul
+from ngraph.opset7 import max_pool
+from ngraph.opset7 import maximum
+from ngraph.opset7 import minimum
+from ngraph.opset7 import mish
+from ngraph.opset7 import mod
+from ngraph.opset7 import multiply
+from ngraph.opset7 import mvn
+from ngraph.opset7 import negative
+from ngraph.opset7 import non_max_suppression
+from ngraph.opset7 import non_zero
+from ngraph.opset7 import normalize_l2
+from ngraph.opset7 import not_equal
+from ngraph.opset7 import one_hot
+from ngraph.opset7 import pad
+from ngraph.opset7 import parameter
+from ngraph.opset7 import power
+from ngraph.opset7 import prelu
+from ngraph.opset7 import prior_box
+from ngraph.opset7 import prior_box_clustered
+from ngraph.opset7 import psroi_pooling
+from ngraph.opset7 import proposal
+from ngraph.opset7 import range
+from ngraph.opset7 import read_value
+from ngraph.opset7 import reduce_l1
+from ngraph.opset7 import reduce_l2
+from ngraph.opset7 import reduce_logical_and
+from ngraph.opset7 import reduce_logical_or
+from ngraph.opset7 import reduce_max
+from ngraph.opset7 import reduce_mean
+from ngraph.opset7 import reduce_min
+from ngraph.opset7 import reduce_prod
+from ngraph.opset7 import reduce_sum
+from ngraph.opset7 import region_yolo
+from ngraph.opset7 import reorg_yolo
+from ngraph.opset7 import relu
+from ngraph.opset7 import reshape
+from ngraph.opset7 import result
+from ngraph.opset7 import reverse_sequence
+from ngraph.opset7 import rnn_cell
+from ngraph.opset7 import rnn_sequence
+from ngraph.opset7 import roi_align
+from ngraph.opset7 import roi_pooling
+from ngraph.opset7 import round
+from ngraph.opset7 import scatter_elements_update
+from ngraph.opset7 import scatter_update
+from ngraph.opset7 import select
+from ngraph.opset7 import selu
+from ngraph.opset7 import shape_of
+from ngraph.opset7 import shuffle_channels
+from ngraph.opset7 import sigmoid
+from ngraph.opset7 import sign
+from ngraph.opset7 import sin
+from ngraph.opset7 import sinh
+from ngraph.opset7 import softmax
+from ngraph.opset7 import softplus
+from ngraph.opset7 import space_to_batch
+from ngraph.opset7 import space_to_depth
+from ngraph.opset7 import split
+from ngraph.opset7 import sqrt
+from ngraph.opset7 import squared_difference
+from ngraph.opset7 import squeeze
+from ngraph.opset7 import strided_slice
+from ngraph.opset7 import subtract
+from ngraph.opset7 import swish
+from ngraph.opset7 import tan
+from ngraph.opset7 import tanh
+from ngraph.opset7 import tensor_iterator
+from ngraph.opset7 import tile
+from ngraph.opset7 import topk
+from ngraph.opset7 import transpose
+from ngraph.opset7 import unsqueeze
+from ngraph.opset7 import variadic_split
 
 
 # Extend Node class to support binary operators
diff --git a/ngraph/python/src/ngraph/utils/types.py b/ngraph/python/src/ngraph/utils/types.py
index 4e3b2f631328c8..b40ec700a5373e 100644
--- a/ngraph/python/src/ngraph/utils/types.py
+++ b/ngraph/python/src/ngraph/utils/types.py
@@ -34,11 +34,11 @@
     (NgraphType.u16, np.uint16),
     (NgraphType.u32, np.uint32),
     (NgraphType.u64, np.uint64),
+    (NgraphType.bf16, np.uint16),
 ]
 
 ngraph_to_numpy_types_str_map = [
     ("boolean", np.bool),
-    # ('bf16', ???),
     ("f16", np.float16),
     ("f32", np.float32),
     ("f64", np.float64),
diff --git a/ngraph/python/src/pyngraph/types/element_type.cpp b/ngraph/python/src/pyngraph/types/element_type.cpp
index 7ae833a4fa57a5..db1bac50ed03a9 100644
--- a/ngraph/python/src/pyngraph/types/element_type.cpp
+++ b/ngraph/python/src/pyngraph/types/element_type.cpp
@@ -28,6 +28,7 @@ void regclass_pyngraph_Type(py::module m)
     type.attr("u16") = ngraph::element::u16;
     type.attr("u32") = ngraph::element::u32;
     type.attr("u64") = ngraph::element::u64;
+    type.attr("bf16") = ngraph::element::bf16;
 
     type.def("__repr__", [](const ngraph::element::Type& self) {
         std::string bitwidth = std::to_string(self.bitwidth());
diff --git a/ngraph/python/src/pyngraph/types/element_type.hpp b/ngraph/python/src/pyngraph/types/element_type.hpp
index 94a67165c20f14..763b08e51c9ab5 100644
--- a/ngraph/python/src/pyngraph/types/element_type.hpp
+++ b/ngraph/python/src/pyngraph/types/element_type.hpp
@@ -20,3 +20,4 @@ void regclass_pyngraph_UInt8(py::module m);
 // void regclass_pyngraph_UInt16(py::module m);
 void regclass_pyngraph_UInt32(py::module m);
 void regclass_pyngraph_UInt64(py::module m);
+void regclass_pyngraph_BFloat16(py::module m);
diff --git a/ngraph/python/tests/__init__.py b/ngraph/python/tests/__init__.py
index b76cf6a7672394..65b7040f679041 100644
--- a/ngraph/python/tests/__init__.py
+++ b/ngraph/python/tests/__init__.py
@@ -81,17 +81,8 @@ def xfail_test(reason="Mark the test as expected to fail", strict=True):
 xfail_issue_38708 = xfail_test(reason="RuntimeError: While validating ONNX node '<Node(Slice): y>': "
                                       "Axes input must be constant")
 xfail_issue_38710 = xfail_test(reason="RuntimeError: roi has zero dimension which is not allowed")
-xfail_issue_38712 = xfail_test(reason="RuntimeError: Check '(fmod == 1) "
-                                      "While validating ONNX node '<Node(Mod): z>': "
-                                      "Only 'fmod=1' mode is supported for mod operator.")
 xfail_issue_38713 = xfail_test(reason="RuntimeError: nGraph does not support the following ONNX operations:"
                                       "ai.onnx.preview.training.Momentum")
-xfail_issue_38714 = xfail_test(reason="RuntimeError: While validating ONNX node '<Node(Resize): Y>'"
-                                      "Check 'element::Type::merge(element_type, element_type,"
-                                      "node->get_input_element_type(i))' "
-                                      "While validating node 'v1::<name> (sizes[0]:i64{4},"
-                                      "Convert_29306[0]:f32{4}) -> (dynamic?)' with friendly_name '<name>':"
-                                      "Argument element types are inconsistent.")
 xfail_issue_43742 = xfail_test(reason="RuntimeError: nGraph does not support the following ONNX operations:"
                                       "If")
 xfail_issue_45457 = xfail_test(reason="RuntimeError: Unsupported dynamic ops: v5::Loop"
@@ -127,15 +118,12 @@ def xfail_test(reason="Mark the test as expected to fail", strict=True):
 xfail_issue_44957 = xfail_test(reason="E   Unsupported dynamic op: NonZero")
 xfail_issue_44958 = xfail_test(reason="E   Unsupported dynamic op: Interpolate")
 xfail_issue_44965 = xfail_test(reason="E   RuntimeError: value info has no element")
-xfail_issue_44967 = xfail_test(reason="E   RuntimeError: unsupported element type: BFLOAT16")
 xfail_issue_44968 = xfail_test(reason="E   Unsupported dynamic op: Squeeze")
 xfail_issue_44970 = xfail_test(reason="Assertion error")
 xfail_issue_44976 = xfail_test(reason="E   RuntimeError: Quantize layer with name:"
                                       "FakeQuantize_xxx has non const input on 1 port")
 xfail_issue_46762 = xfail_test(reason="Incorrect result of Minimum op if uint data type is used")
 xfail_issue_46765 = xfail_test(reason="select_last_index attribute is not supported by ArgMin and ArgMax")
-xfail_issue_47317 = xfail_test(reason="RuntimeError: While validating ONNX node '<Node(Add): 2>': "
-                                      "Check shape_size(axes_shape) == input_rank' failed")
 xfail_issue_47323 = xfail_test(reason="RuntimeError: The plugin does not support FP64")
 xfail_issue_47337 = xfail_test(reason="RuntimeError: Unsupported dynamic ops: v1::OneHot")
 xfail_issue_33593 = xfail_test(reason="Current implementation of MaxPool doesn't support indices output")
@@ -173,4 +161,5 @@ def xfail_test(reason="Mark the test as expected to fail", strict=True):
 xfail_issue_49752 = xfail_test(reason="RuntimeError: Unsupported dynamic ops: v1::Pad")
 xfail_issue_49753 = xfail_test(reason="RuntimeError: Unsupported dynamic ops: v1::StridedSlice")
 xfail_issue_49754 = xfail_test(reason="RuntimeError: Unsupported dynamic ops: v1::TopKIE")
-xfail_issue_49913 = xfail_test(reason="CPU supports Gelu with tanh mode only")
+xfail_issue_52463 = xfail_test(reason="test_operator_add_size1_singleton_broadcast_cpu - "
+                                      "Not equal to tolerance")
diff --git a/ngraph/python/tests/runtime.py b/ngraph/python/tests/runtime.py
index 20035fb66b71a8..16b79b85b5b2fe 100644
--- a/ngraph/python/tests/runtime.py
+++ b/ngraph/python/tests/runtime.py
@@ -121,6 +121,17 @@ def _get_ie_output_blob_buffer(self, output_blobs: Dict[str, Blob], ng_result: r
         out_name = self._get_ie_output_blob_name(output_blobs, ng_result)
         return output_blobs[out_name].buffer
 
+    def convert_buffers(self, source_buffers, target_dtypes):
+        converted_buffers = []
+        for i in range(len(source_buffers)):
+            target_dtype = target_dtypes[i]
+            # custom conversion for bf16
+            if self.results[i].get_output_element_type(0) == Type.bf16:
+                converted_buffers.append((source_buffers[i].view(np.uint32) >> 16).astype(np.uint16))
+            else:
+                converted_buffers.append(source_buffers[i].astype(target_dtype))
+        return converted_buffers
+
     def __call__(self, *input_values: NumericData) -> List[NumericData]:
         """Run computation on input values and return result."""
         # Input validation
@@ -173,6 +184,5 @@ def __call__(self, *input_values: NumericData) -> List[NumericData]:
 
         # Since OV overwrite result data type we have to convert results to the original one.
         original_dtypes = [get_dtype(result.get_output_element_type(0)) for result in self.results]
-        converted_buffers = [buffer.astype(original_dtype) for buffer, original_dtype in
-                             zip(result_buffers, original_dtypes)]
+        converted_buffers = self.convert_buffers(result_buffers, original_dtypes)
         return converted_buffers
diff --git a/ngraph/python/tests/test_ngraph/test_ops_unary.py b/ngraph/python/tests/test_ngraph/test_ops_unary.py
index f10f77c9030b87..bd8da79988a708 100644
--- a/ngraph/python/tests/test_ngraph/test_ops_unary.py
+++ b/ngraph/python/tests/test_ngraph/test_ops_unary.py
@@ -8,7 +8,7 @@
 from ngraph.impl import Shape, Type
 from tests.runtime import get_runtime
 from tests.test_ngraph.util import run_op_node
-from tests import xfail_issue_44970, xfail_issue_49913
+from tests import xfail_issue_44970
 
 
 @pytest.mark.parametrize(
@@ -176,7 +176,6 @@ def test_hsigmoid():
     assert node.get_output_element_type(0) == Type.f32
 
 
-@xfail_issue_49913
 def test_gelu_operator_with_parameters():
     runtime = get_runtime()
 
@@ -190,10 +189,9 @@ def test_gelu_operator_with_parameters():
 
     result = computation(data_value)
     expected = np.array([[-1.6391277e-06, 8.4134471e-01], [-4.5500278e-02, 2.9959502]], dtype=np.float32)
-    assert np.allclose(result, expected)
+    assert np.allclose(result, expected, 1e-6, 1e-6)
 
 
-@xfail_issue_49913
 def test_gelu_operator_with_array():
     runtime = get_runtime()
 
@@ -204,7 +202,7 @@ def test_gelu_operator_with_array():
 
     result = computation()
     expected = np.array([[-1.6391277e-06, 8.4134471e-01], [-4.5500278e-02, 2.9959502]], dtype=np.float32)
-    assert np.allclose(result, expected)
+    assert np.allclose(result, expected, 1e-6, 1e-6)
 
 
 def test_gelu_tanh_operator_with_parameters():
diff --git a/ngraph/python/tests/test_onnx/test_backend.py b/ngraph/python/tests/test_onnx/test_backend.py
index 5efde34aa8bccf..aa136fd1525ce0 100644
--- a/ngraph/python/tests/test_onnx/test_backend.py
+++ b/ngraph/python/tests/test_onnx/test_backend.py
@@ -26,9 +26,7 @@
                    xfail_issue_38701,
                    xfail_issue_38706,
                    xfail_issue_38708,
-                   xfail_issue_38712,
                    xfail_issue_38713,
-                   xfail_issue_38714,
                    xfail_issue_38722,
                    xfail_issue_38723,
                    xfail_issue_38724,
@@ -50,14 +48,12 @@
                    xfail_issue_44957,
                    xfail_issue_44958,
                    xfail_issue_44965,
-                   xfail_issue_44967,
                    xfail_issue_44968,
                    xfail_issue_44976,
                    xfail_issue_45180,
                    xfail_issue_45344,
                    xfail_issue_46762,
                    xfail_issue_46765,
-                   xfail_issue_47317,
                    xfail_issue_47323,
                    xfail_issue_47337,
                    xfail_issue_48052,
@@ -65,7 +61,8 @@
                    xfail_issue_49750,
                    xfail_issue_49752,
                    xfail_issue_49753,
-                   xfail_issue_49754)
+                   xfail_issue_49754,
+                   xfail_issue_52463)
 
 
 def expect_fail(test_case_path, xfail):  # type: (str) -> None
@@ -156,7 +153,6 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
     (xfail_issue_39662,
      "OnnxBackendNodeModelTest.test_nonmaxsuppression_two_classes_cpu",
      "OnnxBackendNodeModelTest.test_scatter_elements_with_negative_indices_cpu",
-     "OnnxBackendNodeModelTest.test_constantofshape_int_shape_zero_cpu",
      "OnnxBackendNodeModelTest.test_gather_negative_indices_cpu"),
     (xfail_issue_49753,
      "OnnxBackendNodeModelTest.test_slice_default_axes_cpu"),
@@ -190,9 +186,8 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
      "OnnxBackendNodeModelTest.test_argmin_no_keepdims_random_select_last_index_cpu"),
     (xfail_issue_38091,
      "OnnxBackendNodeModelTest.test_gather_negative_indices_cpu"),
-    (xfail_issue_47317,
-     "OnnxBackendPyTorchOperatorModelTest.test_operator_add_size1_broadcast_cpu",
-     "OnnxBackendPyTorchOperatorModelTest.test_operator_add_size1_singleton_broadcast_cpu",),
+    (xfail_issue_52463,
+     "OnnxBackendPyTorchOperatorModelTest.test_operator_add_size1_singleton_broadcast_cpu"),
     (xfail_issue_47323,
      "OnnxBackendPyTorchOperatorModelTest.test_operator_add_broadcast_cpu",
      "OnnxBackendPyTorchOperatorModelTest.test_operator_addconstant_cpu",
@@ -272,25 +267,14 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
      "OnnxBackendNodeModelTest.test_resize_upsample_sizes_nearest_ceil_half_pixel_cpu",
      "OnnxBackendNodeModelTest.test_resize_upsample_sizes_cubic_cpu",
      "OnnxBackendNodeModelTest.test_resize_downsample_sizes_linear_pytorch_half_pixel_cpu",
-     "OnnxBackendNodeModelTest.test_resize_downsample_sizes_nearest_cpu"),
+     "OnnxBackendNodeModelTest.test_resize_downsample_sizes_nearest_cpu",
+     "OnnxBackendNodeModelTest.test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn_cpu"),
     (xfail_issue_33581,
      "OnnxBackendNodeModelTest.test_gather_elements_negative_indices_cpu"),
-    (xfail_issue_38712,
-     "OnnxBackendNodeModelTest.test_mod_mixed_sign_int16_cpu",
-     "OnnxBackendNodeModelTest.test_mod_uint8_cpu",
-     "OnnxBackendNodeModelTest.test_mod_uint64_cpu",
-     "OnnxBackendNodeModelTest.test_mod_uint32_cpu",
-     "OnnxBackendNodeModelTest.test_mod_uint16_cpu",
-     "OnnxBackendNodeModelTest.test_mod_mixed_sign_int8_cpu",
-     "OnnxBackendNodeModelTest.test_mod_mixed_sign_int64_cpu",
-     "OnnxBackendNodeModelTest.test_mod_broadcast_cpu",
-     "OnnxBackendNodeModelTest.test_mod_mixed_sign_int32_cpu"),
     (xfail_issue_38713,
      "OnnxBackendNodeModelTest.test_momentum_cpu",
      "OnnxBackendNodeModelTest.test_nesterov_momentum_cpu",
      "OnnxBackendNodeModelTest.test_momentum_multiple_cpu"),
-    (xfail_issue_38714,
-     "OnnxBackendNodeModelTest.test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn_cpu"),
     (xfail_issue_47337,
      "OnnxBackendNodeModelTest.test_onehot_without_axis_cpu",
      "OnnxBackendNodeModelTest.test_onehot_with_negative_axis_cpu",
@@ -397,9 +381,6 @@ def expect_fail(test_case_path, xfail):  # type: (str) -> None
      "OnnxBackendNodeModelTest.test_loop13_seq_cpu",
      "OnnxBackendNodeModelTest.test_sequence_insert_at_back_cpu",
      "OnnxBackendNodeModelTest.test_sequence_insert_at_front_cpu",),
-    (xfail_issue_44967,
-     "OnnxBackendNodeModelTest.test_cast_BFLOAT16_to_FLOAT_cpu",
-     "OnnxBackendNodeModelTest.test_cast_FLOAT_to_BFLOAT16_cpu",),
     (xfail_issue_44968,
      "OnnxBackendNodeModelTest.test_squeeze_cpu",
      "OnnxBackendNodeModelTest.test_squeeze_negative_axes_cpu",),
diff --git a/ngraph/python/tox.ini b/ngraph/python/tox.ini
index aa75efab6272f6..e0ccc85785ef60 100644
--- a/ngraph/python/tox.ini
+++ b/ngraph/python/tox.ini
@@ -13,7 +13,7 @@ deps =
 setenv =
   NGRAPH_BACKEND = {env:NGRAPH_BACKEND:"CPU"}
   PYTHONPATH = {env:PYTHONPATH}
-  ngraph_DIR = {env:NGRAPH_CPP_BUILD_PATH}
+  ngraph_DIR = {env:ngraph_DIR}
 passenv =
   http_proxy
   https_proxy
@@ -24,6 +24,11 @@ commands=
   flake8 --ignore=D100,D101,D102,D103,D104,D105,D107,W503 tests/  # ignore lack of docs in tests
   mypy --config-file=tox.ini {posargs:src/}
   pytest --backend={env:NGRAPH_BACKEND} tests -v -k 'not _cuda' --ignore=tests/test_onnx/test_zoo_models.py
+
+[testenv:zoo_models]
+commands=
+  {envbindir}/python setup.py bdist_wheel
+  {envbindir}/pip install --no-index --pre --find-links=dist/ ngraph-core
   pytest --backend={env:NGRAPH_BACKEND} tests/test_onnx/test_zoo_models.py -v -n 4 -k 'not _cuda' --model_zoo_xfail
 
 [testenv:devenv]
diff --git a/ngraph/test/CMakeLists.txt b/ngraph/test/CMakeLists.txt
index baefe1d01b36df..6c270fcce2e507 100644
--- a/ngraph/test/CMakeLists.txt
+++ b/ngraph/test/CMakeLists.txt
@@ -60,6 +60,7 @@ set(SRC
     op.cpp
     op_eval/binary_convolution.cpp
     op_eval/bucketize.cpp
+    op_eval/clamp.cpp
     op_eval/floor_mod.cpp
     op_eval/gelu.cpp
     op_eval/hsigmoid.cpp
@@ -89,6 +90,7 @@ set(SRC
     provenance.cpp
     replace_node.cpp
     shape.cpp
+    span.cpp
     specialize_function.cpp
     tensor.cpp
     type_prop/assign.cpp
@@ -147,6 +149,7 @@ set(SRC
     type_prop/max_pool.cpp
     type_prop/minimum.cpp
     type_prop/mish.cpp
+    type_prop/mod.cpp
     type_prop/mvn.cpp
     type_prop/non_max_suppression.cpp
     type_prop/non_zero.cpp
@@ -313,6 +316,7 @@ set(MULTI_TEST_SRC
     backend/convert_like.in.cpp
     backend/convolution.in.cpp
     backend/binary_convolution.in.cpp
+    backend/clamp.in.cpp
     backend/cos.in.cpp
     backend/cosh.in.cpp
     backend/ctc_greedy_decoder.in.cpp
@@ -347,6 +351,7 @@ set(MULTI_TEST_SRC
     backend/maximum.in.cpp
     backend/max_pool.in.cpp
     backend/minimum.in.cpp
+    backend/mod.in.cpp
     backend/multiple_backends.in.cpp
     backend/multiple_result.in.cpp
     backend/multiply.in.cpp
@@ -377,7 +382,9 @@ set(MULTI_TEST_SRC
     backend/reverse_sequence.in.cpp
     backend/reverse.in.cpp
     backend/roi_pooling.in.cpp
+    backend/roll.in.cpp
     backend/round.in.cpp
+    backend/scatter_nd_update.in.cpp
     backend/select.in.cpp
     backend/shape_of.in.cpp
     backend/sigmoid.in.cpp
@@ -410,15 +417,18 @@ if (NGRAPH_ONNX_IMPORT_ENABLE AND NOT NGRAPH_USE_PROTOBUF_LITE)
             onnx/onnx_import_provenance.in.cpp
             onnx/onnx_import_reshape.in.cpp
             onnx/onnx_import_rnn.in.cpp
-            onnx/onnx_import_quant.in.cpp
-            onnx/onnx_test_utils.in.cpp)
+            onnx/onnx_import_quant.in.cpp)
     list(APPEND SRC
             onnx/onnx_import_exceptions.cpp
             onnx/onnx_import_library.cpp
-            onnx/onnx_editor.cpp
             onnx/onnx_tensor_names.cpp)
 endif()
 
+if (NGRAPH_ONNX_EDITOR_ENABLE)
+    list(APPEND SRC onnx/onnx_editor.cpp)
+    list(APPEND MULTI_TEST_SRC onnx/onnx_test_utils.in.cpp)
+endif()
+
 add_clang_format_target(unit-test_clang FOR_SOURCES ${SRC} ${MULTI_TEST_SRC})
 
 foreach(BACKEND_NAME ${ACTIVE_BACKEND_LIST})
@@ -492,7 +502,11 @@ endif()
 target_link_libraries(unit-test PRIVATE ie_backend)
 
 if (NGRAPH_ONNX_IMPORT_ENABLE)
-    target_link_libraries(unit-test PRIVATE onnx_importer onnx_editor)
+    target_link_libraries(unit-test PRIVATE onnx_importer)
+endif()
+
+if (NGRAPH_ONNX_EDITOR_ENABLE)
+    target_link_libraries(unit-test PRIVATE onnx_editor)
 endif()
 
 if (NGRAPH_INTERPRETER_ENABLE)
diff --git a/ngraph/test/backend/broadcast.in.cpp b/ngraph/test/backend/broadcast.in.cpp
index 7c65594062f7d4..3d74952ab774e0 100644
--- a/ngraph/test/backend/broadcast.in.cpp
+++ b/ngraph/test/backend/broadcast.in.cpp
@@ -456,6 +456,22 @@ NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_3d_stride_2)
     broadcast_test_helper(shape_a, shape_r, axis);
 }
 
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_3d_diffrent_rank)
+{
+    Shape shape_a{3, 1};
+    Shape shape_r{2, 3, 3};
+    AxisSet axis{1, 2};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_4d_same_rank)
+{
+    Shape shape_a{2, 3, 1, 1};
+    Shape shape_r{2, 3, 4, 5};
+    AxisSet axis{0, 1, 2, 3};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
 NGRAPH_TEST(${BACKEND_NAME}, broadcast_matrix_0)
 {
     Shape shape_a{2, 2};
diff --git a/ngraph/test/backend/clamp.in.cpp b/ngraph/test/backend/clamp.in.cpp
new file mode 100644
index 00000000000000..8f985f02af1597
--- /dev/null
+++ b/ngraph/test/backend/clamp.in.cpp
@@ -0,0 +1,418 @@
+//*****************************************************************************
+// Copyright 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
+namespace
+{
+    template <typename T, test::TestCaseType tct = test::TestCaseType::STATIC>
+    void clamp_test(const element::Type& type,
+                    const PartialShape& dynamic_shape,
+                    const Shape& static_shape,
+                    const std::vector<T>& input,
+                    double min,
+                    double max,
+                    const std::vector<T>& output)
+    {
+        auto data = make_shared<op::Parameter>(type, dynamic_shape);
+        auto clamp = make_shared<op::Clamp>(data, min, max);
+        auto function = make_shared<Function>(clamp, ParameterVector{data});
+
+        auto test_case = test::TestCase<TestEngine, tct>(function);
+        test_case.template add_input<T>(static_shape, input);
+        test_case.template add_expected_output<T>(static_shape, output);
+        return test_case.run();
+    }
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_integral)
+{
+    Shape in_shape{6};
+    element::Type et = element::i32;
+
+    float min = 0.4; // ceiled to 1
+    float max = 5.6; // floored to 5
+
+    auto input = make_shared<op::Parameter>(et, in_shape);
+    auto clamp = make_shared<op::Clamp>(input, min, max);
+    auto f = make_shared<Function>(clamp, ParameterVector{input});
+
+    vector<int32_t> in_vec{-1, 3, -10, 20, 6, 2};
+    vector<int32_t> out_vec{1, 3, 1, 5, 5, 2};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input(in_shape, in_vec);
+    test_case.add_expected_output(in_shape, out_vec);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_integral_negative)
+{
+    Shape in_shape{6};
+    element::Type et = element::i32;
+
+    float min = -5.6; // ceiled to -5
+    float max = -0.4; // floored to -1
+
+    auto input = make_shared<op::Parameter>(et, in_shape);
+    auto clamp = make_shared<op::Clamp>(input, min, max);
+    auto f = make_shared<Function>(clamp, ParameterVector{input});
+
+    vector<int32_t> in_vec{-6, 1, -2, 0, -1, 2};
+    vector<int32_t> out_vec{-5, -1, -2, -1, -1, -1};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input(in_shape, in_vec);
+    test_case.add_expected_output(in_shape, out_vec);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_float)
+{
+    auto type = element::f32;
+    typedef float ctype;
+
+    auto sshape = Shape{5, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<float>::infinity();
+    auto ninf = -numeric_limits<float>::infinity();
+
+    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
+
+    // static shape
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
+                      0.2,
+                      0.6,
+                      {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
+
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      input,
+                      10.0,
+                      20.0,
+                      {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      input,
+                      10.0,
+                      pinf,
+                      {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
+
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      input,
+                      ninf,
+                      20.0,
+                      {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_int8)
+{
+    auto type = element::i8;
+    typedef int8_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<double>::infinity();
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    // static shape
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_int16)
+{
+    auto type = element::i16;
+    typedef int16_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<double>::infinity();
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    // static shape
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_int32)
+{
+    auto type = element::i32;
+    typedef int32_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<double>::infinity();
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    // static shape
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_int64)
+{
+    auto type = element::i64;
+    typedef int64_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<double>::infinity();
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    // static shape
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_uint8)
+{
+    auto type = element::u8;
+    typedef uint8_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
+    // auto max = numeric_limits<ctype>::max();
+    // auto pinf = numeric_limits<double>::infinity();
+    ctype max = (static_cast<ctype>(1) << (numeric_limits<ctype>::digits - 1)) - 1;
+    auto pinf = static_cast<double>(max);
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    // static shape
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_uint16)
+{
+    auto type = element::u16;
+    typedef uint16_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
+    // auto max = numeric_limits<ctype>::max();
+    // auto pinf = numeric_limits<double>::infinity();
+    ctype max = (static_cast<ctype>(1) << (numeric_limits<ctype>::digits - 1)) - 1;
+    auto pinf = static_cast<double>(max);
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    // static shape
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_uint32)
+{
+    auto type = element::u32;
+    typedef uint32_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
+    // auto max = numeric_limits<ctype>::max();
+    // auto pinf = numeric_limits<double>::infinity();
+    ctype max = (static_cast<ctype>(1) << (numeric_limits<ctype>::digits - 1)) - 1;
+    auto pinf = static_cast<double>(max);
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    // static shape
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_uint64)
+{
+    auto type = element::u64;
+    typedef uint64_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
+    // auto max = numeric_limits<ctype>::max();
+    // auto pinf = numeric_limits<double>::infinity();
+    ctype max = (static_cast<ctype>(1) << (32 - 1)) - 1;
+    auto pinf = static_cast<double>(max);
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    // static shape
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_float16)
+{
+    auto type = element::f16;
+    typedef float16 ctype;
+
+    auto sshape = Shape{5, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<float>::infinity();
+    auto ninf = -numeric_limits<float>::infinity();
+
+    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
+
+    // static shape
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
+                      0.2,
+                      0.6,
+                      {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
+
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      input,
+                      10.0,
+                      20.0,
+                      {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      input,
+                      10.0,
+                      pinf,
+                      {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
+
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      input,
+                      ninf,
+                      20.0,
+                      {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, clamp_bfloat16)
+{
+    auto type = element::bf16;
+    typedef bfloat16 ctype;
+
+    auto sshape = Shape{5, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<float>::infinity();
+    auto ninf = -numeric_limits<float>::infinity();
+
+    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
+
+    // static shape
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
+                      0.2,
+                      0.6,
+                      {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
+
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      input,
+                      10.0,
+                      20.0,
+                      {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      input,
+                      10.0,
+                      pinf,
+                      {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
+
+    clamp_test<ctype>(type,
+                      sshape,
+                      sshape,
+                      input,
+                      ninf,
+                      20.0,
+                      {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+}
diff --git a/ngraph/test/backend/fused_op.in.cpp b/ngraph/test/backend/fused_op.in.cpp
index 2fc8f0a059b8b2..ff70b22e1b10d9 100644
--- a/ngraph/test/backend/fused_op.in.cpp
+++ b/ngraph/test/backend/fused_op.in.cpp
@@ -446,616 +446,6 @@ NGRAPH_TEST(${BACKEND_NAME}, DISABLED_normalize_across_chw_4d_max_bias)
     test_case.run(DEFAULT_FLOAT_TOLERANCE_BITS + 1);
 }
 
-namespace
-{
-    template <typename T, test::TestCaseType tct = test::TestCaseType::STATIC>
-    void clamp_test(const element::Type& type,
-                    const PartialShape& dynamic_shape,
-                    const Shape& static_shape,
-                    const std::vector<T>& input,
-                    double min,
-                    double max,
-                    const std::vector<T>& output)
-    {
-        auto data = make_shared<op::Parameter>(type, dynamic_shape);
-        auto clamp = make_shared<op::Clamp>(data, min, max);
-        auto function = make_shared<Function>(clamp, ParameterVector{data});
-
-        auto test_case = test::TestCase<TestEngine, tct>(function);
-        test_case.template add_input<T>(static_shape, input);
-        test_case.template add_expected_output<T>(static_shape, output);
-        return test_case.run();
-    }
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_double)
-{
-    auto type = element::f64;
-    typedef double ctype;
-
-    auto sshape = Shape{5, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    auto max = numeric_limits<ctype>::max();
-    auto pinf = numeric_limits<double>::infinity();
-    auto ninf = -numeric_limits<double>::infinity();
-
-    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
-
-    // static shape
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
-                      0.2,
-                      0.6,
-                      {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      10.0,
-                      20.0,
-                      {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      10.0,
-                      pinf,
-                      {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      ninf,
-                      20.0,
-                      {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
-        0.2,
-        0.6,
-        {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        10.0,
-        20.0,
-        {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        10.0,
-        pinf,
-        {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        ninf,
-        20.0,
-        {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_float)
-{
-    auto type = element::f32;
-    typedef float ctype;
-
-    auto sshape = Shape{5, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    auto max = numeric_limits<ctype>::max();
-    auto pinf = numeric_limits<float>::infinity();
-    auto ninf = -numeric_limits<float>::infinity();
-
-    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
-
-    // static shape
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
-                      0.2,
-                      0.6,
-                      {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      10.0,
-                      20.0,
-                      {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      10.0,
-                      pinf,
-                      {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      ninf,
-                      20.0,
-                      {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
-        0.2,
-        0.6,
-        {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        10.0,
-        20.0,
-        {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        10.0,
-        pinf,
-        {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        ninf,
-        20.0,
-        {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_int8)
-{
-    auto type = element::i8;
-    typedef int8_t ctype;
-
-    auto sshape = Shape{4, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    auto max = numeric_limits<ctype>::max();
-    auto pinf = numeric_limits<double>::infinity();
-    auto ninf = -numeric_limits<double>::infinity();
-
-    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
-
-    // static shape
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_int16)
-{
-    auto type = element::i16;
-    typedef int16_t ctype;
-
-    auto sshape = Shape{4, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    auto max = numeric_limits<ctype>::max();
-    auto pinf = numeric_limits<double>::infinity();
-    auto ninf = -numeric_limits<double>::infinity();
-
-    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
-
-    // static shape
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_int32)
-{
-    auto type = element::i32;
-    typedef int32_t ctype;
-
-    auto sshape = Shape{4, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    auto max = numeric_limits<ctype>::max();
-    auto pinf = numeric_limits<double>::infinity();
-    auto ninf = -numeric_limits<double>::infinity();
-
-    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
-
-    // static shape
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_int64)
-{
-    auto type = element::i64;
-    typedef int64_t ctype;
-
-    auto sshape = Shape{4, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    auto max = numeric_limits<ctype>::max();
-    auto pinf = numeric_limits<double>::infinity();
-    auto ninf = -numeric_limits<double>::infinity();
-
-    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
-
-    // static shape
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_uint8)
-{
-    auto type = element::u8;
-    typedef uint8_t ctype;
-
-    auto sshape = Shape{4, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
-    // auto max = numeric_limits<ctype>::max();
-    // auto pinf = numeric_limits<double>::infinity();
-    ctype max = (static_cast<ctype>(1) << (numeric_limits<ctype>::digits - 1)) - 1;
-    auto pinf = static_cast<double>(max);
-    auto ninf = -numeric_limits<double>::infinity();
-
-    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
-
-    // static shape
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_uint16)
-{
-    auto type = element::u16;
-    typedef uint16_t ctype;
-
-    auto sshape = Shape{4, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
-    // auto max = numeric_limits<ctype>::max();
-    // auto pinf = numeric_limits<double>::infinity();
-    ctype max = (static_cast<ctype>(1) << (numeric_limits<ctype>::digits - 1)) - 1;
-    auto pinf = static_cast<double>(max);
-    auto ninf = -numeric_limits<double>::infinity();
-
-    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
-
-    // static shape
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_uint32)
-{
-    auto type = element::u32;
-    typedef uint32_t ctype;
-
-    auto sshape = Shape{4, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
-    // auto max = numeric_limits<ctype>::max();
-    // auto pinf = numeric_limits<double>::infinity();
-    ctype max = (static_cast<ctype>(1) << (numeric_limits<ctype>::digits - 1)) - 1;
-    auto pinf = static_cast<double>(max);
-    auto ninf = -numeric_limits<double>::infinity();
-
-    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
-
-    // static shape
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_uint64)
-{
-    auto type = element::u64;
-    typedef uint64_t ctype;
-
-    auto sshape = Shape{4, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
-    // auto max = numeric_limits<ctype>::max();
-    // auto pinf = numeric_limits<double>::infinity();
-    ctype max = (static_cast<ctype>(1) << (32 - 1)) - 1;
-    auto pinf = static_cast<double>(max);
-    auto ninf = -numeric_limits<double>::infinity();
-
-    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
-
-    // static shape
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype>(type, sshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype>(type, sshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_float16)
-{
-    auto type = element::f16;
-    typedef float16 ctype;
-
-    auto sshape = Shape{5, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    auto max = numeric_limits<ctype>::max();
-    auto pinf = numeric_limits<float>::infinity();
-    auto ninf = -numeric_limits<float>::infinity();
-
-    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
-
-    // static shape
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
-                      0.2,
-                      0.6,
-                      {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      10.0,
-                      20.0,
-                      {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      10.0,
-                      pinf,
-                      {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      ninf,
-                      20.0,
-                      {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
-        0.2,
-        0.6,
-        {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        10.0,
-        20.0,
-        {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        10.0,
-        pinf,
-        {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        ninf,
-        20.0,
-        {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fused_clamp_bfloat16)
-{
-    auto type = element::bf16;
-    typedef bfloat16 ctype;
-
-    auto sshape = Shape{5, 2};
-    auto dshape = PartialShape::dynamic();
-
-    auto min = numeric_limits<ctype>::min();
-    auto max = numeric_limits<ctype>::max();
-    auto pinf = numeric_limits<float>::infinity();
-    auto ninf = -numeric_limits<float>::infinity();
-
-    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
-
-    // static shape
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
-                      0.2,
-                      0.6,
-                      {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      10.0,
-                      20.0,
-                      {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      10.0,
-                      pinf,
-                      {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
-
-    clamp_test<ctype>(type,
-                      sshape,
-                      sshape,
-                      input,
-                      ninf,
-                      20.0,
-                      {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    // dynamic shape
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
-        0.2,
-        0.6,
-        {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        10.0,
-        20.0,
-        {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        10.0,
-        pinf,
-        {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
-
-    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
-        type,
-        dshape,
-        sshape,
-        input,
-        ninf,
-        20.0,
-        {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
-}
-
 NGRAPH_TEST(${BACKEND_NAME}, mvn_mean_normalization)
 {
     Shape data_shape{1, 2, 5};
@@ -1366,7 +756,17 @@ NGRAPH_TEST(${BACKEND_NAME}, squeeze_dynamic)
 {
     const auto data_param = make_shared<op::Parameter>(element::f32, Shape{1, 4, 1, 1, 2});
     const auto axes_param = make_shared<op::Parameter>(element::i64, Shape{2});
-    EXPECT_THROW(make_shared<op::Squeeze>(data_param, axes_param), CheckFailure);
+
+    const auto squeeze = make_shared<op::Squeeze>(data_param, axes_param);
+
+    const auto function = make_shared<Function>(NodeVector{squeeze}, ParameterVector{data_param, axes_param});
+    auto test_case = test::TestCase<TestEngine>(function);
+
+    const auto data = vector<float>{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+    test_case.add_input(data);
+    test_case.add_input(vector<int64_t>{0, 2});
+    test_case.add_expected_output<float>(Shape{4, 1, 2}, data);
+    test_case.run();
 }
 
 // TODO: Issue: 37534
diff --git a/ngraph/test/backend/gather.in.cpp b/ngraph/test/backend/gather.in.cpp
index dc0f07ff07a5f8..31a77d7d3168e2 100644
--- a/ngraph/test/backend/gather.in.cpp
+++ b/ngraph/test/backend/gather.in.cpp
@@ -450,11 +450,19 @@ NGRAPH_TEST(${BACKEND_NAME}, gather_axis_0_int32)
     auto A = op::Constant::create(element::i64, Shape{}, {0});
     auto G = make_shared<op::v1::Gather>(P, I, A);
     auto f = make_shared<Function>(G, ParameterVector{P, I});
-
+    // clang-format off
     auto test_case = test::TestCase<TestEngine>(f);
-    test_case.add_input<int32_t>({10, 11, 20, 21, 30, 31});
-    test_case.add_input<int32_t>({0, 1, 1, 2});
-    test_case.add_expected_output<int32_t>(out_shape, {10, 11, 20, 21, 20, 21, 30, 31});
+    test_case.add_input<int32_t>({10, 11,
+                                  20, 21,
+                                  30, 31});
+    test_case.add_input<int32_t>({0, 1,
+                                  1, 2});
+    test_case.add_expected_output<int32_t>(out_shape, {10, 11,
+                                                       20, 21,
+
+                                                       20, 21,
+                                                       30, 31});
+    // clang-format on
     test_case.run(MIN_FLOAT_TOLERANCE_BITS);
 }
 
@@ -548,7 +556,560 @@ NGRAPH_TEST(${BACKEND_NAME}, gather_axis_0_uint64)
     test_case.run(MIN_FLOAT_TOLERANCE_BITS);
 }
 
-NGRAPH_TEST(${BACKEND_NAME}, gather_axis_0_bool)
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_4d_indices_axis_0_uint8)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2, 3, 4};
+    Shape out_shape{2, 2, 3, 4, 2};
+    auto P = make_shared<op::Parameter>(element::u8, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<uint8_t>({10, 11, 20, 21, 30, 31});
+    test_case.add_input<int32_t>({0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
+                                  0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
+                                  0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2});
+    test_case.add_expected_output<uint8_t>(
+            out_shape, {10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21,
+                        20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
+                        10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21,
+                        20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
+                        10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_4d_indices_axis_0_2d_input)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2, 3, 4};
+    Shape out_shape{2, 2, 3, 4, 2};
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+
+    // clang-format off
+    test_case.add_input<float>({1.0f, 1.1f,
+                                2.0f, 2.1f,
+                                3.0f, 3.1f});
+
+    test_case.add_input<int32_t>({0, 1, 1, 2,
+                                  0, 1, 1, 2,
+                                  0, 1, 1, 2,
+
+                                  0, 1, 1, 2,
+                                  0, 1, 1, 2,
+                                  0, 1, 1, 2,
+
+
+                                  0, 1, 1, 2,
+                                  0, 1, 1, 2,
+                                  0, 1, 1, 2,
+
+                                  0, 1, 1, 2,
+                                  0, 1, 1, 2,
+                                  0, 1, 1, 2});
+    test_case.add_expected_output<float>(
+            out_shape,
+            { 1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f,
+
+              1.0f, 1.1f,
+              2.0f, 2.1f,
+              2.0f, 2.1f,
+              3.0f, 3.1f});
+    // clang-format on
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_3d_indices_axis_0_2d_input)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 3, 4};
+    Shape out_shape{2, 3, 4, 2};
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    // clang-format off
+    test_case.add_input<float>({1.0f, 1.1f,
+                                2.0f, 2.1f,
+                                3.0f, 3.1f});
+    test_case.add_input<int32_t>(
+            {0, 1, 1, 2,
+             0, 1, 1, 2,
+             0, 1, 1, 2,
+
+             0, 1, 1, 2,
+             0, 1, 1, 2,
+             0, 1, 1, 2});
+    test_case.add_expected_output<float>(
+            out_shape, {1.0f, 1.1f,
+                        2.0f, 2.1f,
+                        2.0f, 2.1f,
+                        3.0f, 3.1f,
+
+                        1.0f, 1.1f,
+                        2.0f, 2.1f,
+                        2.0f, 2.1f,
+                        3.0f, 3.1f,
+
+                        1.0f, 1.1f,
+                        2.0f, 2.1f,
+                        2.0f, 2.1f,
+                        3.0f, 3.1f,
+
+
+                        1.0f, 1.1f,
+                        2.0f, 2.1f,
+                        2.0f, 2.1f,
+                        3.0f, 3.1f,
+
+                        1.0f, 1.1f,
+                        2.0f, 2.1f,
+                        2.0f, 2.1f,
+                        3.0f, 3.1f,
+
+                        1.0f, 1.1f,
+                        2.0f, 2.1f,
+                        2.0f, 2.1f,
+                        3.0f, 3.1f});
+    // clang-format on
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_1d_int32)
+{
+    Shape data_shape{3};
+    Shape indices_shape{2};
+    Shape out_shape{2};
+    auto P = make_shared<op::Parameter>(element::i32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    // clang-format off
+    test_case.add_input<int32_t>({1, 2, 3});
+    test_case.add_input<int32_t>({2, 0});
+    test_case.add_expected_output<int32_t>(out_shape, {3, 1});
+    // clang-format on
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_2d_indices_axis_0_2d_input)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    // clang-format off
+    test_case.add_input<float>({1.0f, 1.1f,
+                                2.0f, 2.1f,
+                                3.0f, 3.1f});
+    // clang-format on
+    test_case.add_input<int32_t>({0, 1, 1, 2});
+    // clang-format off
+    test_case.add_expected_output<float>(out_shape,
+                                         {1.0f, 1.1f,
+                                          2.0f, 2.1f,
+
+                                          2.0f, 2.1f,
+                                          3.0f, 3.1f});
+    // clang-format on
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_2d_negative_and_positive_indices_axis_0_2d_input)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+
+    // clang-format off
+    test_case.add_input<float>({1.0f, 1.1f,
+                                2.0f, 2.1f,
+                                3.0f, 3.1f});
+    // clang-format on
+
+    test_case.add_input<int32_t>({0, -2, 1, 2});
+
+    // clang-format off
+    test_case.add_expected_output<float>(out_shape,
+                                         {1.0f, 1.1f,
+                                          2.0f, 2.1f,
+
+                                          2.0f, 2.1f,
+                                          3.0f, 3.1f});
+    // clang-format on
+
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_1d_indices_axis_0_1d_input)
+{
+    Shape data_shape{3};
+    Shape indices_shape{2};
+    Shape out_shape{2};
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>({1.0f, 2.0f, 3.0f});
+    test_case.add_input<int32_t>({1, 0});
+    test_case.add_expected_output<float>(out_shape, {2.0f, 1.0f});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_scalar_indices_axis_0_2d_input)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{};
+    Shape out_shape{2};
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>({1.0f, 1.1f, 2.0f, 2.1f, 3.0f, 3.1f});
+    test_case.add_input<int32_t>({1});
+    test_case.add_expected_output<float>(out_shape, {2.0f, 2.1f});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_2d_indices_axis_1_2d_input)
+{
+    Shape data_shape{3, 3};
+    Shape indices_shape{1, 2};
+    Shape out_shape{3, 1, 2};
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {1});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+
+    // clang-format off
+    test_case.add_input<float>({1.0f, 1.1f, 1.2f,
+                                2.0f, 2.1f, 2.2f,
+                                3.0f, 3.1f, 3.2f});
+    // clang-format on
+    test_case.add_input<int32_t>({0, 2});
+
+    // clang-format off
+    test_case.add_expected_output<float>(out_shape, {1.0f, 1.2f,
+                                                     2.0f, 2.2f,
+                                                     3.0f, 3.2f});
+    // clang-format on
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_1d_indices_axis_2_4d_input)
+{
+    Shape data_shape{2, 2, 3, 3};
+    Shape indices_shape{2};
+    Shape out_shape{2, 2, 2, 3};
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {2});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    // clang-format off
+    test_case.add_input<float>({  1.0f,   1.1f,   1.2f,
+                                  2.0f,   2.1f,   2.2f,
+                                  3.0f,   3.1f,   3.2f,
+
+                                  11.0f,  11.1f,  11.2f,
+                                  12.0f,  12.1f,  12.2f,
+                                  13.0f,  13.1f,  13.2f,
+
+
+                                  101.0f, 101.1f, 101.2f,
+                                  102.0f, 102.1f, 102.2f,
+                                  103.0f, 103.1f, 103.2f,
+
+                                  111.0f, 111.1f, 111.2f,
+                                  112.0f, 112.1f, 112.2f,
+                                  113.0f, 113.1f, 113.2f});
+    // clang-format on
+    test_case.add_input<int32_t>({0, 2});
+    // clang-format off
+    test_case.add_expected_output<float>(
+            out_shape, {  1.0f,   1.1f,   1.2f,
+                          3.0f,   3.1f,   3.2f,
+
+                          11.0f,  11.1f,  11.2f,
+                          13.0f,  13.1f,  13.2f,
+
+
+                          101.0f, 101.1f, 101.2f,
+                          103.0f, 103.1f, 103.2f,
+
+                          111.0f, 111.1f, 111.2f,
+                          113.0f, 113.1f, 113.2f});
+    // clang-format on
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_scalar_indices_axis_1_2d_input)
+{
+    Shape data_shape{3, 3};
+    Shape indices_shape{};
+    Shape out_shape{3};
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {1});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>({1.0f, 1.1f, 1.2f, 2.0f, 2.1f, 2.2f, 3.0f, 3.1f, 3.2f});
+    test_case.add_input<int32_t>({0});
+    test_case.add_expected_output<float>(out_shape, {1.0f, 2.0f, 3.0f});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_axis_0_int8)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::i8, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<int8_t>({10, 11, 20, 21, 30, 31});
+    test_case.add_input<int32_t>({0, 1, 1, 2});
+    test_case.add_expected_output<int8_t>(out_shape, {10, 11, 20, 21, 20, 21, 30, 31});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_axis_0_int16)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::i16, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<int16_t>({10, 11, 20, 21, 30, 31});
+    test_case.add_input<int64_t>({0, 1, 1, 2});
+    test_case.add_expected_output<int16_t>(out_shape, {10, 11, 20, 21, 20, 21, 30, 31});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_axis_0_int32)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::i32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+    // clang-format off
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<int32_t>({10, 11,
+                                  20, 21,
+                                  30, 31});
+    test_case.add_input<int32_t>({0, 1,
+                                  1, 2});
+    test_case.add_expected_output<int32_t>(out_shape, {10, 11,
+                                                       20, 21,
+
+                                                       20, 21,
+                                                       30, 31});
+    // clang-format on
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_axis_0_int64)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::i64, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<int64_t>({10, 11, 20, 21, 30, 31});
+    test_case.add_input<int64_t>({0, 1, 1, 2});
+    test_case.add_expected_output<int64_t>(out_shape, {10, 11, 20, 21, 20, 21, 30, 31});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_axis_0_uint8)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::u8, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<uint8_t>({10, 11, 20, 21, 30, 31});
+    test_case.add_input<int32_t>({0, 1, 1, 2});
+    test_case.add_expected_output<uint8_t>(out_shape, {10, 11, 20, 21, 20, 21, 30, 31});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_axis_0_uint16)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::u16, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<uint16_t>({10, 11, 20, 21, 30, 31});
+    test_case.add_input<int64_t>({0, 1, 1, 2});
+    test_case.add_expected_output<uint16_t>(out_shape, {10, 11, 20, 21, 20, 21, 30, 31});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_axis_0_uint32)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::u32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<uint32_t>({10, 11, 20, 21, 30, 31});
+    test_case.add_input<int32_t>({0, 1, 1, 2});
+    test_case.add_expected_output<uint32_t>(out_shape, {10, 11, 20, 21, 20, 21, 30, 31});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_axis_0_uint64)
+{
+    Shape data_shape{3, 2};
+    Shape indices_shape{2, 2};
+    Shape out_shape{2, 2, 2};
+    auto P = make_shared<op::Parameter>(element::u64, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(P, I, A);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<uint64_t>({10, 11, 20, 21, 30, 31});
+    test_case.add_input<int64_t>({0, 1, 1, 2});
+    test_case.add_expected_output<uint64_t>(out_shape, {10, 11, 20, 21, 20, 21, 30, 31});
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_axis_0_bool)
 {
     Shape data_shape{3, 2};
     Shape indices_shape{2, 2};
@@ -556,7 +1117,7 @@ NGRAPH_TEST(${BACKEND_NAME}, gather_axis_0_bool)
     auto P = make_shared<op::Parameter>(element::boolean, data_shape);
     auto I = make_shared<op::Parameter>(element::i64, indices_shape);
     auto A = op::Constant::create(element::i64, Shape{}, {0});
-    auto G = make_shared<op::v1::Gather>(P, I, A);
+    auto G = make_shared<op::v7::Gather>(P, I, A);
     auto f = make_shared<Function>(G, ParameterVector{P, I});
 
     auto test_case = test::TestCase<TestEngine>(f);
@@ -565,3 +1126,165 @@ NGRAPH_TEST(${BACKEND_NAME}, gather_axis_0_bool)
     test_case.add_expected_output<char>(out_shape, {1, 1, 1, 0, 1, 0, 0, 1});
     test_case.run(MIN_FLOAT_TOLERANCE_BITS);
 }
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_data_int32_3d_indices_axis_1_batch_dims_1)
+{
+    Shape data_shape{2, 3};
+    Shape indices_shape{2, 2, 2};
+    Shape out_shape{2, 2, 2};
+    int64_t batch_dims = 1;
+    int64_t axis = 1;
+
+    auto P = make_shared<op::Parameter>(element::i32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {axis});
+    auto G = make_shared<op::v7::Gather>(P, I, A, batch_dims);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    // clang-format off
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<int32_t>({1, 2, 3,   // batch 0
+                                  4, 5, 6}); // batch 1
+
+    test_case.add_input<int64_t>({0, 1,      // batch 0
+                                  1, 2,
+
+                                  2, 0,      // batch 1
+                                  1, 2});
+    test_case.add_expected_output<int32_t>(out_shape, {1, 2,  // batch 1
+                                                       2, 3,
+
+                                                       6, 4,  // batch 1
+                                                       5, 6});
+    test_case.run();
+    // clang-format on
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_data_int32_2d_indices_axis_1_batch_dims_1)
+{
+    Shape data_shape{2, 5};
+    Shape indices_shape{2, 3};
+    Shape out_shape{2, 3};
+    int64_t batch_dims = 1;
+    int64_t axis = 1;
+
+    auto P = make_shared<op::Parameter>(element::i32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {axis});
+    auto G = make_shared<op::v7::Gather>(P, I, A, batch_dims);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    // clang-format off
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<int32_t>({1, 2, 3, 4, 5,    // batch 0
+                                  6, 7, 8, 9, 10}); // batch 1
+
+    test_case.add_input<int64_t>({0, 0, 4,    // batch 0
+                                  4, 0, 0});  // batch 1
+    test_case.add_expected_output<int32_t>(out_shape, {1, 1, 5,    // batch 0
+                                                       10, 6, 6});  // batch 1
+    test_case.run();
+    // clang-format on
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_4d_data_axis_2_batch_dims_1_int32)
+{
+    Shape data_shape{2, 1, 5, 4};
+    Shape indices_shape{2, 3};
+    Shape out_shape{2, 1, 3, 4};
+    int64_t batch_dims = 1;
+    int64_t axis = 2;
+
+    auto P = make_shared<op::Parameter>(element::i32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {axis});
+    auto G = make_shared<op::v7::Gather>(P, I, A, batch_dims);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    // clang-format off
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<int32_t>({
+           1,  2,  3,  4, // first batch
+           5,  6,  7,  8,
+           9, 10, 11, 12,
+          13, 14, 15, 16,
+          17, 18, 19, 20,
+
+          21, 22, 23, 24,  // second batch
+          25, 26, 27, 28,
+          29, 30, 31, 32,
+          33, 34, 35, 36,
+          37, 38, 39, 40
+    });
+
+    test_case.add_input<int64_t>({
+        1, 2, 4,  // first batch
+        4, 3, 2   // second batch
+    });
+    test_case.add_expected_output<int32_t>(out_shape, {
+         5,  6,  7,  8,
+         9, 10, 11, 12,
+        17, 18, 19, 20,
+
+        37, 38, 39, 40,
+        33, 34, 35, 36,
+        29, 30, 31, 32
+    });
+    test_case.run();
+    // clang-format on
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_v7_3d_indices_axis_1_batch_dims_1)
+{
+    Shape data_shape{2, 5, 2};
+    Shape indices_shape{2, 2, 3};
+    Shape out_shape{2, 2, 3, 2};
+    int64_t batch_dims = 1;
+    int64_t axis = 1;
+
+    auto P = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {axis});
+
+    auto G = make_shared<op::v7::Gather>(P, I, A, batch_dims);
+    auto f = make_shared<Function>(G, ParameterVector{P, I});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+
+    // clang-format off
+    test_case.add_input<float>({1.0f, 2.0f,
+                                3.0f, 4.0f,
+                                5.0f, 6.0f,
+                                7.0f, 8.0f,
+                                9.0f, 10.0f,
+
+                                11.0f, 12.0f,
+                                13.0f, 14.0f,
+                                15.0f, 16.0f,
+                                17.0f, 18.0f,
+                                19.0f, 20.0f});
+
+    test_case.add_input<int32_t>({0, 0, 4,
+                                  4, 0, 0,
+
+                                  1, 2, 4,
+                                  4, 3, 2});
+    test_case.add_expected_output<float>({1.0f, 2.0f,
+                                          1.0f, 2.0f,
+                                          9.0f, 10.0f,
+
+                                          9.0f, 10.0f,
+                                          1.0f, 2.0f,
+                                          1.0f, 2.0f,
+
+
+                                          13.0f, 14.0f,
+                                          15.0f, 16.0f,
+                                          19.0f, 20.0f,
+
+                                          19.0f, 20.0f,
+                                          17.0f, 18.0f,
+                                          15.0f, 16.0f});
+    // clang-format on
+    test_case.run(MIN_FLOAT_TOLERANCE_BITS);
+}
diff --git a/ngraph/test/backend/mod.in.cpp b/ngraph/test/backend/mod.in.cpp
new file mode 100644
index 00000000000000..d3d800e61844da
--- /dev/null
+++ b/ngraph/test/backend/mod.in.cpp
@@ -0,0 +1,149 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <cinttypes>
+#include <cmath>
+#include <cstdlib>
+#include <random>
+#include <string>
+
+// clang-format off
+#ifdef ${BACKEND_NAME}_FLOAT_TOLERANCE_BITS
+#define DEFAULT_FLOAT_TOLERANCE_BITS ${BACKEND_NAME}_FLOAT_TOLERANCE_BITS
+#endif
+
+#ifdef ${BACKEND_NAME}_DOUBLE_TOLERANCE_BITS
+#define DEFAULT_DOUBLE_TOLERANCE_BITS ${BACKEND_NAME}_DOUBLE_TOLERANCE_BITS
+#endif
+// clang-format on
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
+NGRAPH_TEST(${BACKEND_NAME}, mod_no_broadcast)
+{
+    Shape shape{1, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::v1::Mod>(A, B), ParameterVector{A, B});
+
+    vector<float> a{256, 56};
+    vector<float> b{256, 56};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_multiple_inputs<float>({a, b});
+    test_case.add_expected_output<float>(shape, {0, 0});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, mod_no_broadcast_remainder)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::v1::Mod>(A, B), ParameterVector{A, B});
+
+    vector<float> a{256, 56, 21, 14};
+    vector<float> b{112, 56, 6, 8};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_multiple_inputs<float>({a, b});
+    test_case.add_expected_output<float>(shape, {32, 0, 3, 6});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, mod_broadcast)
+{
+    Shape shape_a{1, 2};
+    Shape shape_b{3, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    auto f = make_shared<Function>(make_shared<op::v1::Mod>(A, B), ParameterVector{A, B});
+
+    vector<float> a{1, 2};
+    vector<float> b{5, 6, 7, 8, 2, 3, 1, 5, 6, 7, 1, 3};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_multiple_inputs<float>({a, b});
+    test_case.add_expected_output<float>(shape_b, {1, 2, 1, 2, 1, 2, 0, 2, 1, 2, 0, 2});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, mod_scalars)
+{
+    Shape shape{};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::v1::Mod>(A, B), ParameterVector{A, B});
+
+    vector<float> a{57};
+    vector<float> b{13};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_multiple_inputs<float>({a, b});
+    test_case.add_expected_output<float>(shape, {5});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, mod_negative_numbers)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::v1::Mod>(A, B), ParameterVector{A, B});
+
+    vector<float> a{-57, -14, -12, -6};
+    vector<float> b{13, -7, 5, -5};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_multiple_inputs<float>({a, b});
+    test_case.add_expected_output<float>(shape, {-5, 0, -2, -1});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, mod_vector_and_scalar)
+{
+    Shape shape_a{2, 2};
+    Shape shape_b{};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    auto f = make_shared<Function>(make_shared<op::v1::Mod>(A, B), ParameterVector{A, B});
+
+    vector<float> a{2, 4, 7, 8};
+    vector<float> b{8};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_multiple_inputs<float>({a, b});
+    test_case.add_expected_output<float>(shape_a, {2, 4, 7, 0});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, mod_in_place)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto T = make_shared<op::v1::Mod>(A, B);
+    auto T2 = make_shared<op::v1::Mod>(T, T);
+
+    auto f = make_shared<Function>(T2, ParameterVector{A, B});
+
+    vector<float> a{1, 2, 3, 4};
+    vector<float> b{5, 6, 7, 8};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_multiple_inputs<float>({a, b});
+    test_case.add_expected_output<float>(shape, {0, 0 ,0 ,0});
+    test_case.run();
+}
\ No newline at end of file
diff --git a/ngraph/test/backend/roll.in.cpp b/ngraph/test/backend/roll.in.cpp
new file mode 100644
index 00000000000000..ed49ea09fca606
--- /dev/null
+++ b/ngraph/test/backend/roll.in.cpp
@@ -0,0 +1,206 @@
+//*****************************************************************************
+// Copyright 2017-2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "ngraph/opsets/opset7.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/shape.hpp"
+#include "runtime/backend.hpp"
+#include "util/all_close.hpp"
+#include "util/all_close_f.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+
+NGRAPH_TEST(${BACKEND_NAME}, roll_2d_input)
+{
+    Shape shape{4, 3};
+    auto x = make_shared<opset7::Parameter>(element::f32, shape);
+    auto shift = make_shared<opset7::Constant>(element::i64, Shape{1}, vector<int64_t>{1});
+    auto axes = make_shared<opset7::Constant>(element::i64, Shape{1}, vector<int64_t>{0});
+    auto f = make_shared<Function>(make_shared<opset7::Roll>(x, shift, axes), ParameterVector{x});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto x_tensor = backend->create_tensor(element::f32, shape);
+    copy_data(x_tensor,
+              vector<float>{50.2907,
+                            70.8054,
+                            -68.3403,
+                            62.6444,
+                            4.9748,
+                            -18.5551,
+                            40.5383,
+                            -15.3859,
+                            -4.5881,
+                            -43.3479,
+                            94.1676,
+                            -95.7097});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {x_tensor});
+    EXPECT_TRUE(test::all_close_f((vector<float>{-43.3479,
+                                                 94.1676,
+                                                 -95.7097,
+                                                 50.2907,
+                                                 70.8054,
+                                                 -68.3403,
+                                                 62.6444,
+                                                 4.9748,
+                                                 -18.5551,
+                                                 40.5383,
+                                                 -15.3859,
+                                                 -4.5881}),
+                                  read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, roll_2d_input_negative_shift)
+{
+    Shape shape{4, 3};
+    auto x = make_shared<opset7::Parameter>(element::f32, shape);
+    auto shift = make_shared<opset7::Constant>(element::i32, Shape{2}, vector<int32_t>{-1, 2});
+    auto axes = make_shared<opset7::Constant>(element::i32, Shape{2}, vector<int32_t>{0, 1});
+    auto f = make_shared<Function>(make_shared<opset7::Roll>(x, shift, axes), ParameterVector{x});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto x_tensor = backend->create_tensor(element::f32, shape);
+    copy_data(x_tensor,
+              vector<float>{50.2907,
+                            70.8054,
+                            -68.3403,
+                            62.6444,
+                            4.9748,
+                            -18.5551,
+                            40.5383,
+                            -15.3859,
+                            -4.5881,
+                            -43.3479,
+                            94.1676,
+                            -95.7097});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {x_tensor});
+    EXPECT_TRUE(test::all_close_f((vector<float>{4.9748,
+                                                 -18.5551,
+                                                 62.6444,
+                                                 -15.3859,
+                                                 -4.5881,
+                                                 40.5383,
+                                                 94.1676,
+                                                 -95.7097,
+                                                 -43.3479,
+                                                 70.8054,
+                                                 -68.3403,
+                                                 50.2907}),
+                                  read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, roll_repeated_axes)
+{
+    Shape shape{4, 3};
+    auto x = make_shared<opset7::Parameter>(element::i64, shape);
+    auto shift = make_shared<opset7::Constant>(element::i64, Shape{3}, vector<int64_t>{1, 2, 1});
+    auto axes = make_shared<opset7::Constant>(element::i64, Shape{3}, vector<int64_t>{0, 1, 0});
+    auto f = make_shared<Function>(make_shared<opset7::Roll>(x, shift, axes), ParameterVector{x});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto x_tensor = backend->create_tensor(element::i64, shape);
+    copy_data(x_tensor, vector<int64_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    auto result = backend->create_tensor(element::i64, shape);
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {x_tensor});
+    EXPECT_TRUE(test::all_close((vector<int64_t>{8, 9, 7, 11, 12, 10, 2, 3, 1, 5, 6, 4}),
+                                read_vector<int64_t>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, roll_3d_input)
+{
+    Shape shape{4, 2, 3};
+    auto x = make_shared<opset7::Parameter>(element::f32, shape);
+    auto shift = make_shared<opset7::Constant>(element::i64, Shape{3}, vector<int64_t>{2, 1, 3});
+    auto axes = make_shared<opset7::Constant>(element::i64, Shape{3}, vector<int64_t>{0, 1, 2});
+    auto f = make_shared<Function>(make_shared<opset7::Roll>(x, shift, axes), ParameterVector{x});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x_tensor = backend->create_tensor(element::f32, shape);
+    copy_data(x_tensor, vector<float>{94.0773,  33.0599, 58.1724,  -20.3640, 54.5372,  -54.3023,
+                                      10.4662,  11.7532, -11.7692, 56.4223,  -95.3774, 8.8978,
+                                      1.9305,   13.8025, 12.0827,  81.4669,  19.5321,  -8.9553,
+                                      -75.3226, 20.8033, 20.7660,  62.7361,  14.9372,  -33.0825});
+    auto result = backend->create_tensor(element::f32, shape);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {x_tensor});
+    EXPECT_TRUE(test::all_close_f(
+        (vector<float>{81.4669,  19.5321,  -8.9553,  1.9305,   13.8025, 12.0827,
+                       62.7361,  14.9372,  -33.0825, -75.3226, 20.8033, 20.7660,
+                       -20.3640, 54.5372,  -54.3023, 94.0773,  33.0599, 58.1724,
+                       56.4223,  -95.3774, 8.8978,   10.4662,  11.7532, -11.7692}),
+        read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, roll_3d_input_negative_shift)
+{
+    Shape shape{4, 2, 3};
+    auto x = make_shared<opset7::Parameter>(element::f32, shape);
+    auto shift = make_shared<opset7::Constant>(element::i32, Shape{3}, vector<int32_t>{-5, 1, 3});
+    auto axes = make_shared<opset7::Constant>(element::i64, Shape{3}, vector<int64_t>{0, 1, 1});
+    auto f = make_shared<Function>(make_shared<opset7::Roll>(x, shift, axes), ParameterVector{x});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x_tensor = backend->create_tensor(element::f32, shape);
+    copy_data(x_tensor, vector<float>{94.0773,  33.0599, 58.1724,  -20.3640, 54.5372,  -54.3023,
+                                      10.4662,  11.7532, -11.7692, 56.4223,  -95.3774, 8.8978,
+                                      1.9305,   13.8025, 12.0827,  81.4669,  19.5321,  -8.9553,
+                                      -75.3226, 20.8033, 20.7660,  62.7361,  14.9372,  -33.0825});
+    auto result = backend->create_tensor(element::f32, shape);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {x_tensor});
+    EXPECT_TRUE(test::all_close_f(
+        (vector<float>{10.4662,  11.7532, -11.7692, 56.4223,  -95.3774, 8.8978,
+                       1.9305,   13.8025, 12.0827,  81.4669,  19.5321,  -8.9553,
+                       -75.3226, 20.8033, 20.7660,  62.7361,  14.9372,  -33.0825,
+                       94.0773,  33.0599, 58.1724,  -20.3640, 54.5372,  -54.3023}),
+        read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, roll_negative_axes)
+{
+    Shape shape{4, 2, 3};
+    auto x = make_shared<opset7::Parameter>(element::i32, shape);
+    auto shift = make_shared<opset7::Constant>(element::i64, Shape{3}, vector<int64_t>{2, -1, -7});
+    auto axes = make_shared<opset7::Constant>(element::i32, Shape{3}, vector<int32_t>{-1, -1, -2});
+    auto f = make_shared<Function>(make_shared<opset7::Roll>(x, shift, axes), ParameterVector{x});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x_tensor = backend->create_tensor(element::i32, shape);
+    copy_data(x_tensor, vector<int32_t>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                                        13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+    auto result = backend->create_tensor(element::i32, shape);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {x_tensor});
+    EXPECT_TRUE(test::all_close((vector<int32_t>{6,  4,  5,  3,  1,  2,  12, 10, 11, 9,  7,  8,
+                                                 18, 16, 17, 15, 13, 14, 24, 22, 23, 21, 19, 20}),
+                                read_vector<int32_t>(result)));
+}
diff --git a/ngraph/test/backend/scatter_nd_update.in.cpp b/ngraph/test/backend/scatter_nd_update.in.cpp
new file mode 100644
index 00000000000000..e939d31c8c78e7
--- /dev/null
+++ b/ngraph/test/backend/scatter_nd_update.in.cpp
@@ -0,0 +1,473 @@
+//*****************************************************************************
+// Copyright 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "gtest/gtest.h"
+
+#include <memory>
+#include <string>
+
+#include "ngraph/opsets/opset7.hpp"
+#include "ngraph/type/element_type.hpp"
+
+#include "runtime/backend.hpp"
+
+#include "util/all_close_f.hpp"
+#include "util/ndarray.hpp"
+#include "util/test_control.hpp"
+#include "util/test_tools.hpp"
+
+static std::string s_manifest = "${MANIFEST}";
+
+namespace
+{
+    template <typename ValueType>
+    struct Array
+    {
+        using StorageType = ngraph::test::NDArrayBase<ValueType>;
+        static ngraph::element::Type element_type() { return ngraph::element::from<ValueType>(); }
+        StorageType data;
+    };
+    struct Params
+    {
+        Array<float> input;
+        Array<int32_t> indices;
+        Array<float> updates;
+        Array<float> expected_output;
+    };
+
+    void execute_test(const Params& p)
+    {
+        using namespace ngraph;
+        using namespace opset7;
+
+        auto inputs = std::make_shared<Parameter>(p.input.element_type(), p.input.data.get_shape());
+        auto indices = Constant::create(
+            p.indices.element_type(), p.indices.data.get_shape(), p.indices.data.get_vector());
+        auto updates = Constant::create(
+            p.updates.element_type(), p.updates.data.get_shape(), p.updates.data.get_vector());
+
+        auto scatter = std::make_shared<ScatterNDUpdate>(inputs, indices, updates);
+
+        auto function =
+            std::make_shared<Function>(scatter, ParameterVector{inputs}, "ScatterNDUpdate");
+
+        auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+        // Create some tensors for input/output
+        auto inputs_tensor =
+            backend->create_tensor(p.input.element_type(), p.input.data.get_shape());
+        copy_data(inputs_tensor, p.input.data.get_vector());
+
+        auto result =
+            backend->create_tensor(p.input.element_type(), p.expected_output.data.get_shape());
+
+        auto handle = backend->compile(function);
+        handle->call_with_validate({result}, {inputs_tensor});
+
+        EXPECT_TRUE(test::all_close_f(p.expected_output.data.get_vector(),
+                                      read_vector<float>(result),
+                                      MIN_FLOAT_TOLERANCE_BITS));
+    }
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_1x1)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 1>{1},
+                        NDArray<int32_t, 2>{{0}},
+                        NDArray<float, 1>{20},
+                        NDArray<float, 1>{20}});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_2x2_by_1)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 2>{
+                            {1, 2},
+                            {3, 4},
+                        },
+                        NDArray<int32_t, 2>{{1}, {0}},
+                        NDArray<float, 2>{{10, 20}, {30, 40}},
+                        NDArray<float, 2>{
+                            {30, 40},
+                            {10, 20},
+                        }});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_2x2_by_2)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 2>{
+                            {1, 2},
+                            {3, 4},
+                        },
+                        NDArray<int32_t, 2>{
+                            {0, 0},
+                            {1, 1},
+                        },
+                        NDArray<float, 1>{10, 40},
+                        NDArray<float, 2>{
+                            {10, 2},
+                            {3, 40},
+                        }});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_3x3_by_1)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 3>{
+                            {
+                                {11, 12, 13},
+                                {14, 15, 16},
+                                {17, 18, 19},
+                            },
+                            {
+                                {21, 22, 23},
+                                {24, 25, 26},
+                                {27, 28, 29},
+                            },
+                            {
+                                {31, 32, 33},
+                                {34, 35, 36},
+                                {37, 38, 39},
+                            },
+                        },
+                        NDArray<int32_t, 2>{{0}, {2}},
+                        NDArray<float, 3>{
+                            {
+                                {91, 92, 93},
+                                {94, 95, 96},
+                                {97, 98, 99},
+                            },
+                            {
+                                {81, 82, 83},
+                                {84, 85, 86},
+                                {87, 88, 89},
+                            },
+                        },
+                        NDArray<float, 3>{
+                            {
+                                {91, 92, 93},
+                                {94, 95, 96},
+                                {97, 98, 99},
+                            },
+                            {
+                                {21, 22, 23},
+                                {24, 25, 26},
+                                {27, 28, 29},
+                            },
+                            {
+                                {81, 82, 83},
+                                {84, 85, 86},
+                                {87, 88, 89},
+                            },
+                        }});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_3x3_by_2v2)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 3>{
+                            {
+                                {11, 12, 13},
+                                {14, 15, 16},
+                                {17, 18, 19},
+                            },
+                            {
+                                {21, 22, 23},
+                                {24, 25, 26},
+                                {27, 28, 29},
+                            },
+                            {
+                                {31, 32, 33},
+                                {34, 35, 36},
+                                {37, 38, 39},
+                            },
+                        },
+                        NDArray<int32_t, 3>{
+                            {
+                                {0, 0, 0},
+                                {2, 2, 2},
+                            },
+                            {
+                                {1, 0, 0},
+                                {1, 2, 2},
+                            },
+                        },
+                        NDArray<float, 2>{
+                            {91, 92},
+                            {81, 82},
+                        },
+                        NDArray<float, 3>{
+                            {
+                                {91, 12, 13},
+                                {14, 15, 16},
+                                {17, 18, 19},
+                            },
+                            {
+                                {81, 22, 23},
+                                {24, 25, 26},
+                                {27, 28, 82},
+                            },
+                            {
+                                {31, 32, 33},
+                                {34, 35, 36},
+                                {37, 38, 92},
+                            },
+                        }});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_3x3_by_2)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 3>{
+                            {
+                                {11, 12, 13},
+                                {14, 15, 16},
+                                {17, 18, 19},
+                            },
+                            {
+                                {21, 22, 23},
+                                {24, 25, 26},
+                                {27, 28, 29},
+                            },
+                            {
+                                {31, 32, 33},
+                                {34, 35, 36},
+                                {37, 38, 39},
+                            },
+                        },
+                        NDArray<int32_t, 2>{{0, 0}, {2, 2}},
+                        NDArray<float, 2>{
+                            {91, 92, 93},
+                            {87, 88, 89},
+                        },
+                        NDArray<float, 3>{
+                            {
+                                {91, 92, 93},
+                                {14, 15, 16},
+                                {17, 18, 19},
+                            },
+                            {
+                                {21, 22, 23},
+                                {24, 25, 26},
+                                {27, 28, 29},
+                            },
+                            {
+                                {31, 32, 33},
+                                {34, 35, 36},
+                                {87, 88, 89},
+                            },
+                        }});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_3x3_by_3)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 3>{
+                            {
+                                {11, 12, 13},
+                                {14, 15, 16},
+                                {17, 18, 19},
+                            },
+                            {
+                                {21, 22, 23},
+                                {24, 25, 26},
+                                {27, 28, 29},
+                            },
+                            {
+                                {31, 32, 33},
+                                {34, 35, 36},
+                                {37, 38, 39},
+                            },
+                        },
+                        NDArray<int32_t, 2>{{0, 0, 0}, {2, 2, 2}},
+                        NDArray<float, 1>{91, 99},
+                        NDArray<float, 3>{
+                            {
+                                {91, 12, 13},
+                                {14, 15, 16},
+                                {17, 18, 19},
+                            },
+                            {
+                                {21, 22, 23},
+                                {24, 25, 26},
+                                {27, 28, 29},
+                            },
+                            {
+                                {31, 32, 33},
+                                {34, 35, 36},
+                                {37, 38, 99},
+                            },
+                        }});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_1d_from_examples)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 1>{1, 2, 3, 4, 5, 6, 7, 8},
+                        NDArray<int32_t, 2>{{4}, {3}, {1}, {7}},
+                        NDArray<float, 1>{9, 10, 11, 12},
+                        NDArray<float, 1>{1, 11, 3, 10, 9, 6, 7, 12}});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_4x4_shape_from_examples)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 3>{{
+                                              {1, 2, 3, 4},
+                                              {5, 6, 7, 8},
+                                              {8, 7, 6, 5},
+                                              {4, 3, 2, 1},
+                                          },
+                                          {
+                                              {1, 2, 3, 4},
+                                              {5, 6, 7, 8},
+                                              {8, 7, 6, 5},
+                                              {4, 3, 2, 1},
+                                          },
+                                          {
+                                              {8, 7, 6, 5},
+                                              {4, 3, 2, 1},
+                                              {1, 2, 3, 4},
+                                              {5, 6, 7, 8},
+                                          },
+                                          {
+                                              {8, 7, 6, 5},
+                                              {4, 3, 2, 1},
+                                              {1, 2, 3, 4},
+                                              {5, 6, 7, 8},
+                                          }},
+                        NDArray<int32_t, 2>{{0}, {2}},
+                        NDArray<float, 3>{
+                            {
+                                {5, 5, 5, 5},
+                                {6, 6, 6, 6},
+                                {7, 7, 7, 7},
+                                {8, 8, 8, 8},
+                            },
+                            {
+                                {1, 1, 1, 1},
+                                {2, 2, 2, 2},
+                                {3, 3, 3, 3},
+                                {4, 4, 4, 4},
+                            },
+                        },
+                        NDArray<float, 3>{
+                            {
+                                {5, 5, 5, 5},
+                                {6, 6, 6, 6},
+                                {7, 7, 7, 7},
+                                {8, 8, 8, 8},
+                            },
+                            {
+                                {1, 2, 3, 4},
+                                {5, 6, 7, 8},
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+                            },
+                            {
+                                {1, 1, 1, 1},
+                                {2, 2, 2, 2},
+                                {3, 3, 3, 3},
+                                {4, 4, 4, 4},
+                            },
+                            {
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+                                {1, 2, 3, 4},
+                                {5, 6, 7, 8},
+                            },
+                        }});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scatter_nd_update_4x4_v2)
+{
+    using namespace ngraph::test;
+    execute_test(Params{NDArray<float, 3>{
+                            {
+                                {1, 2, 3, 4},
+                                {5, 6, 7, 8},
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+                            },
+                            {
+                                {1, 2, 3, 4},
+                                {5, 6, 7, 8},
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+                            },
+                            {
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+                                {1, 2, 3, 4},
+                                {5, 6, 7, 8},
+                            },
+                            {
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+                                {1, 2, 3, 4},
+                                {5, 6, 7, 8},
+                            },
+                        },
+                        NDArray<int32_t, 3>{
+                            {
+                                {0, 0},
+                                {2, 2},
+                            },
+                            {
+                                {1, 1},
+                                {3, 3},
+                            },
+                        },
+                        NDArray<float, 3>{
+                            {
+                                {15, 16, 17, 18},
+                                {25, 26, 27, 28},
+                            },
+                            {
+                                {35, 36, 37, 38},
+                                {45, 46, 47, 58},
+                            },
+                        },
+                        NDArray<float, 3>{
+                            {
+                                {15, 16, 17, 18},
+                                {5, 6, 7, 8},
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+
+                            },
+                            {
+                                {1, 2, 3, 4},
+                                {35, 36, 37, 38},
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+                            },
+                            {
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+                                {25, 26, 27, 28},
+                                {5, 6, 7, 8},
+                            },
+                            {
+                                {8, 7, 6, 5},
+                                {4, 3, 2, 1},
+                                {1, 2, 3, 4},
+                                {45, 46, 47, 58},
+                            },
+                        }});
+}
diff --git a/ngraph/test/constant_folding.cpp b/ngraph/test/constant_folding.cpp
index b416938ef9e64e..c34dcb12c6e0b7 100644
--- a/ngraph/test/constant_folding.cpp
+++ b/ngraph/test/constant_folding.cpp
@@ -1917,6 +1917,276 @@ TEST(constant_folding, const_gather_v1_subgraph_skip_if_not_single_input)
     ASSERT_EQ(count_ops_of_type<op::v1::Gather>(f), 1);
 }
 
+TEST(constant_folding, const_gather_v7)
+{
+    auto constant_data = op::Constant::create(
+            element::f32,
+            Shape{2, 5},
+            vector<float>{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f});
+    auto constant_indices =
+            op::Constant::create(element::i64, Shape{4}, vector<int64_t>{0, 3, 2, 2});
+    auto constant_axis = op::Constant::create(element::i64, Shape{1}, vector<int64_t>{1});
+    auto gather = make_shared<op::v7::Gather>(constant_data, constant_indices, constant_axis);
+    gather->set_friendly_name("test");
+    auto f = make_shared<Function>(gather, ParameterVector{});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 0);
+    ASSERT_EQ(count_ops_of_type<op::Constant>(f), 1);
+
+    auto new_const =
+            as_type_ptr<op::Constant>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
+    ASSERT_TRUE(new_const);
+    ASSERT_EQ(new_const->get_friendly_name(), "test");
+    auto values_out = new_const->get_vector<float>();
+
+    vector<float> values_expected{1.0f, 4.0f, 3.0f, 3.0f, 6.0f, 9.0f, 8.0f, 8.0f};
+
+    ASSERT_TRUE(test::all_close_f(values_out, values_expected, MIN_FLOAT_TOLERANCE_BITS));
+}
+
+TEST(constant_folding, const_gather_v7_scalar)
+{
+    auto constant_data = op::Constant::create(
+            element::f32,
+            Shape{2, 5},
+            vector<float>{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f});
+    auto constant_indices =
+            op::Constant::create(element::i64, Shape{4}, vector<int64_t>{0, 3, 2, 2});
+    auto constant_axis = op::Constant::create(element::i64, Shape{}, vector<int64_t>{1});
+    auto gather = make_shared<op::v7::Gather>(constant_data, constant_indices, constant_axis);
+    gather->set_friendly_name("test");
+    auto f = make_shared<Function>(gather, ParameterVector{});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 0);
+    ASSERT_EQ(count_ops_of_type<op::Constant>(f), 1);
+
+    auto new_const =
+            as_type_ptr<op::Constant>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
+    ASSERT_TRUE(new_const);
+    ASSERT_EQ(new_const->get_friendly_name(), "test");
+    auto values_out = new_const->get_vector<float>();
+
+    vector<float> values_expected{1.0f, 4.0f, 3.0f, 3.0f, 6.0f, 9.0f, 8.0f, 8.0f};
+
+    ASSERT_TRUE(test::all_close_f(values_out, values_expected, MIN_FLOAT_TOLERANCE_BITS));
+}
+
+TEST(constant_folding, const_gather_v7_subgraph)
+{
+    const auto A = make_shared<op::Parameter>(element::f32, Shape{1});
+    const float b_value = 3.21f;
+    const auto B_const = op::Constant::create(element::f32, {1}, {b_value});
+    const auto C = make_shared<op::Parameter>(element::f32, Shape{1});
+    const int64_t axis = 0;
+    const auto axis_const = op::Constant::create(element::i64, {}, {axis});
+
+    const auto concat = make_shared<op::Concat>(NodeVector{A, B_const, C}, axis);
+
+    const vector<int64_t> indices{1};
+    const auto indices_const = op::Constant::create(element::i64, {indices.size()}, indices);
+    const auto gather = make_shared<op::v7::Gather>(concat, indices_const, axis_const);
+    gather->set_friendly_name("test");
+    auto f = make_shared<Function>(gather, ParameterVector{A, C});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 0);
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 0);
+    ASSERT_EQ(count_ops_of_type<op::Constant>(f), 1);
+
+    const auto new_const =
+            as_type_ptr<op::Constant>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
+    ASSERT_TRUE(new_const);
+    ASSERT_EQ(new_const->get_friendly_name(), "test");
+
+    const auto values_out = new_const->get_vector<float>();
+    ASSERT_TRUE(test::all_close_f(values_out, {b_value}, MIN_FLOAT_TOLERANCE_BITS));
+}
+
+TEST(constant_folding, const_gather_v7_subgraph_neg_axis)
+{
+    const auto A = make_shared<op::Parameter>(element::f32, Shape{1});
+    const float b_value = 1.23f;
+    const auto B = make_shared<op::Parameter>(element::f32, Shape{1});
+    const auto C_const = op::Constant::create(element::f32, {1}, {b_value});
+    const int64_t axis = 0;
+    const auto axis_const = op::Constant::create(element::i64, {}, {axis});
+
+    const auto concat = make_shared<op::Concat>(NodeVector{A, B, C_const}, axis);
+
+    const vector<int64_t> indices{-1};
+    const auto indices_const = op::Constant::create(element::i64, {indices.size()}, indices);
+    const auto gather = make_shared<op::v7::Gather>(concat, indices_const, axis_const);
+    gather->set_friendly_name("test");
+    auto f = make_shared<Function>(gather, ParameterVector{A, B});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 0);
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 0);
+    ASSERT_EQ(count_ops_of_type<op::Constant>(f), 1);
+
+    const auto new_const =
+            as_type_ptr<op::Constant>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
+    ASSERT_TRUE(new_const);
+    ASSERT_EQ(new_const->get_friendly_name(), "test");
+
+    const auto values_out = new_const->get_vector<float>();
+    ASSERT_TRUE(test::all_close_f(values_out, {b_value}, MIN_FLOAT_TOLERANCE_BITS));
+}
+
+TEST(constant_folding, const_gather_v7_subgraph_no_constant_input)
+{
+    const auto A = make_shared<op::Parameter>(element::f32, Shape{1});
+    const auto B = make_shared<op::Parameter>(element::f32, Shape{1});
+    const auto C = make_shared<op::Parameter>(element::f32, Shape{1});
+    const int64_t axis = 0;
+    const auto axis_const = op::Constant::create(element::i64, {}, {axis});
+
+    const auto concat = make_shared<op::Concat>(NodeVector{A, B, C}, axis);
+
+    const vector<int64_t> indices{1};
+    const auto indices_const = op::Constant::create(element::i64, {indices.size()}, indices);
+    const auto gather = make_shared<op::v7::Gather>(concat, indices_const, axis_const);
+    gather->set_friendly_name("test");
+    auto f = make_shared<Function>(gather, ParameterVector{A, B, C});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 0);
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 0);
+}
+
+TEST(constant_folding, const_gather_v7_subgraph_no_constant_input_scalar)
+{
+    const auto A = make_shared<op::Parameter>(element::f32, Shape{1});
+    const auto B = make_shared<op::Parameter>(element::f32, Shape{1});
+    const auto C = make_shared<op::Parameter>(element::f32, Shape{1});
+    const int64_t axis = 0;
+    const auto axis_const = op::Constant::create(element::i64, {}, {axis});
+
+    const auto concat = make_shared<op::Concat>(NodeVector{A, B, C}, axis);
+
+    const vector<int64_t> indices{1};
+    const auto indices_const = op::Constant::create(element::i64, {}, indices);
+    const auto gather = make_shared<op::v7::Gather>(concat, indices_const, axis_const);
+    auto f = make_shared<Function>(gather, ParameterVector{A, B, C});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 0);
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 0);
+    ASSERT_EQ(count_ops_of_type<op::v0::Squeeze>(f), 1);
+}
+
+TEST(constant_folding, const_gather_v7_subgraph_skip_if_non_zero_axis)
+{
+    const auto A = make_shared<op::Parameter>(element::f32, Shape{2, 2});
+    const auto B = make_shared<op::Parameter>(element::f32, Shape{2, 2});
+    const auto C = make_shared<op::Parameter>(element::f32, Shape{2, 2});
+    const int64_t axis = 1;
+    const auto axis_const = op::Constant::create(element::i64, {}, {axis});
+
+    const auto concat = make_shared<op::Concat>(NodeVector{A, B, C}, axis);
+
+    const vector<int64_t> indices{1};
+    const auto indices_const = op::Constant::create(element::i64, {indices.size()}, indices);
+    const auto gather = make_shared<op::v7::Gather>(concat, indices_const, axis_const);
+    auto f = make_shared<Function>(gather, ParameterVector{A, B, C});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 1);
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 1);
+}
+
+TEST(constant_folding, const_gather_v7_subgraph_skip_if_non_single_indices)
+{
+    const auto A = make_shared<op::Parameter>(element::f32, Shape{1});
+    const auto B = make_shared<op::Parameter>(element::f32, Shape{1});
+    const auto C = make_shared<op::Parameter>(element::f32, Shape{1});
+    const int64_t axis = 0;
+    const auto axis_const = op::Constant::create(element::i64, {}, {axis});
+
+    const auto concat = make_shared<op::Concat>(NodeVector{A, B, C}, axis);
+
+    const vector<int64_t> indices{0, 1};
+    const auto indices_const = op::Constant::create(element::i64, {indices.size()}, indices);
+    const auto gather = make_shared<op::v7::Gather>(concat, indices_const, axis_const);
+    auto f = make_shared<Function>(gather, ParameterVector{A, B, C});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 1);
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 1);
+}
+
+TEST(constant_folding, const_gather_v7_subgraph_skip_if_concat_output_shape_dynamic)
+{
+    const auto A = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+    const auto B = make_shared<op::Parameter>(element::f32, Shape{1});
+    const auto C = make_shared<op::Parameter>(element::f32, Shape{1});
+    const int64_t axis = 0;
+    const auto axis_const = op::Constant::create(element::i64, {}, {axis});
+
+    const auto concat = make_shared<op::Concat>(NodeVector{A, B, C}, axis);
+
+    const vector<int64_t> indices{1};
+    const auto indices_const = op::Constant::create(element::i64, {indices.size()}, indices);
+    const auto gather = make_shared<op::v7::Gather>(concat, indices_const, axis_const);
+    auto f = make_shared<Function>(gather, ParameterVector{A, B, C});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 1);
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 1);
+}
+
+TEST(constant_folding, const_gather_v7_subgraph_skip_if_not_single_input)
+{
+    const auto A = make_shared<op::Parameter>(element::f32, Shape{2});
+    const auto B = make_shared<op::Parameter>(element::f32, Shape{1});
+    const auto C = make_shared<op::Parameter>(element::f32, Shape{1});
+    const int64_t axis = 0;
+    const auto axis_const = op::Constant::create(element::i64, {}, {axis});
+
+    const auto concat = make_shared<op::Concat>(NodeVector{A, B, C}, axis);
+
+    const vector<int64_t> indices{1};
+    const auto indices_const = op::Constant::create(element::i64, {indices.size()}, indices);
+    const auto gather = make_shared<op::v7::Gather>(concat, indices_const, axis_const);
+    auto f = make_shared<Function>(gather, ParameterVector{A, B, C});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::ConstantFolding>();
+    pass_manager.run_passes(f);
+
+    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 1);
+    ASSERT_EQ(count_ops_of_type<op::v7::Gather>(f), 1);
+}
+
 TEST(constant_folding, const_strided_slice)
 {
     Shape shape_in{16};
diff --git a/ngraph/test/eval.cpp b/ngraph/test/eval.cpp
index 0a06de330b8a44..e37a2cad95b0d6 100644
--- a/ngraph/test/eval.cpp
+++ b/ngraph/test/eval.cpp
@@ -1197,7 +1197,7 @@ TEST(eval, evaluate_logical_not)
     ASSERT_EQ(result_val, expec);
 }
 
-TEST(eval, evaluate_dynamic_gather)
+TEST(eval, evaluate_dynamic_gather_v1)
 {
     auto arg1 = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
     auto arg2 = make_shared<op::Parameter>(element::i32, PartialShape::dynamic());
@@ -1216,7 +1216,7 @@ TEST(eval, evaluate_dynamic_gather)
     ASSERT_EQ(cval, out);
 }
 
-TEST(eval, evaluate_dynamic_axis_gather)
+TEST(eval, evaluate_dynamic_gather_v1_scalar_axis)
 {
     auto arg1 = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
     auto arg2 = make_shared<op::Parameter>(element::i32, PartialShape::dynamic());
@@ -1236,6 +1236,49 @@ TEST(eval, evaluate_dynamic_axis_gather)
     ASSERT_EQ(cval, out);
 }
 
+TEST(eval, evaluate_dynamic_gather_v7)
+{
+    auto arg1 = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+    auto arg2 = make_shared<op::Parameter>(element::i32, PartialShape::dynamic());
+    auto arg3 = make_shared<op::Parameter>(element::i32, PartialShape::dynamic());
+    int64_t batch_dims = 1;
+    int32_t axis = 1;
+    auto gather = make_shared<op::v7::Gather>(arg1, arg2, arg3, batch_dims);
+    auto fun = make_shared<Function>(OutputVector{gather}, ParameterVector{arg1, arg2, arg3});
+    auto result_tensor = make_shared<HostTensor>();
+    ASSERT_TRUE(fun->evaluate({result_tensor},
+                              {make_host_tensor<element::Type_t::f32>({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}),
+                               make_host_tensor<element::Type_t::i32>({2, 2}, {1, 0, 1, 0}),
+                               make_host_tensor<element::Type_t::i32>({1}, {axis})}));
+    EXPECT_EQ(result_tensor->get_element_type(), element::f32);
+    EXPECT_EQ(result_tensor->get_partial_shape(), (PartialShape{2, 2}));
+    auto cval = read_vector<float>(result_tensor);
+    vector<float> out{2.0f, 1.0f, 5.0f, 4.0f};
+    ASSERT_EQ(cval, out);
+}
+
+TEST(eval, evaluate_dynamic_gather_v7_axis_scalar)
+{
+    auto arg1 = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+    auto arg2 = make_shared<op::Parameter>(element::i32, PartialShape::dynamic());
+    auto arg3 = make_shared<op::Parameter>(element::i64, PartialShape::dynamic());
+    int64_t batch_dims = 0;
+    int64_t axis = 1;
+    auto gather = make_shared<op::v7::Gather>(arg1, arg2, arg3, batch_dims);
+    auto fun = make_shared<Function>(OutputVector{gather}, ParameterVector{arg1, arg2, arg3});
+    auto result_tensor = make_shared<HostTensor>();
+    ASSERT_TRUE(fun->evaluate({result_tensor},
+                              {make_host_tensor<element::Type_t::f32>(
+                                      {3, 3}, {1.0f, 1.1f, 1.2f, 2.0f, 2.1f, 2.2f, 3.0f, 3.1f, 3.2f}),
+                               make_host_tensor<element::Type_t::i32>({1, 2}, {0, 2}),
+                               make_host_tensor<element::Type_t::i64>({}, {axis})}));
+    EXPECT_EQ(result_tensor->get_element_type(), element::f32);
+    EXPECT_EQ(result_tensor->get_partial_shape(), (PartialShape{3, 1, 2}));
+    auto cval = read_vector<float>(result_tensor);
+    vector<float> out{1.0f, 1.2f, 2.0f, 2.2f, 3.0f, 3.2f};
+    ASSERT_EQ(cval, out);
+}
+
 TEST(eval, evaluate_dynamic_concat)
 {
     auto arg1 = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
diff --git a/ngraph/test/models/ir/add_abc.bin b/ngraph/test/models/ir/add_abc.bin
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/ngraph/test/models/ir/add_abc.xml b/ngraph/test/models/ir/add_abc.xml
new file mode 100644
index 00000000000000..bb9c0b07c6fe6c
--- /dev/null
+++ b/ngraph/test/models/ir/add_abc.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0" ?>
+<net name="add_abc" version="10">
+	<layers>
+		<layer id="0" name="A" type="Parameter" version="opset1">
+			<data element_type="f32" shape="1"/>
+			<output>
+				<port id="0" names="A" precision="FP32">
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="1" name="B" type="Parameter" version="opset1">
+			<data element_type="f32" shape="1"/>
+			<output>
+				<port id="0" names="B" precision="FP32">
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="2" name="add_node1" type="Add" version="opset1">
+			<data auto_broadcast="numpy"/>
+			<input>
+				<port id="0">
+					<dim>1</dim>
+				</port>
+				<port id="1">
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" names="X" precision="FP32">
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="3" name="C" type="Parameter" version="opset1">
+			<data element_type="f32" shape="1"/>
+			<output>
+				<port id="0" names="C" precision="FP32">
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="4" name="Y" type="Add" version="opset1">
+			<data auto_broadcast="numpy"/>
+			<input>
+				<port id="0">
+					<dim>1</dim>
+				</port>
+				<port id="1">
+					<dim>1</dim>
+				</port>
+			</input>
+			<output>
+				<port id="2" names="Y" precision="FP32">
+					<dim>1</dim>
+				</port>
+			</output>
+		</layer>
+		<layer id="5" name="Y/sink_port_0" type="Result" version="opset1">
+			<input>
+				<port id="0">
+					<dim>1</dim>
+				</port>
+			</input>
+		</layer>
+	</layers>
+	<edges>
+		<edge from-layer="0" from-port="0" to-layer="2" to-port="0"/>
+		<edge from-layer="1" from-port="0" to-layer="2" to-port="1"/>
+		<edge from-layer="2" from-port="2" to-layer="4" to-port="0"/>
+		<edge from-layer="3" from-port="0" to-layer="4" to-port="1"/>
+		<edge from-layer="4" from-port="2" to-layer="5" to-port="0"/>
+	</edges>
+</net>
diff --git a/ngraph/test/models/ir/weights/add_abc.bin b/ngraph/test/models/ir/weights/add_abc.bin
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/ngraph/test/models/onnx/instance_norm.prototxt b/ngraph/test/models/onnx/instance_norm.prototxt
index 3775d957c4442a..6e446c95e000a0 100644
--- a/ngraph/test/models/onnx/instance_norm.prototxt
+++ b/ngraph/test/models/onnx/instance_norm.prototxt
@@ -59,7 +59,6 @@ graph {
           dim {
             dim_value: 2
           }
-
         }
       }
     }
diff --git a/ngraph/test/models/onnx/mod_incorrect_fmod.prototxt b/ngraph/test/models/onnx/mod_incorrect_fmod.prototxt
new file mode 100644
index 00000000000000..86305d5bd90b5b
--- /dev/null
+++ b/ngraph/test/models/onnx/mod_incorrect_fmod.prototxt
@@ -0,0 +1,58 @@
+ir_version: 5
+producer_name: "nGraph ONNX Importer"
+graph {
+  node {
+    input: "A"
+    input: "B"
+    output: "Y"
+    op_type: "Mod"
+    attribute {
+      name: "fmod"
+      i: 2
+      type: INT
+    }
+  }
+  name: "test_mod"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+    input {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 10
+}
diff --git a/ngraph/test/models/onnx/mod_sign.prototxt b/ngraph/test/models/onnx/mod_sign.prototxt
index 3922213e957faf..50be1dfb479d4b 100644
--- a/ngraph/test/models/onnx/mod_sign.prototxt
+++ b/ngraph/test/models/onnx/mod_sign.prototxt
@@ -6,18 +6,13 @@ graph {
     input: "B"
     output: "Y"
     op_type: "Mod"
-    attribute {
-      name: "fmod"
-      i: 1
-      type: INT
-    }
   }
   name: "test_mod"
   input {
     name: "A"
     type {
       tensor_type {
-        elem_type: 7
+        elem_type: 6
         shape {
           dim {
             dim_value: 6
@@ -26,11 +21,11 @@ graph {
       }
     }
   }
-    input {
+  input {
     name: "B"
     type {
       tensor_type {
-        elem_type: 7
+        elem_type: 6
         shape {
           dim {
             dim_value: 6
@@ -43,7 +38,7 @@ graph {
     name: "Y"
     type {
       tensor_type {
-        elem_type: 7
+        elem_type: 6
         shape {
           dim {
             dim_value: 6
diff --git a/ngraph/test/models/onnx/mod_sign_broadcast.prototxt b/ngraph/test/models/onnx/mod_sign_broadcast.prototxt
new file mode 100644
index 00000000000000..55c5f1f7414b05
--- /dev/null
+++ b/ngraph/test/models/onnx/mod_sign_broadcast.prototxt
@@ -0,0 +1,53 @@
+ir_version: 5
+producer_name: "nGraph ONNX Importer"
+graph {
+  node {
+    input: "A"
+    input: "B"
+    output: "Y"
+    op_type: "Mod"
+  }
+  name: "test_mod"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+    input {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 10
+}
diff --git a/ngraph/test/models/onnx/mod_sign_f32.prototxt b/ngraph/test/models/onnx/mod_sign_f32.prototxt
new file mode 100644
index 00000000000000..2a86ca082bd885
--- /dev/null
+++ b/ngraph/test/models/onnx/mod_sign_f32.prototxt
@@ -0,0 +1,53 @@
+ir_version: 5
+producer_name: "nGraph ONNX Importer"
+graph {
+  node {
+    input: "A"
+    input: "B"
+    output: "Y"
+    op_type: "Mod"
+  }
+  name: "test_mod"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+    input {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 10
+}
diff --git a/ngraph/test/models/onnx/mod_sign_fmod.prototxt b/ngraph/test/models/onnx/mod_sign_fmod.prototxt
new file mode 100644
index 00000000000000..84dd4aac3cf51f
--- /dev/null
+++ b/ngraph/test/models/onnx/mod_sign_fmod.prototxt
@@ -0,0 +1,58 @@
+ir_version: 5
+producer_name: "nGraph ONNX Importer"
+graph {
+  node {
+    input: "A"
+    input: "B"
+    output: "Y"
+    op_type: "Mod"
+    attribute {
+      name: "fmod"
+      i: 1
+      type: INT
+    }
+  }
+  name: "test_mod"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+    input {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 10
+}
diff --git a/ngraph/test/models/onnx/mod_sign_fmod_broadcast.prototxt b/ngraph/test/models/onnx/mod_sign_fmod_broadcast.prototxt
new file mode 100644
index 00000000000000..826583b48b4a42
--- /dev/null
+++ b/ngraph/test/models/onnx/mod_sign_fmod_broadcast.prototxt
@@ -0,0 +1,58 @@
+ir_version: 5
+producer_name: "nGraph ONNX Importer"
+graph {
+  node {
+    input: "A"
+    input: "B"
+    output: "Y"
+    op_type: "Mod"
+    attribute {
+      name: "fmod"
+      i: 1
+      type: INT
+    }
+  }
+  name: "test_mod"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 1
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 6
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 10
+}
diff --git a/ngraph/test/models/onnx/mod_sign_fmod_f32.prototxt b/ngraph/test/models/onnx/mod_sign_fmod_f32.prototxt
new file mode 100644
index 00000000000000..f8bfc3621b0877
--- /dev/null
+++ b/ngraph/test/models/onnx/mod_sign_fmod_f32.prototxt
@@ -0,0 +1,58 @@
+ir_version: 5
+producer_name: "nGraph ONNX Importer"
+graph {
+  node {
+    input: "A"
+    input: "B"
+    output: "Y"
+    op_type: "Mod"
+    attribute {
+      name: "fmod"
+      i: 1
+      type: INT
+    }
+  }
+  name: "test_mod"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+    input {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 10
+}
diff --git a/ngraph/test/models/onnx/mod_sign_i64.prototxt b/ngraph/test/models/onnx/mod_sign_i64.prototxt
new file mode 100644
index 00000000000000..8043c896d1514e
--- /dev/null
+++ b/ngraph/test/models/onnx/mod_sign_i64.prototxt
@@ -0,0 +1,53 @@
+ir_version: 5
+producer_name: "nGraph ONNX Importer"
+graph {
+  node {
+    input: "A"
+    input: "B"
+    output: "Y"
+    op_type: "Mod"
+  }
+  name: "test_mod"
+  input {
+    name: "A"
+    type {
+      tensor_type {
+        elem_type: 7
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "B"
+    type {
+      tensor_type {
+        elem_type: 7
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "Y"
+    type {
+      tensor_type {
+        elem_type: 7
+        shape {
+          dim {
+            dim_value: 6
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 10
+}
diff --git a/ngraph/test/models/onnx/org.openvinotoolkit/deformable_conv_2d.prototxt b/ngraph/test/models/onnx/org.openvinotoolkit/deformable_conv_2d.prototxt
new file mode 100644
index 00000000000000..6151ddd7435349
--- /dev/null
+++ b/ngraph/test/models/onnx/org.openvinotoolkit/deformable_conv_2d.prototxt
@@ -0,0 +1,115 @@
+ir_version: 7
+producer_name: "nGraph ONNX Importer"
+graph {
+  node {
+    input: "data"
+    input: "deformation"
+    input: "filters"
+    output: "out"
+    op_type: "DeformableConv2D"
+  }
+  name: "test_graph"
+  input {
+    name: "data"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "deformation"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "filters"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+  initializer {
+    name: "filters"
+    dims: 1
+    dims: 1
+    dims: 2
+    dims: 2
+    data_type: 1
+    float_data: 0.1
+    float_data: 0.2
+    float_data: 0.3
+    float_data: 0.4
+  }
+  output {
+    name: "out"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/ngraph/test/onnx/onnx_editor.cpp b/ngraph/test/onnx/onnx_editor.cpp
index 898d5e449b1509..defcf6745baa10 100644
--- a/ngraph/test/onnx/onnx_editor.cpp
+++ b/ngraph/test/onnx/onnx_editor.cpp
@@ -59,9 +59,7 @@ NGRAPH_TEST(onnx_editor, types__single_input_type_substitution)
 
     editor.set_input_types({{"A", element::i64}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
-
+    const auto function = editor.get_function();
     const auto graph_inputs = function->get_parameters();
 
     const auto float_inputs_count = std::count_if(
@@ -84,8 +82,7 @@ NGRAPH_TEST(onnx_editor, types__all_inputs_type_substitution)
 
     editor.set_input_types({{"A", element::i8}, {"B", element::i8}, {"C", element::i8}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
 
     const auto graph_inputs = function->get_parameters();
 
@@ -142,8 +139,7 @@ NGRAPH_TEST(onnx_editor, types__elem_type_missing_in_input)
     // the "elem_type" is missing in the model but it should be possible to set the type anyway
     EXPECT_NO_THROW(editor.set_input_types({{"A", element::i64}}));
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
 
     const auto graph_inputs = function->get_parameters();
 
@@ -165,9 +161,7 @@ NGRAPH_TEST(onnx_editor, shapes__modify_single_input)
 
     editor.set_input_shapes({{"B", new_shape}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
-
+    const auto function = editor.get_function();
     const auto graph_inputs = function->get_parameters();
 
     EXPECT_TRUE(find_input(graph_inputs, "B")->get_partial_shape().same_scheme(new_shape));
@@ -182,9 +176,7 @@ NGRAPH_TEST(onnx_editor, shapes__modify_all_inputs)
 
     editor.set_input_shapes({{"A", new_shape}, {"B", new_shape}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
-
+    const auto function = editor.get_function();
     const auto graph_inputs = function->get_parameters();
 
     for (const auto& input : graph_inputs)
@@ -203,9 +195,7 @@ NGRAPH_TEST(onnx_editor, shapes__dynamic_rank_in_model)
     const auto expected_shape_of_A = PartialShape{1, 2};
     EXPECT_NO_THROW(editor.set_input_shapes({{"A", expected_shape_of_A}}));
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
-
+    const auto function = editor.get_function();
     const auto graph_inputs = function->get_parameters();
 
     EXPECT_TRUE(
@@ -221,9 +211,7 @@ NGRAPH_TEST(onnx_editor, shapes__set_dynamic_dimension)
 
     editor.set_input_shapes({{"A", new_shape}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
-
+    const auto function = editor.get_function();
     const auto graph_inputs = function->get_parameters();
 
     EXPECT_TRUE(find_input(graph_inputs, "A")->get_partial_shape().same_scheme(new_shape));
@@ -239,9 +227,7 @@ NGRAPH_TEST(onnx_editor, shapes__set_mixed_dimensions)
 
     editor.set_input_shapes({{"A", new_shape_A}, {"B", new_shape_B}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
-
+    const auto function = editor.get_function();
     const auto graph_inputs = function->get_parameters();
 
     const auto input_A = find_input(graph_inputs, "A");
@@ -260,9 +246,7 @@ NGRAPH_TEST(onnx_editor, shapes__set_scalar_inputs)
 
     editor.set_input_shapes({{"A", new_shape}, {"B", new_shape}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
-
+    const auto function = editor.get_function();
     const auto graph_inputs = function->get_parameters();
 
     const auto input_A = find_input(graph_inputs, "A");
@@ -281,9 +265,7 @@ NGRAPH_TEST(onnx_editor, shapes__static_to_dynamic_rank_substitution)
 
     editor.set_input_shapes({{"A", new_shape}, {"B", new_shape}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
-
+    const auto function = editor.get_function();
     const auto graph_inputs = function->get_parameters();
 
     for (const auto& input : graph_inputs)
@@ -687,8 +669,7 @@ NGRAPH_TEST(onnx_editor, values__append_one_initializer)
     in_vals.emplace("A", op::Constant::create(element::i64, Shape{2}, {1, 2}));
     editor.set_input_values(in_vals);
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
     auto test_case = test::TestCase<TestEngine>(function);
     test_case.add_input<int64_t>(Shape{2}, {5, 6});
     test_case.add_expected_output<int64_t>(Shape{2}, {6, 8});
@@ -705,8 +686,7 @@ NGRAPH_TEST(onnx_editor, values__append_two_initializers_to_invalid)
     in_vals.emplace("B", op::Constant::create(element::i64, Shape{2}, {1, 3}));
     editor.set_input_values(in_vals);
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
     auto test_case = test::TestCase<TestEngine>(function);
     test_case.add_expected_output<int64_t>(Shape{2}, {5, 5});
     test_case.run();
@@ -721,8 +701,7 @@ NGRAPH_TEST(onnx_editor, values__modify_one_initializer)
     in_vals.emplace("B", op::Constant::create(element::i64, Shape{2}, {3, 4}));
     editor.set_input_values(in_vals);
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
     auto test_case = test::TestCase<TestEngine>(function);
     test_case.add_expected_output<int64_t>(Shape{2}, {4, 6});
     test_case.run();
@@ -738,8 +717,7 @@ NGRAPH_TEST(onnx_editor, values__modify_two_initializers)
     in_vals.emplace("B", op::Constant::create(element::i64, Shape{2}, {2, 1}));
     editor.set_input_values(in_vals);
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
     auto test_case = test::TestCase<TestEngine>(function);
     test_case.add_expected_output<int64_t>(Shape{2}, {5, 7});
     test_case.run();
@@ -755,8 +733,7 @@ NGRAPH_TEST(onnx_editor, values__no_inputs_modify_two_initializers)
     in_vals.emplace("B", op::Constant::create(element::i64, Shape{2}, {11, 22}));
     editor.set_input_values(in_vals);
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
     auto test_case = test::TestCase<TestEngine>(function);
     test_case.add_expected_output<int64_t>(Shape{2}, {12, 24});
     test_case.run();
@@ -772,8 +749,7 @@ NGRAPH_TEST(onnx_editor, values__append_two_initializers_change_shape_type)
     in_vals.emplace("B", op::Constant::create(element::i8, Shape{2, 1}, {-2, 2}));
     editor.set_input_values(in_vals);
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
     auto test_case = test::TestCase<TestEngine>(function);
     test_case.add_expected_output<int8_t>(Shape{2, 1}, {-3, 3});
     test_case.run();
@@ -790,8 +766,7 @@ NGRAPH_TEST(onnx_editor, values__append_two_initializers_mixed_types)
     in_vals.emplace("indices", op::Constant::create(element::i32, Shape{2, 2, 1}, {0, 1, 0, 1}));
     editor.set_input_values(in_vals);
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
     auto test_case = test::TestCase<TestEngine>(function);
     test_case.add_expected_output<int16_t>(Shape{2, 2, 1}, {1, 4, 5, 8});
     test_case.run();
diff --git a/ngraph/test/onnx/onnx_import.in.cpp b/ngraph/test/onnx/onnx_import.in.cpp
index e7cf0c0b752e11..3c49b1243f2334 100644
--- a/ngraph/test/onnx/onnx_import.in.cpp
+++ b/ngraph/test/onnx/onnx_import.in.cpp
@@ -3012,19 +3012,122 @@ NGRAPH_TEST(${BACKEND_NAME}, onnx_matmul_float_type)
     test_case.run();
 }
 
-NGRAPH_TEST(${BACKEND_NAME}, onnx_model_mod)
+NGRAPH_TEST(${BACKEND_NAME}, onnx_model_mod_sign)
 {
     const auto function = onnx_import::import_onnx_model(
         file_util::path_join(SERIALIZED_ZOO, "onnx/mod_sign.prototxt"));
     auto test_case = test::TestCase<TestEngine>(function);
 
-    test_case.add_input<int64_t>({-8, 3, 4, 9, -17, 1});
-    test_case.add_input<int64_t>({22, -13, 8, -3, 7, 2});
-    test_case.add_expected_output<int64_t>(Shape{6}, {-8, 3, 4, 0, -3, 1});
+    test_case.add_input<int32_t>({-4, 7, 5, 4, -7, 8});
+    test_case.add_input<int32_t>({2, -3, 8, -2, 3, 5});
+    test_case.add_expected_output<int32_t>(Shape{6}, {0, -2,  5,  0,  2,  3});
 
     test_case.run();
 }
 
+NGRAPH_TEST(${BACKEND_NAME}, onnx_model_mod_sign_i64)
+{
+    const auto function = onnx_import::import_onnx_model(
+        file_util::path_join(SERIALIZED_ZOO, "onnx/mod_sign_i64.prototxt"));
+    auto test_case = test::TestCase<TestEngine>(function);
+
+    test_case.add_input<int64_t>({-4, 7, 5, 4, -7, 8});
+    test_case.add_input<int64_t>({2, -3, 8, -2, 3, 5});
+    test_case.add_expected_output<int64_t>(Shape{6}, {0, -2,  5,  0,  2,  3});
+
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, onnx_model_mod_sign_broadcast)
+{
+    const auto function = onnx_import::import_onnx_model(
+        file_util::path_join(SERIALIZED_ZOO, "onnx/mod_sign_broadcast.prototxt"));
+    auto test_case = test::TestCase<TestEngine>(function);
+
+    test_case.add_input<int32_t>({-8, 3, 4, 9, -17, 1});
+    test_case.add_input<int32_t>({3});
+    test_case.add_expected_output<int32_t>(Shape{6}, {1, 0, 1, 0, 1, 1});
+
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, onnx_model_mod_sign_f32)
+{
+    try
+    {
+        const auto function = onnx_import::import_onnx_model(
+        file_util::path_join(SERIALIZED_ZOO, "onnx/mod_sign_f32.prototxt"));
+        FAIL() << "Expected exception was not thrown";
+    }
+    catch (const ngraph::ngraph_error& e)
+    {
+        EXPECT_HAS_SUBSTRING(e.what(),
+                             std::string("If the input type is floating point, then `fmod` attribute must be set to 1."));
+    }
+    catch (...)
+    {
+        FAIL() << "Expected ngraph_error exception was not thrown";
+    }
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, onnx_model_mod_sign_fmod)
+{
+    const auto function = onnx_import::import_onnx_model(
+        file_util::path_join(SERIALIZED_ZOO, "onnx/mod_sign_fmod.prototxt"));
+    auto test_case = test::TestCase<TestEngine>(function);
+
+    test_case.add_input<int32_t>({-8, 3, 4, 9, -17, 1});
+    test_case.add_input<int32_t>({22, -13, 8, -3, 7, 2});
+    test_case.add_expected_output<int32_t>(Shape{6}, {-8, 3, 4, 0, -3, 1});
+
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, onnx_model_mod_sign_fmod_broadcast)
+{
+    const auto function = onnx_import::import_onnx_model(
+        file_util::path_join(SERIALIZED_ZOO, "onnx/mod_sign_fmod_broadcast.prototxt"));
+    auto test_case = test::TestCase<TestEngine>(function);
+
+    test_case.add_input<int32_t>({-8, 3, 4, 9, -17, 1});
+    test_case.add_input<int32_t>({3});
+    test_case.add_expected_output<int32_t>(Shape{6}, {-2, 0, 1, 0, -2, 1});
+
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, onnx_model_mod_sign_fmod_f32)
+{
+    const auto function = onnx_import::import_onnx_model(
+        file_util::path_join(SERIALIZED_ZOO, "onnx/mod_sign_fmod_f32.prototxt"));
+    auto test_case = test::TestCase<TestEngine>(function);
+
+    test_case.add_input<float>({-4.3, 7.2, 5.0, 4.3, -7.2, 8.0});
+    test_case.add_input<float>({2.1, -3.4, 8.0, -2.1, 3.4, 5.0});
+    test_case.add_expected_output<float>(Shape{6}, {-0.10000038, 0.39999962, 5. , 0.10000038, -0.39999962, 3.});
+
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, onnx_model_mod_incorrect_fmod)
+{
+    try
+    {
+        const auto function = onnx_import::import_onnx_model(
+        file_util::path_join(SERIALIZED_ZOO, "onnx/mod_incorrect_fmod.prototxt"));
+        FAIL() << "Expected exception was not thrown";
+    }
+    catch (const ngraph::ngraph_error& e)
+    {
+        EXPECT_HAS_SUBSTRING(e.what(),
+                             std::string("Unsupported value of 'fmod' attribute (should be: 0 or 1)"));
+    }
+    catch (...)
+    {
+        FAIL() << "Expected ngraph_error exception was not thrown";
+    }
+}
+
 NGRAPH_TEST(${BACKEND_NAME}, onnx_model_scatterND_param_i64_indices)
 {
     const auto function = onnx_import::import_onnx_model(
diff --git a/ngraph/test/onnx/onnx_import_org_openvino.in.cpp b/ngraph/test/onnx/onnx_import_org_openvino.in.cpp
index b1bb641a14d01e..3407ad8e956bda 100644
--- a/ngraph/test/onnx/onnx_import_org_openvino.in.cpp
+++ b/ngraph/test/onnx/onnx_import_org_openvino.in.cpp
@@ -590,3 +590,45 @@ NGRAPH_TEST(${BACKEND_NAME}, onnx_model_experimental_detectron_topk_rios)
     test_case.add_expected_output<float>(Shape{1, 4}, {1, 1, 3, 4});
     test_case.run();
 }
+
+NGRAPH_TEST(${BACKEND_NAME}, onnx_model_deformable_conv_2d)
+{
+    auto function = onnx_import::import_onnx_model(file_util::path_join(
+        SERIALIZED_ZOO, "onnx/org.openvinotoolkit/deformable_conv_2d.prototxt"));
+
+    auto test_case = test::TestCase<TestEngine>(function);
+
+    // data
+    test_case.add_input<float>({1.0f,
+                                2.0f,
+                                3.0f,
+                                4.0f,
+                                5.0f,
+                                6.0f,
+                                7.0f,
+                                8.0f,
+                                9.0f,
+                                10.0f,
+                                11.0f,
+                                12.0f,
+                                13.0f,
+                                14.0f,
+                                15.0f,
+                                16.0f});
+
+    // deformations
+    test_case.add_input<float>({0.5f, -0.5f, 0.0f, 1.0f});
+
+    test_case.add_expected_output<float>(Shape{1, 1, 3, 3},
+                                         {4.5999999f,
+                                          5.2000003f,
+                                          6.4000001f,
+                                          8.4000006f,
+                                          9.8000002f,
+                                          9.6999998f,
+                                          11.5f,
+                                          13.4000006f,
+                                          14.3999996f});
+
+    test_case.run();
+}
diff --git a/ngraph/test/onnx/onnx_test_utils.in.cpp b/ngraph/test/onnx/onnx_test_utils.in.cpp
index 00f24a6e4fb8c1..01019823b2be62 100644
--- a/ngraph/test/onnx/onnx_test_utils.in.cpp
+++ b/ngraph/test/onnx/onnx_test_utils.in.cpp
@@ -23,6 +23,7 @@
 using namespace ngraph;
 
 static std::string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
 
 template <typename T>
 class ElemTypesTests : public ::testing::Test
@@ -32,7 +33,6 @@ TYPED_TEST_CASE_P(ElemTypesTests);
 
 TYPED_TEST_P(ElemTypesTests, onnx_test_add_abc_set_precission)
 {
-    using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
     using DataType = TypeParam;
     const element::Type ng_type = element::from<DataType>();
 
@@ -41,8 +41,7 @@ TYPED_TEST_P(ElemTypesTests, onnx_test_add_abc_set_precission)
 
     editor.set_input_types({{"A", ng_type}, {"B", ng_type}, {"C", ng_type}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
     auto test_case = test::TestCase<TestEngine>(function);
     test_case.add_input<DataType>(std::vector<DataType>{1, 2, 3});
     test_case.add_input<DataType>(std::vector<DataType>{4, 5, 6});
@@ -53,7 +52,6 @@ TYPED_TEST_P(ElemTypesTests, onnx_test_add_abc_set_precission)
 
 TYPED_TEST_P(ElemTypesTests, onnx_test_split_multioutput_set_precission)
 {
-    using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
     using DataType = TypeParam;
     const element::Type ng_type = element::from<DataType>();
 
@@ -62,8 +60,7 @@ TYPED_TEST_P(ElemTypesTests, onnx_test_split_multioutput_set_precission)
 
     editor.set_input_types({{"input", ng_type}});
 
-    std::istringstream model_stream(editor.model_string());
-    const auto function = onnx_import::import_onnx_model(model_stream);
+    const auto function = editor.get_function();
     auto test_case = test::TestCase<TestEngine>(function);
     test_case.add_input<DataType>(std::vector<DataType>{1, 2, 3, 4, 5, 6});
     test_case.add_expected_output<DataType>(Shape{2}, std::vector<DataType>{1, 2});
@@ -77,3 +74,31 @@ REGISTER_TYPED_TEST_CASE_P(ElemTypesTests,
                            onnx_test_split_multioutput_set_precission);
 typedef ::testing::Types<int8_t, int16_t, int32_t, uint8_t, float> ElemTypes;
 INSTANTIATE_TYPED_TEST_CASE_P(${BACKEND_NAME}, ElemTypesTests, ElemTypes);
+
+
+NGRAPH_TEST(${BACKEND_NAME}, add_abc_from_ir) {
+    const auto ir_xml = file_util::path_join(SERIALIZED_ZOO, "ir/add_abc.xml");
+    const auto function = test::function_from_ir(ir_xml);
+
+    auto test_case = test::TestCase<TestEngine>(function);
+    test_case.add_input<float>({1});
+    test_case.add_input<float>({2});
+    test_case.add_input<float>({3});
+    test_case.add_expected_output<float>(Shape{1}, {6});
+
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, add_abc_from_ir_with_bin_path) {
+    const auto ir_xml = file_util::path_join(SERIALIZED_ZOO, "ir/add_abc.xml");
+    const auto ir_bin = file_util::path_join(SERIALIZED_ZOO, "ir/weights/add_abc.bin");
+    const auto function = test::function_from_ir(ir_xml, ir_bin);
+
+    auto test_case = test::TestCase<TestEngine>(function);
+    test_case.add_input<float>({1});
+    test_case.add_input<float>({2});
+    test_case.add_input<float>({3});
+    test_case.add_expected_output<float>(Shape{1}, {6});
+
+    test_case.run();
+}
diff --git a/ngraph/test/op.cpp b/ngraph/test/op.cpp
index d8cfceb016717f..32cfcf0982196c 100644
--- a/ngraph/test/op.cpp
+++ b/ngraph/test/op.cpp
@@ -6,6 +6,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <thread>
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -13,6 +14,7 @@
 #include "ngraph/graph_util.hpp"
 #include "ngraph/ngraph.hpp"
 #include "ngraph/variant.hpp"
+#include "ngraph/opsets/opset.hpp"
 
 NGRAPH_SUPPRESS_DEPRECATED_START
 
@@ -50,6 +52,35 @@ TEST(op, provenance_tag)
     ASSERT_TRUE(tags.find(tag2) != tags.end());
 }
 
+TEST(op, opset_multi_thread) {
+    auto doTest = [&](std::function<const ngraph::OpSet&()> fun) {
+        std::atomic<const ngraph::OpSet*> opset {nullptr};
+        std::atomic_bool failed {false};
+        auto threadFun = [&] () {
+            const ngraph::OpSet* op = &fun();
+            const ngraph::OpSet* current = opset;
+            do {
+                if (current != nullptr && current != op) {
+                    failed = true;
+                    break;
+                }
+            } while (opset.compare_exchange_strong(op, current));
+        };
+        std::thread t1 {threadFun};
+        std::thread t2 {threadFun};
+        t1.join();
+        t2.join();
+        ASSERT_FALSE(failed);
+    };
+    doTest(ngraph::get_opset1);
+    doTest(ngraph::get_opset2);
+    doTest(ngraph::get_opset3);
+    doTest(ngraph::get_opset4);
+    doTest(ngraph::get_opset5);
+    doTest(ngraph::get_opset6);
+    doTest(ngraph::get_opset7);
+}
+
 struct Ship
 {
     std::string name;
diff --git a/ngraph/test/op_eval/clamp.cpp b/ngraph/test/op_eval/clamp.cpp
new file mode 100644
index 00000000000000..a48ab39f08131f
--- /dev/null
+++ b/ngraph/test/op_eval/clamp.cpp
@@ -0,0 +1,402 @@
+//*****************************************************************************
+// Copyright 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/engine/interpreter_engine.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+
+namespace
+{
+    template <typename T, test::TestCaseType tct = test::TestCaseType::STATIC>
+    void clamp_test(const element::Type& type,
+                    const PartialShape& dynamic_shape,
+                    const Shape& static_shape,
+                    const std::vector<T>& input,
+                    double min,
+                    double max,
+                    const std::vector<T>& output)
+    {
+        auto data = make_shared<op::Parameter>(type, dynamic_shape);
+        auto clamp = make_shared<op::Clamp>(data, min, max);
+        auto function = make_shared<Function>(clamp, ParameterVector{data});
+
+        auto test_case = test::TestCase<test::INTERPRETER_Engine, tct>(function);
+        test_case.template add_input<T>(static_shape, input);
+        test_case.template add_expected_output<T>(static_shape, output);
+        return test_case.run();
+    }
+}
+
+TEST(op_eval, clamp_float_dynamic)
+{
+    auto type = element::f32;
+    typedef float ctype;
+
+    auto sshape = Shape{5, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<float>::infinity();
+    auto ninf = -numeric_limits<float>::infinity();
+
+    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
+        0.2,
+        0.6,
+        {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        input,
+        10.0,
+        20.0,
+        {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        input,
+        10.0,
+        pinf,
+        {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        input,
+        ninf,
+        20.0,
+        {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+}
+
+TEST(op_eval, clamp_int8_dynamic)
+{
+    auto type = element::i8;
+    typedef int8_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<double>::infinity();
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+TEST(op_eval, clamp_int16_dynamic)
+{
+    auto type = element::i16;
+    typedef int16_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<double>::infinity();
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+TEST(op_eval, clamp_int32_dynamic)
+{
+    auto type = element::i32;
+    typedef int32_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<double>::infinity();
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+TEST(op_eval, clamp_int64_dynamic)
+{
+    auto type = element::i64;
+    typedef int64_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<double>::infinity();
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+TEST(op_eval, clamp_uint8_dynamic)
+{
+    auto type = element::u8;
+    typedef uint8_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
+    // auto max = numeric_limits<ctype>::max();
+    // auto pinf = numeric_limits<double>::infinity();
+    ctype max = (static_cast<ctype>(1) << (numeric_limits<ctype>::digits - 1)) - 1;
+    auto pinf = static_cast<double>(max);
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+TEST(op_eval, clamp_uint16_dynamic)
+{
+    auto type = element::u16;
+    typedef uint16_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
+    // auto max = numeric_limits<ctype>::max();
+    // auto pinf = numeric_limits<double>::infinity();
+    ctype max = (static_cast<ctype>(1) << (numeric_limits<ctype>::digits - 1)) - 1;
+    auto pinf = static_cast<double>(max);
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    // dynamic shape
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+TEST(op_eval, clamp_uint32_dynamic)
+{
+    auto type = element::u32;
+    typedef uint32_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
+    // auto max = numeric_limits<ctype>::max();
+    // auto pinf = numeric_limits<double>::infinity();
+    ctype max = (static_cast<ctype>(1) << (numeric_limits<ctype>::digits - 1)) - 1;
+    auto pinf = static_cast<double>(max);
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+TEST(op_eval, clamp_uint64_dynamic)
+{
+    auto type = element::u64;
+    typedef uint64_t ctype;
+
+    auto sshape = Shape{4, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    // TODO: Fix CPU DEX / MLIR correctness bug: using signed comparison for unsigned ints
+    // auto max = numeric_limits<ctype>::max();
+    // auto pinf = numeric_limits<double>::infinity();
+    ctype max = (static_cast<ctype>(1) << (32 - 1)) - 1;
+    auto pinf = static_cast<double>(max);
+    auto ninf = -numeric_limits<double>::infinity();
+
+    vector<ctype> input{min, max, 9, 10, 11, 19, 20, 21};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, 20.0, {10, 20, 10, 10, 11, 19, 20, 20});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, 10.0, pinf, {10, max, 10, 10, 11, 19, 20, 21});
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type, dshape, sshape, input, ninf, 20.0, {min, 20, 9, 10, 11, 19, 20, 20});
+}
+
+TEST(op_eval, clamp_float16_dynamic)
+{
+    auto type = element::f16;
+    typedef float16 ctype;
+
+    auto sshape = Shape{5, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<float>::infinity();
+    auto ninf = -numeric_limits<float>::infinity();
+
+    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
+        0.2,
+        0.6,
+        {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        input,
+        10.0,
+        20.0,
+        {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        input,
+        10.0,
+        pinf,
+        {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        input,
+        ninf,
+        20.0,
+        {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+}
+
+TEST(op_eval, clamp_bfloat16_dynamic)
+{
+    auto type = element::bf16;
+    typedef bfloat16 ctype;
+
+    auto sshape = Shape{5, 2};
+    auto dshape = PartialShape::dynamic();
+
+    auto min = numeric_limits<ctype>::min();
+    auto max = numeric_limits<ctype>::max();
+    auto pinf = numeric_limits<float>::infinity();
+    auto ninf = -numeric_limits<float>::infinity();
+
+    vector<ctype> input{min, max, ninf, pinf, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.000001};
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        {-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8},
+        0.2,
+        0.6,
+        {0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.6, 0.6});
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        input,
+        10.0,
+        20.0,
+        {10.0, 20.0, 10.0, 20.0, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        input,
+        10.0,
+        pinf,
+        {10.0, max, 10.0, pinf, 10.0, 10.0, 10.000001, 19.999999, 20.0, 20.000001});
+
+    clamp_test<ctype, test::TestCaseType::DYNAMIC>(
+        type,
+        dshape,
+        sshape,
+        input,
+        ninf,
+        20.0,
+        {min, 20.0, ninf, 20.0, 9.99999, 10.0, 10.000001, 19.999999, 20.0, 20.0});
+}
diff --git a/ngraph/test/op_eval/floor_mod.cpp b/ngraph/test/op_eval/floor_mod.cpp
index 373c5e4ef05603..cc1ffdc102d629 100644
--- a/ngraph/test/op_eval/floor_mod.cpp
+++ b/ngraph/test/op_eval/floor_mod.cpp
@@ -37,3 +37,25 @@ TEST(op_eval, floor_mod)
     for (size_t i = 0; i < expected_result.size(); i++)
         EXPECT_NEAR(result_data[i], expected_result[i], 0.000001);
 }
+
+TEST(op_eval, floor_mod_i32)
+{
+    auto a = make_shared<op::Parameter>(element::i32, Shape{6});
+    auto b = make_shared<op::Parameter>(element::i32, Shape{6});
+    auto floor_mod = make_shared<op::v1::FloorMod>(a, b);
+    auto fun = make_shared<Function>(OutputVector{floor_mod}, ParameterVector{a, b});
+
+    std::vector<int32_t> a_value{-4, 7, 5, 4, -7, 8};
+    std::vector<int32_t> b_value{2, -3, 8, -2, 3, 5};
+    std::vector<int32_t> expected_result{0, -2,  5,  0,  2,  3};
+
+    auto result = make_shared<HostTensor>();
+    ASSERT_TRUE(fun->evaluate({result},
+                              {make_host_tensor<element::Type_t::i32>(Shape{6}, a_value),
+                               make_host_tensor<element::Type_t::i32>(Shape{6}, b_value)}));
+    EXPECT_EQ(result->get_element_type(), element::i32);
+    EXPECT_EQ(result->get_shape(), Shape{6});
+    auto result_data = read_vector<int32_t>(result);
+    for (size_t i = 0; i < expected_result.size(); i++)
+        EXPECT_NEAR(result_data[i], expected_result[i], 0.000001);
+}
diff --git a/ngraph/test/runtime/ie/unit_test.manifest b/ngraph/test/runtime/ie/unit_test.manifest
index 98223013a3552a..32391961a06605 100644
--- a/ngraph/test/runtime/ie/unit_test.manifest
+++ b/ngraph/test/runtime/ie/unit_test.manifest
@@ -88,7 +88,7 @@ onnx_model_conv_transpose_w_groups
 
 # [NOT_IMPLEMENTED] Input image format I64 is not supported yet...
 onnx_model_global_lp_pool_p0
-onnx_model_mod
+onnx_model_mod_sign_i64
 onnx_model_constant_of_shape_float_zeros
 onnx_model_constant_of_shape_int_ones
 ab_plus_c_inference
@@ -106,7 +106,6 @@ ceiling_int64
 matmul_2x2x3_2x3x1_int64
 matmul_2x2x3_2x1x3_transpose_int64
 onnx_dyn_shapes_slice_10_3d_input_12_axes
-IE_CPU.fused_clamp_int64
 IE_CPU.onnx_dyn_shapes_slice_10_default_axes
 onnx_dyn_shapes_ab_plus_c_inference
 onnx_dyn_shapes_dynamic_rank_input_inference
@@ -135,9 +134,6 @@ onnx_dyn_shapes_slice_10_3d_input_12_axes
 onnx_top_k_opset_10
 onnx_model_scatterND_param_i64_indices
 
-# [NOT_IMPLEMENTED] Input image format U64 is not supported yet...
-IE_CPU.fused_clamp_uint64
-
 # TopK Incorrect input data/index values precision
 onnx_model_argmax_int32
 onnx_model_argmin_int32
@@ -172,12 +168,6 @@ onnx_model_one_hot_without_axis
 onnx_model_one_hot_with_axis
 
 # Dynamic function 'get_shape was called on a descriptor::Tensor with dynamic shape'
-fused_clamp_uint16
-fused_clamp_uint8
-fused_clamp_int32
-fused_clamp_int8
-fused_clamp_float
-fused_clamp_int16
 onnx_dyn_shapes_model_acosh_1_3
 onnx_dyn_shapes_model_acosh_3_2
 onnx_dyn_shapes_model_asinh_1_3
@@ -246,6 +236,7 @@ onnx_model_gru_defaults_fwd_const_dynamic
 onnx_model_rnn_defaults_fwd_const_dynamic
 onnx_model_depth_to_space_dynamic_input
 onnx_model_space_to_depth_dynamic_input
+squeeze_dynamic
 
 
 # Constant network
@@ -573,7 +564,6 @@ select_double
 quantize_clamp_int32
 max_3d_to_scalar_double
 argmin_trivial_in_double
-IE_CPU.fused_clamp_double
 
 # Incorrect precision bf16!
 convert_float32_bf16
@@ -599,11 +589,6 @@ broadcast_vector_rowwise_int64
 broadcast_scalar_to_matrix_int64
 abc_int64
 
-# Unsupported primitive of type: Round
-IE_CPU.onnx_model_round
-IE_CPU.onnx_model_round_half_nearest_even
-round_away_from_zero
-
 # Unsupported primitive of type: SigmoidBackprop
 sigmoid_bprop_n1c1h4
 
@@ -898,8 +883,9 @@ dyn_group_convolution_backprop_data
 dynamic_transpose
 transpose
 
-# Failing from new reason after unblocking more Blob types
+# todo: check negative indices implementation
 gather_2d_negative_and_positive_indices_axis_0_2d_input
+# Failing from new reason after unblocking more Blob types
 gather_axis_0_int8
 gather_axis_0_uint8
 gather_axis_0_uint32
@@ -1075,16 +1061,36 @@ rnn_cell_zero_bias_default_attrs
 # Activation function hardsigmoid is not supported
 gru_cell_hardsigmoid_activation_function
 
+# Roll is not implemented yet for CPU, GPU 
+roll_2d_input
+roll_2d_input_negative_shift
+roll_repeated_axes
+roll_3d_input
+roll_3d_input_negative_shift
+roll_negative_axes
+
 #-------------------------------------------------------------------------------
 #
 #       Inference Engine CPU plugin excludes
 #
 #-------------------------------------------------------------------------------
 
-# Cannot cast ngraph node ReplaceSlice to CNNLayer!
-# Incorrect precision u32!
-# Parameter has zero dimension that is not allowable
-IE_CPU.fused_clamp_uint32
+# Clamp op:
+# Issue 51676: Output mismatch due to wrong conversion of bounds
+IE_CPU.clamp_integral
+IE_CPU.clamp_integral_negative
+# Issue 51679: CI failure on Ubuntu 20. Overflow handling -inf lower bound
+IE_CPU.clamp_int32
+# Precision mismatch
+IE_CPU.clamp_uint32
+IE_CPU.clamp_uint16
+IE_CPU.clamp_int16
+IE_CPU.clamp_int64
+IE_CPU.clamp_float16
+# [NOT_IMPLEMENTED] Input image format U64 is not supported yet...
+IE_CPU.clamp_uint64
+# [NOT_IMPLEMENTED] Input image format BF16 is not supported yet...
+IE_CPU.clamp_bfloat16
 
 # Cannot cast ngraph node Reshape to CNNLayer!
 # Parameter has zero dimension that is not allowable
@@ -1110,12 +1116,6 @@ IE_CPU.convert_like_float32_bfloat16
 IE_CPU.convert_like_bfloat16_float32
 IE_CPU.convert_like_dyn_float16_to_int64
 
-# Can't convert type f16 to IE Precision!
-IE_CPU.fused_clamp_float16
-
-# [NOT_IMPLEMENTED] Input image format BF16 is not supported yet...
-IE_CPU.fused_clamp_bfloat16
-
 # Operations were removed from opset
 IE_CPU.atanh
 IE_CPU.asinh
@@ -1381,7 +1381,7 @@ IE_GPU.normalize_across_chw_4d
 IE_GPU.normalize_across_h_4d
 IE_GPU.normalize_across_c_2x2_shape
 IE_GPU.normalize_across_c_2x4_shape
-IE_GPU.fused_clamp
+IE_GPU.clamp
 IE_GPU.grn_4d
 IE_GPU.squeeze
 IE_GPU.squared_difference
@@ -1583,6 +1583,33 @@ evaluate_mvn_6_across_chanells
 evaluate_mvn_6_across_batch
 IE_CPU.onnx_mvn_v6
 
+# not yet implemented on CPU/GPU Gather 7
+gather_v7_1d_int32
+gather_v7_data_int32_3d_indices_axis_1_batch_dims_1
+gather_v7_data_int32_2d_indices_axis_1_batch_dims_1
+gather_v7_3d_indices_axis_1_batch_dims_1
+gather_v7_4d_indices_axis_0_uint8
+gather_v7_4d_indices_axis_0_2d_input
+gather_v7_3d_indices_axis_0_2d_input
+gather_v7_2d_indices_axis_0_2d_input
+gather_v7_2d_negative_and_positive_indices_axis_0_2d_input
+gather_v7_1d_indices_axis_0_1d_input
+gather_v7_scalar_indices_axis_0_2d_input
+gather_v7_2d_indices_axis_1_2d_input
+gather_v7_1d_indices_axis_2_4d_input
+gather_v7_scalar_indices_axis_1_2d_input
+gather_v7_axis_0_int8
+gather_v7_axis_0_int16
+gather_v7_axis_0_int32
+gather_v7_axis_0_int64
+gather_v7_axis_0_uint8
+gather_v7_axis_0_uint16
+gather_v7_axis_0_uint32
+gather_v7_axis_0_uint64
+gather_v7_axis_0_bool
+gather_v7_3d_indices_axis_1_batch_dims_1_int32
+gather_v7_4d_data_axis_2_batch_dims_1_int32
+
 # Issue 49621: Incorrect blob sizes for node BinaryConvolution_X
 bin_convolution_2D_1batch_1channel
 bin_convolution_2D_1batch_1channel_padding_pad_val_0
@@ -1596,3 +1623,6 @@ bin_convolution_2D_2batch_1channel
 
 # RuntimeError: Unsupported dynamic ops: v4::Interpolate - Ticket: 50691
 onnx_upsample6_dynamic
+
+# random values returned from the plugin: ticket 51762
+onnx_model_deformable_conv_2d
diff --git a/ngraph/test/runtime/interpreter/evaluates_map.cpp b/ngraph/test/runtime/interpreter/evaluates_map.cpp
index d6ab1816057d7b..e242a4021e1678 100644
--- a/ngraph/test/runtime/interpreter/evaluates_map.cpp
+++ b/ngraph/test/runtime/interpreter/evaluates_map.cpp
@@ -52,6 +52,7 @@
 #include <ngraph/runtime/reference/reverse_sequence.hpp>
 #include <ngraph/runtime/reference/rnn_cell.hpp>
 #include <ngraph/runtime/reference/roi_pooling.hpp>
+#include <ngraph/runtime/reference/roll.hpp>
 #include <ngraph/runtime/reference/scatter_nd_update.hpp>
 #include <ngraph/runtime/reference/select.hpp>
 #include <ngraph/runtime/reference/selu.hpp>
@@ -1105,7 +1106,7 @@ namespace
                                    outputs[0]->get_data_ptr<T>(),
                                    inputs[0]->get_shape(),
                                    inputs[1]->get_shape(),
-                                   op->get_auto_broadcast());
+                                   op->get_autob());
         return true;
     }
 
@@ -2126,6 +2127,48 @@ namespace
         return true;
     }
 
+    template <element::Type_t ET>
+    bool evaluate(const shared_ptr<op::v7::Roll>& op,
+                  const HostTensorVector& outputs,
+                  const HostTensorVector& inputs)
+    {
+        const auto& shiftType = inputs[1]->get_element_type();
+        std::vector<int64_t> shift_int64;
+        if (shiftType == element::Type_t::i32)
+        {
+            auto shift = inputs[1]->get_data_ptr<const int32_t>();
+            shift_int64.resize(shape_size(inputs[1]->get_shape()));
+            std::transform(shift,
+                           shift + shape_size(inputs[1]->get_shape()),
+                           shift_int64.begin(),
+                           [](const int32_t& elem) { return static_cast<int64_t>(elem); });
+        }
+        const auto& axesType = inputs[2]->get_element_type();
+        std::vector<int64_t> axes_int64;
+        if (axesType == element::Type_t::i32)
+        {
+            auto axes = inputs[2]->get_data_ptr<const int32_t>();
+            axes_int64.resize(shape_size(inputs[2]->get_shape()));
+            std::transform(axes,
+                           axes + shape_size(inputs[2]->get_shape()),
+                           axes_int64.begin(),
+                           [](const int32_t& elem) { return static_cast<int64_t>(elem); });
+        }
+        runtime::reference::roll(inputs[0]->get_data_ptr<const char>(),
+                                 inputs[1]->get_element_type() != element::Type_t::i64
+                                     ? shift_int64.data()
+                                     : inputs[1]->get_data_ptr<const int64_t>(),
+                                 inputs[2]->get_element_type() != element::Type_t::i64
+                                     ? axes_int64.data()
+                                     : inputs[2]->get_data_ptr<const int64_t>(),
+                                 outputs[0]->get_data_ptr<char>(),
+                                 inputs[0]->get_shape(),
+                                 inputs[1]->get_shape(),
+                                 inputs[2]->get_shape(),
+                                 inputs[0]->get_element_type().size());
+        return true;
+    }
+
     template <typename T>
     bool evaluate_node(std::shared_ptr<Node> node,
                        const HostTensorVector& outputs,
diff --git a/ngraph/test/runtime/interpreter/opset_int_tbl.hpp b/ngraph/test/runtime/interpreter/opset_int_tbl.hpp
index 4e52bd26201da6..92351719e4e9c7 100644
--- a/ngraph/test/runtime/interpreter/opset_int_tbl.hpp
+++ b/ngraph/test/runtime/interpreter/opset_int_tbl.hpp
@@ -85,3 +85,5 @@ NGRAPH_OP(Round, op::v5)
 NGRAPH_OP(CTCGreedyDecoderSeqLen, op::v6)
 NGRAPH_OP(GatherElements, op::v6)
 NGRAPH_OP(MVN, ngraph::op::v6)
+
+NGRAPH_OP(Roll, ngraph::op::v7)
diff --git a/ngraph/test/runtime/interpreter/unit_test.manifest b/ngraph/test/runtime/interpreter/unit_test.manifest
index b33d89103a96b9..c04ba98df6b4ce 100644
--- a/ngraph/test/runtime/interpreter/unit_test.manifest
+++ b/ngraph/test/runtime/interpreter/unit_test.manifest
@@ -65,12 +65,11 @@ INTERPRETER.gather_axis_0_int8
 INTERPRETER.gather_axis_0_int16
 INTERPRETER.gather_axis_0_uint8
 INTERPRETER.gather_axis_0_uint16
-INTERPRETER.fused_clamp_double
-INTERPRETER.fused_clamp_int8
-INTERPRETER.fused_clamp_int16
-INTERPRETER.fused_clamp_uint8
-INTERPRETER.fused_clamp_uint16
-INTERPRETER.fused_clamp_bfloat16
+INTERPRETER.gather_v7_4d_indices_axis_0_uint8
+INTERPRETER.gather_v7_axis_0_int8
+INTERPRETER.gather_v7_axis_0_int16
+INTERPRETER.gather_v7_axis_0_uint8
+INTERPRETER.gather_v7_axis_0_uint16
 INTERPRETER.auto_bcast_binary_elementwise
 INTERPRETER.auto_bcast_binary_elementwise_pdpd
 
@@ -118,7 +117,7 @@ INTERPRETER.onnx_model_conv_integer_pads
 onnx_model_lstm_fwd_with_clip_peepholes
 onnx_model_lstm_bdir_short_input_seq_peepholes
 # Activation function hardsigmoid unsupported
-onnx_model_gru_fwd_activations_relu_hardsigmoid 
+onnx_model_gru_fwd_activations_relu_hardsigmoid
 onnx_model_lstm_fwd_hardsigmoid_activation
 gru_cell_hardsigmoid_activation_function
 
@@ -148,6 +147,7 @@ onnx_controlflow_loop_infinite
 onnx_controlflow_loop_2d_trip_count_dynamic
 onnx_controlflow_loop_no_variadic_inputs_and_outputs
 onnx_controlflow_loop_power
+squeeze_dynamic
 
 # The test fails in CI on Ubuntu i386
 # There's an overflow of some kind: 2147483647 is not close to -2147483648 at index 2
@@ -171,3 +171,6 @@ INTERPRETER.onnx_model_experimental_detectron_prior_grid_generator
 
 # Interpreter backend doesn't implement evaluate method for OP ExperimentalDetectronROIFeatureExtractor
 INTERPRETER.onnx_model_experimental_detectron_roi_feature_extractor
+
+# No evaluator for DeformableConv2D
+onnx_model_deformable_conv_2d
diff --git a/ngraph/test/span.cpp b/ngraph/test/span.cpp
new file mode 100644
index 00000000000000..4cff4714d54e42
--- /dev/null
+++ b/ngraph/test/span.cpp
@@ -0,0 +1,208 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+
+#include <algorithm>
+#include <array>
+#include <vector>
+
+#include "ngraph/runtime/reference/utils/span.hpp"
+
+using namespace ngraph::runtime::reference;
+
+TEST(span_util, create_from_vector)
+{
+    std::vector<int> data{1, 2, 3, 4};
+    const auto s = span(data);
+
+    ASSERT_EQ(s.size(), data.size());
+    EXPECT_TRUE(std::equal(begin(data), end(data), begin(s)));
+
+    const auto si = span(begin(data), end(data));
+
+    ASSERT_EQ(si.size(), data.size());
+    EXPECT_TRUE(std::equal(begin(data), end(data), begin(si)));
+}
+
+TEST(span_util, create_from_const_vector)
+{
+    const std::vector<int> data{1, 2, 3, 4};
+    const auto s = span(data);
+
+    ASSERT_EQ(s.size(), data.size());
+    EXPECT_TRUE(std::equal(begin(data), end(data), begin(s)));
+
+    const auto si = span(begin(data), end(data));
+
+    ASSERT_EQ(si.size(), data.size());
+    EXPECT_TRUE(std::equal(begin(data), end(data), begin(si)));
+}
+
+TEST(span_util, create_from_memory)
+{
+    std::array<int, 4> data{1, 2, 3, 4};
+    const auto s = span(data);
+
+    ASSERT_EQ(s.size(), data.size());
+    EXPECT_TRUE(std::equal(begin(data), end(data), begin(s)));
+}
+
+TEST(span_util, create_from_const_memory)
+{
+    const std::array<int, 4> data{1, 2, 3, 4};
+    const auto s = span(data);
+
+    ASSERT_EQ(s.size(), data.size());
+    EXPECT_TRUE(std::equal(begin(data), end(data), begin(s)));
+}
+
+TEST(span_util, empty_span_stay_empty_for_drop_front)
+{
+    {
+        constexpr std::array<int, 1> data{1};
+        auto s = span(data);
+        EXPECT_EQ(1, s.size());
+        EXPECT_FALSE(s.empty());
+        EXPECT_EQ(data.front(), s.front());
+
+        s.drop_front(1);
+        EXPECT_EQ(0, s.size());
+        EXPECT_TRUE(s.empty());
+
+        s.drop_front(1);
+        EXPECT_EQ(0, s.size());
+        EXPECT_TRUE(s.empty());
+    }
+    {
+        constexpr std::array<int, 2> data{1, 2};
+        auto s = span(data);
+        EXPECT_EQ(2, s.size());
+        EXPECT_FALSE(s.empty());
+        EXPECT_EQ(data.front(), s.front());
+
+        s.drop_front(1);
+        EXPECT_FALSE(s.empty());
+        EXPECT_EQ(data.back(), s.front());
+
+        s.drop_front(1);
+        EXPECT_EQ(0, s.size());
+        EXPECT_TRUE(s.empty());
+
+        s.drop_front(1);
+        EXPECT_EQ(0, s.size());
+        EXPECT_TRUE(s.empty());
+    }
+}
+TEST(span_util, empty_span_stay_empty_for_drop_back)
+{
+    {
+        constexpr std::array<int, 1> data{1};
+        auto s = span(data);
+        EXPECT_EQ(1, s.size());
+        EXPECT_FALSE(s.empty());
+        EXPECT_EQ(data.front(), s.front());
+
+        s.drop_back(1);
+        EXPECT_EQ(0, s.size());
+        EXPECT_TRUE(s.empty());
+
+        s.drop_back(1);
+        EXPECT_EQ(0, s.size());
+        EXPECT_TRUE(s.empty());
+    }
+    {
+        constexpr std::array<int, 2> data{1, 2};
+        auto s = span(data);
+        EXPECT_EQ(2, s.size());
+        EXPECT_FALSE(s.empty());
+        EXPECT_EQ(data.back(), s.back());
+
+        s.drop_back(1);
+        EXPECT_FALSE(s.empty());
+        EXPECT_EQ(data.front(), s.back());
+
+        s.drop_back(1);
+        EXPECT_EQ(0, s.size());
+        EXPECT_TRUE(s.empty());
+
+        s.drop_back(1);
+        EXPECT_EQ(0, s.size());
+        EXPECT_TRUE(s.empty());
+    }
+}
+
+TEST(span_util, create_substring)
+{
+    const std::array<int, 4> data{1, 2, 3, 4};
+    const auto s = span(data.data(), data.size());
+
+    {
+        const auto sub = s.subspan(1, 1000);
+        EXPECT_EQ(sub.size(), data.size() - 1);
+        EXPECT_FALSE(sub.empty());
+    }
+    {
+        const auto sub = s.subspan(data.size() - 1);
+        EXPECT_EQ(sub.size(), 1);
+        EXPECT_FALSE(sub.empty());
+    }
+    {
+        const auto sub = s.subspan(10000, 1000);
+        EXPECT_EQ(sub.size(), 0);
+        EXPECT_TRUE(sub.empty());
+    }
+}
+
+TEST(span_util, compare_substr_with_drop_front)
+{
+    const std::array<int, 4> data{1, 2, 3, 4};
+    const auto s = span(data.data(), data.size());
+
+    auto sf = s;
+    auto ss = s;
+    for (size_t i = 0; i != data.size() + 1; ++i)
+    {
+        sf.drop_front(1);
+        ss = ss.subspan(1);
+        EXPECT_EQ(sf.size(), ss.size());
+        EXPECT_EQ(sf.empty(), ss.empty());
+        if (!sf.empty())
+        {
+            EXPECT_EQ(sf.front(), ss.front());
+        }
+    }
+}
+
+TEST(span_util, drop_elements)
+{
+    const std::array<int, 4> data{1, 2, 3, 4};
+    const auto s = span(data.data(), data.size());
+
+    auto length = s.size();
+    for (auto sub = s; !sub.empty(); sub.drop_back(1))
+    {
+        EXPECT_EQ(sub.front(), data.front());
+        EXPECT_EQ(sub.size(), length);
+        length--;
+    }
+
+    length = s.size();
+    for (auto sub = s; !sub.empty(); sub.drop_front(1))
+    {
+        EXPECT_EQ(sub.back(), data.back());
+        EXPECT_EQ(sub.size(), length);
+        length--;
+    }
+}
+
+TEST(span_util, throw_on_out_of_range)
+{
+    std::array<int, 2> data{};
+    EXPECT_THROW(Span<char>{}.at(0), std::out_of_range);
+    EXPECT_NO_THROW(span(data).at(0));
+    EXPECT_NO_THROW(span(data).at(1));
+    EXPECT_THROW(span(data).at(2), std::out_of_range);
+    EXPECT_THROW(span(data).at(3), std::out_of_range);
+}
\ No newline at end of file
diff --git a/ngraph/test/type_prop/binary_convolution.cpp b/ngraph/test/type_prop/binary_convolution.cpp
index 6f0cc7806388c8..7895ca266f6fe7 100644
--- a/ngraph/test/type_prop/binary_convolution.cpp
+++ b/ngraph/test/type_prop/binary_convolution.cpp
@@ -176,7 +176,7 @@ TEST(type_prop, bin_convolution_invalid_inputs_et)
     const auto auto_pad = op::PadType::EXPLICIT;
     try
     {
-        const auto data_batch = make_shared<op::Parameter>(element::i32, PartialShape{1, 1, 5, 5});
+        const auto data_batch = make_shared<op::Parameter>(element::boolean, PartialShape{1, 1, 5, 5});
         const auto filters = make_shared<op::Parameter>(element::u1, PartialShape{1, 1, 3, 3});
         const auto bin_conv = make_shared<op::v1::BinaryConvolution>(data_batch,
                                                                      filters,
@@ -192,7 +192,7 @@ TEST(type_prop, bin_convolution_invalid_inputs_et)
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(error.what(), "Data batch element type must be float point");
+        EXPECT_HAS_SUBSTRING(error.what(), "Data batch element type must be numeric");
     }
     catch (...)
     {
@@ -261,8 +261,7 @@ TEST(type_prop, bin_convolution_invalid_input_ranks)
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(error.what(),
-                             "Shapes for data batch and filters must have same rank.");
+        EXPECT_HAS_SUBSTRING(error.what(), "Data batch and filters inputs must have same rank");
     }
     catch (...)
     {
@@ -290,8 +289,7 @@ TEST(type_prop, bin_convolution_invalid_input_ranks)
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(error.what(),
-                             "Shapes for data batch and filters must have same rank.");
+        EXPECT_HAS_SUBSTRING(error.what(), "Data batch and filters inputs must have same rank");
     }
     catch (...)
     {
diff --git a/ngraph/test/type_prop/broadcast.cpp b/ngraph/test/type_prop/broadcast.cpp
index b72f537271ab07..aa24cec60bc6f6 100644
--- a/ngraph/test/type_prop/broadcast.cpp
+++ b/ngraph/test/type_prop/broadcast.cpp
@@ -151,7 +151,7 @@ TYPED_TEST_P(BroadcastTests, broadcast_fail_axes_map)
 
 TYPED_TEST_P(BroadcastTests, broadcast_fail_axes_map_shape)
 {
-    auto param = make_shared<op::Parameter>(element::f32, Shape{3, 1});
+    auto param = make_shared<op::Parameter>(element::f32, Shape{3, 2});
     auto target_shape = op::Constant::create<int64_t>(element::i64, Shape{3}, {2, 3, 3});
     auto axes_mapping = op::Constant::create<int64_t>(element::i64, Shape{2}, {1, 2});
 
@@ -162,7 +162,7 @@ TYPED_TEST_P(BroadcastTests, broadcast_fail_axes_map_shape)
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(error.what(), "Broadcast target[axes_mapping[1]] Expected 1. Got 3");
+        EXPECT_HAS_SUBSTRING(error.what(), "Broadcast target[axes_mapping[1]] Expected 2. Got 3");
     }
     catch (...)
     {
diff --git a/ngraph/test/type_prop/clamp.cpp b/ngraph/test/type_prop/clamp.cpp
index 89e54e83a6f475..d16d7f0ccd5bf1 100644
--- a/ngraph/test/type_prop/clamp.cpp
+++ b/ngraph/test/type_prop/clamp.cpp
@@ -9,23 +9,99 @@
 using namespace std;
 using namespace ngraph;
 
-TEST(type_prop, fused_clamp)
+TEST(type_prop, clamp_basic_f32)
 {
-    const auto data = make_shared<op::Parameter>(element::f64, Shape{2, 2});
+    auto data = make_shared<op::Parameter>(element::f32, Shape{1, 32, 32});
+    auto clamp = make_shared<op::Clamp>(data, 0.0, 2.1);
+
+    ASSERT_EQ(clamp->get_element_type(), element::f32);
+    ASSERT_EQ(clamp->get_min(), 0.0);
+    ASSERT_EQ(clamp->get_max(), 2.1);
+    ASSERT_EQ(clamp->get_output_shape(0), (Shape{1, 32, 32}));
+}
+
+TEST(type_prop, clamp_basic_i32)
+{
+    auto data = make_shared<op::Parameter>(element::i32, Shape{1, 32, 32});
+    auto clamp = make_shared<op::Clamp>(data, 0.0, 2.1);
+
+    ASSERT_EQ(clamp->get_element_type(), element::i32);
+    ASSERT_EQ(clamp->get_min(), 0.0);
+    ASSERT_EQ(clamp->get_max(), 2.1);
+    ASSERT_EQ(clamp->get_output_shape(0), (Shape{1, 32, 32}));
+}
+
+TEST(type_prop, clamp_shape_static_rank)
+{
+    auto data = make_shared<op::Parameter>(
+        element::f16, PartialShape{Dimension::dynamic(), Dimension::dynamic(), 32});
+    auto clamp = make_shared<op::Clamp>(data, -2.1, 2.1);
+
+    ASSERT_EQ(clamp->get_element_type(), element::f16);
+    ASSERT_EQ(clamp->get_min(), -2.1);
+    ASSERT_EQ(clamp->get_max(), 2.1);
+    ASSERT_EQ(clamp->get_output_partial_shape(0),
+              (PartialShape{Dimension::dynamic(), Dimension::dynamic(), 32}));
+}
+
+TEST(type_prop, clamp_shape_dynamic)
+{
+    auto data = make_shared<op::Parameter>(element::u16, PartialShape::dynamic());
+    auto clamp = make_shared<op::Clamp>(data, 1.5, 15.0);
+
+    ASSERT_EQ(clamp->get_element_type(), element::u16);
+    ASSERT_EQ(clamp->get_min(), 1.5);
+    ASSERT_EQ(clamp->get_max(), 15.0);
+    ASSERT_EQ(clamp->get_output_partial_shape(0), (PartialShape::dynamic()));
+}
+
+TEST(type_prop, clamp_invalid_element_type)
+{
+    auto data = make_shared<op::Parameter>(element::boolean, Shape{2, 2});
 
     try
     {
-        const auto clamp = make_shared<op::Clamp>(data, 2.0, 1.0);
-        EXPECT_FALSE(clamp.get())
-            << "Clamp validation did not work. Op node was created with incorrect params.";
+        auto clamp = make_shared<op::Clamp>(data, 0.5, 5.5);
+        // Input element type is boolean
+        FAIL() << "Invalid boolean element type for input not detected";
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(
-            error.what(), std::string("The 'min' parameter needs to be less than 'max' for Clamp"));
+        EXPECT_HAS_SUBSTRING(error.what(), "Input element type must be numeric");
     }
+    catch (...)
+    {
+        FAIL() << "Numeric element type node validation check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, clamp_equal_attributes)
+{
+    auto data = make_shared<op::Parameter>(element::f64, Shape{2, 2});
+
+    auto clamp = make_shared<op::Clamp>(data, 1.0, 1.0);
+    ASSERT_EQ(clamp->get_element_type(), element::f64);
+    ASSERT_EQ(clamp->get_min(), 1.0);
+    ASSERT_EQ(clamp->get_max(), 1.0);
+    ASSERT_EQ(clamp->get_output_shape(0), (Shape{2, 2}));
+}
+
+TEST(type_prop, clamp_invalid_attributes)
+{
+    auto data = make_shared<op::Parameter>(element::f64, Shape{2, 2});
 
-    const auto clamp = make_shared<op::Clamp>(data, 1.0, 2.0);
-    EXPECT_EQ(clamp->get_element_type(), element::f64);
-    EXPECT_EQ(clamp->get_shape(), (Shape{2, 2}));
+    try
+    {
+        auto clamp = make_shared<op::Clamp>(data, 2.0, 1.0);
+        // Attribute 'max' not greater than 'min'
+        FAIL() << "Attribute 'min' bigger than 'max' not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(), "Attribute 'min' must be less or equal than 'max'");
+    }
+    catch (...)
+    {
+        FAIL() << "'min' and 'max' attributes node validation check failed for unexpected reason";
+    }
 }
diff --git a/ngraph/test/type_prop/convolution.cpp b/ngraph/test/type_prop/convolution.cpp
index e0f2328e3cd674..afd26ff71e7c3d 100644
--- a/ngraph/test/type_prop/convolution.cpp
+++ b/ngraph/test/type_prop/convolution.cpp
@@ -2647,7 +2647,8 @@ TEST(type_prop, conv_v1_partial_data_shape_dynamic)
     auto conv = make_shared<op::v1::Convolution>(
         data_batch, filters, strides, pads_begin, pads_end, dilations, auto_pad);
 
-    ASSERT_TRUE(conv->get_output_partial_shape(0).same_scheme({PartialShape::dynamic()}));
+    ASSERT_TRUE(conv->get_output_partial_shape(0).same_scheme(
+        {Dimension::dynamic(), 1, Dimension::dynamic(), Dimension::dynamic()}));
     ASSERT_EQ(conv->get_pads_begin(), (CoordinateDiff{}));
     ASSERT_EQ(conv->get_pads_end(), (CoordinateDiff{}));
 }
diff --git a/ngraph/test/type_prop/deformable_psroi_pooling.cpp b/ngraph/test/type_prop/deformable_psroi_pooling.cpp
index 3c1509f2518e3d..61474d800a5e6a 100644
--- a/ngraph/test/type_prop/deformable_psroi_pooling.cpp
+++ b/ngraph/test/type_prop/deformable_psroi_pooling.cpp
@@ -9,55 +9,154 @@
 using namespace std;
 using namespace ngraph;
 
-TEST(type_prop, deformable_psroi_pooling_output_shape)
+TEST(type_prop, deformable_psroi_pooling_no_offsets_group_size_3)
 {
-    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1024, 63, 38});
-    auto coords = make_shared<op::Parameter>(element::f32, Shape{300, 5});
-    auto offsets = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4});
+    const float spatial_scale = 0.0625;
     const int64_t output_dim = 882;
+    const int64_t group_size = 3;
+
+    const auto rois_dim = 300;
+
+    auto input_data = make_shared<op::Parameter>(element::f32, PartialShape{2, 7938, 63, 38});
+    auto input_coords = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 5});
+
+    auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
+        input_data, input_coords, output_dim, spatial_scale, group_size);
+
+    const PartialShape expected_output{rois_dim, output_dim, group_size, group_size};
+    ASSERT_EQ(def_psroi_pool->get_output_partial_shape(0), expected_output);
+}
+
+TEST(type_prop, deformable_psroi_pooling_group_size_3)
+{
     const float spatial_scale = 0.0625;
+    const int64_t output_dim = 882;
     const int64_t group_size = 3;
+    const int64_t part_size = 3;
+    const double spatial_bins = 4;
+
+    const auto rois_dim = 300;
+
+    auto input_data = make_shared<op::Parameter>(element::f32, PartialShape{2, 7938, 63, 38});
+    auto input_coords = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 5});
+    auto input_offsets = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 2, part_size, part_size});
 
     auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
-        input, coords, offsets, output_dim, spatial_scale, group_size);
+        input_data, input_coords, input_offsets, output_dim, spatial_scale, group_size, "bilinear_deformable", spatial_bins, spatial_bins, 0.1, part_size);
 
-    ASSERT_EQ(def_psroi_pool->get_output_shape(0), (Shape{300, 882, 3, 3}));
+    const PartialShape expected_output{rois_dim, output_dim, group_size, group_size};
+    ASSERT_EQ(def_psroi_pool->get_output_partial_shape(0), expected_output);
 }
 
-TEST(type_prop, deformable_psroi_pooling_output_shape_2)
+TEST(type_prop, deformable_psroi_pooling_group_size_7)
 {
-    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 7938, 38, 38});
-    auto coords = make_shared<op::Parameter>(element::f32, Shape{300, 5});
-    auto offsets = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4});
-    const int64_t output_dim = 162;
     const float spatial_scale = 0.0625;
+    const int64_t output_dim = 162;
     const int64_t group_size = 7;
+    const int64_t part_size = 7;
+    const double spatial_bins = 4;
+
+    const auto rois_dim = 300;
+
+    auto input_data = make_shared<op::Parameter>(element::f32, PartialShape{2, 7938, 63, 38});
+    auto input_coords = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 5});
+    auto input_offsets = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 2, part_size, part_size});
+
+   auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
+        input_data, input_coords, input_offsets, output_dim, spatial_scale, group_size, "bilinear_deformable", spatial_bins, spatial_bins, 0.1, part_size);
+
+    const PartialShape expected_output{rois_dim, output_dim, group_size, group_size};
+    ASSERT_EQ(def_psroi_pool->get_output_partial_shape(0), expected_output);
+}
+
+TEST(type_prop, deformable_psroi_pooling_dynamic_rois)
+{
+    const float spatial_scale = 0.0625;
+    const int64_t output_dim = 882;
+    const int64_t group_size = 3;
+
+    const auto rois_dim = Dimension(100, 200);
+
+    auto input_data = make_shared<op::Parameter>(element::f32, PartialShape{2, 7938, 63, 38});
+    auto input_coords = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 5});
 
     auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
-        input, coords, offsets, output_dim, spatial_scale, group_size);
+        input_data, input_coords, output_dim, spatial_scale, group_size);
 
-    ASSERT_EQ(def_psroi_pool->get_output_shape(0), (Shape{300, 162, 7, 7}));
+    const PartialShape expected_output{rois_dim, output_dim, group_size, group_size};
+    ASSERT_EQ(def_psroi_pool->get_output_partial_shape(0), expected_output);
 }
 
-TEST(type_prop, deformable_psroi_pooling_invalid_input_rank)
+TEST(type_prop, deformable_psroi_pooling_fully_dynamic)
 {
-    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3});
-    auto coords = make_shared<op::Parameter>(element::f32, Shape{1, 2});
-    auto offsets = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4});
-    const int64_t output_dim = 4;
-    const float spatial_scale = 0.9;
-    const int64_t group_size = 7;
+    const float spatial_scale = 0.0625;
+    const int64_t output_dim = 882;
+    const int64_t group_size = 3;
+
+    const auto rois_dim = Dimension::dynamic();
+
+    auto input_data = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+    auto input_coords = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+
+    auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
+        input_data, input_coords, output_dim, spatial_scale, group_size);
+
+    const PartialShape expected_output{rois_dim, output_dim, group_size, group_size};
+    ASSERT_EQ(def_psroi_pool->get_output_partial_shape(0), expected_output);
+}
+
+TEST(type_prop, deformable_psroi_pooling_invalid_group_size)
+{
+    const float spatial_scale = 0.0625;
+    const int64_t output_dim = 882;
+    const auto rois_dim = 300;
     try
     {
+        const int64_t group_size = 0;
+
+        auto input_data = make_shared<op::Parameter>(element::f32, PartialShape{2, 7938, 63, 38});
+        auto input_coords = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 5});
         auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
-            input, coords, offsets, output_dim, spatial_scale, group_size);
+            input_data, input_coords, output_dim, spatial_scale, group_size);
+
+        FAIL() << "Invalid group_size not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(), std::string("Value of `group_size` attribute has to be greater than 0"));
+    }
+    catch (...)
+    {
+        FAIL() << "Unknown exception was thrown";
+    }
+}
+
+TEST(type_prop, deformable_psroi_pooling_invalid_data_input_rank)
+{
+    const float spatial_scale = 0.0625;
+    const int64_t output_dim = 162;
+    const int64_t group_size = 7;
+    const int64_t part_size = 7;
+    const double spatial_bins = 4;
+
+    const auto rois_dim = 300;
+
+    auto input_data = make_shared<op::Parameter>(element::f32, PartialShape{7938, 63, 38});
+    auto input_coords = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 5});
+    auto input_offsets = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 2, part_size, part_size});
+
+    try
+    {
+      auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
+        input_data, input_coords, input_offsets, output_dim, spatial_scale, group_size, "bilinear_deformable", spatial_bins, spatial_bins, 0.1, part_size);
+
         // Should have thrown, so fail if it didn't
-        FAIL() << "Ivalid feature map input rank not detected";
+        FAIL() << "Invalid first input rank not detected";
     }
     catch (const NodeValidationFailure& error)
     {
         EXPECT_HAS_SUBSTRING(error.what(),
-                             std::string("Feature map input rank must equal to 4 (input rank: 3)"));
+                             std::string("First input rank must be compatible with 4 (input rank: 3)"));
     }
     catch (...)
     {
@@ -67,24 +166,26 @@ TEST(type_prop, deformable_psroi_pooling_invalid_input_rank)
 
 TEST(type_prop, deformable_psroi_pooling_invalid_box_coordinates_rank)
 {
-    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4});
-    auto coords = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3});
-    auto offsets = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4});
     const int64_t output_dim = 4;
     const float spatial_scale = 0.9;
     const int64_t group_size = 7;
+    
+    const auto rois_dim = 300;
+
+    auto input_data = make_shared<op::Parameter>(element::f32, PartialShape{2, 7938, 63, 38});
+    auto input_coords = make_shared<op::Parameter>(element::f32, PartialShape{2, rois_dim, 5});
     try
     {
         auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
-            input, coords, offsets, output_dim, spatial_scale, group_size);
+            input_data, input_coords, output_dim, spatial_scale, group_size);
         // Should have thrown, so fail if it didn't
-        FAIL() << "Ivalid box coordinates input rank not detected";
+        FAIL() << "Invalid second input rank not detected";
     }
     catch (const NodeValidationFailure& error)
     {
         EXPECT_HAS_SUBSTRING(
             error.what(),
-            std::string("Box coordinates input rank must equal to 2 (input rank: 3)"));
+            std::string("Second input rank must be compatible with 2 (input rank: 3)"));
     }
     catch (...)
     {
@@ -94,23 +195,29 @@ TEST(type_prop, deformable_psroi_pooling_invalid_box_coordinates_rank)
 
 TEST(type_prop, deformable_psroi_pooling_invalid_offstes_rank)
 {
-    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4});
-    auto coords = make_shared<op::Parameter>(element::f32, Shape{1, 2});
-    auto offsets = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4, 5});
-    const int64_t output_dim = 4;
-    const float spatial_scale = 0.9;
+    const float spatial_scale = 0.0625;
+    const int64_t output_dim = 162;
     const int64_t group_size = 7;
+    const int64_t part_size = 7;
+    const double spatial_bins = 4;
+
+    const auto rois_dim = 300;
+
+    auto input_data = make_shared<op::Parameter>(element::f32, PartialShape{2, 7938, 63, 38});
+    auto input_coords = make_shared<op::Parameter>(element::f32, PartialShape{rois_dim, 5});
+    auto input_offsets = make_shared<op::Parameter>(element::f32, PartialShape{2, rois_dim, 2, part_size, part_size});
     try
     {
-        auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
-            input, coords, offsets, output_dim, spatial_scale, group_size);
-        // Should have thrown, so fail if it didn't
-        FAIL() << "Offsets input rank not detected";
+      auto def_psroi_pool = make_shared<op::v1::DeformablePSROIPooling>(
+        input_data, input_coords, input_offsets, output_dim, spatial_scale, group_size, "bilinear_deformable", spatial_bins, spatial_bins, 0.1, part_size);
+
+       // Should have thrown, so fail if it didn't
+        FAIL() << "Invalid third input rank not detected";
     }
     catch (const NodeValidationFailure& error)
     {
         EXPECT_HAS_SUBSTRING(error.what(),
-                             std::string("Offsets input rank must equal to 4 (input rank: 5)"));
+                             std::string("Third input rank must be compatible with 4 (input rank: 5)"));
     }
     catch (...)
     {
diff --git a/ngraph/test/type_prop/gather.cpp b/ngraph/test/type_prop/gather.cpp
index 151ef437e9c519..e74b809f416920 100644
--- a/ngraph/test/type_prop/gather.cpp
+++ b/ngraph/test/type_prop/gather.cpp
@@ -11,6 +11,8 @@ NGRAPH_SUPPRESS_DEPRECATED_START
 using namespace std;
 using namespace ngraph;
 
+// ------------------------------ V1 ------------------------------
+
 TEST(type_prop, gather_axis_0)
 {
     Shape params_shape{3, 2};
@@ -92,3 +94,329 @@ TEST(type_prop, gather_v1_negative_axis)
     auto gather_v1 = make_shared<op::v1::Gather>(params, indices, axis_node);
     ASSERT_EQ(gather_v1->get_axis(), 1);
 }
+
+// ------------------------------ V7 ------------------------------
+
+TEST(type_prop, gather_7_axis_0)
+{
+    PartialShape data_shape{3, 2};
+    PartialShape indices_shape{2, 2};
+    PartialShape out_shape{2, 2, 2};
+    int64_t batch_dims = 0;
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {0});
+    auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+    ASSERT_EQ(G->get_axis(), 0);
+}
+
+TEST(type_prop, gather_7_axis_1)
+{
+    PartialShape data_shape{3, 3};
+    PartialShape indices_shape{1, 2};
+    PartialShape out_shape{3, 1, 2};
+    int64_t axis = 1;
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto A = op::Constant::create(element::i64, Shape{}, {axis});
+    auto G = make_shared<op::v7::Gather>(D, I, A);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+    ASSERT_EQ(G->get_axis(), 1);
+}
+
+TEST(type_prop, gather_7_negative_axis)
+{
+    PartialShape data_shape{5, 6, 7};
+    PartialShape indices_shape{4};
+    PartialShape out_shape{5, 4, 7};
+    int64_t axis = -2;
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{axis});
+    auto G = make_shared<op::v7::Gather>(D, I, A);
+
+    ASSERT_EQ(G->get_axis(), 1);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+}
+
+TEST(type_prop, gather_7_batch_dims_1_axis_3)
+{
+    PartialShape data_shape{Dimension(1, 7), Dimension(1, 3), 200, 400};
+    PartialShape indices_shape{Dimension(7, 10), Dimension(2, 10), 3, 8};
+    PartialShape out_shape{7, Dimension(1, 3), 200, Dimension(2, 10), 3, 8};
+    int64_t axis = 3;
+    int64_t batch_dims = 1;
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{axis});
+    auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+}
+
+TEST(type_prop, gather_7_dynamic_batch_dim)
+{
+    PartialShape data_shape{Dimension(1, 7), 20, 20};
+    PartialShape indices_shape{Dimension(7, 10), 3, 8};
+    PartialShape out_shape{7, 3, 8, 20};
+    int64_t axis = 1;
+    int64_t batch_dims = 1;
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{axis});
+    auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+}
+
+TEST(type_prop, gather_7_dynamic_2d_batch_dim)
+{
+    PartialShape data_shape{Dimension(1, 7), Dimension(1, 3), 200, 400};
+    PartialShape indices_shape{Dimension(7, 10), Dimension(2, 10), 3, 8};
+    PartialShape out_shape{7, Dimension(2, 3), 3, 8, 400};
+    int64_t axis = 2;
+    int64_t batch_dims = 2;
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{axis});
+    auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+}
+
+TEST(type_prop, gather_7_dynamic_2d_batch_dim_axis_3)
+{
+    PartialShape data_shape{Dimension(1, 7), Dimension(1, 3), 200, 400};
+    PartialShape indices_shape{Dimension(7, 10), Dimension(2, 10), 3, 8};
+    PartialShape out_shape{7, Dimension(2, 3), 200, 3, 8};
+    int64_t axis = 3;
+    int64_t batch_dims = 2;
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{axis});
+    auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+}
+
+TEST(type_prop, gather_7_dynamic_data_indices_rank)
+{
+    PartialShape data_shape{Dimension(1, 7), Dimension(1, 3), 200, 400};
+    PartialShape indices_shape = PartialShape::dynamic();
+    PartialShape out_shape = PartialShape::dynamic();
+    int64_t axis = 3;
+    int64_t batch_dims = 2;
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{axis});
+    auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+}
+
+TEST(type_prop, gather_7_axis_not_set)
+{
+    PartialShape data_shape{1, 1, 200, 400};
+    PartialShape indices_shape{2, 2};
+    // default batch_dims = 0
+    PartialShape out_shape = PartialShape::dynamic(5);  // out_rank = data_rank + indices_rank - 1 - batch_dims
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = make_shared<op::Parameter>(element::f32, Shape{1});
+    auto G = make_shared<op::v7::Gather>(D, I, A);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+}
+
+TEST(type_prop, gather_7_axis_not_set_positive_batch_dims)
+{
+    PartialShape data_shape{2, 1, 200, 400};
+    PartialShape indices_shape{2, 2};
+    int64_t batch_dims = 1;
+    PartialShape out_shape = PartialShape({2,
+                                           Dimension::dynamic(),
+                                           Dimension::dynamic(),
+                                           Dimension::dynamic()});
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = make_shared<op::Parameter>(element::f32, Shape{1});
+    auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+}
+
+TEST(type_prop, gather_7_axis_not_set_negative_batch)
+{
+    PartialShape data_shape{1, 1, 200, 400};
+    PartialShape indices_shape{2, 2};
+    int64_t batch_dims = -1;
+    // negative batch_dims together with unknown axis could mean any value
+    // within the intervals [0, data_rank] && [0, indices_rank] so out_rank will be dynamic with the range
+    // out_rank = data_rank + indices_rank - 1 - interval(0, max(data_rank, indices_rank))
+    PartialShape out_shape = PartialShape::dynamic(Dimension(2, 5));
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    auto A = make_shared<op::Parameter>(element::f32, Shape{1});
+    auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+
+    ASSERT_EQ(G->get_element_type(), element::f32);
+    ASSERT_EQ(G->get_output_partial_shape(0), out_shape);
+}
+
+// --------------------- Negative tests ------------------------------
+
+TEST(type_prop, gather_7_incorrect_axis_shape)
+{
+    auto D = make_shared<op::Parameter>(element::f32, Shape{5, 6});
+    auto I = make_shared<op::Parameter>(element::i64, Shape{4});
+    auto A = make_shared<op::Parameter>(element::i64, Shape{2});
+
+    try
+    {
+        auto G = make_shared<op::v7::Gather>(D, I, A);
+        // Should have thrown, so fail if it didn't
+        FAIL() << "Incorrect A input shape";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             std::string("Axes input must be scalar or have 1 element"));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, gather_7_axis_out_of_input_rank)
+{
+    auto D = make_shared<op::Parameter>(element::f32, Shape{5, 6});
+    auto I = make_shared<op::Parameter>(element::i64, Shape{4});
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{2});
+    int64_t batch_dims = 0;
+    try
+    {
+        auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+        // Should have thrown, so fail if it didn't
+        FAIL() << "axis check failed";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(
+            error.what(), std::string("The axis must be => 0 and < data_rank. But instead got"));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, gather_7_dynamic_batch_dims_inconsistent)
+{
+    PartialShape data_shape{Dimension(1, 7), 20, 20};
+    PartialShape indices_shape{Dimension(8, 10), 3, 8};
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    int64_t axis = 1;
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{axis});
+    int64_t batch_dims = 1;
+
+    try
+    {
+        auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+        // Should have thrown, so fail if it didn't
+        FAIL() << "Shape inconsistency check for dynamic PartialShape failed";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(
+            error.what(),
+            std::string("data and indices must have equal or intersecting sizes until batch_dims"));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, gather_7_batch_dims_less_check)
+{
+    PartialShape data_shape{1, 20, 20};
+    PartialShape indices_shape{1, 3, 8};
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    int64_t axis = 1;
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{axis});
+    int64_t batch_dims = 2;
+
+    try
+    {
+        auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+        // Should have thrown, so fail if it didn't
+        FAIL() << "batch_dims check failed";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(
+                error.what(),
+                std::string("batch_dims <= axis. But instead got: batch_dims ="));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, gather_7_batch_dims_less_indices_rank_check)
+{
+    PartialShape data_shape{1, 20, 20, 22, 22};
+    PartialShape indices_shape{1, 3};
+
+    auto D = make_shared<op::Parameter>(element::f32, data_shape);
+    auto I = make_shared<op::Parameter>(element::i64, indices_shape);
+    int64_t axis = 4;
+    auto A = make_shared<op::Constant>(element::i64, Shape{1}, vector<int64_t>{axis});
+    int64_t batch_dims = 3;
+
+    try
+    {
+        auto G = make_shared<op::v7::Gather>(D, I, A, batch_dims);
+        // Should have thrown, so fail if it didn't
+        FAIL() << "batch_dims check failed";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(
+                error.what(),
+                std::string("batch_dims must be <= indices_rank"));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
diff --git a/ngraph/test/type_prop/group_convolution.cpp b/ngraph/test/type_prop/group_convolution.cpp
index 50d1041ccb5008..71dacdafe380da 100644
--- a/ngraph/test/type_prop/group_convolution.cpp
+++ b/ngraph/test/type_prop/group_convolution.cpp
@@ -9,39 +9,41 @@
 using namespace std;
 using namespace ngraph;
 
-TEST(type_prop, group_conv_v1_partial_auto_padding_same_lower)
+TEST(type_prop, group_convolution_auto_padding_same_lower)
 {
-    const PartialShape data_batch_shape{1, 4, 5, 5};
-    const PartialShape filters_shape{2, 1, 2, 3, 3};
+    const PartialShape data_batch_pshape{1, 4, 5, 5};
+    const PartialShape filters_pshape{2, 1, 2, 3, 3};
+    element::Type_t et = element::f32;
     Strides strides{1, 1};
     CoordinateDiff pads_begin{0, 0};
     CoordinateDiff pads_end{0, 0};
     Strides dilations{1, 1};
     const auto auto_pad = op::PadType::SAME_LOWER;
 
-    auto data_batch = make_shared<op::Parameter>(element::f32, data_batch_shape);
-    auto filters = make_shared<op::Parameter>(element::f32, filters_shape);
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
 
-    auto conv = make_shared<op::v1::GroupConvolution>(
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
         data_batch, filters, strides, pads_begin, pads_end, dilations, auto_pad);
 
-    ASSERT_TRUE(conv->get_output_partial_shape(0).same_scheme(PartialShape{1, 2, 5, 5}));
-    ASSERT_EQ(conv->get_pads_begin(), (CoordinateDiff{1, 1}));
-    ASSERT_EQ(conv->get_pads_end(), (CoordinateDiff{1, 1}));
+    ASSERT_TRUE(groupConv->get_output_partial_shape(0).same_scheme(PartialShape{1, 2, 5, 5}));
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{1, 1}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{1, 1}));
 }
 
-TEST(type_prop, group_conv_v1_partial_auto_padding_same_upper)
+TEST(type_prop, group_convolution_auto_padding_same_upper)
 {
-    const PartialShape data_batch_shape{1, 4, 5, 5};
-    const PartialShape filters_shape{2, 1, 2, 2, 2};
+    const PartialShape data_batch_pshape{1, 4, 5, 5};
+    const PartialShape filters_pshape{2, 1, 2, 2, 2};
+    element::Type_t et = element::f32;
     Strides strides{1, 1};
     CoordinateDiff pads_begin{0, 0};
     CoordinateDiff pads_end{0, 0};
     Strides dilations{1, 1};
     const auto auto_pad = op::PadType::SAME_UPPER;
 
-    auto data_batch = make_shared<op::Parameter>(element::f32, data_batch_shape);
-    auto filters = make_shared<op::Parameter>(element::f32, filters_shape);
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
 
     auto conv = make_shared<op::v1::GroupConvolution>(
         data_batch, filters, strides, pads_begin, pads_end, dilations, auto_pad);
@@ -51,65 +53,476 @@ TEST(type_prop, group_conv_v1_partial_auto_padding_same_upper)
     ASSERT_EQ(conv->get_pads_end(), (CoordinateDiff{1, 1}));
 }
 
-TEST(type_prop, group_conv_v1_partial_auto_padding_same_lower_nc_dims_dynamic)
+TEST(type_prop, group_convolution_auto_padding_same_lower_spatial_dims_static)
 {
-    const PartialShape data_batch_shape{Dimension::dynamic(), Dimension::dynamic(), 5, 5};
-    const PartialShape filters_shape{2, 1, 2, 3, 3};
-    Strides strides{1, 1};
-    CoordinateDiff pads_begin{0, 0};
-    CoordinateDiff pads_end{0, 0};
-    Strides dilations{1, 1};
+    const PartialShape data_batch_pshape{Dimension::dynamic(), Dimension::dynamic(), 5, 5};
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), Dimension::dynamic(), Dimension::dynamic(), 3, 3};
+    const element::Type_t et = element::f32;
     const auto auto_pad = op::PadType::SAME_LOWER;
 
-    auto data_batch = make_shared<op::Parameter>(element::f32, data_batch_shape);
-    auto filters = make_shared<op::Parameter>(element::f32, filters_shape);
-
-    auto conv = make_shared<op::v1::GroupConvolution>(
-        data_batch, filters, strides, pads_begin, pads_end, dilations, auto_pad);
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}, auto_pad);
 
-    ASSERT_TRUE(conv->get_output_partial_shape(0).same_scheme({Dimension::dynamic(), 2, 5, 5}));
-    ASSERT_EQ(conv->get_pads_begin(), (CoordinateDiff{1, 1}));
-    ASSERT_EQ(conv->get_pads_end(), (CoordinateDiff{1, 1}));
+    ASSERT_TRUE(groupConv->get_output_partial_shape(0).same_scheme(
+        {Dimension::dynamic(), Dimension::dynamic(), 5, 5}));
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{1, 1}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{1, 1}));
 }
 
-TEST(type_prop, group_conv_v1_partial_auto_padding_same_upper_nc_dims_dynamic)
+TEST(type_prop, group_convolution_auto_padding_same_upper_spatial_dims_static)
 {
-    const PartialShape data_batch_shape{Dimension::dynamic(), Dimension::dynamic(), 5, 5};
-    const PartialShape filters_shape{2, 1, 2, 2, 2};
-    Strides strides{1, 1};
-    CoordinateDiff pads_begin{0, 0};
-    CoordinateDiff pads_end{0, 0};
-    Strides dilations{1, 1};
+    const PartialShape data_batch_pshape{1, Dimension::dynamic(), 5, 5};
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), Dimension::dynamic(), Dimension::dynamic(), 2, 2};
+    const element::Type_t et = element::f32;
     const auto auto_pad = op::PadType::SAME_UPPER;
 
-    auto data_batch = make_shared<op::Parameter>(element::f32, data_batch_shape);
-    auto filters = make_shared<op::Parameter>(element::f32, filters_shape);
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}, auto_pad);
 
-    auto conv = make_shared<op::v1::GroupConvolution>(
-        data_batch, filters, strides, pads_begin, pads_end, dilations, auto_pad);
+    ASSERT_TRUE(groupConv->get_output_partial_shape(0).same_scheme(
+        PartialShape{1, Dimension::dynamic(), 5, 5}));
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{0, 0}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{1, 1}));
+}
 
-    ASSERT_TRUE(conv->get_output_partial_shape(0).same_scheme({Dimension::dynamic(), 2, 5, 5}));
-    ASSERT_EQ(conv->get_pads_begin(), (CoordinateDiff{0, 0}));
-    ASSERT_EQ(conv->get_pads_end(), (CoordinateDiff{1, 1}));
+TEST(type_prop, group_convolution_static_ranks_filters_groups_dyn)
+{
+    const PartialShape data_batch_pshape{Dimension::dynamic(), 4, 5, 5};
+    const PartialShape filters_pshape{Dimension::dynamic(), 1, 2, 3, 3};
+    const element::Type_t et = element::f32;
+    const auto auto_pad = op::PadType::SAME_LOWER;
+
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}, auto_pad);
+
+    ASSERT_TRUE(
+        groupConv->get_output_partial_shape(0).same_scheme({Dimension::dynamic(), 2, 5, 5}));
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{1, 1}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{1, 1}));
 }
 
-TEST(type_prop, group_conv_v1_partial_auto_padding_same_spatial_dims_dynamic)
+TEST(type_prop, group_convolution_static_ranks_filters_groups_cout_dyn)
 {
-    const PartialShape data_batch_shape{1, 4, Dimension::dynamic(), 5};
-    const PartialShape filters_shape{2, 1, 2, 3, 3};
-    Strides strides{1, 1};
-    CoordinateDiff pads_begin{0, 0};
-    CoordinateDiff pads_end{0, 0};
-    Strides dilations{1, 1};
+    const PartialShape data_batch_pshape{Dimension::dynamic(), 4, 5, 5};
+    const PartialShape filters_pshape{Dimension::dynamic(), Dimension::dynamic(), 2, 3, 3};
+    const element::Type_t et = element::f32;
     const auto auto_pad = op::PadType::SAME_LOWER;
 
-    auto data_batch = make_shared<op::Parameter>(element::f32, data_batch_shape);
-    auto filters = make_shared<op::Parameter>(element::f32, filters_shape);
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}, auto_pad);
 
-    auto conv = make_shared<op::v1::GroupConvolution>(
-        data_batch, filters, strides, pads_begin, pads_end, dilations, auto_pad);
+    ASSERT_TRUE(groupConv->get_output_partial_shape(0).same_scheme(
+        {Dimension::dynamic(), Dimension::dynamic(), 5, 5}));
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{1, 1}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{1, 1}));
+}
+
+TEST(type_prop, group_convolution_static_ranks_data_cin_filters_group_dyn)
+{
+    const PartialShape data_batch_pshape{Dimension::dynamic(), Dimension::dynamic(), 5, 5};
+    const PartialShape filters_pshape{Dimension::dynamic(), 1, 2, 3, 3};
+    const element::Type_t et = element::f32;
+    const auto auto_pad = op::PadType::SAME_LOWER;
+
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}, auto_pad);
+
+    ASSERT_TRUE(groupConv->get_output_partial_shape(0).same_scheme(
+        {Dimension::dynamic(), Dimension::dynamic(), 5, 5}));
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{1, 1}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{1, 1}));
+}
+
+TEST(type_prop, group_convolution_auto_padding_same_spatial_dims_dynamic)
+{
+    const PartialShape data_batch_pshape{1, 4, Dimension::dynamic(), 5};
+    const PartialShape filters_pshape{2, 1, 2, 3, 3};
+    const element::Type_t et = element::f32;
+    const auto auto_pad = op::PadType::SAME_LOWER;
+
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}, auto_pad);
+
+    ASSERT_TRUE(
+        groupConv->get_output_partial_shape(0).same_scheme({1, 2, Dimension::dynamic(), 5}));
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{0, 1}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{0, 1}));
+}
+
+TEST(type_prop, group_convolution_data_batch_dynamic)
+{
+    const PartialShape data_batch_pshape{PartialShape::dynamic()};
+    const PartialShape filters_pshape{2, 1, 2, 3, 3};
+    const element::Type_t et = element::f32;
+
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+
+    ASSERT_EQ(groupConv->get_auto_pad(), op::PadType::EXPLICIT);
+    ASSERT_EQ(groupConv->get_strides(), (Strides{1, 1}));
+    ASSERT_EQ(groupConv->get_dilations(), (Strides{1, 1}));
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{0, 0}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{0, 0}));
+    ASSERT_TRUE(groupConv->get_output_partial_shape(0).same_scheme(
+        PartialShape{Dimension::dynamic(), 2, Dimension::dynamic(), Dimension::dynamic()}));
+}
+
+TEST(type_prop, group_convolution_filters_dynamic_auto_pad_explicit)
+{
+    const PartialShape data_batch_pshape{1, 4, 5, 5};
+    const PartialShape filters_pshape{PartialShape::dynamic()};
+    const element::Type_t et = element::f16;
+
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+
+    ASSERT_EQ(groupConv->get_auto_pad(), op::PadType::EXPLICIT);
+    ASSERT_EQ(groupConv->get_strides(), (Strides{1, 1}));
+    ASSERT_EQ(groupConv->get_dilations(), (Strides{1, 1}));
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{0, 0}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{0, 0}));
+    ASSERT_TRUE(groupConv->get_output_partial_shape(0).same_scheme(
+        PartialShape{1, Dimension::dynamic(), Dimension::dynamic(), Dimension::dynamic()}));
+}
+
+TEST(type_prop, group_convolution_filters_dynamic_auto_pad_same)
+{
+    const PartialShape data_batch_pshape{1, 4, 5, 5};
+    const PartialShape filters_pshape{PartialShape::dynamic()};
+    const element::Type_t et = element::f16;
+    const auto auto_pad = op::PadType::SAME_LOWER;
+
+    auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{}, auto_pad);
+
+    ASSERT_EQ(groupConv->get_auto_pad(), op::PadType::SAME_LOWER);
+    // pads should be as default since filters shape is dynamic
+    ASSERT_EQ(groupConv->get_pads_begin(), (CoordinateDiff{0, 0}));
+    ASSERT_EQ(groupConv->get_pads_end(), (CoordinateDiff{0, 0}));
+    ASSERT_TRUE(groupConv->get_output_partial_shape(0).same_scheme(
+        PartialShape{1, Dimension::dynamic(), Dimension::dynamic(), Dimension::dynamic()}));
+}
+
+TEST(type_prop, group_convolution_data_batch_and_filters_dynamic)
+{
+    const PartialShape dyn_pshape{PartialShape::dynamic()};
+    const element::Type_t et = element::f32;
+
+    auto data_batch = make_shared<op::Parameter>(et, dyn_pshape);
+    auto filters = make_shared<op::Parameter>(et, dyn_pshape);
+    auto groupConv = make_shared<op::v1::GroupConvolution>(
+        data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+
+    ASSERT_TRUE(groupConv->get_output_partial_shape(0).same_scheme(PartialShape::dynamic()));
+}
+
+TEST(type_prop, group_convolution_invalid_et_inputs)
+{
+    const PartialShape data_batch_pshape{1, 4, 5, 5};
+    const PartialShape filters_pshape{2, 1, 2, 3, 3};
+
+    try
+    {
+        auto data_batch = make_shared<op::Parameter>(element::f16, data_batch_pshape);
+        auto filters = make_shared<op::Parameter>(element::f32, filters_pshape);
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+        // data batch and filters must be of same element type
+        FAIL() << "Invalid element type of inputs not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(), "Element types for data batch and filters do not match");
+    }
+    catch (...)
+    {
+        FAIL() << "Element types of data batch and filters validation check failed for unexpected "
+                  "reason.";
+    }
+
+    try
+    {
+        const element::Type boolean_et = element::boolean;
+        auto data_batch = make_shared<op::Parameter>(boolean_et, data_batch_pshape);
+        auto filters = make_shared<op::Parameter>(boolean_et, filters_pshape);
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+        // data batch and filters must be of numeric element type
+        FAIL() << "Boolean element type of inputs not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(), "Element type of inputs must be numeric");
+    }
+    catch (...)
+    {
+        FAIL() << "Numeric element types of data batch and filters validation check failed for "
+                  "unexpected reason.";
+    }
+}
+
+TEST(type_prop, group_convolution_invalid_input_ranks)
+{
+    const element::Type_t et = element::f32;
+
+    // data partial shape provided is rank 4 (Conv2D)
+    // filter partial shape provided is rank 6 (Conv3D)
+    try
+    {
+        auto filters =
+            make_shared<op::Parameter>(et, PartialShape{2, 8, 2, 3, 3, Dimension::dynamic()});
+        auto data = make_shared<op::Parameter>(et, PartialShape{1, 16, 6, 6});
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+        // data and weight have incompatible ranks
+        FAIL() << "Incompatible input ranks not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             std::string("Shapes for data batch and filters do not match."));
+    }
+    catch (...)
+    {
+        FAIL() << "Rank validation check of inputs failed for unexpected reason";
+    }
+
+    // data partial shape provided is rank 5 (Conv3D)
+    // filter partial shape provided is rank 5 (Conv2D)
+    try
+    {
+        const auto filters = make_shared<op::Parameter>(et, PartialShape{2, 8, 2, 3, 3});
+        const auto data =
+            make_shared<op::Parameter>(et, PartialShape{1, Dimension::dynamic(), 16, 6, 6});
+        const auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+        // data and weight have incompatible ranks
+        FAIL() << "Incompatible input ranks not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             std::string("Shapes for data batch and filters do not match."));
+    }
+    catch (...)
+    {
+        FAIL() << "Rank validation check of inputs failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, group_convolution_invalid_input_channel_dims)
+{
+    try
+    {
+        const PartialShape data_batch_pshape{1, 6, 5, 5};
+        const PartialShape filters_pshape{2, 1, 2, 3, 3};
+        element::Type_t et = element::f32;
+
+        auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+        auto filters = make_shared<op::Parameter>(et, filters_pshape);
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+        // data batch shape does not have correct dimension C_IN * GROUPS
+        FAIL() << "Invalid input channels dimension of data batch not detected.";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Input channels dimension of data batch has incompatible value "
+                             "with filter shape.");
+    }
+    catch (...)
+    {
+        FAIL() << "Input channels dimension of data batch validation check failed for unexpected "
+                  "reason.";
+    }
+
+    try
+    {
+        const PartialShape data_batch_pshape{1, 3, 5, 5};
+        const PartialShape filters_pshape{2, 1, Dimension::dynamic(), 3, 3};
+        element::Type_t et = element::f32;
+
+        auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+        auto filters = make_shared<op::Parameter>(et, filters_pshape);
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+        // data batch shape does not have correct dimension C_IN * GROUPS
+        FAIL() << "Invalid input channels dimension of data batch not detected.";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Input channels dimension of data batch not a multiple of group size");
+    }
+    catch (...)
+    {
+        FAIL() << "Input channels dimension of data batch validation check failed for unexpected "
+                  "reason.";
+    }
+}
+
+TEST(type_prop, group_convolution_invalid_conv_param_spatial_dims)
+{
+    const PartialShape data_batch_pshape{1, 4, 5, 5};
+    const PartialShape filters_pshape{2, 1, 2, 2, 2};
+    const element::Type_t et = element::f32;
+
+    // invalid strides spatial dimensions
+    try
+    {
+        Strides strides{1, 1, 1};
+        Strides dilations{1, 1};
+        CoordinateDiff pads_begin{0, 0};
+        CoordinateDiff pads_end{0, 0};
+
+        auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+        auto filters = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid strides spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Strides should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Strides spatial dimensions validation check failed for unexpected reason";
+    }
+    try
+    {
+        Strides strides{1};
+        Strides dilations{1, 1};
+        CoordinateDiff pads_begin{0, 0};
+        CoordinateDiff pads_end{0, 0};
+
+        auto data_batch = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto filters = make_shared<op::Parameter>(et, filters_pshape);
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid strides spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Strides should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Strides spatial dimensions validation check failed for unexpected reason";
+    }
+
+    // invalid dilations spatial dimensions
+    try
+    {
+        Strides strides{1, 1};
+        Strides dilations{1};
+        CoordinateDiff pads_begin{0, 0};
+        CoordinateDiff pads_end{0, 0};
+
+        auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+        auto filters = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid dilations spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Dilations should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Dilations spatial dimensions validation check failed for unexpected reason";
+    }
+    try
+    {
+        Strides strides{1, 1};
+        Strides dilations{1, 1, 1};
+        CoordinateDiff pads_begin{0, 0};
+        CoordinateDiff pads_end{0, 0};
+
+        auto data_batch = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto filters = make_shared<op::Parameter>(et, filters_pshape);
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid dilations spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Dilations should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Dilations spatial dimensions validation check failed for unexpected reason";
+    }
+
+    // invalid padding spatial dimensions
+    try
+    {
+        Strides strides{1, 1};
+        Strides dilations{1, 1};
+        CoordinateDiff pads_begin{0, 0, 0};
+        CoordinateDiff pads_end{0, 0};
+
+        auto data_batch = make_shared<op::Parameter>(et, data_batch_pshape);
+        auto filters = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid padding spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Pads should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Padding spatial dimensions validation check failed for unexpected reason";
+    }
+    try
+    {
+        Strides strides{1, 1};
+        Strides dilations{1, 1};
+        CoordinateDiff pads_begin{0, 0};
+        CoordinateDiff pads_end{0};
 
-    ASSERT_TRUE(conv->get_output_partial_shape(0).same_scheme({1, 2, Dimension::dynamic(), 5}));
-    ASSERT_EQ(conv->get_pads_begin(), (CoordinateDiff{0, 1}));
-    ASSERT_EQ(conv->get_pads_end(), (CoordinateDiff{0, 1}));
+        auto data_batch = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto filters = make_shared<op::Parameter>(et, filters_pshape);
+        auto groupConv = make_shared<op::v1::GroupConvolution>(
+            data_batch, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid padding spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Pads should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Padding spatial dimensions validation check failed for unexpected reason";
+    }
 }
diff --git a/ngraph/test/type_prop/group_convolution_backprop_data.cpp b/ngraph/test/type_prop/group_convolution_backprop_data.cpp
index 0e0897d91f823f..b7e77236d8f710 100644
--- a/ngraph/test/type_prop/group_convolution_backprop_data.cpp
+++ b/ngraph/test/type_prop/group_convolution_backprop_data.cpp
@@ -9,14 +9,17 @@
 using namespace std;
 using namespace ngraph;
 
-TEST(type_prop, group_conv_backprop_data)
+TEST(type_prop, group_convolution_backprop_data_shape_infer)
 {
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    const auto filters = make_shared<op::Parameter>(element::f32, Shape{2, 8, 2, 3, 3});
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 6, 6});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{1, 16, 6, 6};      // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{2, 8, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+
     EXPECT_EQ(gcbd->get_element_type(), element::f32);
     EXPECT_EQ(gcbd->get_output_shape(0), (Shape{1, 4, 8, 8}));
     EXPECT_EQ(gcbd->get_strides(), (Strides{1, 1}));
@@ -27,50 +30,60 @@ TEST(type_prop, group_conv_backprop_data)
     EXPECT_EQ(gcbd->get_auto_pad(), op::PadType::EXPLICIT);
 }
 
-TEST(type_prop, group_conv_backprop_data_output_shape_as_const)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_with_output_shape_as_const)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    const auto filters = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 3, 3});
-    const auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{1, 16, 5, 5};       // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{1, 16, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
-    EXPECT_EQ(gcbd->get_element_type(), element::f32);
-    EXPECT_EQ(gcbd->get_output_shape(0), (Shape{1, 2, 3, 3}));
-    EXPECT_EQ(gcbd->get_strides(), (Strides{1, 1}));
-    EXPECT_EQ(gcbd->get_dilations(), (Strides{1, 1}));
-    EXPECT_EQ(gcbd->get_pads_begin(), (CoordinateDiff{2, 2}));
-    EXPECT_EQ(gcbd->get_pads_end(), (CoordinateDiff{2, 2}));
-    EXPECT_EQ(gcbd->get_output_padding(), (CoordinateDiff{0, 0}));
-    EXPECT_EQ(gcbd->get_auto_pad(), op::PadType::SAME_UPPER);
+
+    ASSERT_EQ(gcbd->get_element_type(), element::f32);
+    ASSERT_EQ(gcbd->get_output_shape(0), (Shape{1, 2, 3, 3}));
+    ASSERT_EQ(gcbd->get_strides(), (Strides{1, 1}));
+    ASSERT_EQ(gcbd->get_dilations(), (Strides{1, 1}));
+    ASSERT_EQ(gcbd->get_pads_begin(), (CoordinateDiff{2, 2}));
+    ASSERT_EQ(gcbd->get_pads_end(), (CoordinateDiff{2, 2}));
+    ASSERT_EQ(gcbd->get_output_padding(), (CoordinateDiff{0, 0}));
+    ASSERT_EQ(gcbd->get_auto_pad(), op::PadType::SAME_UPPER);
 }
 
-TEST(type_prop, group_conv_backprop_data_output_shape_as_param)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_with_output_shape_as_param)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    const auto filters = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 3, 3});
-    const auto output_shape = make_shared<op::Parameter>(element::i64, Shape{2});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{1, 16, 5, 5};       // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{1, 16, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = make_shared<op::Parameter>(element::i64, Shape{2});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
-    EXPECT_EQ(gcbd->get_element_type(), element::f32);
-    EXPECT_EQ(gcbd->get_auto_pad(), op::PadType::SAME_UPPER);
+
+    ASSERT_EQ(gcbd->get_element_type(), element::f32);
+    ASSERT_EQ(gcbd->get_auto_pad(), op::PadType::SAME_UPPER);
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).same_scheme(
         PartialShape{1, 2, Dimension::dynamic(), Dimension::dynamic()}));
 }
 
-TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shape_inference_1)
+TEST(type_prop,
+     group_convolution_backprop_data_shape_infer_with_output_shape_static_ranks_data_nc_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data = make_shared<op::Parameter>(
-        element::f32, PartialShape{Dimension::dynamic(), Dimension::dynamic(), 5, 5});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    const auto filters = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 3, 3});
-    const auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{
+        Dimension::dynamic(), Dimension::dynamic(), 5, 5}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{1, 16, 2, 3, 3};     // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
@@ -78,17 +91,20 @@ TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shap
         gcbd->get_output_partial_shape(0).same_scheme(PartialShape{Dimension::dynamic(), 2, 3, 3}));
 }
 
-TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shape_inference_2)
+TEST(type_prop,
+     group_convolution_backprop_data_shape_infer_with_output_shape_static_ranks_filters_group_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data =
-        make_shared<op::Parameter>(element::f32, PartialShape{Dimension::dynamic(), 16, 5, 5});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    const auto filters =
-        make_shared<op::Parameter>(element::f32, PartialShape{Dimension::dynamic(), 16, 2, 3, 3});
-    const auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{Dimension::dynamic(), 16, 5, 5}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), 16, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
@@ -96,17 +112,21 @@ TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shap
         gcbd->get_output_partial_shape(0).same_scheme(PartialShape{Dimension::dynamic(), 2, 3, 3}));
 }
 
-TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shape_inference_3)
+TEST(
+    type_prop,
+    group_convolution_backprop_data_shape_infer_with_output_shape_static_ranks_filters_group_cin_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data =
-        make_shared<op::Parameter>(element::f32, PartialShape{Dimension::dynamic(), 16, 5, 5});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    const auto filters = make_shared<op::Parameter>(
-        element::f32, PartialShape{Dimension::dynamic(), Dimension::dynamic(), 2, 3, 3});
-    const auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{Dimension::dynamic(), 16, 5, 5}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), Dimension::dynamic(), 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
@@ -114,17 +134,21 @@ TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shap
         PartialShape{Dimension::dynamic(), Dimension::dynamic(), 3, 3}));
 }
 
-TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shape_inference_4)
+TEST(
+    type_prop,
+    group_convolution_backprop_data_shape_infer_with_output_shape_static_ranks_data_cin_filters_group_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data =
-        make_shared<op::Parameter>(element::f32, PartialShape{1, Dimension::dynamic(), 5, 5});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    const auto filters =
-        make_shared<op::Parameter>(element::f32, PartialShape{Dimension::dynamic(), 16, 2, 3, 3});
-    const auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{1, Dimension::dynamic(), 5, 5}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), 16, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
@@ -132,17 +156,21 @@ TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shap
         gcbd->get_output_partial_shape(0).same_scheme(PartialShape{1, Dimension::dynamic(), 3, 3}));
 }
 
-TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shape_inference_5)
+TEST(
+    type_prop,
+    group_convolution_backprop_data_shape_infer_with_output_shape_static_ranks_filters_group_cout_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data =
-        make_shared<op::Parameter>(element::f32, PartialShape{Dimension::dynamic(), 16, 5, 5});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    const auto filters = make_shared<op::Parameter>(
-        element::f32, PartialShape{Dimension::dynamic(), 16, Dimension::dynamic(), 3, 3});
-    const auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{Dimension::dynamic(), 16, 5, 5}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), 16, Dimension::dynamic(), 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
@@ -150,18 +178,20 @@ TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_static_ranks_shap
         PartialShape{Dimension::dynamic(), Dimension::dynamic(), 3, 3}));
 }
 
-TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_1)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_static_ranks_data_nc_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    auto data = make_shared<op::Parameter>(
-        element::f32, PartialShape{Dimension::dynamic(), Dimension::dynamic(), 224, 224});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    auto filters = make_shared<op::Parameter>(element::f32, PartialShape{4, 5, 2, 3, 3});
-    auto strides = Strides{2, 2};
-    auto dilations = Strides{1, 1};
-    auto padding_begin = CoordinateDiff{1, 1};
-    auto padding_end = CoordinateDiff{1, 1};
-
+    const PartialShape data_pshape{
+        Dimension::dynamic(), Dimension::dynamic(), 224, 224}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{4, 5, 2, 3, 3};          // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    const Strides strides{2, 2};
+    const Strides dilations{1, 1};
+    const CoordinateDiff padding_begin{1, 1};
+    const CoordinateDiff padding_end{1, 1};
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
     auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, strides, padding_begin, padding_end, dilations);
 
@@ -172,18 +202,20 @@ TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_1)
         PartialShape{Dimension::dynamic(), 8, 447, 447}));
 }
 
-TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_2)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_static_ranks_filters_group_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    auto data = make_shared<op::Parameter>(element::f32, PartialShape{1, 20, 224, 224});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    auto filters =
-        make_shared<op::Parameter>(element::f32, PartialShape{Dimension::dynamic(), 5, 2, 3, 3});
-    auto strides = Strides{2, 2};
-    auto dilations = Strides{1, 1};
-    auto padding_begin = CoordinateDiff{1, 1};
-    auto padding_end = CoordinateDiff{1, 1};
-
+    const PartialShape data_pshape{1, 20, 224, 224}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), 5, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    const Strides strides{2, 2};
+    const Strides dilations{1, 1};
+    const CoordinateDiff padding_begin{1, 1};
+    const CoordinateDiff padding_end{1, 1};
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
     auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, strides, padding_begin, padding_end, dilations);
 
@@ -193,19 +225,20 @@ TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_2)
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).same_scheme(PartialShape{1, 8, 447, 447}));
 }
 
-TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_3)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_static_ranks_filters_group_cin_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    auto data =
-        make_shared<op::Parameter>(element::f32, PartialShape{Dimension::dynamic(), 20, 224, 224});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    auto filters = make_shared<op::Parameter>(
-        element::f32, PartialShape{Dimension::dynamic(), Dimension::dynamic(), 2, 3, 3});
-    auto strides = Strides{2, 2};
-    auto dilations = Strides{1, 1};
-    auto padding_begin = CoordinateDiff{1, 1};
-    auto padding_end = CoordinateDiff{1, 1};
-
+    const PartialShape data_pshape{Dimension::dynamic(), 20, 224, 224}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), Dimension::dynamic(), 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    const Strides strides{2, 2};
+    const Strides dilations{1, 1};
+    const CoordinateDiff padding_begin{1, 1};
+    const CoordinateDiff padding_end{1, 1};
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
     auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, strides, padding_begin, padding_end, dilations);
 
@@ -216,19 +249,44 @@ TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_3)
         PartialShape{Dimension::dynamic(), Dimension::dynamic(), 447, 447}));
 }
 
-TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_4)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_static_ranks_data_cin_filters_group_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    auto data =
-        make_shared<op::Parameter>(element::f32, PartialShape{1, Dimension::dynamic(), 224, 224});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    auto filters =
-        make_shared<op::Parameter>(element::f32, PartialShape{Dimension::dynamic(), 5, 2, 3, 3});
-    auto strides = Strides{2, 2};
-    auto dilations = Strides{1, 1};
-    auto padding_begin = CoordinateDiff{1, 1};
-    auto padding_end = CoordinateDiff{1, 1};
+    const PartialShape data_pshape{1, Dimension::dynamic(), 224, 224}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), 5, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    const Strides strides{2, 2};
+    const Strides dilations{1, 1};
+    const CoordinateDiff padding_begin{1, 1};
+    const CoordinateDiff padding_end{1, 1};
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+        data, filters, strides, padding_begin, padding_end, dilations);
 
+    ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
+    ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
+    ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
+    ASSERT_TRUE(gcbd->get_output_partial_shape(0).same_scheme(
+        PartialShape{1, Dimension::dynamic(), 447, 447}));
+}
+
+TEST(type_prop, group_convolution_backprop_data_shape_infer_static_ranks_filters_group_cout_dyn)
+{
+    const PartialShape data_pshape{1, 20, 224, 224}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{
+        Dimension::dynamic(), Dimension::dynamic(), 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    const Strides strides{2, 2};
+    const Strides dilations{1, 1};
+    const CoordinateDiff padding_begin{1, 1};
+    const CoordinateDiff padding_end{1, 1};
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
     auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, strides, padding_begin, padding_end, dilations);
 
@@ -239,18 +297,44 @@ TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_4)
         PartialShape{1, Dimension::dynamic(), 447, 447}));
 }
 
-TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_5)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_static_ranks_data_spatial_dim_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    auto data = make_shared<op::Parameter>(element::f32, PartialShape{1, 20, 224, 224});
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    auto filters = make_shared<op::Parameter>(
-        element::f32, PartialShape{Dimension::dynamic(), Dimension::dynamic(), 2, 3, 3});
-    auto strides = Strides{2, 2};
-    auto dilations = Strides{1, 1};
-    auto padding_begin = CoordinateDiff{1, 1};
-    auto padding_end = CoordinateDiff{1, 1};
+    const PartialShape data_pshape{1, 20, 224, 224}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{
+        4, 5, 2, Dimension::dynamic(), 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    const Strides strides{2, 2};
+    const Strides dilations{1, 1};
+    const CoordinateDiff padding_begin{1, 1};
+    const CoordinateDiff padding_end{1, 1};
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+        data, filters, strides, padding_begin, padding_end, dilations);
 
+    ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
+    ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
+    ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
+    ASSERT_TRUE(gcbd->get_output_partial_shape(0).same_scheme(
+        PartialShape{1, 8, Dimension::dynamic(), 447}));
+}
+
+TEST(type_prop, group_convolution_backprop_data_shape_infer_static_ranks_filters_spatial_dim_dyn)
+{
+    const PartialShape data_pshape{
+        Dimension::dynamic(), 20, 224, Dimension::dynamic()}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{4, 5, 2, 3, 3};         // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    const Strides strides{2, 2};
+    const Strides dilations{1, 1};
+    const CoordinateDiff padding_begin{1, 1};
+    const CoordinateDiff padding_end{1, 1};
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
     auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, strides, padding_begin, padding_end, dilations);
 
@@ -258,17 +342,21 @@ TEST(type_prop, group_conv_backprop_data_dyn_static_ranks_shape_inference_5)
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).same_scheme(
-        PartialShape{1, Dimension::dynamic(), 447, 447}));
+        PartialShape{Dimension::dynamic(), 8, 447, Dimension::dynamic()}));
 }
 
-TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_data_batch)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_with_output_shape_data_dyn)
 {
-    const auto data = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    const auto filters = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 3, 3});
-    const auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{PartialShape::dynamic()};
+    const PartialShape filters_pshape{1, 16, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
@@ -276,11 +364,14 @@ TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_data_batch)
         gcbd->get_output_partial_shape(0).same_scheme(PartialShape{Dimension::dynamic(), 2, 3, 3}));
 }
 
-TEST(type_prop, group_conv_backprop_data_shape_dyn_data_batch)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_data_dyn)
 {
-    auto data = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
-    // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-    auto filters = make_shared<op::Parameter>(element::f32, PartialShape{4, 5, 2, 3, 3});
+    const PartialShape data_pshape{PartialShape::dynamic()};
+    const PartialShape filters_pshape{4, 5, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
     auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
 
@@ -291,15 +382,19 @@ TEST(type_prop, group_conv_backprop_data_shape_dyn_data_batch)
         PartialShape{Dimension::dynamic(), 8, Dimension::dynamic(), Dimension::dynamic()}));
 }
 
-TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_filters)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_with_output_shape_filters_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data = make_shared<op::Parameter>(
-        element::f32, PartialShape{1, 16, Dimension::dynamic(), Dimension::dynamic()});
-    const auto filters = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
-    const auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{
+        1, 16, Dimension::dynamic(), Dimension::dynamic()}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{PartialShape::dynamic()};
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = op::Constant::create(element::i64, Shape{2}, {3, 3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{4}));
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
@@ -307,11 +402,14 @@ TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_filters)
         gcbd->get_output_partial_shape(0).same_scheme(PartialShape{1, Dimension::dynamic(), 3, 3}));
 }
 
-TEST(type_prop, group_conv_backprop_data_shape_dyn_filters)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_filters_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    auto data = make_shared<op::Parameter>(element::f32, PartialShape{1, 8, 224, 224});
-    auto filters = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+    const PartialShape data_pshape{1, 8, 224, 224}; // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{PartialShape::dynamic()};
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
     auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
 
@@ -322,14 +420,19 @@ TEST(type_prop, group_conv_backprop_data_shape_dyn_filters)
         PartialShape{1, Dimension::dynamic(), Dimension::dynamic(), Dimension::dynamic()}));
 }
 
-TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_data_and_filters_1)
+TEST(type_prop,
+     group_convolution_backprop_data_shape_infer_with_output_shape_as_const_data_and_filters_dyn)
 {
-    // data batch shape: [N, C_IN * GROUPS, H, W]
-    const auto data = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
-    const auto filters = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
-    const auto output_shape = op::Constant::create(element::i64, Shape{3}, {3, 3, 3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{PartialShape::dynamic()};
+    const PartialShape filters_pshape{PartialShape::dynamic()};
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = op::Constant::create(element::i64, Shape{3}, {3, 3, 3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_static());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().same_scheme(Rank{5}));
@@ -337,22 +440,32 @@ TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_data_and_filters_
         PartialShape{Dimension::dynamic(), Dimension::dynamic(), 3, 3, 3}));
 }
 
-TEST(type_prop, group_conv_backprop_data_with_output_shape_dyn_data_and_filters_2)
+TEST(type_prop,
+     group_convolution_backprop_data_shape_infer_with_output_shape_as_param_data_and_filters_dyn)
 {
-    const auto data = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
-    const auto filters = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
-    const auto output_shape = make_shared<op::Parameter>(element::i64, Shape{3});
-    const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+    const PartialShape data_pshape{PartialShape::dynamic()};
+    const PartialShape filters_pshape{PartialShape::dynamic()};
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
+    auto output_shape = make_shared<op::Parameter>(element::i64, Shape{3});
+    auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).is_dynamic());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).rank().is_dynamic());
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).same_scheme(PartialShape::dynamic()));
 }
 
-TEST(type_prop, group_conv_backprop_data_dyn_data_and_filters)
+TEST(type_prop, group_convolution_backprop_data_shape_infer_data_and_filters_dyn)
 {
-    auto data = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
-    auto filters = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+    const PartialShape data_pshape{PartialShape::dynamic()};
+    const PartialShape filters_pshape{PartialShape::dynamic()};
+    const element::Type_t et = element::f32;
+
+    auto data = make_shared<op::Parameter>(et, data_pshape);
+    auto filters = make_shared<op::Parameter>(et, filters_pshape);
     auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
         data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
 
@@ -361,15 +474,19 @@ TEST(type_prop, group_conv_backprop_data_dyn_data_and_filters)
     ASSERT_TRUE(gcbd->get_output_partial_shape(0).same_scheme(PartialShape::dynamic()));
 }
 
-TEST(type_prop, group_conv_backprop_data_invalid_element_types)
+TEST(type_prop, group_convolution_backprop_data_invalid_et_inputs)
 {
     try
     {
-        // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{2, 8, 2, 3, 3});
-        // data batch shape: [N, C_IN * GROUPS, H, W]
-        const auto data = make_shared<op::Parameter>(element::f16, Shape{1, 16, 6, 6});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+        const PartialShape data_pshape{1, 16, 6, 6};      // [N, C_IN * GROUPS, H, W]
+        const PartialShape filters_pshape{2, 8, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+
+        const element::Type_t data_et = element::f16;
+        const element::Type_t filters_et = element::f32;
+
+        auto data = make_shared<op::Parameter>(data_et, data_pshape);
+        auto filters = make_shared<op::Parameter>(filters_et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
             data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
         // data and filters should be of same element type
         FAIL() << "Incompatible element types not detected";
@@ -386,12 +503,40 @@ TEST(type_prop, group_conv_backprop_data_invalid_element_types)
 
     try
     {
-        // data batch shape: [N, C_IN * GROUPS, H, W]
-        const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-        // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 3, 3});
-        const auto output_shape = op::Constant::create(element::f16, Shape{2}, {3, 3});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+        const PartialShape data_pshape{1, 16, 6, 6};      // [N, C_IN * GROUPS, H, W]
+        const PartialShape filters_pshape{2, 8, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+
+        const element::Type boolean_et = element::boolean;
+
+        auto data = make_shared<op::Parameter>(boolean_et, data_pshape);
+        auto filters = make_shared<op::Parameter>(boolean_et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
+        // data and filters must be of numeric element type
+        FAIL() << "Boolean element type of inputs not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             std::string("Element type of inputs must be numeric"));
+    }
+    catch (...)
+    {
+        FAIL() << "Numeric element types of data batch and filters validation check failed for "
+                  "unexpected reason.";
+    }
+
+    try
+    {
+        const PartialShape data_pshape{1, 16, 5, 5};       // [N, C_IN * GROUPS, H, W]
+        const PartialShape filters_pshape{1, 16, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+
+        const element::Type_t inputs_et = element::f32;
+
+        auto data = make_shared<op::Parameter>(inputs_et, data_pshape);
+        auto filters = make_shared<op::Parameter>(inputs_et, filters_pshape);
+        auto output_shape = op::Constant::create(inputs_et, Shape{2}, {3, 3});
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
             data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
         // output shape input element type must be of integer type
         FAIL() << "Incompatible element types not detected";
@@ -407,18 +552,22 @@ TEST(type_prop, group_conv_backprop_data_invalid_element_types)
     }
 }
 
-TEST(type_prop, group_conv_backprop_data_invalid_input_ranks)
+TEST(type_prop, group_convolution_backprop_data_invalid_input_ranks)
 {
     // data partial shape provided is rank 4 (Conv2D)
     // filter partial shape provided is rank 6 (Conv3D)
     try
     {
-        const auto filters = make_shared<op::Parameter>(
-            element::f32, PartialShape{2, 8, 2, 3, 3, Dimension::dynamic()});
-        const auto data = make_shared<op::Parameter>(element::f32, PartialShape{1, 16, 6, 6});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+        const PartialShape data_pshape{1, 16, 6, 6};         // [N, C_IN * GROUPS, H, W]
+        const PartialShape filters_pshape{2, 8, 2, 3, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW, kD]
+
+        const element::Type_t inputs_et = element::f32;
+
+        auto data = make_shared<op::Parameter>(inputs_et, data_pshape);
+        auto filters = make_shared<op::Parameter>(inputs_et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
             data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
-        // data and weight have incompatible ranks
+        // data and filters have incompatible ranks
         FAIL() << "Incompatible input ranks not detected";
     }
     catch (const NodeValidationFailure& error)
@@ -435,10 +584,14 @@ TEST(type_prop, group_conv_backprop_data_invalid_input_ranks)
     // filter partial shape provided is rank 5 (Conv2D)
     try
     {
-        const auto filters = make_shared<op::Parameter>(element::f32, PartialShape{2, 8, 2, 3, 3});
-        const auto data = make_shared<op::Parameter>(
-            element::f32, PartialShape{1, Dimension::dynamic(), 16, 6, 6});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+        const PartialShape data_pshape{1, 16, 6, 6, 6};   // [N, C_IN * GROUPS, H, W, D]
+        const PartialShape filters_pshape{2, 8, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+
+        const element::Type_t inputs_et = element::f32;
+
+        auto data = make_shared<op::Parameter>(inputs_et, data_pshape);
+        auto filters = make_shared<op::Parameter>(inputs_et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
             data, filters, Strides{}, CoordinateDiff{}, CoordinateDiff{}, Strides{});
         // data and weight have incompatible ranks
         FAIL() << "Incompatible input ranks not detected";
@@ -455,10 +608,15 @@ TEST(type_prop, group_conv_backprop_data_invalid_input_ranks)
 
     try
     {
-        const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 3, 3});
-        const auto output_shape = op::Constant::create(element::i64, Shape{2, 1}, {3, 3});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+        const PartialShape data_pshape{1, 16, 5, 5};       // [N, C_IN * GROUPS, H, W]
+        const PartialShape filters_pshape{1, 16, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+
+        const element::Type_t inputs_et = element::f32;
+
+        auto data = make_shared<op::Parameter>(inputs_et, data_pshape);
+        auto filters = make_shared<op::Parameter>(inputs_et, filters_pshape);
+        auto output_shape = op::Constant::create(element::i64, Shape{2, 1}, {3, 3});
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
             data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
         // Output shape optional input must be of rank 1
         FAIL() << "Incompatible output shape input rank not detected.";
@@ -474,20 +632,22 @@ TEST(type_prop, group_conv_backprop_data_invalid_input_ranks)
     }
 }
 
-TEST(type_prop, group_conv_backprop_data_invalid_params)
+TEST(type_prop, group_convolution_backprop_data_invalid_input_channel_dims)
 {
+    const Strides strides{1, 1};
+    const Strides dilations{1, 1};
+    const CoordinateDiff padding{2, 2};
+    const element::Type_t inputs_et = element::f32;
+
     try
     {
-        // filter shape: [GROUPS, C_IN, C_OUT, kH, kW]
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{21, 16, 20, 3, 3});
-        // data batch shape: [N, C_IN * GROUPS, H, W]
-        const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(data,
-                                                                            filters,
-                                                                            Strides{1, 1},
-                                                                            CoordinateDiff{2, 2},
-                                                                            CoordinateDiff{2, 2},
-                                                                            Strides{1, 1});
+        const PartialShape data_pshape{1, 16, 5, 5};         // [N, C_IN * GROUPS, H, W]
+        const PartialShape filters_pshape{21, 16, 20, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+
+        auto data = make_shared<op::Parameter>(inputs_et, data_pshape);
+        auto filters = make_shared<op::Parameter>(inputs_et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, padding, padding, dilations);
         // data batch shape does not have correct dimension C_IN * GROUPS
         FAIL() << "Incompatibile input shapes not detected.";
     }
@@ -503,16 +663,13 @@ TEST(type_prop, group_conv_backprop_data_invalid_params)
 
     try
     {
-        // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{4, 16, 20, 3, 3});
-        // data batch shape: [N, C_IN * GROUPS, H, W]
-        const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(data,
-                                                                            filters,
-                                                                            Strides{1, 1},
-                                                                            CoordinateDiff{2, 2},
-                                                                            CoordinateDiff{2, 2},
-                                                                            Strides{1, 1});
+        const PartialShape data_pshape{1, 16, 5, 5};        // [N, C_IN * GROUPS, H, W]
+        const PartialShape filters_pshape{4, 16, 20, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+
+        auto data = make_shared<op::Parameter>(inputs_et, data_pshape);
+        auto filters = make_shared<op::Parameter>(inputs_et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, padding, padding, dilations);
         // filter shape specifies GROUPS = 4 and C_IN = 16, while data batch shape specifies
         // dimension C_IN * GROUPS = 16
         FAIL() << "Incompatibile input shapes not detected.";
@@ -527,123 +684,227 @@ TEST(type_prop, group_conv_backprop_data_invalid_params)
     {
         FAIL() << "Input shapes validation check failed for unexpected reason.";
     }
+}
+
+TEST(type_prop, group_convolution_backprop_data_invalid_output_shape_spatial_dims)
+{
+    const PartialShape data_pshape{1, 16, 5, 5};      // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_shape{1, 16, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t inputs_et = element::f32;
 
     try
     {
-        // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{2, 8, 2, 3, 3});
-        // data batch shape: [N, C_IN * GROUPS, H, W]
-        const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 6, 6});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
-            data, filters, Strides{}, CoordinateDiff{1}, CoordinateDiff{1, 1}, Strides{});
-        // pads_begin and pads_end do not match spatial dimensions
-        FAIL() << "Incompatible pads number of spatial dimensions not detected.";
+        auto data = make_shared<op::Parameter>(inputs_et, data_pshape);
+        auto filters = make_shared<op::Parameter>(inputs_et, filters_shape);
+        auto output_shape = op::Constant::create(element::i64, Shape{3}, {3, 3, 3});
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
+        // output_shape has invalid spatials dimensions (should be 2)
+        FAIL() << "Incompatible output shape optional input not detected";
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(error.what(),
-                             "Pads should be defined for all and only spatial features.");
+        EXPECT_HAS_SUBSTRING(
+            error.what(),
+            std::string("Output shape should be specified only and for all spatial dimensions."));
     }
     catch (...)
     {
-        FAIL() << "Pads validation check failed for unexpected reason.";
+        FAIL() << "Output shape validation check failed for unexpected reason.";
     }
+}
 
+TEST(type_prop, group_convolution_backprop_data_invalid_conv_param_spatial_dims)
+{
+    const PartialShape data_pshape{1, 16, 6, 6};      // [N, C_IN * GROUPS, H, W]
+    const PartialShape filters_pshape{2, 8, 2, 3, 3}; // [GROUPS, C_IN, C_OUT, kH, kW]
+    const element::Type_t et = element::f32;
+
+    // invalid strides spatial dimensions
     try
     {
-        // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{4, 4, 20, 3, 3});
-        // data batch shape: [N, C_IN * GROUPS, H, W]
-        const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
-            data, filters, Strides{1}, CoordinateDiff{2, 2}, CoordinateDiff{2, 2}, Strides{1, 1});
-        // Strides have incompatible number of spatial dimensions
-        FAIL() << "Incompatible stride number of spatial dimensions not detected.";
+        const Strides strides{1, 1, 1};
+        const Strides dilations{1, 1};
+        const CoordinateDiff pads_begin{0, 0};
+        const CoordinateDiff pads_end{0, 0};
+
+        auto data = make_shared<op::Parameter>(et, data_pshape);
+        auto filters = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid strides spatial dimensions not detected";
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(
-            error.what(),
-            std::string("Strides should be defined for all and only spatial features."));
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Strides should be defined for all and only spatial features.");
     }
     catch (...)
     {
-        FAIL() << "Strides validation check failed for unexpected reason.";
+        FAIL() << "Strides spatial dimensions validation check failed for unexpected reason";
     }
+    try
+    {
+        const Strides strides{1};
+        const Strides dilations{1, 1};
+        const CoordinateDiff pads_begin{0, 0};
+        const CoordinateDiff pads_end{0, 0};
 
+        auto data = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto filters = make_shared<op::Parameter>(et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid strides spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Strides should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Strides spatial dimensions validation check failed for unexpected reason";
+    }
+
+    // invalid dilations spatial dimensions
     try
     {
-        // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{4, 4, 20, 3, 3});
-        // data batch shape: [N, C_IN * GROUPS, H, W]
-        const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(data,
-                                                                            filters,
-                                                                            Strides{1, 1},
-                                                                            CoordinateDiff{2, 2},
-                                                                            CoordinateDiff{2, 2},
-                                                                            Strides{1, 1, 1});
-        // Dilations have incompatible number of spatial dimensions
-        FAIL() << "Incompatible dilations number of spatial dimensions not detected.";
+        const Strides strides{1, 1};
+        const Strides dilations{1};
+        const CoordinateDiff pads_begin{0, 0};
+        const CoordinateDiff pads_end{0, 0};
+
+        auto data = make_shared<op::Parameter>(et, data_pshape);
+        auto filters = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid dilations spatial dimensions not detected";
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(
-            error.what(),
-            std::string("Dilations should be defined for all and only spatial features."));
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Dilations should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Dilations spatial dimensions validation check failed for unexpected reason";
+    }
+    try
+    {
+        const Strides strides{1, 1};
+        const Strides dilations{1, 1, 1};
+        const CoordinateDiff pads_begin{0, 0};
+        const CoordinateDiff pads_end{0, 0};
+
+        auto data = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto filters = make_shared<op::Parameter>(et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid dilations spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Dilations should be defined for all and only spatial features.");
     }
     catch (...)
     {
-        FAIL() << "Dilations validation check failed for unexpected reason.";
+        FAIL() << "Dilations spatial dimensions validation check failed for unexpected reason";
     }
 
+    // invalid padding spatial dimensions
     try
     {
-        // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{4, 4, 20, 3, 3});
-        // data batch shape: [N, C_IN * GROUPS, H, W]
-        const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(data,
-                                                                            filters,
-                                                                            Strides{1, 1},
-                                                                            CoordinateDiff{2, 2},
-                                                                            CoordinateDiff{2, 2},
-                                                                            Strides{1, 1},
-                                                                            op::PadType::EXPLICIT,
-                                                                            CoordinateDiff{0});
-        // Output padding have incompatible number of spatial dimensions
-        FAIL() << "Incompatible output padding number of spatial dimensions not detected.";
+        const Strides strides{1, 1};
+        const Strides dilations{1, 1};
+        const CoordinateDiff pads_begin{0, 0, 0};
+        const CoordinateDiff pads_end{0, 0};
+
+        auto data = make_shared<op::Parameter>(et, data_pshape);
+        auto filters = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid padding spatial dimensions not detected";
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(
-            error.what(),
-            std::string("Output padding should be defined for all and only spatial features."));
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Pads should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Padding spatial dimensions validation check failed for unexpected reason";
+    }
+    try
+    {
+        const Strides strides{1, 1};
+        const Strides dilations{1, 1};
+        const CoordinateDiff pads_begin{0, 0};
+        const CoordinateDiff pads_end{0};
+
+        auto data = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto filters = make_shared<op::Parameter>(et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, pads_begin, pads_end, dilations);
+        FAIL() << "Invalid padding spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Pads should be defined for all and only spatial features.");
     }
     catch (...)
     {
-        FAIL() << "Output padding validation check failed for unexpected reason.";
+        FAIL() << "Padding spatial dimensions validation check failed for unexpected reason";
     }
 
+    // invalid output padding spatial dimensions
     try
     {
-        // filters shape: [GROUPS, C_IN, C_OUT, kH, kW]
-        const auto filters = make_shared<op::Parameter>(element::f32, Shape{1, 16, 2, 3, 3});
-        // data batch shape: [N, C_IN * GROUPS, H, W]
-        const auto data = make_shared<op::Parameter>(element::f32, Shape{1, 16, 5, 5});
-        const auto output_shape = op::Constant::create(element::i64, Shape{3}, {3, 3, 3});
-        const auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
-            data, filters, output_shape, Strides{}, Strides{}, op::PadType::SAME_UPPER);
-        FAIL() << "Incompatible output shape optional input not detected";
+        const Strides strides{1, 1};
+        const Strides dilations{1, 1};
+        const CoordinateDiff pads_begin{0, 0};
+        const CoordinateDiff pads_end{0, 0};
+        const CoordinateDiff output_padding{0, 0, 0};
+        const op::PadType auto_pad = op::PadType::EXPLICIT;
+
+        auto data = make_shared<op::Parameter>(et, data_pshape);
+        auto filters = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, pads_begin, pads_end, dilations, auto_pad, output_padding);
+        FAIL() << "Invalid output padding spatial dimensions not detected";
     }
     catch (const NodeValidationFailure& error)
     {
-        EXPECT_HAS_SUBSTRING(
-            error.what(),
-            std::string("Output shape should be specified only and for all spatial dimensions."));
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Output padding should be defined for all and only spatial features.");
     }
     catch (...)
     {
-        FAIL() << "Output shape validation check failed for unexpected reason.";
+        FAIL() << "Output padding spatial dimensions validation check failed for unexpected reason";
+    }
+    try
+    {
+        const Strides strides{1, 1};
+        const Strides dilations{1, 1};
+        const CoordinateDiff pads_begin{0, 0};
+        const CoordinateDiff pads_end{0, 0};
+        const CoordinateDiff output_padding{0};
+        const op::PadType auto_pad = op::PadType::EXPLICIT;
+
+        auto data = make_shared<op::Parameter>(et, PartialShape::dynamic());
+        auto filters = make_shared<op::Parameter>(et, filters_pshape);
+        auto gcbd = make_shared<op::v1::GroupConvolutionBackpropData>(
+            data, filters, strides, pads_begin, pads_end, dilations, auto_pad, output_padding);
+        FAIL() << "Invalid output padding spatial dimensions not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Output padding should be defined for all and only spatial features.");
+    }
+    catch (...)
+    {
+        FAIL() << "Output padding spatial dimensions validation check failed for unexpected reason";
     }
 }
diff --git a/ngraph/test/type_prop/mod.cpp b/ngraph/test/type_prop/mod.cpp
new file mode 100644
index 00000000000000..bfde883012bc6f
--- /dev/null
+++ b/ngraph/test/type_prop/mod.cpp
@@ -0,0 +1,9 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "arithmetic_ops.hpp"
+
+using Type = ::testing::Types<ngraph::op::v1::Mod>;
+
+INSTANTIATE_TYPED_TEST_CASE_P(type_prop_mod, ArithmeticOperator, Type);
diff --git a/ngraph/test/util/CMakeLists.txt b/ngraph/test/util/CMakeLists.txt
index 1382f2d18334f4..91f790a1185ff1 100644
--- a/ngraph/test/util/CMakeLists.txt
+++ b/ngraph/test/util/CMakeLists.txt
@@ -9,6 +9,7 @@ set (SRC
     engine/shared_utils.cpp
     float_util.cpp
     test_tools.cpp
+    test_case.cpp
     test_control.cpp
     visitor.hpp
     provenance_enabler.hpp
diff --git a/ngraph/test/util/test_case.cpp b/ngraph/test/util/test_case.cpp
new file mode 100644
index 00000000000000..764cb4d6b99e98
--- /dev/null
+++ b/ngraph/test/util/test_case.cpp
@@ -0,0 +1,21 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <ie_core.hpp>
+
+#include "test_case.hpp"
+
+namespace ngraph
+{
+    namespace test
+    {
+        std::shared_ptr<Function> function_from_ir(const std::string& xml_path,
+                                                   const std::string& bin_path)
+        {
+            InferenceEngine::Core c;
+            auto network = c.ReadNetwork(xml_path, bin_path);
+            return network.getFunction();
+        }
+    }
+}
diff --git a/ngraph/test/util/test_case.hpp b/ngraph/test/util/test_case.hpp
index bd30536918fd2c..2bb5e524a3e2a8 100644
--- a/ngraph/test/util/test_case.hpp
+++ b/ngraph/test/util/test_case.hpp
@@ -17,6 +17,9 @@ namespace ngraph
 {
     namespace test
     {
+        std::shared_ptr<Function> function_from_ir(const std::string& xml_path,
+                                                   const std::string& bin_path = {});
+
         template <typename Engine, TestCaseType tct = TestCaseType::STATIC>
         class TestCase
         {
diff --git a/ngraph/test/visitors/op/mod.cpp b/ngraph/test/visitors/op/mod.cpp
index 7f6e6ab5688604..dce8ef15a0740b 100644
--- a/ngraph/test/visitors/op/mod.cpp
+++ b/ngraph/test/visitors/op/mod.cpp
@@ -30,5 +30,5 @@ TEST(attributes, mod_op)
     NodeBuilder builder(mod);
     auto g_mod = as_type_ptr<opset1::Mod>(builder.create());
 
-    EXPECT_EQ(g_mod->get_auto_broadcast(), mod->get_auto_broadcast());
+    EXPECT_EQ(g_mod->get_autob(), mod->get_autob());
 }
diff --git a/scripts/demo/demo_benchmark_app.sh b/scripts/demo/demo_benchmark_app.sh
index 5420a0d7bc140d..8c84e8a724ef4c 100755
--- a/scripts/demo/demo_benchmark_app.sh
+++ b/scripts/demo/demo_benchmark_app.sh
@@ -3,7 +3,7 @@
 # Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" && pwd )"
 
 . "$ROOT_DIR/utils.sh"
 
diff --git a/scripts/demo/demo_security_barrier_camera.sh b/scripts/demo/demo_security_barrier_camera.sh
index 5116cd4b7f1f04..4913aaf1d314ca 100755
--- a/scripts/demo/demo_security_barrier_camera.sh
+++ b/scripts/demo/demo_security_barrier_camera.sh
@@ -3,7 +3,7 @@
 # Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" && pwd )"
 
 . "$ROOT_DIR/utils.sh"
 
diff --git a/scripts/demo/demo_squeezenet_download_convert_run.sh b/scripts/demo/demo_squeezenet_download_convert_run.sh
index 0aa4ad64ef2b2c..00daead548e10c 100755
--- a/scripts/demo/demo_squeezenet_download_convert_run.sh
+++ b/scripts/demo/demo_squeezenet_download_convert_run.sh
@@ -3,7 +3,7 @@
 # Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" && pwd )"
 
 . "$ROOT_DIR/utils.sh"
 
diff --git a/scripts/install_dependencies/install_NEO_OCL_driver.sh b/scripts/install_dependencies/install_NEO_OCL_driver.sh
index 81deadd3d8a039..b9dfa1d71edbe2 100755
--- a/scripts/install_dependencies/install_NEO_OCL_driver.sh
+++ b/scripts/install_dependencies/install_NEO_OCL_driver.sh
@@ -19,7 +19,7 @@ CENTOS_MINOR=
 RHEL_VERSION=
 UBUNTU_VERSION=
 DISTRO=
-SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" >/dev/null 2>&1 && pwd )"
 INSTALL_DRIVER_VERSION='19.41.14441'
 AVAILABLE_DRIVERS=("19.41.14441" "20.35.17767")
 
diff --git a/scripts/install_dependencies/install_openvino_dependencies.sh b/scripts/install_dependencies/install_openvino_dependencies.sh
index 4b4a729c92f5da..802ecbd0fa961e 100755
--- a/scripts/install_dependencies/install_openvino_dependencies.sh
+++ b/scripts/install_dependencies/install_openvino_dependencies.sh
@@ -59,7 +59,7 @@ if [ -n "$selftest" ] ; then
             echo "||"
             echo "|| Test $image / '$opt'"
             echo "||"
-            SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+            SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" >/dev/null 2>&1 && pwd )"
             docker run -it --rm \
                 --volume ${SCRIPT_DIR}:/scripts:ro,Z  \
                 --volume yum-cache:/var/cache/yum \
diff --git a/scripts/setupvars/setupvars.sh b/scripts/setupvars/setupvars.sh
index 958a3a00781e52..62f9b5f8ec8337 100755
--- a/scripts/setupvars/setupvars.sh
+++ b/scripts/setupvars/setupvars.sh
@@ -3,7 +3,7 @@
 # Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" >/dev/null 2>&1 && pwd )"
 BASE_DIR="$( dirname "$SCRIPT_DIR" )"
 
 INSTALLDIR="${BASE_DIR}"
diff --git a/tests/conditional_compilation/conftest.py b/tests/conditional_compilation/conftest.py
index 04618ea9946c9d..764cb497e8fd9f 100644
--- a/tests/conditional_compilation/conftest.py
+++ b/tests/conditional_compilation/conftest.py
@@ -14,23 +14,22 @@
 """
 
 import sys
+import pytest
+import yaml
+
 from inspect import getsourcefile
 from pathlib import Path
 
-import pytest
-import yaml
+from tests_utils import write_session_info, SESSION_INFO_FILE
 
 # add ../lib to imports
-sys.path.insert(
-    0, str((Path(getsourcefile(lambda: 0)) / ".." / ".." / "lib").resolve(strict=True))
-)
+sys.path.insert(0, str((Path(getsourcefile(lambda: 0)) / ".." / ".." / "lib").resolve(strict=True)))
 
 from path_utils import expand_env_vars  # pylint: disable=import-error
 
 
 def pytest_addoption(parser):
-    """ Define extra options for pytest options
-    """
+    """Define extra options for pytest options."""
     parser.addoption(
         "--test_conf",
         type=Path,
@@ -67,8 +66,7 @@ def pytest_addoption(parser):
 
 
 def pytest_generate_tests(metafunc):
-    """ Generate tests depending on command line options
-    """
+    """Generate tests depending on command line options."""
     params = []
     ids = []
 
@@ -87,6 +85,25 @@ def pytest_generate_tests(metafunc):
     metafunc.parametrize("test_id, model", params, ids=ids)
 
 
+@pytest.fixture(scope="function")
+def test_info(request, pytestconfig):
+    """Fixture function for getting the additional attributes of the current test."""
+    setattr(request.node._request, "test_info", {})
+    if not hasattr(pytestconfig, "session_info"):
+        setattr(pytestconfig, "session_info", [])
+
+    yield request.node._request.test_info
+
+    pytestconfig.session_info.append(request.node._request.test_info)
+
+
+@pytest.fixture(scope="session", autouse=True)
+def save_session_info(pytestconfig, artifacts):
+    """Fixture function for saving additional attributes to configuration file."""
+    yield
+    write_session_info(path=artifacts / SESSION_INFO_FILE, data=pytestconfig.session_info)
+
+
 @pytest.fixture(scope="session")
 def sea_runtool(request):
     """Fixture function for command-line option."""
diff --git a/tests/conditional_compilation/test_collect.py b/tests/conditional_compilation/test_collect.py
index 055b1cd7b28cac..6014f16e7950bd 100644
--- a/tests/conditional_compilation/test_collect.py
+++ b/tests/conditional_compilation/test_collect.py
@@ -13,16 +13,19 @@
 from proc_utils import cmd_exec  # pylint: disable=import-error
 
 
-def test_cc_collect(test_id, model, sea_runtool, benchmark_app, collector_dir, artifacts):
+def test_cc_collect(test_id, model, sea_runtool, benchmark_app, collector_dir, artifacts, test_info):
     """ Test conditional compilation statistics collection
+    :param test_info: custom `test_info` field of built-in `request` pytest fixture.
+                      contain a dictionary to store test metadata.
     """
     out = artifacts / test_id
+    test_info["test_id"] = test_id
     # cleanup old data if any
-    prev_results = glob.glob(f"{out}.pid*.csv")
-    for path in prev_results:
+    prev_result = glob.glob(f"{out}.pid*.csv")
+    for path in prev_result:
         os.remove(path)
     # run use case
-    returncode, output = cmd_exec(
+    return_code, output = cmd_exec(
         [
             sys.executable,
             str(sea_runtool),
@@ -37,7 +40,8 @@ def test_cc_collect(test_id, model, sea_runtool, benchmark_app, collector_dir, a
             "-nireq=1",
         ]
     )
-    assert returncode == 0, f"Command exited with non-zero status {returncode}:\n {output}"
-    assert (
-        len(glob.glob(f"{out}.pid*.csv")) == 1
-    ), f'Multiple or none "{out}.pid*.csv" files'
+    out_csv = glob.glob(f"{out}.pid*.csv")
+    test_info["out_csv"] = out_csv
+
+    assert return_code == 0, f"Command exited with non-zero status {return_code}:\n {output}"
+    assert (len(out_csv) == 1), f'Multiple or none "{out}.pid*.csv" files'
diff --git a/tests/conditional_compilation/tests_utils.py b/tests/conditional_compilation/tests_utils.py
new file mode 100644
index 00000000000000..2b186726843017
--- /dev/null
+++ b/tests/conditional_compilation/tests_utils.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# Copyright (C) 2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+""" Utility functions for work with json test configuration file.
+"""
+import json
+
+from inspect import getsourcefile
+from pathlib import Path
+
+
+SESSION_INFO_FILE = "cc_tests.json"
+
+
+def read_session_info(path: Path = Path(getsourcefile(lambda: 0)).parent / SESSION_INFO_FILE):
+    with open(path, 'r') as json_file:
+        cc_tests_ids = json.load(json_file)
+    return cc_tests_ids
+
+
+def write_session_info(path: Path = Path(getsourcefile(lambda: 0)).parent / SESSION_INFO_FILE,
+                       data: dict = None):
+    with open(path, "w") as json_file:
+        json.dump(data, json_file, indent=4)
diff --git a/tests/stress_tests/.automation/memcheck_tests/precommit_configs/desktop_references_config.xml b/tests/stress_tests/.automation/memcheck_tests/precommit_configs/desktop_references_config.xml
index ef04b7ef8a4267..5bc91171c9d68d 100644
--- a/tests/stress_tests/.automation/memcheck_tests/precommit_configs/desktop_references_config.xml
+++ b/tests/stress_tests/.automation/memcheck_tests/precommit_configs/desktop_references_config.xml
@@ -3,37 +3,37 @@
     <models>
         <!--Models with FP32 precision-->
         <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="740214" vmpeak="805110" vmrss="129308" vmhwm="129308" />
-        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="922147" vmpeak="922147" vmrss="587522" vmhwm="587522" />
+        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2709506" vmpeak="2794703" vmrss="1342104" vmhwm="1342104" />
         <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="1007890" vmpeak="1007890" vmrss="138652" vmhwm="138652" />
-        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="1006439" vmpeak="1091636" vmrss="587241" vmhwm="587241" />
+        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2709501" vmpeak="2794698" vmrss="1291404" vmhwm="1291404" />
         <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="691589" vmpeak="922864" vmrss="31054" vmhwm="31054" />
-        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="626194" vmpeak="626194" vmrss="290695" vmhwm="290695" />
+        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="626194" vmpeak="626194" vmrss="403228" vmhwm="403228" />
         <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="958240" vmpeak="1043437" vmrss="31366" vmhwm="31366" />
-        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="708734" vmpeak="793930" vmrss="287877" vmhwm="287877" />
+        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="854417" vmpeak="939614" vmrss="402339" vmhwm="402339" />
         <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="1046988" vmpeak="1179042" vmrss="307990" vmhwm="439457" />
-        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="1267775" vmpeak="1279647" vmrss="932672" vmhwm="944626" />
+        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2969241" vmpeak="2969241" vmrss="1506492" vmhwm="1506492" />
         <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="1321819" vmpeak="1321819" vmrss="374207" vmhwm="439748" />
-        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="1356565" vmpeak="1441762" vmrss="941418" vmhwm="947060" />
+        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2605324" vmpeak="26900521" vmrss="1549958" vmhwm="1549958" />
         <model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="2133814" vmpeak="2836412" vmrss="1438049" vmhwm="2140533" />
         <model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2801422" vmpeak="3915366" vmrss="2465065" vmhwm="3578811" />
         <model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="2401380" vmpeak="2836412" vmrss="1469832" vmhwm="2140377" />
         <model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2892432" vmpeak="3939166" vmrss="2472017" vmhwm="3602924" />
         <!--Models with FP16 precision-->
         <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="1057487" vmpeak="1085224" vmrss="109694" vmhwm="137295" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="760942" vmpeak="760942" vmrss="418298" vmhwm="418298" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2667537" vmpeak="2752734" vmrss="1304919" vmhwm="1304919" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
         <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="1058844" vmpeak="1085224" vmrss="123016" vmhwm="136682" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="845348" vmpeak="930545" vmrss="417445" vmhwm="417445" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2347389" vmpeak="2432586" vmrss="1290504" vmhwm="1290504" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
         <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="955427" vmpeak="955806" vmrss="27700" vmhwm="27700" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="680862" vmpeak="680862" vmrss="331858" vmhwm="331858" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2123113" vmpeak="2208310" vmrss="453814" vmhwm="453814" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
         <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="955827" vmpeak="955827" vmrss="27222" vmhwm="27222" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="766053" vmpeak="851250" vmrss="331458" vmhwm="331458" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="1760990" vmpeak="1760990" vmrss="454173" vmhwm="454173" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
         <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="1372961" vmpeak="1505639" vmrss="369969" vmhwm="501649" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="866543" vmpeak="866543" vmrss="523967" vmhwm="523967" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2746588" vmpeak="2831784" vmrss="1296328" vmhwm="1296328" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
         <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="1381265" vmpeak="1505472" vmrss="437039" vmhwm="500630" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="951584" vmpeak="1036781" vmrss="528060" vmhwm="528060" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2380580" vmpeak="2465777" vmrss="1326369" vmhwm="1326369" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
         <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="2748220" vmpeak="3450818" vmrss="1783704" vmhwm="2486161" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="1397463" vmpeak="1994402" vmrss="1049625" vmhwm="1629414" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2181312" vmpeak="2582752" vmrss="1060712" vmhwm="1629414" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
         <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="2749458" vmpeak="3450818" vmrss="1816765" vmhwm="2486525" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="1482655" vmpeak="1998812" vmrss="1049692" vmhwm="1630782" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2910814" vmpeak="3347489" vmrss="1371380" vmhwm="1717102" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
     </models>
 </attributes>
diff --git a/tests/stress_tests/.automation/memleaks_tests/nightly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/memleaks_tests/nightly_configs/desktop_test_config.xml
index 8025c9e04ec9b0..060fffc897c0e7 100644
--- a/tests/stress_tests/.automation/memleaks_tests/nightly_configs/desktop_test_config.xml
+++ b/tests/stress_tests/.automation/memleaks_tests/nightly_configs/desktop_test_config.xml
@@ -13,6 +13,7 @@
         <value>GPU</value>
     </devices>
     <models>
+        <!-- Alexnet model contains MatMul node which is needed to cover *-52623 -->
         <model name="alexnet" precision="FP32" source="omz" />
         <model name="mobilenet-ssd" precision="FP32" source="omz" />
         <model name="mtcnn-r" precision="FP32" source="omz" />
diff --git a/tests/stress_tests/.automation/memleaks_tests/weekly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/memleaks_tests/weekly_configs/desktop_test_config.xml
index 1d688686773beb..95bced89a1f2cd 100644
--- a/tests/stress_tests/.automation/memleaks_tests/weekly_configs/desktop_test_config.xml
+++ b/tests/stress_tests/.automation/memleaks_tests/weekly_configs/desktop_test_config.xml
@@ -13,6 +13,7 @@
         <value>GPU</value>
     </devices>
     <models>
+        <!-- Alexnet model contains MatMul node which is needed to cover *-52623 -->
         <model name="alexnet" precision="FP32" source="omz" />
         <model name="mobilenet-ssd" precision="FP32" source="omz" />
         <model name="mtcnn-r" precision="FP32" source="omz" />
diff --git a/tests/stress_tests/memleaks_tests/tests.cpp b/tests/stress_tests/memleaks_tests/tests.cpp
index c1eea833c81bd9..b00f24f8552aa2 100644
--- a/tests/stress_tests/memleaks_tests/tests.cpp
+++ b/tests/stress_tests/memleaks_tests/tests.cpp
@@ -110,6 +110,14 @@ TEST_P(MemLeaksTestSuite, reinfer_request_inference) {
     };
     test_runner(test_params.numthreads, test);
 }
+
+TEST_P(MemLeaksTestSuite, infer_request_inference) {
+    auto test_params = GetParam();
+    auto test = [&] {
+        return test_infer_request_inference(test_params.model, test_params.device, test_params.numiters);
+    };
+    test_runner(test_params.numthreads, test);
+}
 // tests_pipelines/tests_pipelines.cpp
 
 INSTANTIATE_TEST_CASE_P(MemLeaksTests, MemLeaksTestSuiteNoModel,
diff --git a/tests/time_tests/scripts/run_timetest.py b/tests/time_tests/scripts/run_timetest.py
index 73ec6c1ce0afd0..03ecad9b5b43e2 100644
--- a/tests/time_tests/scripts/run_timetest.py
+++ b/tests/time_tests/scripts/run_timetest.py
@@ -11,16 +11,19 @@
 # pylint: disable=redefined-outer-name
 
 import statistics
-from pathlib import Path
 import tempfile
 import subprocess
 import logging
 import argparse
 import sys
 import os
-from pprint import pprint
 import yaml
 
+from pathlib import Path
+from pprint import pprint
+
+from test_runner.utils import filter_timetest_result
+
 
 def run_cmd(args: list, log=None, verbose=True):
     """ Run command
@@ -95,11 +98,14 @@ def run_timetest(args: dict, log=None):
         stats = dict((step_name, stats.get(step_name, []) + [duration])
                      for step_name, duration in raw_data.items())
 
+    # Remove outliers
+    filtered_stats = filter_timetest_result(stats)
+
     # Aggregate results
-    aggregated_stats = aggregate_stats(stats)
+    aggregated_stats = aggregate_stats(filtered_stats)
     log.debug("Aggregated statistics after full run: {}".format(aggregated_stats))
 
-    return 0, aggregated_stats
+    return 0, aggregated_stats, stats
 
 
 def check_positive_int(val):
@@ -129,7 +135,7 @@ def cli_parser():
                         type=str,
                         help='target device to infer on')
     parser.add_argument('-niter',
-                        default=3,
+                        default=10,
                         type=check_positive_int,
                         help='number of times to execute binary to aggregate statistics of')
     parser.add_argument('-s',
@@ -148,7 +154,7 @@ def cli_parser():
     logging.basicConfig(format="[ %(levelname)s ] %(message)s",
                         level=logging.DEBUG, stream=sys.stdout)
 
-    exit_code, aggr_stats = run_timetest(dict(args._get_kwargs()), log=logging)  # pylint: disable=protected-access
+    exit_code, aggr_stats, _ = run_timetest(dict(args._get_kwargs()), log=logging)  # pylint: disable=protected-access
 
     if args.stats_path:
         # Save aggregated results to a file
diff --git a/tests/time_tests/src/timetests/timetest_infer_cache.cpp b/tests/time_tests/src/timetests/timetest_infer_cache.cpp
new file mode 100644
index 00000000000000..95b733758b7e61
--- /dev/null
+++ b/tests/time_tests/src/timetests/timetest_infer_cache.cpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <inference_engine.hpp>
+#include <iostream>
+
+#include "common.h"
+#include "timetests_helper/timer.h"
+#include "timetests_helper/utils.h"
+using namespace InferenceEngine;
+
+
+/**
+ * @brief Function that contain executable pipeline which will be called from
+ * main(). The function should not throw any exceptions and responsible for
+ * handling it by itself.
+ */
+int runPipeline(const std::string &model, const std::string &device) {
+  auto pipeline = [](const std::string &model, const std::string &device) {
+    Core ie;
+    CNNNetwork cnnNetwork;
+    ExecutableNetwork exeNetwork;
+    InferRequest inferRequest;
+
+    {
+      SCOPED_TIMER(first_inference_latency);
+      {
+        SCOPED_TIMER(load_plugin);
+        ie.GetVersions(device);
+      }
+      {
+        SCOPED_TIMER(load_network);
+        ie.SetConfig({{"CACHE_DIR", "models_cache"}});
+        exeNetwork = ie.LoadNetwork(model, device);
+      }
+      {
+        SCOPED_TIMER(first_inference);
+        inferRequest = exeNetwork.CreateInferRequest();
+        {
+          SCOPED_TIMER(fill_inputs)
+          const InferenceEngine::ConstInputsDataMap inputsInfo(exeNetwork.GetInputsInfo());
+          fillBlobs(inferRequest, inputsInfo, 1);
+        }
+        inferRequest.Infer();
+      }
+    }
+  };
+
+  try {
+    pipeline(model, device);
+  } catch (const InferenceEngine::Exception &iex) {
+    std::cerr
+        << "Inference Engine pipeline failed with Inference Engine exception:\n"
+        << iex.what();
+    return 1;
+  } catch (const std::exception &ex) {
+    std::cerr << "Inference Engine pipeline failed with exception:\n"
+              << ex.what();
+    return 2;
+  } catch (...) {
+    std::cerr << "Inference Engine pipeline failed\n";
+    return 3;
+  }
+  return 0;
+}
diff --git a/tests/time_tests/test_runner/conftest.py b/tests/time_tests/test_runner/conftest.py
index 05346e6c5a2a8a..16cd358d1c7c70 100644
--- a/tests/time_tests/test_runner/conftest.py
+++ b/tests/time_tests/test_runner/conftest.py
@@ -23,10 +23,10 @@
 import shutil
 import sys
 import tempfile
-from pathlib import Path
-
 import pytest
 import yaml
+
+from pathlib import Path
 from jsonschema import validate, ValidationError
 
 from scripts.run_timetest import check_positive_int
@@ -161,6 +161,7 @@ def test_info(request, pytestconfig):
     """
     setattr(request.node._request, "test_info", {"orig_instance": request.node.funcargs["instance"],
                                                  "results": {},
+                                                 "raw_results": {},
                                                  "db_info": {}})
     if not hasattr(pytestconfig, "session_info"):
         setattr(pytestconfig, "session_info", [])
@@ -391,6 +392,7 @@ def pytest_runtest_makereport(item, call):
 
     data = item._request.test_info["db_info"].copy()
     data["results"] = item._request.test_info["results"].copy()
+    data["raw_results"] = item._request.test_info["raw_results"].copy()
     data["status"] = "not_finished"
     data["error_msg"] = ""
 
diff --git a/tests/time_tests/test_runner/requirements.txt b/tests/time_tests/test_runner/requirements.txt
index bdb557e84976b0..7477a75942f9ff 100644
--- a/tests/time_tests/test_runner/requirements.txt
+++ b/tests/time_tests/test_runner/requirements.txt
@@ -2,4 +2,5 @@ pytest==4.0.1
 attrs==19.1.0   # required for pytest==4.0.1 to resolve compatibility issues
 PyYAML==5.4.1
 jsonschema==3.2.0
-distro==1.5.0
\ No newline at end of file
+distro==1.5.0
+numpy==1.18.5
diff --git a/tests/time_tests/test_runner/test_timetest.py b/tests/time_tests/test_runner/test_timetest.py
index d97313e78ef20f..42dab034c5b909 100644
--- a/tests/time_tests/test_runner/test_timetest.py
+++ b/tests/time_tests/test_runner/test_timetest.py
@@ -63,11 +63,12 @@ def test_timetest(instance, executable, niter, cl_cache_dir, test_info, temp_dir
         run_timetest(_exe_args, log=logging)
         assert os.listdir(cl_cache_dir), "cl_cache isn't generated"
 
-    retcode, aggr_stats = run_timetest(exe_args, log=logging)
+    retcode, aggr_stats, raw_stats = run_timetest(exe_args, log=logging)
     assert retcode == 0, "Run of executable failed"
 
     # Add timetest results to submit to database and save in new test conf as references
     test_info["results"] = aggr_stats
+    test_info["raw_results"] = raw_stats
 
     # Compare with references
     comparison_status = 0
diff --git a/tests/time_tests/test_runner/utils.py b/tests/time_tests/test_runner/utils.py
index 40c90edb28bdc3..0290462f3cc812 100644
--- a/tests/time_tests/test_runner/utils.py
+++ b/tests/time_tests/test_runner/utils.py
@@ -6,11 +6,12 @@
 import os
 import platform
 import sys
-from enum import Enum
-from pathlib import Path
-
 import distro
 import yaml
+import numpy as np
+
+from enum import Enum
+from pathlib import Path
 from pymongo import MongoClient
 
 # constants
@@ -18,6 +19,10 @@
 DB_COLLECTIONS = ["commit", "nightly", "weekly"]
 PRODUCT_NAME = 'dldt'   # product name from build manifest
 
+# Define a range to cut outliers which are < Q1 − IQR_CUTOFF * IQR, and > Q3 + IQR_CUTOFF * IQR
+# https://en.wikipedia.org/wiki/Interquartile_range
+IQR_CUTOFF = 1.5
+
 
 def expand_env_vars(obj):
     """Expand environment variables in provided object."""
@@ -34,16 +39,14 @@ def expand_env_vars(obj):
 
 
 def upload_timetest_data(data, db_url, db_collection):
-    """ Upload timetest data to database
-    """
+    """ Upload timetest data to database."""
     client = MongoClient(db_url)
     collection = client[DATABASE][db_collection]
     collection.replace_one({'_id': data['_id']}, data, upsert=True)
 
 
 def metadata_from_manifest(manifest: Path):
-    """ Extract commit metadata from manifest
-    """
+    """ Extract commit metadata from manifest."""
     with open(manifest, 'r') as manifest_file:
         manifest = yaml.safe_load(manifest_file)
     repo_trigger = next(
@@ -58,11 +61,27 @@ def metadata_from_manifest(manifest: Path):
     }
 
 
-class UnsupportedOsError(Exception):
-    """
-    Exception for unsupported OS type
-    """
+def calculate_iqr(stats: list):
+    """IQR is calculated as the difference between the 3th and the 1th quantile of the data."""
+    q1 = np.quantile(stats, 0.25)
+    q3 = np.quantile(stats, 0.75)
+    iqr = q3 - q1
+    return iqr, q1, q3
+
+
+def filter_timetest_result(stats: dict):
+    """Identify and remove outliers from time_results."""
+    filtered_stats = {}
+    for step_name, time_results in stats.items():
+        iqr, q1, q3 = calculate_iqr(time_results)
+        cut_off = iqr * IQR_CUTOFF
+        upd_time_results = [x for x in time_results if (q1 - cut_off < x < q3 + cut_off)]
+        filtered_stats.update({step_name: upd_time_results})
+    return filtered_stats
 
+
+class UnsupportedOsError(Exception):
+    """Exception for unsupported OS type."""
     def __init__(self, *args, **kwargs):
         error_message = f'OS type "{get_os_type()}" is not currently supported'
         if args or kwargs:
@@ -72,9 +91,7 @@ def __init__(self, *args, **kwargs):
 
 
 class OsType(Enum):
-    """
-    Container for supported os types
-    """
+    """Container for supported os types."""
     WINDOWS = 'Windows'
     LINUX = 'Linux'
     DARWIN = 'Darwin'
@@ -91,17 +108,17 @@ def get_os_type():
 
 
 def os_type_is_windows():
-    """Returns True if OS type is Windows. Otherwise returns False"""
+    """Returns True if OS type is Windows. Otherwise returns False."""
     return get_os_type() == OsType.WINDOWS.value
 
 
 def os_type_is_linux():
-    """Returns True if OS type is Linux. Otherwise returns False"""
+    """Returns True if OS type is Linux. Otherwise returns False."""
     return get_os_type() == OsType.LINUX.value
 
 
 def os_type_is_darwin():
-    """Returns True if OS type is Darwin. Otherwise returns False"""
+    """Returns True if OS type is Darwin. Otherwise returns False."""
     return get_os_type() == OsType.DARWIN.value
 
 
diff --git a/tools/benchmark/README.md b/tools/benchmark/README.md
index 1e27271e8ef29f..215d16bb47a166 100644
--- a/tools/benchmark/README.md
+++ b/tools/benchmark/README.md
@@ -110,16 +110,17 @@ Options:
                         "input1[NCHW],input2[NC]" or "[NCHW]" in case of one
                         input size.
   -nstreams NUMBER_STREAMS, --number_streams NUMBER_STREAMS
-                       Optional. Number of streams to use for inference on the CPU/GPU/MYRIAD
-                       (for HETERO and MULTI device cases use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>).
-                       Default value is determined automatically for a device.
-                       Please note that although the automatic selection usually provides a reasonable performance,
-                       it still may be non-optimal for some cases, especially for very small networks.
-                       Also, using nstreams>1 is inherently throughput-oriented option, while for the best-latency
-                       estimations the number of streams should be set to 1.
--enforcebf16 [ENFORCE_BFLOAT16], --enforce_bfloat16 [ENFORCE_BFLOAT16]
-                        Optional. Enforcing of floating point operations
-                        execution in bfloat16 precision where it is acceptable.
+                        Optional. Number of streams to use for inference on the CPU/GPU/MYRIAD
+                        (for HETERO and MULTI device cases use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>).
+                        Default value is determined automatically for a device.
+                        Please note that although the automatic selection usually provides a reasonable performance,
+                        it still may be non-optimal for some cases, especially for very small networks.
+                        Also, using nstreams>1 is inherently throughput-oriented option, while for the best-latency
+                        estimations the number of streams should be set to 1.
+  -enforcebf16 [{true,false}], --enforce_bfloat16 [{true,false}]
+                        Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform.
+                        'true'  - enable  bfloat16 regardless of platform support
+                        'false' - disable bfloat16 regardless of platform support.
   -nthreads NUMBER_THREADS, --number_threads NUMBER_THREADS
                         Number of threads to use for inference on the CPU
                         (including HETERO  and MULTI cases).
diff --git a/tools/benchmark/benchmark.py b/tools/benchmark/benchmark.py
index 6ba7a0de91f6e0..754bdd194e711e 100644
--- a/tools/benchmark/benchmark.py
+++ b/tools/benchmark/benchmark.py
@@ -27,20 +27,19 @@ def __del__(self):
     def add_extension(self, path_to_extension: str=None, path_to_cldnn_config: str=None):
         if path_to_cldnn_config:
             self.ie.set_config({'CONFIG_FILE': path_to_cldnn_config}, GPU_DEVICE_NAME)
-            logger.info('GPU extensions is loaded {}'.format(path_to_cldnn_config))
+            logger.info(f'GPU extensions is loaded {path_to_cldnn_config}')
 
         if path_to_extension:
             self.ie.add_extension(extension_path=path_to_extension, device_name=CPU_DEVICE_NAME)
-            logger.info('CPU extensions is loaded {}'.format(path_to_extension))
+            logger.info(f'CPU extensions is loaded {path_to_extension}')
 
     def get_version_info(self) -> str:
-        logger.info('InferenceEngine:\n{: <9}{:.<24} {}'.format('', 'API version', get_version()))
+        logger.info(f"InferenceEngine:\n{'': <9}{'API version':.<24} {get_version()}")
         version_string = 'Device info\n'
         for device, version in self.ie.get_versions(self.device).items():
-            version_string += '{: <9}{}\n'.format('', device)
-            version_string += '{: <9}{:.<24}{} {}.{}\n'.format('', version.description, ' version', version.major,
-                                                               version.minor)
-            version_string += '{: <9}{:.<24} {}\n'.format('', 'Build', version.build_number)
+            version_string += f"{'': <9}{device}\n"
+            version_string += f"{'': <9}{version.description:.<24}{' version'} {version.major}.{version.minor}\n"
+            version_string += f"{'': <9}{'Build':.<24} {version.build_number}\n"
         return version_string
 
     def set_config(self, config = {}):
@@ -83,7 +82,7 @@ def first_infer(self, exe_network):
             infer_request.async_infer()
             status = exe_network.wait()
             if status != StatusCode.OK:
-                raise Exception("Wait for all requests is failed with status code {}!".format(status))
+                raise Exception(f"Wait for all requests is failed with status code {status}!")
         return infer_request.latency
 
     def infer(self, exe_network, batch_size, progress_bar=None):
@@ -137,7 +136,7 @@ def infer(self, exe_network, batch_size, progress_bar=None):
         # wait the latest inference executions
         status = exe_network.wait()
         if status != StatusCode.OK:
-            raise Exception("Wait for all requests is failed with status code {}!".format(status))
+            raise Exception(f"Wait for all requests is failed with status code {status}!")
 
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
         for infer_request_id in in_fly:
diff --git a/tools/benchmark/main.py b/tools/benchmark/main.py
index c3d40ea0d6b159..bdfe296e33195f 100644
--- a/tools/benchmark/main.py
+++ b/tools/benchmark/main.py
@@ -93,14 +93,14 @@ def is_flag_set_in_command_line(flag):
                 ## set to user defined value
                 config[device]['PERF_COUNT'] = 'YES' if args.perf_counts else 'NO'
             elif 'PERF_COUNT' in config[device].keys() and config[device]['PERF_COUNT'] == 'YES':
-                logger.warning("Performance counters for {} device is turned on. ".format(device) +
+                logger.warning(f"Performance counters for {device} device is turned on. " +
                                "To print results use -pc option.")
             elif args.report_type in [ averageCntReport, detailedCntReport ]:
-                logger.warning("Turn on performance counters for {} device ".format(device) +
-                               "since report type is {}.".format(args.report_type))
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               f"since report type is {args.report_type}.")
                 config[device]['PERF_COUNT'] = 'YES'
             elif args.exec_graph_path is not None:
-                logger.warning("Turn on performance counters for {} device ".format(device) +
+                logger.warning(f"Turn on performance counters for {device} device " +
                                "due to execution graph dumping.")
                 config[device]['PERF_COUNT'] = 'YES'
             else:
@@ -114,11 +114,11 @@ def set_throughput_streams():
                     ## set to user defined value
                     supported_config_keys = benchmark.ie.get_metric(device, 'SUPPORTED_CONFIG_KEYS')
                     if key not in supported_config_keys:
-                        raise Exception("Device {} doesn't support config key '{}'! ".format(device, key) +
+                        raise Exception(f"Device {device} doesn't support config key '{key}'! " +
                                         "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>")
                     config[device][key] = device_number_streams[device]
                 elif key not in config[device].keys() and args.api_type == "async":
-                    logger.warning("-nstreams default value is determined automatically for {} device. ".format(device) +
+                    logger.warning(f"-nstreams default value is determined automatically for {device} device. " +
                                    "Although the automatic selection usually provides a reasonable performance,"
                                    "but it still may be non-optimal for some cases, for more information look at README.")
                     if device != MYRIAD_DEVICE_NAME:  ## MYRIAD sets the default number of streams implicitly
@@ -139,7 +139,7 @@ def set_throughput_streams():
                     config[device]['CPU_BIND_THREAD'] = args.infer_threads_pinning
                 elif 'CPU_BIND_THREAD' not in config[device].keys():
                     if MULTI_DEVICE_NAME in device_name and GPU_DEVICE_NAME in device_name:
-                        logger.warning("Turn off threads pinning for {}".format(device) +
+                        logger.warning(f"Turn off threads pinning for {device} " +
                                        "device since multi-scenario with GPU device is used.")
                         config[device]['CPU_BIND_THREAD'] = 'NO'
                     else:
@@ -185,8 +185,8 @@ def set_throughput_streams():
 
             start_time = datetime.utcnow()
             ie_network = benchmark.read_network(args.path_to_model)
-            duration_ms = "{:.2f}".format((datetime.utcnow() - start_time).total_seconds() * 1000)
-            logger.info("Read network took {} ms".format(duration_ms))
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Read network took {duration_ms} ms")
             if statistics:
                 statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                           [
@@ -203,8 +203,8 @@ def set_throughput_streams():
                 logger.info(
                     'Reshaping network: {}'.format(', '.join("'{}': {}".format(k, v) for k, v in shapes.items())))
                 ie_network.reshape(shapes)
-                duration_ms = "{:.2f}".format((datetime.utcnow() - start_time).total_seconds() * 1000)
-                logger.info("Reshape network took {} ms".format(duration_ms))
+                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+                logger.info(f"Reshape network took {duration_ms} ms")
                 if statistics:
                     statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                               [
@@ -214,7 +214,7 @@ def set_throughput_streams():
             # use batch size according to provided layout and shapes
             batch_size = get_batch_size(app_inputs_info) if args.layout else ie_network.batch_size
 
-            logger.info('Network batch size: {}'.format(batch_size))
+            logger.info(f'Network batch size: {batch_size}')
 
             # --------------------- 6. Configuring inputs and outputs of the model --------------------------------------------------
             next_step()
@@ -227,8 +227,8 @@ def set_throughput_streams():
 
             start_time = datetime.utcnow()
             exe_network = benchmark.load_network(ie_network)
-            duration_ms = "{:.2f}".format((datetime.utcnow() - start_time).total_seconds() * 1000)
-            logger.info("Load network took {} ms".format(duration_ms))
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Load network took {duration_ms} ms")
             if statistics:
                 statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                           [
@@ -247,8 +247,8 @@ def set_throughput_streams():
 
             start_time = datetime.utcnow()
             exe_network = benchmark.import_network(args.path_to_model)
-            duration_ms = "{:.2f}".format((datetime.utcnow() - start_time).total_seconds() * 1000)
-            logger.info("Import network took {} ms".format(duration_ms))
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Import network took {duration_ms} ms")
             if statistics:
                 statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                           [
@@ -297,7 +297,7 @@ def set_throughput_streams():
             for nstreams in device_number_streams.items():
                 statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
                                          [
-                                            ("number of {} streams".format(nstreams[0]), str(nstreams[1])),
+                                            (f"number of {nstreams[0]} streams", str(nstreams[1])),
                                          ])
 
         # ------------------------------------ 10. Measuring performance -----------------------------------------------
@@ -311,8 +311,8 @@ def set_throughput_streams():
 
         progress_bar = ProgressBar(progress_bar_total_count, args.stream_output, args.progress) if args.progress else None
 
-        duration_ms =  "{:.2f}".format(benchmark.first_infer(exe_network))
-        logger.info("First inference took {} ms".format(duration_ms))
+        duration_ms =  f"{benchmark.first_infer(exe_network):.2f}"
+        logger.info(f"First inference took {duration_ms} ms")
         if statistics:
             statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                     [
@@ -325,7 +325,7 @@ def set_throughput_streams():
 
         if args.dump_config:
             dump_config(args.dump_config, config)
-            logger.info("Inference Engine configuration settings were dumped to {}".format(args.dump_config))
+            logger.info(f"Inference Engine configuration settings were dumped to {args.dump_config}")
 
         if args.exec_graph_path:
             dump_exec_graph(exe_network, args.exec_graph_path)
@@ -342,28 +342,28 @@ def set_throughput_streams():
         if statistics:
             statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                       [
-                                          ('total execution time (ms)', '{:.2f}'.format(get_duration_in_milliseconds(total_duration_sec))),
+                                          ('total execution time (ms)', f'{get_duration_in_milliseconds(total_duration_sec):.2f}'),
                                           ('total number of iterations', str(iteration)),
                                       ])
             if MULTI_DEVICE_NAME not in device_name:
                 statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                           [
-                                              ('latency (ms)', '{:.2f}'.format(latency_ms)),
+                                              ('latency (ms)', f'{latency_ms:.2f}'),
                                           ])
 
             statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
                                       [
-                                          ('throughput', '{:.2f}'.format(fps)),
+                                          ('throughput', f'{fps:.2f}'),
                                       ])
 
         if statistics:
           statistics.dump()
 
-        print('Count:      {} iterations'.format(iteration))
-        print('Duration:   {:.2f} ms'.format(get_duration_in_milliseconds(total_duration_sec)))
+        print(f'Count:      {iteration} iterations')
+        print(f'Duration:   {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
         if MULTI_DEVICE_NAME not in device_name:
-            print('Latency:    {:.2f} ms'.format(latency_ms))
-        print('Throughput: {:.2f} FPS'.format(fps))
+            print(f'Latency:    {latency_ms:.2f} ms')
+        print(f'Throughput: {fps:.2f} FPS')
 
         del exe_network
 
diff --git a/tools/benchmark/parameters.py b/tools/benchmark/parameters.py
index 0b9289d89569de..1a4d8b84d6a7cb 100644
--- a/tools/benchmark/parameters.py
+++ b/tools/benchmark/parameters.py
@@ -17,7 +17,7 @@ def str2bool(v):
 def check_positive(value):
     ivalue = int(value)
     if ivalue <= 0:
-        raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value)
+        raise argparse.ArgumentTypeError(f"{value} is an invalid positive int value")
     return ivalue
 
 class print_help(argparse.Action):
@@ -84,8 +84,10 @@ def parse_args():
                            'Also, using nstreams>1 is inherently throughput-oriented option, while for the best-latency '
                            'estimations the number of streams should be set to 1. '
                            'See samples README for more details.')
-    args.add_argument('-enforcebf16', '--enforce_bfloat16', type=str2bool, required=False, default=False, nargs='?', const=True,
-                      help='Optional. Enforcing of floating point operations execution in bfloat16 precision where it is acceptable.')
+    args.add_argument('-enforcebf16', '--enforce_bfloat16', type=str2bool, required=False, default=False, nargs='?', const=True, choices=[True, False],
+                      help='Optional. By default floating point operations execution in bfloat16 precision are enforced if supported by platform. '
+                           '\'true\'  - enable  bfloat16 regardless of platform support. '
+                           '\'false\' - disable bfloat16 regardless of platform support.')
     args.add_argument('-nthreads', '--number_threads', type=int, required=False, default=None,
                       help='Number of threads to use for inference on the CPU, GNA '
                            '(including HETERO and MULTI cases).')
diff --git a/tools/benchmark/utils/inputs_filling.py b/tools/benchmark/utils/inputs_filling.py
index bbb1fdf63b2396..ed06ca43afa51f 100644
--- a/tools/benchmark/utils/inputs_filling.py
+++ b/tools/benchmark/utils/inputs_filling.py
@@ -15,7 +15,7 @@ def set_inputs(paths_to_input, batch_size, app_input_info, requests):
     inputs = requests[i].input_blobs
     for k, v in requests_input_data[i].items():
         if k not in inputs.keys():
-            raise Exception("No input with name {} found!".format(k))
+            raise Exception(f"No input with name {k} found!")
         inputs[k].buffer[:] = v
 
 def get_inputs(paths_to_input, batch_size, app_input_info, requests):
@@ -42,33 +42,34 @@ def get_inputs(paths_to_input, batch_size, app_input_info, requests):
     else:
         binary_to_be_used = binaries_count * batch_size * len(requests)
         if binary_to_be_used > 0 and len(binary_files) == 0:
-            logger.warning("No supported binary inputs found! Please check your file extensions: {}".format(
-                ",".join(BINARY_EXTENSIONS)))
+            logger.warning(f"No supported binary inputs found! "
+                                        f"Please check your file extensions: {','.join(BINARY_EXTENSIONS)}")
         elif binary_to_be_used > len(binary_files):
             logger.warning(
-                "Some binary input files will be duplicated: {} files are required, but only {} were provided".format(
-                    binary_to_be_used, len(binary_files)))
+                f"Some binary input files will be duplicated: "
+                                        f"{binary_to_be_used} files are required, "
+                                        f"but only {len(binary_files)} were provided")
         elif binary_to_be_used < len(binary_files):
             logger.warning(
-                "Some binary input files will be ignored: only {} files are required from {}".format(binary_to_be_used,
-                                                                                                     len(binary_files)))
+                f"Some binary input files will be ignored: only {binary_to_be_used} "
+                                        f"files are required from {len(binary_files)}")
 
         images_to_be_used = images_count * batch_size * len(requests)
         if images_to_be_used > 0 and len(image_files) == 0:
-            logger.warning("No supported image inputs found! Please check your file extensions: {}".format(
-                ",".join(IMAGE_EXTENSIONS)))
+            logger.warning(f"No supported image inputs found! Please check your "
+                                        f"file extensions: {','.join(IMAGE_EXTENSIONS)}")
         elif images_to_be_used > len(image_files):
             logger.warning(
-                "Some image input files will be duplicated: {} files are required, but only {} were provided".format(
-                    images_to_be_used, len(image_files)))
+                f"Some image input files will be duplicated: {images_to_be_used} "
+                            f"files are required, but only {len(image_files)} were provided")
         elif images_to_be_used < len(image_files):
             logger.warning(
-                "Some image input files will be ignored: only {} files are required from {}".format(images_to_be_used,
-                                                                                                    len(image_files)))
+                f"Some image input files will be ignored: only {images_to_be_used} "
+                                                    f"files are required from {len(image_files)}")
 
     requests_input_data = []
     for request_id in range(0, len(requests)):
-        logger.info("Infer Request {} filling".format(request_id))
+        logger.info(f"Infer Request {request_id} filling")
         input_data = {}
         keys = list(sorted(app_input_info.keys()))
         for key in keys:
@@ -95,8 +96,8 @@ def get_inputs(paths_to_input, batch_size, app_input_info, requests):
                 continue
 
             # fill with random data
-            logger.info("Fill input '{}' with random values ({} is expected)".format(key, "image"
-                if info.is_image else "some binary data"))
+            logger.info(f"Fill input '{key}' with random values "
+                                    f"({'image' if info.is_image else 'some binary data'} is expected)")
             input_data[key] = fill_blob_with_random(info)
 
         requests_input_data.append(input_data)
@@ -128,11 +129,11 @@ def fill_blob_with_image(image_paths, request_id, batch_size, input_id, input_si
     for b in range(batch_size):
         image_index %= len(image_paths)
         image_filename = image_paths[image_index]
-        logger.info('Prepare image {}'.format(image_filename))
+        logger.info(f'Prepare image {image_filename}')
         image = cv2.imread(image_filename)
         new_im_size = tuple((info.width, info.height))
         if image.shape[:-1] != new_im_size:
-            logger.warning("Image is resized from ({}) to ({})".format(image.shape[:-1], new_im_size))
+            logger.warning(f"Image is resized from ({image.shape[:-1]}) to ({new_im_size})")
             image = cv2.resize(image, new_im_size)
         if info.layout in ['NCHW', 'CHW']:
             image = image.transpose((2, 0, 1))
@@ -173,7 +174,7 @@ def fill_blob_with_binary(binary_paths, request_id, batch_size, input_id, input_
         blob_size = dtype().nbytes * int(np.prod(shape))
         if blob_size != binary_file_size:
             raise Exception(
-                "File {} contains {} bytes but network expects {}".format(binary_filename, binary_file_size, blob_size))
+                f"File {binary_filename} contains {binary_file_size} bytes but network expects {blob_size}")
         binaries[b] = np.reshape(np.fromfile(binary_filename, dtype), shape)
         binary_index += input_size
 
diff --git a/tools/benchmark/utils/statistics_report.py b/tools/benchmark/utils/statistics_report.py
index d8791d4ba8083f..553c1fe1f616a1 100644
--- a/tools/benchmark/utils/statistics_report.py
+++ b/tools/benchmark/utils/statistics_report.py
@@ -38,7 +38,7 @@ def add_parameters(self, category, parameters):
     def dump(self):
         def dump_parameters(f, parameters):
             for k, v in parameters:
-                f.write('{}{}{}\n'.format(k, self.csv_separator, v))
+                f.write(f'{k}{self.csv_separator}{v}\n')
 
         with open(os.path.join(self.config.report_folder, 'benchmark_report.csv'), 'w') as f:
             if self.Category.COMMAND_LINE_PARAMETERS in self.parameters.keys():
@@ -56,7 +56,7 @@ def dump_parameters(f, parameters):
                 dump_parameters(f, self.parameters[self.Category.EXECUTION_RESULTS])
                 f.write('\n')
 
-            logger.info("Statistics report is stored to {}".format(f.name))
+            logger.info(f"Statistics report is stored to {f.name}")
 
     def dump_performance_counters_request(self, f, perf_counts):
         total = 0
@@ -79,7 +79,7 @@ def dump_performance_counters(self, perf_counts):
             logger.info('Performance counters are empty. No reports are dumped.')
             return
 
-        filename = os.path.join(self.config.report_folder, 'benchmark_{}_report.csv'.format(self.config.report_type))
+        filename = os.path.join(self.config.report_folder, f'benchmark_{self.config.report_type}_report.csv')
         with open(filename, 'w') as f:
             if self.config.report_type == detailedCntReport:
                 for pc in perf_counts:
@@ -104,4 +104,4 @@ def get_average_performance_counters(perf_counts):
             else:
                 raise Exception('PM data can only be collected for average or detailed report types')
 
-            logger.info('Performance counters report is stored to {}'.format(filename))
+            logger.info(f'Performance counters report is stored to {filename}')
diff --git a/tools/benchmark/utils/utils.py b/tools/benchmark/utils/utils.py
index 7fea6b1a46e46e..758147fec0e17e 100644
--- a/tools/benchmark/utils/utils.py
+++ b/tools/benchmark/utils/utils.py
@@ -40,10 +40,10 @@ def next_step(additional_info='', step_id=0):
         next_step.step_id += 1
 
     if next_step.step_id not in step_names.keys():
-        raise Exception('Step ID {} is out of total steps number '.format(next_step.step_id, str(len(step_names))))
+        raise Exception(f'Step ID {next_step.step_id} is out of total steps number {str(len(step_names))}')
 
     step_info_template = '[Step {}/{}] {}'
-    step_name = step_names[next_step.step_id] + (' ({})'.format(additional_info) if additional_info else '')
+    step_name = step_names[next_step.step_id] + (f' ({additional_info})' if additional_info else '')
     step_info_template = step_info_template.format(next_step.step_id, len(step_names), step_name)
     print(step_info_template)
 
@@ -57,9 +57,8 @@ def _configure_network_inputs(ie_network: IENetwork, app_inputs_info, input_prec
     input_info = ie_network.input_info
 
     for key in input_info.keys():
-        if app_inputs_info[key].is_image:
-            app_inputs_info[key].precision = input_precision
-            input_info[key].precision = input_precision 
+        app_inputs_info[key].precision = input_precision
+        input_info[key].precision = input_precision
 
 def _configure_network_outputs(ie_network: IENetwork, output_precision: str):
     output_info = ie_network.outputs
@@ -82,7 +81,7 @@ def _configure_network_inputs_and_outputs(ie_network: IENetwork, input_output_pr
         elif key in output_info:
             output_info[key].precision = value
         else:
-            raise Exception("Element '{}' does not exist in network".format(key))
+            raise Exception(f"Element '{key}' does not exist in network")
 
 def _parse_arg_map(arg_map: str):
     arg_map = arg_map.replace(" ", "")
@@ -99,19 +98,15 @@ def print_inputs_and_outputs_info(ie_network: IENetwork):
     input_info = ie_network.input_info
     for key in input_info.keys():
         tensor_desc = input_info[key].tensor_desc
-        logger.info("Network input '{}' precision {}, dimensions ({}): {}".format(key,
-                                                                                  tensor_desc.precision,
-                                                                                  tensor_desc.layout,
-                                                                                  " ".join(str(x) for x in
-                                                                                           tensor_desc.dims)))
+        logger.info(f"Network input '{key}' precision {tensor_desc.precision}, "
+                                                    f"dimensions ({tensor_desc.layout}): "
+                                                    f"{' '.join(str(x) for x in tensor_desc.dims)}")
     output_info = ie_network.outputs
     for key in output_info.keys():
         info = output_info[key]
-        logger.info("Network output '{}' precision {}, dimensions ({}): {}".format(key,
-                                                                                  info.precision,
-                                                                                  info.layout,
-                                                                                  " ".join(str(x) for x in
-                                                                                           info.shape)))
+        logger.info(f"Network output '{key}' precision {info.precision}, "
+                                        f"dimensions ({info.layout}): "
+                                        f"{' '.join(str(x) for x in info.shape)}")
 
 def get_number_iterations(number_iterations: int, nireq: int, api_type: str):
     niter = number_iterations
@@ -120,7 +115,7 @@ def get_number_iterations(number_iterations: int, nireq: int, api_type: str):
         niter = int((niter + nireq - 1) / nireq) * nireq
         if number_iterations != niter:
             logger.warning('Number of iterations was aligned by request number '
-                           'from {} to {} using number of requests {}'.format(number_iterations, niter, nireq))
+                           f'from {number_iterations} to {niter} using number of requests {nireq}')
 
     return niter
 
@@ -147,7 +142,7 @@ def get_duration_in_secs(target_device):
 
     if duration == 0:
         duration = DEVICE_DURATION_IN_SECS[UNKNOWN_DEVICE_TYPE]
-        logger.warning('Default duration {} seconds is used for unknown device {}'.format(duration, target_device))
+        logger.warning(f'Default duration {duration} seconds is used for unknown device {target_device}')
 
     return duration
 
@@ -188,18 +183,18 @@ def parse_nstreams_value_per_device(devices, values_string):
 
 
 def process_help_inference_string(benchmark_app):
-    output_string = 'Start inference {}hronously'.format(benchmark_app.api_type)
+    output_string = f'Start inference {benchmark_app.api_type}hronously'
     if benchmark_app.api_type == 'async':
-        output_string += ', {} inference requests'.format(benchmark_app.nireq)
+        output_string += f', {benchmark_app.nireq} inference requests'
 
         device_ss = ''
         if CPU_DEVICE_NAME in benchmark_app.device:
             device_ss += str(benchmark_app.ie.get_config(CPU_DEVICE_NAME, 'CPU_THROUGHPUT_STREAMS'))
-            device_ss += ' streams for {}'.format(CPU_DEVICE_NAME)
+            device_ss += f' streams for {CPU_DEVICE_NAME}'
         if GPU_DEVICE_NAME in benchmark_app.device:
             device_ss += ', ' if device_ss else ''
             device_ss += str(benchmark_app.ie.get_config(GPU_DEVICE_NAME, 'GPU_THROUGHPUT_STREAMS'))
-            device_ss += ' streams for {}'.format(GPU_DEVICE_NAME)
+            device_ss += f' streams for {GPU_DEVICE_NAME}'
 
         if device_ss:
             output_string += ' using ' + device_ss
@@ -207,10 +202,10 @@ def process_help_inference_string(benchmark_app):
     limits = ''
 
     if benchmark_app.niter and not benchmark_app.duration_seconds:
-        limits += '{} iterations'.format(benchmark_app.niter)
+        limits += f'{benchmark_app.niter} iterations'
 
     if benchmark_app.duration_seconds:
-        limits += '{} ms duration'.format(get_duration_in_milliseconds(benchmark_app.duration_seconds))
+        limits += f'{get_duration_in_milliseconds(benchmark_app.duration_seconds)} ms duration'
     if limits:
         output_string += ', limits: ' + limits
 
@@ -221,7 +216,7 @@ def dump_exec_graph(exe_network, exec_graph_path):
     try:
         exec_graph_info = exe_network.get_exec_graph_info()
         exec_graph_info.serialize(exec_graph_path)
-        logger.info('Executable graph is stored to {}'.format(exec_graph_path))
+        logger.info(f'Executable graph is stored to {exec_graph_path}')
         del exec_graph_info
     except Exception as e:
         logger.exception(e)
@@ -232,20 +227,19 @@ def print_perf_counters(perf_counts_list):
         perf_counts = perf_counts_list[ni]
         total_time = 0
         total_time_cpu = 0
-        logger.info("Performance counts for {}-th infer request".format(ni))
+        logger.info(f"Performance counts for {ni}-th infer request")
         for layer, stats in sorted(perf_counts.items(), key=lambda x: x[1]['execution_index']):
             max_layer_name = 30
-            print("{:<30}{:<15}{:<30}{:<20}{:<20}{:<20}".format(
-                layer[:max_layer_name - 4] + '...' if (len(layer) >= max_layer_name) else layer,
-                stats['status'],
-                'layerType: ' + str(stats['layer_type']),
-                'realTime: ' + str(stats['real_time']),
-                'cpu: ' + str(stats['cpu_time']),
-                'execType: ' + str(stats['exec_type'])))
+            print(f"{layer[:max_layer_name - 4] + '...' if (len(layer) >= max_layer_name) else layer:<30}"
+                                                                f"{stats['status']:<15}"
+                                                                f"{'layerType: ' + str(stats['layer_type']):<30}"
+                                                                f"{'realTime: ' + str(stats['real_time']):<20}"
+                                                                f"{'cpu: ' + str(stats['cpu_time']):<20}"
+                                                                f"{'execType: ' + str(stats['exec_type']):<20}")
             total_time += stats['real_time']
             total_time_cpu += stats['cpu_time']
-        print('Total time:     {} microseconds'.format(total_time))
-        print('Total CPU time: {} microseconds\n'.format(total_time_cpu))
+        print(f'Total time:     {total_time} microseconds')
+        print(f'Total CPU time: {total_time_cpu} microseconds\n')
 
 def get_command_line_arguments(argv):
     parameters = []
@@ -283,7 +277,7 @@ def parse_input_parameters(parameter_string, input_info):
                     return_value  = { k:value for k in input_info.keys() }
                     break
         else:
-            raise Exception("Can't parse input parameter: {}".format(parameter_string))
+            raise Exception(f"Can't parse input parameter: {parameter_string}")
     return return_value
 
 class InputInfo:
@@ -306,7 +300,7 @@ def is_image_info(self):
 
     def getDimentionByLayout(self, character):
         if character not in self.layout:
-            raise Exception("Error: Can't get {} from layout {}".format(character, self.layout))
+            raise Exception(f"Error: Can't get {character} from layout {self.layout}")
         return self.shape[self.layout.index(character)]
 
     @property