From fcc07f80cd51b68ef760c002a6aef604adae619f Mon Sep 17 00:00:00 2001
From: Egor Shulman <egor.shulman@intel.com>
Date: Wed, 23 Jun 2021 14:22:10 +0300
Subject: [PATCH] [IE CPU] Reimplement extension nodes via MKLDNNNode API
 (#5784)

---
 .../src/mkldnn_plugin/cpu_types.h             |  23 +-
 .../src/mkldnn_plugin/mkldnn_node.cpp         |  21 +
 .../src/mkldnn_plugin/mkldnn_node.h           |  44 +-
 .../src/mkldnn_plugin/nodes/bucketize.cpp     | 242 -------
 .../nodes/ctc_greedy_decoder.cpp              | 183 -----
 .../nodes/ctc_greedy_decoder_seq_len.cpp      | 203 ------
 .../src/mkldnn_plugin/nodes/ctc_loss.cpp      | 302 --------
 .../src/mkldnn_plugin/nodes/cum_sum.cpp       | 271 -------
 .../mkldnn_plugin/nodes/detectionoutput.cpp   | 663 ------------------
 .../nodes/detectionoutput_onnx.cpp            | 402 -----------
 .../src/mkldnn_plugin/nodes/gather_tree.cpp   | 184 -----
 .../src/mkldnn_plugin/nodes/grn.cpp           |  91 ---
 .../src/mkldnn_plugin/nodes/list_tbl.hpp      |  21 -
 .../src/mkldnn_plugin/nodes/log_softmax.cpp   | 136 ----
 .../nodes/mkldnn_bucketize_node.cpp           | 218 ++++++
 .../nodes/mkldnn_bucketize_node.h             |  43 ++
 .../nodes/mkldnn_concat_node.cpp              |   2 +-
 .../nodes/mkldnn_ctc_greedy_decoder_node.cpp  | 167 +++++
 .../nodes/mkldnn_ctc_greedy_decoder_node.h    |  32 +
 ...mkldnn_ctc_greedy_decoder_seq_len_node.cpp | 170 +++++
 .../mkldnn_ctc_greedy_decoder_seq_len_node.h  |  35 +
 .../nodes/mkldnn_ctc_loss_node.cpp            | 279 ++++++++
 .../nodes/mkldnn_ctc_loss_node.h              |  32 +
 .../nodes/mkldnn_cum_sum_node.cpp             | 279 ++++++++
 .../mkldnn_plugin/nodes/mkldnn_cum_sum_node.h |  50 ++
 .../nodes/mkldnn_def_conv_node.cpp            |   2 +-
 .../nodes/mkldnn_detection_output_node.cpp    | 601 ++++++++++++++++
 .../nodes/mkldnn_detection_output_node.h      |  86 +++
 .../mkldnn_plugin/nodes/mkldnn_eltwise_node.h |   1 -
 .../mkldnn_embedding_bag_offset_sum_node.cpp  |   2 +-
 .../mkldnn_embedding_bag_packed_sum_node.cpp  |   2 +-
 .../mkldnn_embedding_segments_sum_node.cpp    |   2 +-
 ...mental_detectron_detection_output_node.cpp | 369 ++++++++++
 ...rimental_detectron_detection_output_node.h |  46 ++
 ...n_generate_proposals_single_image_node.cpp | 429 ++++++++++++
 ...ron_generate_proposals_single_image_node.h |  50 ++
 ...ntal_detectron_priorgridgenerator_node.cpp |  95 +++
 ...mental_detectron_priorgridgenerator_node.h |  46 ++
 ...tal_detectron_roifeatureextractor_node.cpp | 413 +++++++++++
 ...ental_detectron_roifeatureextractor_node.h |  41 ++
 ...n_experimental_detectron_topkrois_node.cpp |  82 +++
 ...dnn_experimental_detectron_topkrois_node.h |  40 ++
 ... => mkldnn_extract_image_patches_node.cpp} | 278 ++++----
 ...pp => mkldnn_extract_image_patches_node.h} |  36 +-
 .../nodes/mkldnn_gather_elements_node.cpp     |   2 +-
 .../nodes/mkldnn_gather_nd_node.cpp           |   2 +-
 .../nodes/mkldnn_gather_node.cpp              |   4 +-
 .../nodes/mkldnn_gather_tree_node.cpp         | 148 ++++
 .../nodes/mkldnn_gather_tree_node.h           |  38 +
 .../mkldnn_plugin/nodes/mkldnn_grn_node.cpp   |  81 +++
 .../src/mkldnn_plugin/nodes/mkldnn_grn_node.h |  30 +
 .../nodes/mkldnn_log_softmax_node.cpp         | 116 +++
 .../nodes/mkldnn_log_softmax_node.h           |  34 +
 .../mkldnn_plugin/nodes/mkldnn_math_node.cpp  |  10 +-
 .../mkldnn_plugin/nodes/mkldnn_math_node.h    |   1 -
 .../nodes/mkldnn_non_max_suppression_node.cpp | 406 +++++++++++
 .../nodes/mkldnn_non_max_suppression_node.h   | 102 +++
 .../nodes/mkldnn_proposal_node.cpp            | 198 ++++++
 .../nodes/mkldnn_proposal_node.h              |  42 ++
 .../mkldnn_plugin/nodes/mkldnn_range_node.cpp | 140 ++++
 .../mkldnn_plugin/nodes/mkldnn_range_node.h   |  34 +
 .../nodes/mkldnn_reorg_yolo_node.cpp          |  93 +++
 .../nodes/mkldnn_reorg_yolo_node.h            |  30 +
 .../nodes/mkldnn_reverse_sequence_node.cpp    | 182 +++++
 .../nodes/mkldnn_reverse_sequence_node.h      |  38 +
 .../nodes/mkldnn_softmax_node.cpp             |   2 +-
 .../mkldnn_plugin/nodes/mkldnn_topk_node.cpp  | 478 +++++++++++++
 .../mkldnn_plugin/nodes/mkldnn_topk_node.h    | 114 +++
 .../nodes/mkldnn_transpose_node.cpp           |   2 +-
 .../nodes/non_max_suppression.cpp             | 464 ------------
 .../nodes/priorgridgenerator_onnx.cpp         | 121 ----
 .../src/mkldnn_plugin/nodes/proposal.cpp      | 227 ------
 .../src/mkldnn_plugin/nodes/proposal_onnx.cpp | 450 ------------
 .../src/mkldnn_plugin/nodes/range.cpp         | 164 -----
 .../src/mkldnn_plugin/nodes/reorg_yolo.cpp    |  99 ---
 .../mkldnn_plugin/nodes/reverse_sequence.cpp  | 209 ------
 .../nodes/roifeatureextractor_onnx.cpp        | 433 ------------
 .../src/mkldnn_plugin/nodes/topk.cpp          | 572 ---------------
 .../src/mkldnn_plugin/nodes/topkrois_onnx.cpp | 101 ---
 .../extract_image_patches.cpp                 |   2 +-
 80 files changed, 6169 insertions(+), 5714 deletions(-)
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/grn.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h
 rename inference-engine/src/mkldnn_plugin/nodes/{extract_image_patches.cpp => mkldnn_extract_image_patches_node.cpp} (66%)
 rename inference-engine/src/mkldnn_plugin/nodes/{extract_image_patches.hpp => mkldnn_extract_image_patches_node.h} (64%)
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.h
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/non_max_suppression.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/priorgridgenerator_onnx.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/proposal.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/proposal_onnx.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/range.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/reorg_yolo.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/reverse_sequence.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/topk.cpp
 delete mode 100644 inference-engine/src/mkldnn_plugin/nodes/topkrois_onnx.cpp

diff --git a/inference-engine/src/mkldnn_plugin/cpu_types.h b/inference-engine/src/mkldnn_plugin/cpu_types.h
index d7f55446024c8c..e5bc8af0b5c745 100644
--- a/inference-engine/src/mkldnn_plugin/cpu_types.h
+++ b/inference-engine/src/mkldnn_plugin/cpu_types.h
@@ -64,7 +64,28 @@ enum Type {
     Reference,
     ShuffleChannels,
     DFT,
-    Math
+    Math,
+    CTCLoss,
+    Bucketize,
+    CTCGreedyDecoder,
+    CTCGreedyDecoderSeqLen,
+    CumSum,
+    DetectionOutput,
+    ExperimentalDetectronDetectionOutput,
+    LogSoftmax,
+    TopK,
+    GatherTree,
+    GRN,
+    Range,
+    Proposal,
+    ReorgYolo,
+    ReverseSequence,
+    ExperimentalDetectronTopKROIs,
+    ExperimentalDetectronROIFeatureExtractor,
+    ExperimentalDetectronPriorGridGenerator,
+    ExperimentalDetectronGenerateProposalsSingleImage,
+    ExtractImagePatches,
+    NonMaxSuppression
 };
 
 enum Algorithm {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
index e2e2a3276b8c78..e46c7a7b0bdf9e 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -203,6 +203,27 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
         { "SoftPlus", Math},
         { "Softsign", Math},
         { "Tan", Math},
+        { "CTCLoss", CTCLoss},
+        { "Bucketize", Bucketize},
+        { "CTCGreedyDecoder", CTCGreedyDecoder},
+        { "CTCGreedyDecoderSeqLen", CTCGreedyDecoderSeqLen},
+        { "CumSum", CumSum},
+        { "DetectionOutput", DetectionOutput},
+        { "ExperimentalDetectronDetectionOutput", ExperimentalDetectronDetectionOutput},
+        { "LogSoftmax", LogSoftmax},
+        { "TopK", TopK},
+        { "GatherTree", GatherTree},
+        { "GRN", GRN},
+        { "Range", Range},
+        { "Proposal", Proposal},
+        { "ReorgYolo", ReorgYolo},
+        { "ReverseSequence", ReverseSequence},
+        { "ExperimentalDetectronTopKROIs", ExperimentalDetectronTopKROIs},
+        { "ExperimentalDetectronROIFeatureExtractor", ExperimentalDetectronROIFeatureExtractor},
+        { "ExperimentalDetectronPriorGridGenerator", ExperimentalDetectronPriorGridGenerator},
+        { "ExperimentalDetectronGenerateProposalsSingleImage", ExperimentalDetectronGenerateProposalsSingleImage},
+        { "ExtractImagePatches", ExtractImagePatches},
+        { "NonMaxSuppressionIEInternal", NonMaxSuppression}
 };
 
 Type TypeFromName(const std::string type) {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
index e5f86f03ea0c4a..29618d51fdbaf5 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@@ -129,7 +129,7 @@ static std::string NameFromType(Type type) {
         case EmbeddingBagPackedSum:
             return "EmbeddingBagPackedSum";
         case EmbeddingBagOffsetsSum:
-            return "EmbeddingBagPackedSum";
+            return "EmbeddingBagOffsetsSum";
         case Gather:
             return "Gather";
         case GatherElements:
@@ -150,6 +150,48 @@ static std::string NameFromType(Type type) {
             return "DFT";
         case Math:
             return "Math";
+        case CTCLoss:
+            return "CTCLoss";
+        case Bucketize:
+            return "Bucketize";
+        case CTCGreedyDecoder:
+            return "CTCGreedyDecoder";
+        case CTCGreedyDecoderSeqLen:
+            return "CTCGreedyDecoderSeqLen";
+        case CumSum:
+            return "CumSum";
+        case DetectionOutput:
+            return "DetectionOutput";
+        case ExperimentalDetectronDetectionOutput:
+            return "ExperimentalDetectronDetectionOutput";
+        case LogSoftmax:
+            return "LogSoftmax";
+        case TopK:
+            return "TopK";
+        case GatherTree:
+            return "GatherTree";
+        case GRN:
+            return "GRN";
+        case Range:
+            return "Range";
+        case Proposal:
+            return "Proposal";
+        case ReorgYolo:
+            return "ReorgYolo";
+        case ReverseSequence:
+            return "ReverseSequence";
+        case ExperimentalDetectronTopKROIs:
+            return "ExperimentalDetectronTopKROIs";
+        case ExperimentalDetectronROIFeatureExtractor:
+            return "ExperimentalDetectronROIFeatureExtractor";
+        case ExperimentalDetectronPriorGridGenerator:
+            return "ExperimentalDetectronPriorGridGenerator";
+        case ExperimentalDetectronGenerateProposalsSingleImage:
+            return "ExperimentalDetectronGenerateProposalsSingleImage";
+        case ExtractImagePatches:
+            return "ExtractImagePatches";
+        case NonMaxSuppression:
+            return "NonMaxSuppression";
         default:
             return "Unknown";
     }
diff --git a/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp b/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp
deleted file mode 100644
index febdf1a8dfd0f2..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <array>
-#include <cassert>
-#include <algorithm>
-#include <limits>
-#include "ie_parallel.hpp"
-#include <ngraph/opsets/opset3.hpp>
-
-using namespace MKLDNNPlugin;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-class BucketizeImpl : public ExtLayerBase {
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto bucketsize = std::dynamic_pointer_cast<const ngraph::opset3::Bucketize>(op);
-            if (!bucketsize) {
-                errorMessage = "Only opset3 Bucketize operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    std::string errorPrefix;
-
-public:
-    explicit BucketizeImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            errorPrefix = "Bucketize layer with name '" + op->get_friendly_name() + "' ";
-            const auto bucketsize = std::dynamic_pointer_cast<const ngraph::opset3::Bucketize>(op);
-
-            if (op->get_input_size() != 2 || op->get_output_size() != 1) {
-                IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
-            }
-
-            // check one attribute
-            with_right = bucketsize->get_with_right_bound();
-
-            // check precisions for input and output tensors
-            input_precision = details::convertPrecision(op->get_input_element_type(INPUT_TENSOR_PORT));
-            if (input_precision != Precision::FP32 && input_precision != Precision::I32 &&
-                input_precision != Precision::I64) {
-                input_precision = Precision::FP32;
-            }
-            boundaries_precision = details::convertPrecision(op->get_input_element_type(INPUT_BINS_PORT));
-            if (boundaries_precision != Precision::FP32 && boundaries_precision != Precision::I32 &&
-                boundaries_precision != Precision::I64) {
-                boundaries_precision = Precision::FP32;
-            }
-            output_precision = details::convertPrecision(op->get_output_element_type(OUTPUT_TENSOR_PORT));
-            if (output_precision != Precision::I32 && output_precision != Precision::I64) {
-                output_precision = Precision::I32;
-            }
-
-            // check dimensions of input tensors
-            SizeVector input_tensor_dims = op->get_input_shape(INPUT_TENSOR_PORT);
-            if (input_tensor_dims.size() < 1) {
-                IE_THROW() << errorPrefix << " has incorrect dimensions of the input.";
-            }
-            SizeVector input_bin_dims = op->get_input_shape(INPUT_BINS_PORT);
-            if (input_bin_dims.size() != 1) {
-                IE_THROW() << errorPrefix << " has incorrect dimensions of the boundaries tensor.";
-            }
-            if (input_bin_dims[0] != 0) {
-                with_bins = true;
-            }
-            num_bin_values = input_bin_dims[0];
-
-            num_values = std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), size_t(1), std::multiplies<size_t>());
-
-            addConfig(op, {{TensorDescCreatorTypes::ncsp, input_precision},
-                           {TensorDescCreatorTypes::ncsp, boundaries_precision}},
-                          {{TensorDescCreatorTypes::ncsp, output_precision}});
-        }
-        catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        auto precision_mask = getPrecisionMask(input_precision, boundaries_precision, output_precision);
-
-        switch (precision_mask) {
-        case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I32):
-            bucketize<PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I64):
-            bucketize<PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I32):
-            bucketize<PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I64):
-            bucketize<PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I32):
-            bucketize<PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I32>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I64):
-            bucketize<PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I64>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I32):
-            bucketize<PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I64):
-            bucketize<PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I32, Precision::I32, Precision::I32):
-            bucketize<PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I32, Precision::I32, Precision::I64):
-            bucketize<PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I32, Precision::I64, Precision::I32):
-            bucketize<PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I32>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I32, Precision::I64, Precision::I64):
-            bucketize<PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I64>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I32):
-            bucketize<PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I64):
-            bucketize<PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::FP32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I64, Precision::I32, Precision::I32):
-            bucketize<PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I32>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I64, Precision::I32, Precision::I64):
-            bucketize<PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I32>::value_type,
-                PrecisionTrait<Precision::I64>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I64, Precision::I64, Precision::I32):
-            bucketize<PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I32>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        case getPrecisionMask(Precision::I64, Precision::I64, Precision::I64):
-            bucketize<PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I64>::value_type,
-                PrecisionTrait<Precision::I64>::value_type>(inputs[0], inputs[1], outputs[0]);
-            break;
-        default:
-            return GENERAL_ERROR;
-        }
-
-        return OK;
-    }
-
-private:
-    template <typename T, typename T_BOUNDARIES, typename T_IND>
-    void bucketize(Blob::Ptr input, Blob::Ptr boundaries, Blob::Ptr output) {
-        const auto *input_data = input->cbuffer().as<const T *>();
-        const auto *boundaries_data = boundaries->cbuffer().as<const T_BOUNDARIES *>();
-        auto *output_data = output->buffer().as<T_IND *>();
-
-        if (with_bins == false) {
-            memset(output_data, 0, num_values * sizeof(T_IND));
-            return;
-        }
-
-        // boundaries are assumed to be sorted and to have unique elements
-        parallel_for(num_values, [&](size_t ind) {
-            T value = input_data[ind];
-            if (with_right) {
-                auto low = std::lower_bound(boundaries_data, boundaries_data + num_bin_values, value);
-                output_data[ind] = static_cast<T_IND>(low - boundaries_data);
-            } else {
-                auto up = std::upper_bound(boundaries_data, boundaries_data + num_bin_values, value);
-                output_data[ind] = static_cast<T_IND>(up - boundaries_data);
-            }
-        });
-    }
-
-    const size_t INPUT_TENSOR_PORT = 0;
-    const size_t INPUT_BINS_PORT = 1;
-    const size_t OUTPUT_TENSOR_PORT = 0;
-
-    size_t num_values = 0;
-    size_t num_bin_values = 0;
-    bool with_right = false;
-    bool with_bins = false;
-
-    Precision input_precision;
-    Precision boundaries_precision;
-    Precision output_precision;
-};
-
-REG_FACTORY_FOR(BucketizeImpl, Bucketize);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp
deleted file mode 100644
index 0ba6ca7e960230..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-#include "ie_parallel.hpp"
-#include <ngraph/op/ctc_greedy_decoder.hpp>
-#include <nodes/common/tensor_desc_creator.h>
-
-#include <string>
-#include <vector>
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-class CTCGreedyDecoderImpl: public ExtLayerBase {
-public:
-    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v0::CTCGreedyDecoder>(op);
-            if (!greedyDecOp) {
-                errorMessage = "Node is not an instance of the CTCGreedyDecoder operation from operation set v0.";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-
-        return true;
-    }
-
-    explicit CTCGreedyDecoderImpl(const std::shared_ptr<ngraph::Node>& op) : mergeRepeated_(true) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            std::string errPrefix = "CTCGreedyDecoder layer with name '" + op->get_friendly_name() + "' ";
-            if (op->get_input_size() != 2)
-                IE_THROW() << errPrefix << "has invalid number of input edges: " << op->get_input_size();
-            if (op->get_output_size() != 1)
-                IE_THROW() << errPrefix << "has invalid number of outputs edges: " << op->get_output_size();
-
-            if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0] &&
-                    op->get_input_shape(DATA_INDEX)[1] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[1])
-                IE_THROW() << errPrefix << "has invalid input shapes.";
-
-            Precision inDataPrecision = details::convertPrecision(op->get_input_element_type(DATA_INDEX));
-            if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16)
-                IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inDataPrecision;
-
-            Precision seqLenPrecision = details::convertPrecision(op->get_input_element_type(SEQUENCE_LENGTH_INDEX));
-            if (seqLenPrecision != Precision::FP32 && seqLenPrecision != Precision::BF16)
-                IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision;
-
-            auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v0::CTCGreedyDecoder>(op);
-            mergeRepeated_ = greedyDecOp->get_ctc_merge_repeated();
-
-            addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                           {TensorDescCreatorTypes::ncsp, Precision::FP32}},
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-            throw;
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        const float* probabilities = inputs[DATA_INDEX]->cbuffer().as<const float*>() +
-            inputs[DATA_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const float* sequenceMask = inputs[SEQUENCE_LENGTH_INDEX]->cbuffer().as<const float*>() +
-            inputs[SEQUENCE_LENGTH_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        float* outputSequences = outputs[0]->buffer().as<float*>() +
-            outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        const size_t T = inputs[DATA_INDEX]->getTensorDesc().getDims()[0];
-        const size_t B = inputs[DATA_INDEX]->getTensorDesc().getDims()[1];
-        const int C = inputs[DATA_INDEX]->getTensorDesc().getDims()[2];
-        const size_t BC = B * C;
-        const size_t CB1 = C * (B - 1);
-
-        const int blankIndex = C - 1;
-
-        std::vector<size_t> sequenceLengths(B, 0);
-        parallel_for(B, [&](size_t b) {
-            size_t t = 0;
-            for (; t < T; t++) {
-                if (sequenceMask[B * t + b] == 0.f)
-                    break;
-            }
-            sequenceLengths[b] = t;
-        });
-
-        size_t workAmount = 0;
-        for (size_t b = 0; b < B; b++) {
-            workAmount += sequenceLengths[b];
-        }
-
-        // Parallelization could not be made directly by T due to output index depends on merged classes and
-        // blank index, thus could not be shared between threads. Better to divide operation on two steps.
-        // At the first stage find the maximum index. At second stage merge if needed.
-        // Such approach makes parallelization more efficient.
-        auto threadBody = [&](const int ithr, const int nthr) {
-            size_t start(0lu), end(0lu);
-            splitter(workAmount, nthr, ithr, start, end);
-            if (start >= end)
-                return;
-            size_t tStart = 0lu, bStart = 0lu;
-            for (; bStart < B; bStart++) {
-                tStart += sequenceLengths[bStart];
-                if (tStart >= start) {
-                    tStart = start - (tStart - sequenceLengths[bStart]);
-                    break;
-                }
-            }
-
-            size_t workCounter = start;
-
-            for (size_t b = bStart; b < B; ++b) {
-                size_t outputIndex = b * T + tStart;
-                const float* probs = probabilities + b * C + BC * tStart;
-                size_t sequenceLength = sequenceLengths[b];
-
-                for (size_t t = tStart; t < sequenceLength; ++t) {
-                    int maxClassIdx = 0;
-
-                    float maxProb = probs[0];
-                    ++probs;
-
-                    for (int c = 1; c < C; ++c, ++probs) {
-                        if (*probs > maxProb) {
-                            maxClassIdx = c;
-                            maxProb = *probs;
-                        }
-                    }
-                    probs += CB1;
-                    outputSequences[outputIndex++] = static_cast<float>(maxClassIdx);
-
-                    if (++workCounter >= end) {
-                        return;
-                    }
-                }
-                tStart = 0lu;
-            }
-        }; // thread body
-
-        parallel_nt(0, threadBody);
-
-        parallel_for(B, [&](size_t b) {
-            int prevClassIdx = -1;
-            size_t outputIndex = b * T;
-            const size_t sequenceLength = sequenceLengths[b];
-            float* shiftedOut = outputSequences + b * T;
-            for (size_t t = 0; t < sequenceLength; ++t) {
-                if (*shiftedOut < blankIndex &&
-                        !(mergeRepeated_ && *shiftedOut == prevClassIdx)) {
-                    outputSequences[outputIndex++] = *shiftedOut;
-                }
-                prevClassIdx = *shiftedOut;
-                shiftedOut++;
-            }
-            std::fill(outputSequences + outputIndex, outputSequences + (b + 1) * T, -1.f);
-        });
-
-        return OK;
-    }
-
-private:
-    const size_t DATA_INDEX = 0lu;
-    const size_t SEQUENCE_LENGTH_INDEX = 1lu;
-    bool mergeRepeated_;
-};
-
-REG_FACTORY_FOR(CTCGreedyDecoderImpl, CTCGreedyDecoder);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp
deleted file mode 100644
index c60684ee0af3f8..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-#include "ie_parallel.hpp"
-#include <ngraph/op/ctc_greedy_decoder_seq_len.hpp>
-#include <nodes/common/tensor_desc_creator.h>
-
-#include <string>
-#include <vector>
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-class CTCGreedyDecoderSeqLenImpl: public ExtLayerBase {
-public:
-    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v6::CTCGreedyDecoderSeqLen>(op);
-            if (!greedyDecOp) {
-                errorMessage = "Node is not an instance of the CTCGreedyDecoderSeqLen operation from operation set v6.";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-
-        return true;
-    }
-
-    explicit CTCGreedyDecoderSeqLenImpl(const std::shared_ptr<ngraph::Node>& op) : mergeRepeated_(true) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            std::string errPrefix = "CTCGreedyDecoderSeqLen layer with name '" + op->get_friendly_name() + "' ";
-            if (op->get_input_size() < 2 || op->get_input_size() > 3)
-                IE_THROW() << errPrefix << "has invalid number of input edges: " << op->get_input_size();
-            if (op->get_output_size() != 2)
-                IE_THROW() << errPrefix << "has invalid number of outputs edges: " << op->get_output_size();
-
-            if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0])
-                IE_THROW() << errPrefix << "has invalid input shapes.";
-
-            Precision inDataPrecision = details::convertPrecision(op->get_input_element_type(DATA_INDEX));
-            if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16)
-                IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inDataPrecision;
-
-            Precision seqLenPrecision = details::convertPrecision(op->get_input_element_type(SEQUENCE_LENGTH_INDEX));
-            if (seqLenPrecision != Precision::I32 && seqLenPrecision != Precision::I64)
-                IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision;
-
-            auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v6::CTCGreedyDecoderSeqLen>(op);
-            mergeRepeated_ = greedyDecOp->get_merge_repeated();
-
-            if (op->get_input_size() == BLANK_INDEX) {
-                addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::I32}},
-                              {{TensorDescCreatorTypes::ncsp, Precision::I32},
-                               {TensorDescCreatorTypes::ncsp, Precision::I32}});
-            } else {
-                Precision blIdxPrecision = details::convertPrecision(op->get_input_element_type(BLANK_INDEX));
-                if (blIdxPrecision != Precision::I32 && blIdxPrecision != Precision::I64)
-                    IE_THROW() << errPrefix << "has unsupported 'blank_index' input precision: " << blIdxPrecision;
-
-                addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::I32},
-                               {TensorDescCreatorTypes::ncsp, Precision::I32}},
-                              {{TensorDescCreatorTypes::ncsp, Precision::I32},
-                               {TensorDescCreatorTypes::ncsp, Precision::I32}});
-            }
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-            throw;
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        const float* probabilities = inputs[DATA_INDEX]->cbuffer().as<const float*>() +
-            inputs[DATA_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const int* sequenceLengths = inputs[SEQUENCE_LENGTH_INDEX]->cbuffer().as<const int*>() +
-            inputs[SEQUENCE_LENGTH_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        int* decodedClasses = outputs[DECODED_CLASSES_INDEX]->buffer().as<int*>() +
-            outputs[DECODED_CLASSES_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        int* decodedClassesLength = outputs[DECODED_CLASSES_LENGTH_INDEX]->buffer().as<int*>() +
-            outputs[DECODED_CLASSES_LENGTH_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        const auto& inDims = inputs[DATA_INDEX]->getTensorDesc().getDims();
-        const size_t B = inDims[0];
-        const size_t T = inDims[1];
-        const int C = inDims[2];
-        const size_t TC = T * C;
-
-        int blankIndex = C - 1;
-        if (inputs.size() > BLANK_INDEX)
-            blankIndex = (inputs[BLANK_INDEX]->cbuffer().as<const int*>() +
-                inputs[BLANK_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0];
-
-        size_t workAmount = 0;
-        for (size_t b = 0; b < B; b++) {
-            if (sequenceLengths[b] > T) {
-                if (resp) {
-                    std::string errorMsg = errPrefix
-                        + ". Sequence length " + std::to_string(sequenceLengths[b])
-                        + " cannot be greater than according decoded classes dimension size "
-                        + std::to_string(outputs[DECODED_CLASSES_INDEX]->getTensorDesc().getDims()[1]);
-                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                }
-                return PARAMETER_MISMATCH;
-            }
-            workAmount += sequenceLengths[b];
-        }
-        // Parallelization could not be made directly by T due to output index depends on merged classes and
-        // blank index, thus could not be shared between threads. Better to divide operation on two steps.
-        // At the first stage find the maximum index. At second stage merge if needed.
-        // Such approach makes parallelization more efficient.
-        auto threadBody = [&](const int ithr, const int nthr) {
-            size_t start(0lu), end(0lu);
-            splitter(workAmount, nthr, ithr, start, end);
-            if (start >= end)
-                return;
-            size_t tStart = 0lu, bStart = 0lu;
-            for (; bStart < B; bStart++) {
-                tStart += sequenceLengths[bStart];
-                if (tStart >= start) {
-                    tStart = start - (tStart - sequenceLengths[bStart]);
-                    break;
-                }
-            }
-
-            size_t workCounter = start;
-
-            for (size_t b = bStart; b < B; ++b) {
-                size_t outputIndex = b * T + tStart;
-                const float* probs = probabilities + b * TC + C * tStart;
-                const size_t actualSeqLen = sequenceLengths[b];
-
-                for (size_t t = tStart; t < actualSeqLen; ++t) {
-                    int maxClassIdx = 0;
-                    float maxProb = probs[0];
-                    probs++;
-
-                    for (int c = 1; c < C; c++, probs++) {
-                        if (*probs > maxProb) {
-                            maxClassIdx = c;
-                            maxProb = *probs;
-                        }
-                    }
-                    decodedClasses[outputIndex++] = maxClassIdx;
-
-                    if (++workCounter >= end) {
-                        return;
-                    }
-                }
-                tStart = 0lu;
-            }
-        }; // thread body
-
-        parallel_nt(0, threadBody);
-
-        parallel_for(B, [&](size_t b) {
-            int prevClassIdx = -1;
-            size_t outputIndex = b * T;
-            const size_t actualSeqLen = sequenceLengths[b];
-            int* shiftedOut = decodedClasses + b * T;
-
-            for (size_t t = 0; t < actualSeqLen; ++t) {
-                if (*shiftedOut != blankIndex &&
-                        !(mergeRepeated_ && *shiftedOut == prevClassIdx)) {
-                    decodedClasses[outputIndex++] = *shiftedOut;
-                }
-                prevClassIdx = *shiftedOut;
-                shiftedOut++;
-            }
-            std::fill(decodedClasses + outputIndex, decodedClasses + (b + 1) * T, -1);
-            decodedClassesLength[b] = outputIndex - b * T;
-        });
-
-        return OK;
-    }
-
-private:
-    const size_t DATA_INDEX = 0lu;
-    const size_t SEQUENCE_LENGTH_INDEX = 1lu;
-    const size_t BLANK_INDEX = 2lu;
-    const size_t DECODED_CLASSES_INDEX = 0lu;
-    const size_t DECODED_CLASSES_LENGTH_INDEX = 1lu;
-    bool mergeRepeated_;
-    std::string errPrefix;
-};
-
-REG_FACTORY_FOR(CTCGreedyDecoderSeqLenImpl, CTCGreedyDecoderSeqLen);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp
deleted file mode 100644
index 84d6b55a1a47e9..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-#include "ie_parallel.hpp"
-#include <ngraph/op/ctc_loss.hpp>
-#include <nodes/common/tensor_desc_creator.h>
-
-#include <cmath>
-
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-class CTCLossImpl : public ExtLayerBase {
-public:
-    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            auto ctcLossOp = ngraph::as_type_ptr<const ngraph::op::v4::CTCLoss>(op);
-            if (!ctcLossOp) {
-                errorMessage = "Node is not an instance of the CTCLoss operation from operation set v4.";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-
-        return true;
-    }
-
-    explicit CTCLossImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            _logPrefix = std::string("CTCLoss layer with name '") + op->get_friendly_name() + "'";
-
-            if (op->get_input_size() != 4 && op->get_input_size() != 5)
-                IE_THROW() << _logPrefix << " has invalid inputs number.";
-
-            auto ctcLossOp = ngraph::as_type_ptr<const ngraph::op::v4::CTCLoss>(op);
-            _ctcMergeRepeated = ctcLossOp->get_ctc_merge_repeated();
-            _preprocessCollapseRepeated = ctcLossOp->get_preprocess_collapse_repeated();
-            _unique = ctcLossOp->get_unique();
-
-            std::vector<DataConfigurator> inDataConfigurators;
-            inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::FP32});
-            for (int i = 1; i < op->get_input_size(); i++) {
-                inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::I32});
-            }
-            addConfig(op, inDataConfigurators,
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-            throw;
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs,
-                       std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        StatusCode returnCode = OK;
-
-        const float* logits = inputs[0]->cbuffer().as<const float*>() +
-            inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const int* logitsLength = inputs[1]->cbuffer().as<const int*>() +
-            inputs[1]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const int* labels = inputs[2]->cbuffer().as<const int*>() +
-            inputs[2]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const int* labelsLength = inputs[3]->cbuffer().as<const int*>() +
-            inputs[3]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        float* dstData = outputs[0]->buffer().as<float*>() +
-            outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        const auto& logitsShape = inputs[0]->getTensorDesc().getDims();
-        const size_t batchNum = logitsShape[0];
-        const size_t maxTime = logitsShape[1];
-        const size_t classesNum = logitsShape[2];
-
-        int blankIndex = classesNum - 1;
-        if (inputs.size() > 4) {
-            blankIndex = inputs[4]->cbuffer().as<const int*>()[0];
-        }
-
-        std::vector<int> decodedTargetLenB(batchNum, 0);
-        std::vector<std::vector<int>> targetDB(batchNum);
-        std::vector<std::vector<std::vector<float>>> logProbabilitiesB(batchNum);
-        std::vector<std::string> errorMsgB(parallel_get_max_threads());
-
-        auto threadBody_1 = [&](const int ithr, const int nthr) {
-            size_t start(0lu), end(0lu);
-            splitter(batchNum, nthr, ithr, start, end);
-            if (start >= end)
-                return;
-
-            for (size_t b = start; b < end; b++) {
-                if (logitsLength[b] < 0 || labelsLength[b] < 0 || logitsLength[b] > maxTime || labelsLength[b] > logitsLength[b]) {
-                    errorMsgB[ithr] = _logPrefix + ". Logit length cannot be greater than max sequence length. "
-                        + "Label length cannot be greater than a logit length"
-                        + " and both cannot be negative.\nMaxSeqLen: "
-                        + std::to_string(maxTime) + "; Logit len: " + std::to_string(logitsLength[b])
-                        + "; Label len: " + std::to_string(labelsLength[b]);
-                    returnCode = GENERAL_ERROR;
-                    return;
-                }
-                const size_t actualLogitLen = logitsLength[b];
-                const size_t actualTargetLen = labelsLength[b];
-                size_t decodedTargetLen = 0lu;
-
-                // Decoding target: merge repeated characters if preprocess_collapse_repeated == True,
-                // find unique elemnts if unique == True.
-                // Inserts blanks before each index and a blank at the end.
-                const int* target = &labels[b * maxTime];
-                targetDB[b].resize(actualTargetLen * 2 + 1);
-                auto& targetD = targetDB[b];
-                if (_unique) {
-                    std::unordered_set<int> uniqVals;
-                    for (size_t t = 0lu; t < actualTargetLen; t++) {
-                        if (uniqVals.find(target[t]) != uniqVals.end()) {
-                            continue;
-                        }
-                        uniqVals.insert(target[t]);
-                        targetD[decodedTargetLen++] = blankIndex;
-                        targetD[decodedTargetLen++] = target[t];
-                    }
-                    targetD[decodedTargetLen++] = blankIndex;
-                } else if (_preprocessCollapseRepeated) {
-                    auto prevValue = target[0];
-                    targetD[decodedTargetLen++] = blankIndex;
-                    targetD[decodedTargetLen++] = target[0];
-                    for (size_t t = 1lu; t < actualTargetLen; t++) {
-                        if (target[t] == prevValue) {
-                            continue;
-                        }
-                        targetD[decodedTargetLen++] = blankIndex;
-                        targetD[decodedTargetLen++] = prevValue = target[t];
-                    }
-                    targetD[decodedTargetLen++] = blankIndex;
-                } else {
-                    for (size_t t = 0lu; t < actualTargetLen; t++) {
-                        targetD[decodedTargetLen++] = blankIndex;
-                        targetD[decodedTargetLen++] = target[t];
-                    }
-                    targetD[decodedTargetLen++] = blankIndex;
-                }
-                decodedTargetLenB[b] = decodedTargetLen;
-
-                auto& logProbabilities = logProbabilitiesB[b];
-                logProbabilities.resize(actualLogitLen);
-                for (size_t ll = 0; ll < actualLogitLen; ll++) {
-                    logProbabilities[ll].resize(decodedTargetLen);
-                }
-            } // for batch
-        }; // threadBody_1
-
-        parallel_nt(0, threadBody_1);
-        if (returnCode != OK) {
-            std::string resErr("");
-            for (auto& err : errorMsgB) {
-                if (!err.empty())
-                    resErr += err + "\n";
-                resErr.copy(resp->msg, sizeof(resp->msg) - 1);
-            }
-            return returnCode;
-        }
-
-        const size_t TC = maxTime * classesNum;
-
-        size_t workAmount2 = 0lu;
-        for (size_t b = 0; b < batchNum; b++) {
-            workAmount2 += logitsLength[b];
-        }
-
-        auto threadBody_2 = [&](const int ithr, const int nthr) {
-            size_t start(0lu), end(0lu);
-            size_t sB(0lu), sT(0lu);
-            splitter(workAmount2, nthr, ithr, start, end);
-            if (start >= end)
-                return;
-            int64_t cw = 0, st = start;
-            for (; sB < batchNum; sB++) {
-                cw += logitsLength[sB];
-                if (cw >= st) {
-                    sT = logitsLength[sB] + st - cw;
-                    break;
-                }
-            }
-            size_t workCounter = start;
-
-            for (size_t b = sB; b < batchNum; b++) {
-                const size_t actualLogitLen = logitsLength[b];
-                const size_t decodedTargetLen = decodedTargetLenB[b];
-                auto& logProbabilities = logProbabilitiesB[b];
-                auto& targetD = targetDB[b];
-
-                double expSum = 0.0;
-                size_t btcT = b * TC + sT * classesNum;
-                // logProbabilities = logSoftmax = logits[b][t][c] - ln(sum_c(exp(logits[b][t])))
-                for (size_t t = sT; t < actualLogitLen; t++) {
-                    expSum = 0.0;
-                    for (size_t c = 0lu; c < classesNum; c++) {
-                        expSum += std::exp(logits[btcT + c]);
-                    }
-                    for (size_t s = 0lu; s < decodedTargetLen; s++) {
-                        logProbabilities[t][s] = logits[btcT + targetD[s]] - std::log(expSum);
-                    }
-                    btcT += classesNum;
-                    if (++workCounter >= end) {
-                        return;
-                    }
-                }
-                sT = 0lu;
-            }  // for batch
-        }; // threadBody_2
-
-        parallel_nt(0, threadBody_2);
-
-        const auto float_inf = std::numeric_limits<float>::infinity();
-
-        auto sumLogs = [&float_inf](float log1, float log2) {
-            if (log1 == -float_inf) {
-                return log2;
-            } else if (log2 == -float_inf) {
-                return log1;
-            } else {
-                if (log1 > log2)
-                    return log1 + std::log1pf(std::exp(log2 - log1));
-                else
-                    return log2 + std::log1pf(std::exp(log1 - log2));
-            }
-        };
-
-        auto threadBody_3 = [&](const int ithr, const int nthr) {
-            size_t start(0lu), end(0lu);
-            splitter(batchNum, nthr, ithr, start, end);
-            if (start >= end)
-                return;
-
-            // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks:
-            // Graves et al., 2016, paragraph 4.1 (10)
-            for (size_t b = start; b < end; b++) {
-                auto& targetD = targetDB[b];
-                auto& logProbabilities = logProbabilitiesB[b];
-                const int actualLogitLen = logitsLength[b];
-                const int decodedTargetLen = decodedTargetLenB[b];
-                std::vector<std::vector<float>> logBwd(decodedTargetLen, std::vector<float>(actualLogitLen, -float_inf));
-                for (int s = decodedTargetLen - 2; s < decodedTargetLen; s++)
-                    logBwd[s][actualLogitLen - 1] = 0.f;
-
-                for (int t = actualLogitLen - 2; t >= 0; t--) {
-                    const int t_1 = t + 1;
-                    for (int s = std::max(0, decodedTargetLen - (2 * (actualLogitLen - t)));
-                            s < std::min(decodedTargetLen, 2 * (t_1)); s++) {
-                        if (_ctcMergeRepeated || targetD[s] == blankIndex) {
-                            logBwd[s][t] = sumLogs(logBwd[s][t],
-                                logBwd[s][t_1] + logProbabilities[t_1][s]);
-                        }
-
-                        if (s + 1 < decodedTargetLen) {
-                            logBwd[s][t] = sumLogs(logBwd[s][t],
-                                logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]);
-                        }
-
-                        if (s + 2 < decodedTargetLen) {
-                            if (targetD[s] != blankIndex && (!_ctcMergeRepeated || (targetD[s] != targetD[s + 2]))) {
-                                logBwd[s][t] = sumLogs(logBwd[s][t],
-                                    logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]);
-                            }
-                        }
-                    }
-                }
-
-                logBwd[0][0] += logProbabilities[0][0];
-                logBwd[1][0] += logProbabilities[0][(decodedTargetLen > 1) ? 1 : 0];
-
-                dstData[b] = -sumLogs(logBwd[0][0], logBwd[1][0]);
-            } // for batch
-        }; // threadBody_3
-
-        parallel_nt(0, threadBody_3);
-
-        return returnCode;
-    } // execute
-
-protected:
-    bool _ctcMergeRepeated;
-    bool _preprocessCollapseRepeated;
-    bool _unique;
-
-    std::string _logPrefix;
-};
-
-REG_FACTORY_FOR(CTCLossImpl, CTCLoss);
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp b/inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp
deleted file mode 100644
index 8940527713cd36..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "list.hpp"
-#include "base.hpp"
-
-#include <string>
-#include <vector>
-#include "ie_parallel.hpp"
-#include "ie_precision.hpp"
-#include <ngraph/opsets/opset1.hpp>
-#include <ngraph/opsets/opset3.hpp>
-#include <ie_ngraph_utils.hpp>
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-class CumSumImpl: public ExtLayerBase {
-    enum { CUM_SUM_DATA, AXIS, numOfInputs };
-    bool exclusive;
-    bool reverse;
-    size_t numOfDims;
-    size_t axis = 0;
-    std::vector<size_t> shape;
-
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto cumsum = std::dynamic_pointer_cast<const ngraph::opset3::CumSum>(op);
-            if (!cumsum) {
-                errorMessage = "Only opset3 CumSum operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-public:
-    explicit CumSumImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            layerName = op->get_friendly_name();
-            if ((op->get_input_size() != numOfInputs && op->get_input_size() != (numOfInputs - 1)) || op->get_output_size() != 1)
-                IE_THROW() << "CumSum layer with name '" << layerName << "' has incorrect number of input/output edges!";
-
-            const auto &dataShape = op->get_input_shape(CUM_SUM_DATA);
-            if (dataShape.size() < 1) {
-                IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'data' input tensor with rank: " << dataShape.size();
-            }
-            numOfDims = dataShape.size();
-
-            const auto cumsum = std::dynamic_pointer_cast<const ngraph::opset3::CumSum>(op);
-            exclusive = cumsum->is_exclusive();
-            reverse = cumsum->is_reverse();
-
-            auto dataPrecision = details::convertPrecision(cumsum->get_input_element_type(CUM_SUM_DATA));
-            if (dataPrecision != Precision::I8 && dataPrecision != Precision::U8 && dataPrecision != Precision::I16 && dataPrecision != Precision::I32 &&
-                dataPrecision != Precision::FP32 && dataPrecision != Precision::I64 && dataPrecision != Precision::U64 && dataPrecision != Precision::BF16)
-                IE_THROW() << "CumSum layer with name '" << layerName << "' has unsupported 'data' input precision: " << dataPrecision.name();
-
-            if (cumsum->get_input_size() == numOfInputs) {
-                const auto& axisTensorPrec = details::convertPrecision(cumsum->get_input_element_type(AXIS));
-                if (axisTensorPrec != Precision::I32 && axisTensorPrec != Precision::I64)
-                    IE_THROW() << "CumSum layer with name '" << layerName << "' has unsupported 'axis' input precision: " << axisTensorPrec.name();
-
-                if (!ngraph::is_scalar(cumsum->get_input_shape(AXIS)))
-                    IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'axis' input tensor with non scalar rank";
-            }
-
-            if (dataShape != cumsum->get_output_shape(0))
-                IE_THROW() << "CumSum layer with name '" << layerName << "' has different 'data' input and output dimensions";
-
-            shape = dataShape;
-
-            std::vector<DataConfigurator> inDataConfigurators;
-            if (dataPrecision == Precision::BF16)
-                dataPrecision = Precision::FP32;
-            inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, dataPrecision});
-            if (op->get_input_size() > 1)
-                inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::I32});
-            addConfig(op, inDataConfigurators, {{TensorDescCreatorTypes::ncsp, dataPrecision}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        if (inputs.size() == numOfInputs)
-            axis = getAxis(inputs[AXIS], inputs[CUM_SUM_DATA]);
-
-        const auto &dataPrecision = inputs[CUM_SUM_DATA]->getTensorDesc().getPrecision();
-        switch (dataPrecision) {
-            case Precision::I8   : { execImpl<int8_t>(inputs[CUM_SUM_DATA], outputs[0]); break; }
-            case Precision::U8   : { execImpl<uint8_t>(inputs[CUM_SUM_DATA], outputs[0]); break; }
-            case Precision::I16  : { execImpl<int16_t>(inputs[CUM_SUM_DATA], outputs[0]); break; }
-            case Precision::I32  : { execImpl<int32_t>(inputs[CUM_SUM_DATA], outputs[0]); break; }
-            case Precision::FP32 : { execImpl<float>(inputs[CUM_SUM_DATA], outputs[0]); break; }
-            case Precision::I64  : { execImpl<int64_t>(inputs[CUM_SUM_DATA], outputs[0]); break; }
-            case Precision::U64  : { execImpl<uint64_t>(inputs[CUM_SUM_DATA], outputs[0]); break; }
-            default : {
-                if (resp) {
-                    std::string errorMsg = "CumSum layer with name '" + layerName + "' has unsupported 'data' input precision: " + dataPrecision.name();
-                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                }
-                return GENERAL_ERROR;
-            }
-        }
-        return OK;
-    }
-
-private:
-    template <typename dataType>
-    void execImpl(const Blob::CPtr& _input, const Blob::Ptr& _output) {
-        const auto *input = _input->cbuffer().as<const dataType *>() + _input->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        auto *output = _output->buffer().as<dataType *>() + _output->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const std::vector<size_t> strides = _input->getTensorDesc().getBlockingDesc().getStrides();
-
-        if (reverse) {
-            if (exclusive) {
-                cumSum<true, true, dataType>(input, output, strides);
-            } else {
-                cumSum<true, false, dataType>(input, output, strides);
-            }
-        } else {
-            if (exclusive) {
-                cumSum<false, true, dataType>(input, output, strides);
-            } else {
-                cumSum<false, false, dataType>(input, output, strides);
-            }
-        }
-    }
-
-    template <bool reverse, bool exclusive, typename dataType>
-    void cumSum(const dataType *input, dataType *output, const std::vector<size_t> &strides) {
-        SizeVector iterationRange(numOfDims - 1);
-        size_t j = 0;
-        for (size_t i = 0; i < shape.size(); i++) {
-            if (i == axis)
-                continue;
-            iterationRange[j++] = shape[i];
-        }
-        size_t work_amount_dst = std::accumulate(iterationRange.begin(), iterationRange.end(), 1, std::multiplies<size_t>());
-        parallel_nt(0, [&](const int ithr, const int nthr) {
-            size_t start = 0, end = 0;
-            SizeVector counters(numOfDims - 1, 0);
-            splitter(work_amount_dst, nthr, ithr, start, end);
-
-            parallelItInit(start, counters, iterationRange);
-
-            for (size_t iwork = start; iwork < end; ++iwork) {
-                std::vector<size_t> forStartOffset(numOfDims);
-                forStartOffset[axis] = 0;
-                for (size_t offsetIdx = 0, countersIdx = 0; offsetIdx < numOfDims; ++offsetIdx) {
-                    if (offsetIdx == axis) {
-                        continue;
-                    }
-                    forStartOffset[offsetIdx] = counters[countersIdx++];
-                }
-
-                size_t startOffset = getStartOffset(forStartOffset, strides);
-
-                const dataType *inputStart = input + startOffset;
-                dataType *outputStart = output + startOffset;
-
-                size_t offset = strides[axis];
-                if (reverse) {
-                    if (exclusive) {
-                        outputStart[offset*(shape[axis] - 1)] = 0;
-                        for (int64_t i = shape[axis] - 2; i >= 0; i--) {
-                            outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset];
-                        }
-                    } else {
-                        outputStart[offset*(shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)];
-                        for (int64_t i = shape[axis] - 2; i >= 0; i--) {
-                            outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset];
-                        }
-                    }
-                } else {
-                    if (exclusive) {
-                        outputStart[0] = 0;
-                        for (size_t i = 1; i < shape[axis]; i++) {
-                            outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset];
-                        }
-                    } else {
-                        outputStart[0] = inputStart[0];
-                        for (size_t i = 1; i < shape[axis]; i++) {
-                            outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset];
-                        }
-                    }
-                }
-
-                parallelItStep(counters, iterationRange);
-            }
-        });
-    }
-
-    void parallelItInit(size_t start, std::vector<size_t>& counters, const std::vector<size_t>& iterationRange) {
-        auto itCounter = counters.rbegin();
-        auto itWork = iterationRange.rbegin();
-        while (itCounter != counters.rend() && itWork != iterationRange.rend()) {
-            *itCounter = start % *itWork;
-            start /= *itWork;
-            ++itCounter;
-            ++itWork;
-        }
-    }
-
-    inline void parallelItStep(std::vector<size_t>& counters, const std::vector<size_t>& iterationRange) {
-        auto itCounter = counters.rbegin();
-        auto itWork = iterationRange.rbegin();
-
-        while (itCounter != counters.rend() && itWork != iterationRange.rend()) {
-            *itCounter = (*itCounter + 1) % *itWork;
-            if (*itCounter != 0) {
-                break;
-            }
-            ++itCounter;
-            ++itWork;
-        }
-    }
-
-    inline size_t getStartOffset(const std::vector<size_t> &forStartOffset, const std::vector<size_t>& strides) const {
-        size_t startOffset = 0;
-        for (size_t idx = 0; idx < forStartOffset.size(); ++idx) {
-            startOffset += forStartOffset[idx] * strides[idx];
-        }
-        return startOffset;
-    }
-
-    size_t getAxis(const Blob::CPtr& _axis, const Blob::CPtr& _data) const {
-        const auto& axisPrecision = _axis->getTensorDesc().getPrecision();
-        const int64_t dataShapeSize = static_cast<int64_t>(_data->getTensorDesc().getDims().size());
-        int64_t axisValueFromBlob;
-        switch (axisPrecision) {
-            case Precision::I32 : {
-                const auto *axisPtr = _axis->cbuffer().as<const int32_t *>();
-                axisValueFromBlob = static_cast<int64_t>(axisPtr[0]);
-                break;
-            }
-            case Precision::I64 : {
-                const auto *axisPtr = _axis->cbuffer().as<const int64_t *>();
-                axisValueFromBlob = axisPtr[0];
-                break;
-            }
-            default : {
-                IE_THROW() << "CumSum layer with name '" << layerName << "'  doesn't support 'axis' input with precision: " << axisPrecision.name();
-            }
-        }
-        if (axisValueFromBlob < -dataShapeSize || axisValueFromBlob > dataShapeSize - 1)
-            IE_THROW() << "CumSum layer with name '" << layerName << "'  has axis with a value out of range: " << axisValueFromBlob;
-        return axisValueFromBlob >= 0 ? axisValueFromBlob : (axisValueFromBlob + dataShapeSize);
-    }
-
-private:
-    std::string layerName;
-};
-
-REG_FACTORY_FOR(CumSumImpl, CumSum);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
\ No newline at end of file
diff --git a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp b/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp
deleted file mode 100644
index bd3b1da8fc878c..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp
+++ /dev/null
@@ -1,663 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cfloat>
-#include <vector>
-#include <cmath>
-#include <string>
-#include <utility>
-#include <algorithm>
-#include "caseless.hpp"
-#include "ie_parallel.hpp"
-#include "common/tensor_desc_creator.h"
-#include <ngraph/op/detection_output.hpp>
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-template <typename T>
-static bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                                 const std::pair<float, T>& pair2) {
-    return pair1.first > pair2.first;
-}
-
-class DetectionOutputImpl: public ExtLayerBase {
-public:
-    bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            auto doOp = ngraph::as_type_ptr<const ngraph::op::v0::DetectionOutput>(op);
-            if (!doOp) {
-                errorMessage = "Node is not an instance of the DetectionOutput from the operations set v0.";
-                return false;
-            }
-            if (!details::CaselessEq<std::string>()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CENTER_SIZE") &&
-                    !details::CaselessEq<std::string>()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CORNER")) {
-                errorMessage = "Unsupported code_type attribute.";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    explicit DetectionOutputImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-            if (op->get_input_size() != 3 && op->get_input_size() != 5)
-                IE_THROW() <<  "Invalid number of input edges.";
-
-            if (op->get_output_size() != 1)
-                IE_THROW() << "Invalid number of output edges.";
-
-            auto doOp = ngraph::as_type_ptr<const ngraph::op::v0::DetectionOutput>(op);
-            auto attributes = doOp->get_attrs();
-
-            _num_classes = attributes.num_classes;
-            _background_label_id = attributes.background_label_id;
-            _top_k = attributes.top_k;
-            _variance_encoded_in_target = attributes.variance_encoded_in_target;
-            _keep_top_k = attributes.keep_top_k[0];
-            _nms_threshold = attributes.nms_threshold;
-            _confidence_threshold = attributes.confidence_threshold;
-            _share_location = attributes.share_location;
-            _clip_before_nms = attributes.clip_before_nms;
-            _clip_after_nms = attributes.clip_after_nms;
-            _decrease_label_id = attributes.decrease_label_id;
-            _normalized = attributes.normalized;
-            _image_height = attributes.input_height;
-            _image_width = attributes.input_width;
-            _prior_size = _normalized ? 4 : 5;
-            _offset = _normalized ? 0 : 1;
-            _num_loc_classes = _share_location ? 1 : _num_classes;
-
-            with_add_box_pred = op->get_input_size() == 5;
-            _objectness_score = attributes.objectness_score;
-
-            _code_type = (details::CaselessEq<std::string>()(attributes.code_type, "caffe.PriorBoxParameter.CENTER_SIZE") ?
-                CodeType::CENTER_SIZE : CodeType::CORNER);
-
-            _num_priors = static_cast<int>(op->get_input_shape(idx_priors).back() / _prior_size);
-            _priors_batches = op->get_input_shape(idx_priors).front() != 1;
-
-            if (_num_priors * _num_loc_classes * 4 != static_cast<int>(op->get_input_shape(idx_location)[1]))
-                IE_THROW() << "Number of priors must match number of location predictions ("
-                                   << _num_priors * _num_loc_classes * 4 << " vs "
-                                   << op->get_input_shape(idx_location)[1] << ")";
-
-            if (_num_priors * _num_classes != static_cast<int>(op->get_input_shape(idx_confidence).back()))
-                IE_THROW() << "Number of priors must match number of confidence predictions.";
-
-            if (_decrease_label_id && _background_label_id != 0)
-                IE_THROW() << "Cannot use decrease_label_id and background_label_id parameter simultaneously.";
-
-            _num = static_cast<int>(op->get_input_shape(idx_confidence)[0]);
-
-            _decoded_bboxes.resize(_num * _num_classes * _num_priors * 4);
-            _buffer.resize(_num * _num_classes * _num_priors);
-            _indices.resize(_num * _num_classes * _num_priors);
-            _detections_count.resize(_num * _num_classes);
-            _bbox_sizes.resize(_num * _num_classes * _num_priors);
-            _num_priors_actual.resize(_num);
-
-            const auto &confSize = op->get_input_shape(idx_confidence);
-            _reordered_conf.resize(std::accumulate(confSize.begin(), confSize.end(), 1, std::multiplies<size_t>()));
-
-            std::vector<DataConfigurator> inDataConfigurators(op->get_input_size(), {TensorDescCreatorTypes::ncsp, Precision::FP32});
-            addConfig(op, inDataConfigurators,
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-            throw;
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        float *dst_data = outputs[0]->buffer();
-
-        const float *loc_data    = inputs[idx_location]->buffer().as<const float *>();
-        const float *conf_data   = inputs[idx_confidence]->buffer().as<const float *>();
-        const float *prior_data  = inputs[idx_priors]->buffer().as<const float *>();
-        const float *arm_conf_data = inputs.size() > 3 ? inputs[idx_arm_confidence]->buffer().as<const float *>() : nullptr;
-        const float *arm_loc_data = inputs.size() > 4 ? inputs[idx_arm_location]->buffer().as<const float *>() : nullptr;
-
-        const int N = inputs[idx_confidence]->getTensorDesc().getDims()[0];
-
-        float *decoded_bboxes_data = _decoded_bboxes.data();
-        float *reordered_conf_data = _reordered_conf.data();
-        float *bbox_sizes_data     = _bbox_sizes.data();
-        int *detections_data       = _detections_count.data();
-        int *buffer_data           = _buffer.data();
-        int *indices_data          = _indices.data();
-        int *num_priors_actual     = _num_priors_actual.data();
-
-        for (int n = 0; n < N; ++n) {
-            const float *ppriors = prior_data;
-            const float *prior_variances = prior_data + _num_priors*_prior_size;
-            if (_priors_batches) {
-                ppriors += _variance_encoded_in_target ? n*_num_priors*_prior_size : 2*n*_num_priors*_prior_size;
-                prior_variances += _variance_encoded_in_target ? 0 : 2*n*_num_priors*_prior_size;
-            }
-
-            if (_share_location) {
-                const float *ploc = loc_data + n*4*_num_priors;
-                float *pboxes = decoded_bboxes_data + n*4*_num_priors;
-                float *psizes = bbox_sizes_data + n*_num_priors;
-
-                if (with_add_box_pred) {
-                    const float *p_arm_loc = arm_loc_data + n*4*_num_priors;
-                    decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
-                    decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false);
-                } else {
-                    decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
-                }
-            } else {
-                for (int c = 0; c < _num_loc_classes; ++c) {
-                    if (c == _background_label_id) {
-                        continue;
-                    }
-                    const float *ploc = loc_data + n*4*_num_loc_classes*_num_priors + c*4;
-                    float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors + c*4*_num_priors;
-                    float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors + c*_num_priors;
-                    if (with_add_box_pred) {
-                        const float *p_arm_loc = arm_loc_data + n*4*_num_loc_classes*_num_priors + c*4;
-                        decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
-                        decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false);
-                    } else {
-                        decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
-                    }
-                }
-            }
-        }
-
-        if (with_add_box_pred) {
-            for (int n = 0; n < N; ++n) {
-                for (int p = 0; p < _num_priors; ++p) {
-                    if (arm_conf_data[n*_num_priors*2 + p * 2 + 1] < _objectness_score) {
-                        for (int c = 0; c < _num_classes; ++c) {
-                            reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = c == _background_label_id ? 1.0f : 0.0f;
-                        }
-                    } else {
-                        for (int c = 0; c < _num_classes; ++c) {
-                            reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c];
-                        }
-                    }
-                }
-            }
-        } else {
-            for (int n = 0; n < N; ++n) {
-                for (int c = 0; c < _num_classes; ++c) {
-                    for (int p = 0; p < _num_priors; ++p) {
-                        reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c];
-                    }
-                }
-            }
-        }
-
-        memset(detections_data, 0, N*_num_classes*sizeof(int));
-
-        for (int n = 0; n < N; ++n) {
-            int detections_total = 0;
-
-            if (!_decrease_label_id) {
-                // Caffe style
-                parallel_for(_num_classes, [&](int c) {
-                    if (c != _background_label_id) {  // Ignore background class
-                        int *pindices    = indices_data + n*_num_classes*_num_priors + c*_num_priors;
-                        int *pbuffer     = buffer_data + c*_num_priors;
-                        int *pdetections = detections_data + n*_num_classes + c;
-
-                        const float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors;
-                        const float *pboxes;
-                        const float *psizes;
-                        if (_share_location) {
-                            pboxes = decoded_bboxes_data + n*4*_num_priors;
-                            psizes = bbox_sizes_data + n*_num_priors;
-                        } else {
-                            pboxes = decoded_bboxes_data + n*4*_num_classes*_num_priors + c*4*_num_priors;
-                            psizes = bbox_sizes_data + n*_num_classes*_num_priors + c*_num_priors;
-                        }
-
-                        nms_cf(pconf, pboxes, psizes, pbuffer, pindices, *pdetections, num_priors_actual[n]);
-                    }
-                });
-            } else {
-                // MXNet style
-                int *pindices = indices_data + n*_num_classes*_num_priors;
-                int *pbuffer = buffer_data;
-                int *pdetections = detections_data + n*_num_classes;
-
-                const float *pconf = reordered_conf_data + n*_num_classes*_num_priors;
-                const float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors;
-                const float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors;
-
-                nms_mx(pconf, pboxes, psizes, pbuffer, pindices, pdetections, _num_priors);
-            }
-
-            for (int c = 0; c < _num_classes; ++c) {
-                detections_total += detections_data[n*_num_classes + c];
-            }
-
-            if (_keep_top_k > -1 && detections_total > _keep_top_k) {
-                std::vector<std::pair<float, std::pair<int, int>>> conf_index_class_map;
-
-                for (int c = 0; c < _num_classes; ++c) {
-                    int detections = detections_data[n*_num_classes + c];
-                    int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors;
-
-                    float *pconf  = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors;
-
-                    for (int i = 0; i < detections; ++i) {
-                        int idx = pindices[i];
-                        conf_index_class_map.push_back(std::make_pair(pconf[idx], std::make_pair(c, idx)));
-                    }
-                }
-
-                std::sort(conf_index_class_map.begin(), conf_index_class_map.end(),
-                          SortScorePairDescend<std::pair<int, int>>);
-                conf_index_class_map.resize(_keep_top_k);
-
-                // Store the new indices.
-                memset(detections_data + n*_num_classes, 0, _num_classes * sizeof(int));
-
-                for (size_t j = 0; j < conf_index_class_map.size(); ++j) {
-                    int label = conf_index_class_map[j].second.first;
-                    int idx = conf_index_class_map[j].second.second;
-                    int *pindices = indices_data + n * _num_classes * _num_priors + label * _num_priors;
-                    pindices[detections_data[n*_num_classes + label]] = idx;
-                    detections_data[n*_num_classes + label]++;
-                }
-            }
-        }
-
-        const int num_results = outputs[0]->getTensorDesc().getDims()[2];
-        const int DETECTION_SIZE = outputs[0]->getTensorDesc().getDims()[3];
-        if (DETECTION_SIZE != 7) {
-            return NOT_IMPLEMENTED;
-        }
-
-        int dst_data_size = 0;
-        if (_keep_top_k > 0)
-            dst_data_size = N * _keep_top_k * DETECTION_SIZE * sizeof(float);
-        else if (_top_k > 0)
-            dst_data_size = N * _top_k * _num_classes * DETECTION_SIZE * sizeof(float);
-        else
-            dst_data_size = N * _num_classes * _num_priors * DETECTION_SIZE * sizeof(float);
-
-        if (dst_data_size > outputs[0]->byteSize()) {
-            return OUT_OF_BOUNDS;
-        }
-        memset(dst_data, 0, dst_data_size);
-
-        int count = 0;
-        for (int n = 0; n < N; ++n) {
-            const float *pconf   = reordered_conf_data + n * _num_priors * _num_classes;
-            const float *pboxes  = decoded_bboxes_data + n*_num_priors*4*_num_loc_classes;
-            const int *pindices  = indices_data + n*_num_classes*_num_priors;
-
-            for (int c = 0; c < _num_classes; ++c) {
-                for (int i = 0; i < detections_data[n*_num_classes + c]; ++i) {
-                    int idx = pindices[c*_num_priors + i];
-
-                    dst_data[count * DETECTION_SIZE + 0] = static_cast<float>(n);
-                    dst_data[count * DETECTION_SIZE + 1] = static_cast<float>(_decrease_label_id ? c-1 : c);
-                    dst_data[count * DETECTION_SIZE + 2] = pconf[c*_num_priors + idx];
-
-                    float xmin = _share_location ? pboxes[idx*4 + 0] :
-                                 pboxes[c*4*_num_priors + idx*4 + 0];
-                    float ymin = _share_location ? pboxes[idx*4 + 1] :
-                                 pboxes[c*4*_num_priors + idx*4 + 1];
-                    float xmax = _share_location ? pboxes[idx*4 + 2] :
-                                 pboxes[c*4*_num_priors + idx*4 + 2];
-                    float ymax = _share_location ? pboxes[idx*4 + 3] :
-                                 pboxes[c*4*_num_priors + idx*4 + 3];
-
-                    if (_clip_after_nms) {
-                        xmin = (std::max)(0.0f, (std::min)(1.0f, xmin));
-                        ymin = (std::max)(0.0f, (std::min)(1.0f, ymin));
-                        xmax = (std::max)(0.0f, (std::min)(1.0f, xmax));
-                        ymax = (std::max)(0.0f, (std::min)(1.0f, ymax));
-                    }
-
-                    dst_data[count * DETECTION_SIZE + 3] = xmin;
-                    dst_data[count * DETECTION_SIZE + 4] = ymin;
-                    dst_data[count * DETECTION_SIZE + 5] = xmax;
-                    dst_data[count * DETECTION_SIZE + 6] = ymax;
-
-                    ++count;
-                }
-            }
-        }
-
-        if (count < num_results) {
-            // marker at end of boxes list
-            dst_data[count * DETECTION_SIZE + 0] = -1;
-        }
-
-        return OK;
-    }
-
-private:
-    const int idx_location = 0;
-    const int idx_confidence = 1;
-    const int idx_priors = 2;
-    const int idx_arm_confidence = 3;
-    const int idx_arm_location = 4;
-
-    int _num_classes = 0;
-    int _background_label_id = 0;
-    int _top_k = 0;
-    int _variance_encoded_in_target = 0;
-    int _keep_top_k = 0;
-    int _code_type = 0;
-
-    bool _share_location    = false;
-    bool _clip_before_nms   = false;  // clip bounding boxes before nms step
-    bool _clip_after_nms    = false;  // clip bounding boxes after nms step
-    bool _decrease_label_id = false;
-
-    bool with_add_box_pred = false;
-
-    int _image_width = 0;
-    int _image_height = 0;
-    int _prior_size = 4;
-    bool _normalized = true;
-    int _offset = 0;
-
-    float _nms_threshold = 0.0f;
-    float _confidence_threshold = 0.0f;
-    float _objectness_score = 0.0f;
-
-    int _num = 0;
-    int _num_loc_classes = 0;
-    int _num_priors = 0;
-    bool _priors_batches = false;
-
-    enum CodeType {
-        CORNER = 1,
-        CENTER_SIZE = 2,
-    };
-
-    void decodeBBoxes(const float *prior_data, const float *loc_data, const float *variance_data,
-                      float *decoded_bboxes, float *decoded_bbox_sizes, int* num_priors_actual, int n, const int& offs, const int& pr_size,
-                      bool decodeType = true); // after ARM = false
-
-    void nms_cf(const float *conf_data, const float *bboxes, const float *sizes,
-                int *buffer, int *indices, int &detections, int num_priors_actual);
-
-    void nms_mx(const float *conf_data, const float *bboxes, const float *sizes,
-                int *buffer, int *indices, int *detections, int num_priors_actual);
-
-    std::vector<float> _decoded_bboxes;
-    std::vector<int> _buffer;
-    std::vector<int> _indices;
-    std::vector<int> _detections_count;
-    std::vector<float> _reordered_conf;
-    std::vector<float> _bbox_sizes;
-    std::vector<int> _num_priors_actual;
-};
-
-struct ConfidenceComparator {
-    explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {}
-
-    bool operator()(int idx1, int idx2) {
-        if (_conf_data[idx1] > _conf_data[idx2]) return true;
-        if (_conf_data[idx1] < _conf_data[idx2]) return false;
-        return idx1 < idx2;
-    }
-
-    const float* _conf_data;
-};
-
-static inline float JaccardOverlap(const float *decoded_bbox,
-                                   const float *bbox_sizes,
-                                   const int idx1,
-                                   const int idx2) {
-    float xmin1 = decoded_bbox[idx1*4 + 0];
-    float ymin1 = decoded_bbox[idx1*4 + 1];
-    float xmax1 = decoded_bbox[idx1*4 + 2];
-    float ymax1 = decoded_bbox[idx1*4 + 3];
-
-    float xmin2 = decoded_bbox[idx2*4 + 0];
-    float ymin2 = decoded_bbox[idx2*4 + 1];
-    float xmax2 = decoded_bbox[idx2*4 + 2];
-    float ymax2 = decoded_bbox[idx2*4 + 3];
-
-    if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) {
-        return 0.0f;
-    }
-
-    float intersect_xmin = (std::max)(xmin1, xmin2);
-    float intersect_ymin = (std::max)(ymin1, ymin2);
-    float intersect_xmax = (std::min)(xmax1, xmax2);
-    float intersect_ymax = (std::min)(ymax1, ymax2);
-
-    float intersect_width  = intersect_xmax - intersect_xmin;
-    float intersect_height = intersect_ymax - intersect_ymin;
-
-    if (intersect_width <= 0 || intersect_height <= 0) {
-        return 0.0f;
-    }
-
-    float intersect_size = intersect_width * intersect_height;
-    float bbox1_size = bbox_sizes[idx1];
-    float bbox2_size = bbox_sizes[idx2];
-
-    return intersect_size / (bbox1_size + bbox2_size - intersect_size);
-}
-
-void DetectionOutputImpl::decodeBBoxes(const float *prior_data,
-                                       const float *loc_data,
-                                       const float *variance_data,
-                                       float *decoded_bboxes,
-                                       float *decoded_bbox_sizes,
-                                       int* num_priors_actual,
-                                       int n,
-                                       const int& offs,
-                                       const int& pr_size,
-                                       bool decodeType) {
-    num_priors_actual[n] = _num_priors;
-    if (!_normalized && decodeType) {
-        int num = 0;
-        for (; num < _num_priors; ++num) {
-            float batch_id = prior_data[num * pr_size + 0];
-            if (batch_id == -1.f) {
-                num_priors_actual[n] = num;
-                break;
-            }
-        }
-    }
-    parallel_for(num_priors_actual[n], [&](int p) {
-        float new_xmin = 0.0f;
-        float new_ymin = 0.0f;
-        float new_xmax = 0.0f;
-        float new_ymax = 0.0f;
-
-        float prior_xmin = prior_data[p*pr_size + 0 + offs];
-        float prior_ymin = prior_data[p*pr_size + 1 + offs];
-        float prior_xmax = prior_data[p*pr_size + 2 + offs];
-        float prior_ymax = prior_data[p*pr_size + 3 + offs];
-
-        float loc_xmin = loc_data[4*p*_num_loc_classes + 0];
-        float loc_ymin = loc_data[4*p*_num_loc_classes + 1];
-        float loc_xmax = loc_data[4*p*_num_loc_classes + 2];
-        float loc_ymax = loc_data[4*p*_num_loc_classes + 3];
-
-        if (!_normalized) {
-            prior_xmin /= _image_width;
-            prior_ymin /= _image_height;
-            prior_xmax /= _image_width;
-            prior_ymax /= _image_height;
-        }
-
-        if (_code_type == CodeType::CORNER) {
-            if (_variance_encoded_in_target) {
-                // variance is encoded in target, we simply need to add the offset predictions.
-                new_xmin = prior_xmin + loc_xmin;
-                new_ymin = prior_ymin + loc_ymin;
-                new_xmax = prior_xmax + loc_xmax;
-                new_ymax = prior_ymax + loc_ymax;
-            } else {
-                new_xmin = prior_xmin + variance_data[p*4 + 0] * loc_xmin;
-                new_ymin = prior_ymin + variance_data[p*4 + 1] * loc_ymin;
-                new_xmax = prior_xmax + variance_data[p*4 + 2] * loc_xmax;
-                new_ymax = prior_ymax + variance_data[p*4 + 3] * loc_ymax;
-            }
-        } else if (_code_type == CodeType::CENTER_SIZE) {
-            float prior_width    =  prior_xmax - prior_xmin;
-            float prior_height   =  prior_ymax - prior_ymin;
-            float prior_center_x = (prior_xmin + prior_xmax) / 2.0f;
-            float prior_center_y = (prior_ymin + prior_ymax) / 2.0f;
-
-            float decode_bbox_center_x, decode_bbox_center_y;
-            float decode_bbox_width, decode_bbox_height;
-
-            if (_variance_encoded_in_target) {
-                // variance is encoded in target, we simply need to restore the offset predictions.
-                decode_bbox_center_x = loc_xmin * prior_width  + prior_center_x;
-                decode_bbox_center_y = loc_ymin * prior_height + prior_center_y;
-                decode_bbox_width  = std::exp(loc_xmax) * prior_width;
-                decode_bbox_height = std::exp(loc_ymax) * prior_height;
-            } else {
-                // variance is encoded in bbox, we need to scale the offset accordingly.
-                decode_bbox_center_x = variance_data[p*4 + 0] * loc_xmin * prior_width + prior_center_x;
-                decode_bbox_center_y = variance_data[p*4 + 1] * loc_ymin * prior_height + prior_center_y;
-                decode_bbox_width    = std::exp(variance_data[p*4 + 2] * loc_xmax) * prior_width;
-                decode_bbox_height   = std::exp(variance_data[p*4 + 3] * loc_ymax) * prior_height;
-            }
-
-            new_xmin = decode_bbox_center_x - decode_bbox_width  / 2.0f;
-            new_ymin = decode_bbox_center_y - decode_bbox_height / 2.0f;
-            new_xmax = decode_bbox_center_x + decode_bbox_width  / 2.0f;
-            new_ymax = decode_bbox_center_y + decode_bbox_height / 2.0f;
-        }
-
-        if (_clip_before_nms) {
-            new_xmin = (std::max)(0.0f, (std::min)(1.0f, new_xmin));
-            new_ymin = (std::max)(0.0f, (std::min)(1.0f, new_ymin));
-            new_xmax = (std::max)(0.0f, (std::min)(1.0f, new_xmax));
-            new_ymax = (std::max)(0.0f, (std::min)(1.0f, new_ymax));
-        }
-
-        decoded_bboxes[p*4 + 0] = new_xmin;
-        decoded_bboxes[p*4 + 1] = new_ymin;
-        decoded_bboxes[p*4 + 2] = new_xmax;
-        decoded_bboxes[p*4 + 3] = new_ymax;
-
-        decoded_bbox_sizes[p] = (new_xmax - new_xmin) * (new_ymax - new_ymin);
-    });
-}
-
-void DetectionOutputImpl::nms_cf(const float* conf_data,
-                          const float* bboxes,
-                          const float* sizes,
-                          int* buffer,
-                          int* indices,
-                          int& detections,
-                          int num_priors_actual) {
-    int count = 0;
-    for (int i = 0; i < num_priors_actual; ++i) {
-        if (conf_data[i] > _confidence_threshold) {
-            indices[count] = i;
-            count++;
-        }
-    }
-
-    int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count));
-
-    std::partial_sort_copy(indices, indices + count,
-                           buffer, buffer + num_output_scores,
-                           ConfidenceComparator(conf_data));
-
-    for (int i = 0; i < num_output_scores; ++i) {
-        const int idx = buffer[i];
-
-        bool keep = true;
-        for (int k = 0; k < detections; ++k) {
-            const int kept_idx = indices[k];
-            float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx);
-            if (overlap > _nms_threshold) {
-                keep = false;
-                break;
-            }
-        }
-        if (keep) {
-            indices[detections] = idx;
-            detections++;
-        }
-    }
-}
-
-void DetectionOutputImpl::nms_mx(const float* conf_data,
-                          const float* bboxes,
-                          const float* sizes,
-                          int* buffer,
-                          int* indices,
-                          int* detections,
-                          int num_priors_actual) {
-    int count = 0;
-    for (int i = 0; i < num_priors_actual; ++i) {
-        float conf = -1;
-        int id = 0;
-        for (int c = 1; c < _num_classes; ++c) {
-            float temp = conf_data[c*_num_priors + i];
-            if (temp > conf) {
-                conf = temp;
-                id = c;
-            }
-        }
-
-        if (id > 0 && conf >= _confidence_threshold) {
-            indices[count++] = id*_num_priors + i;
-        }
-    }
-
-    int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count));
-
-    std::partial_sort_copy(indices, indices + count,
-                           buffer, buffer + num_output_scores,
-                           ConfidenceComparator(conf_data));
-
-    for (int i = 0; i < num_output_scores; ++i) {
-        const int idx = buffer[i];
-        const int cls = idx/_num_priors;
-        const int prior = idx%_num_priors;
-
-        int &ndetection = detections[cls];
-        int *pindices = indices + cls*_num_priors;
-
-        bool keep = true;
-        for (int k = 0; k < ndetection; ++k) {
-            const int kept_idx = pindices[k];
-            float overlap = 0.0f;
-            if (_share_location) {
-                overlap = JaccardOverlap(bboxes, sizes, prior, kept_idx);
-            } else {
-                overlap = JaccardOverlap(bboxes, sizes, cls*_num_priors + prior, cls*_num_priors + kept_idx);
-            }
-            if (overlap > _nms_threshold) {
-                keep = false;
-                break;
-            }
-        }
-        if (keep) {
-            pindices[ndetection++] = prior;
-        }
-    }
-}
-
-REG_FACTORY_FOR(DetectionOutputImpl, DetectionOutput);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp
deleted file mode 100644
index fefcee872cea4f..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp
+++ /dev/null
@@ -1,402 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cassert>
-#include <cfloat>
-#include <vector>
-#include <cmath>
-#include <string>
-#include <utility>
-#include <algorithm>
-#include "ie_parallel.hpp"
-#include "common/tensor_desc_creator.h"
-#include <ngraph/op/experimental_detectron_detection_output.hpp>
-
-
-namespace {
-struct Indexer {
-  const std::vector<int> dims_;
-  int total_{1};
-
-  explicit Indexer(const std::vector<int>& dims) : dims_(dims) {
-      total_ = 1;
-      for (size_t i = 0; i < dims_.size(); ++i) {
-          total_ *= dims_[i];
-      }
-  }
-
-  int operator()(const std::vector<int>& idx) const {
-      int flat_idx = 0;
-      assert(idx.size() == dims_.size());
-      for (size_t i = 0; i < dims_.size(); ++i) {
-          assert(0 <= idx[i] && idx[i] < dims_[i]);
-          flat_idx = flat_idx * dims_[i] + idx[i];
-      }
-      assert(flat_idx < total_);
-      return flat_idx;
-  }
-};
-}  // namespace
-
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-static
-void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores,
-                  float* refined_boxes, float* refined_boxes_areas, float* refined_scores,
-                  const int rois_num, const int classes_num,
-                  const float img_H, const float img_W,
-                  const float max_delta_log_wh,
-                  float coordinates_offset) {
-    Indexer box_idx({rois_num, 4});
-    Indexer delta_idx({rois_num, classes_num, 4});
-    Indexer score_idx({rois_num, classes_num});
-
-    Indexer refined_box_idx({classes_num, rois_num, 4});
-    Indexer refined_score_idx({classes_num, rois_num});
-
-    for (int roi_idx = 0; roi_idx < rois_num; ++roi_idx) {
-        float x0 = boxes[box_idx({roi_idx, 0})];
-        float y0 = boxes[box_idx({roi_idx, 1})];
-        float x1 = boxes[box_idx({roi_idx, 2})];
-        float y1 = boxes[box_idx({roi_idx, 3})];
-
-        if (x1 - x0 <= 0 || y1 - y0 <= 0) {
-            continue;
-        }
-
-        // width & height of box
-        const float ww = x1 - x0 + coordinates_offset;
-        const float hh = y1 - y0 + coordinates_offset;
-        // center location of box
-        const float ctr_x = x0 + 0.5f * ww;
-        const float ctr_y = y0 + 0.5f * hh;
-
-        for (int class_idx = 1; class_idx < classes_num; ++class_idx) {
-            const float dx = deltas[delta_idx({roi_idx, class_idx, 0})] / weights[0];
-            const float dy = deltas[delta_idx({roi_idx, class_idx, 1})] / weights[1];
-            const float d_log_w = deltas[delta_idx({roi_idx, class_idx, 2})] / weights[2];
-            const float d_log_h = deltas[delta_idx({roi_idx, class_idx, 3})] / weights[3];
-
-            // new center location according to deltas (dx, dy)
-            const float pred_ctr_x = dx * ww + ctr_x;
-            const float pred_ctr_y = dy * hh + ctr_y;
-            // new width & height according to deltas d(log w), d(log h)
-            const float pred_w = std::exp((std::min)(d_log_w, max_delta_log_wh)) * ww;
-            const float pred_h = std::exp((std::min)(d_log_h, max_delta_log_wh)) * hh;
-
-            // update upper-left corner location
-            float x0_new = pred_ctr_x - 0.5f * pred_w;
-            float y0_new = pred_ctr_y - 0.5f * pred_h;
-            // update lower-right corner location
-            float x1_new = pred_ctr_x + 0.5f * pred_w - coordinates_offset;
-            float y1_new = pred_ctr_y + 0.5f * pred_h - coordinates_offset;
-
-            // adjust new corner locations to be within the image region,
-            x0_new = std::max<float>(0.0f, x0_new);
-            y0_new = std::max<float>(0.0f, y0_new);
-            x1_new = std::max<float>(0.0f, x1_new);
-            y1_new = std::max<float>(0.0f, y1_new);
-
-            // recompute new width & height
-            const float box_w = x1_new - x0_new + coordinates_offset;
-            const float box_h = y1_new - y0_new + coordinates_offset;
-
-            refined_boxes[refined_box_idx({class_idx, roi_idx, 0})] = x0_new;
-            refined_boxes[refined_box_idx({class_idx, roi_idx, 1})] = y0_new;
-            refined_boxes[refined_box_idx({class_idx, roi_idx, 2})] = x1_new;
-            refined_boxes[refined_box_idx({class_idx, roi_idx, 3})] = y1_new;
-
-            refined_boxes_areas[refined_score_idx({class_idx, roi_idx})] = box_w * box_h;
-
-            refined_scores[refined_score_idx({class_idx, roi_idx})] = scores[score_idx({roi_idx, class_idx})];
-        }
-    }
-}
-
-template <typename T>
-static bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                                 const std::pair<float, T>& pair2) {
-    return pair1.first > pair2.first;
-}
-
-
-struct ConfidenceComparator {
-    explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {}
-
-    bool operator()(int idx1, int idx2) {
-        if (_conf_data[idx1] > _conf_data[idx2]) return true;
-        if (_conf_data[idx1] < _conf_data[idx2]) return false;
-        return idx1 < idx2;
-    }
-
-    const float* _conf_data;
-};
-
-static inline float JaccardOverlap(const float *decoded_bbox,
-                                   const float *bbox_sizes,
-                                   const int idx1,
-                                   const int idx2,
-                                   const float coordinates_offset = 1) {
-    float xmin1 = decoded_bbox[idx1 * 4 + 0];
-    float ymin1 = decoded_bbox[idx1 * 4 + 1];
-    float xmax1 = decoded_bbox[idx1 * 4 + 2];
-    float ymax1 = decoded_bbox[idx1 * 4 + 3];
-
-    float xmin2 = decoded_bbox[idx2 * 4 + 0];
-    float ymin2 = decoded_bbox[idx2 * 4 + 1];
-    float ymax2 = decoded_bbox[idx2 * 4 + 3];
-    float xmax2 = decoded_bbox[idx2 * 4 + 2];
-
-    if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) {
-        return 0.0f;
-    }
-
-    float intersect_xmin = (std::max)(xmin1, xmin2);
-    float intersect_ymin = (std::max)(ymin1, ymin2);
-    float intersect_xmax = (std::min)(xmax1, xmax2);
-    float intersect_ymax = (std::min)(ymax1, ymax2);
-
-    float intersect_width  = intersect_xmax - intersect_xmin + coordinates_offset;
-    float intersect_height = intersect_ymax - intersect_ymin + coordinates_offset;
-
-    if (intersect_width <= 0 || intersect_height <= 0) {
-        return 0.0f;
-    }
-
-    float intersect_size = intersect_width * intersect_height;
-    float bbox1_size = bbox_sizes[idx1];
-    float bbox2_size = bbox_sizes[idx2];
-
-    return intersect_size / (bbox1_size + bbox2_size - intersect_size);
-}
-
-
-static void nms_cf(const float* conf_data,
-                          const float* bboxes,
-                          const float* sizes,
-                          int* buffer,
-                          int* indices,
-                          int& detections,
-                          const int boxes_num,
-                          const int pre_nms_topn,
-                          const int post_nms_topn,
-                          const float confidence_threshold,
-                          const float nms_threshold) {
-    int count = 0;
-    for (int i = 0; i < boxes_num; ++i) {
-        if (conf_data[i] > confidence_threshold) {
-            indices[count] = i;
-            count++;
-        }
-    }
-
-    int num_output_scores = (pre_nms_topn == -1 ? count : (std::min)(pre_nms_topn, count));
-
-    std::partial_sort_copy(indices, indices + count,
-                           buffer, buffer + num_output_scores,
-                           ConfidenceComparator(conf_data));
-
-    detections = 0;
-    for (int i = 0; i < num_output_scores; ++i) {
-        const int idx = buffer[i];
-
-        bool keep = true;
-        for (int k = 0; k < detections; ++k) {
-            const int kept_idx = indices[k];
-            float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx);
-            if (overlap > nms_threshold) {
-                keep = false;
-                break;
-            }
-        }
-        if (keep) {
-            indices[detections] = idx;
-            detections++;
-        }
-    }
-
-    detections = (post_nms_topn == -1 ? detections : (std::min)(post_nms_topn, detections));
-}
-
-
-class ExperimentalDetectronDetectionOutputImpl: public ExtLayerBase {
-private:
-    const int INPUT_ROIS {0};
-    const int INPUT_DELTAS {1};
-    const int INPUT_SCORES {2};
-    const int INPUT_IM_INFO {3};
-
-    const int OUTPUT_BOXES {0};
-    const int OUTPUT_CLASSES {1};
-    const int OUTPUT_SCORES {2};
-
-public:
-    bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            auto doOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronDetectionOutput>(op);
-            if (!doOp) {
-                errorMessage = "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6.";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    explicit ExperimentalDetectronDetectionOutputImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-            auto doOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronDetectionOutput>(op);
-            auto attributes = doOp->get_attrs();
-
-            score_threshold_ = attributes.score_threshold;
-            nms_threshold_ = attributes.nms_threshold;
-            max_delta_log_wh_ = attributes.max_delta_log_wh;
-            classes_num_ = attributes.num_classes;
-            max_detections_per_class_ = attributes.post_nms_count;
-            max_detections_per_image_ = attributes.max_detections_per_image;
-            class_agnostic_box_regression_ = attributes.class_agnostic_box_regression;
-            deltas_weights_ = attributes.deltas_weights;
-
-            std::vector<DataConfigurator> inDataConfigurators(op->get_input_size(), {TensorDescCreatorTypes::ncsp, Precision::FP32});
-
-            addConfig(op, inDataConfigurators,
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                           {TensorDescCreatorTypes::ncsp, Precision::I32},
-                           {TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-            throw;
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        const int rois_num = inputs[INPUT_ROIS]->getTensorDesc().getDims()[0];
-        assert(classes_num_ == static_cast<int>(inputs[INPUT_SCORES]->getTensorDesc().getDims()[1]));
-        assert(4 * classes_num_ == static_cast<int>(inputs[INPUT_DELTAS]->getTensorDesc().getDims()[1]));
-
-        const auto* boxes = inputs[INPUT_ROIS]->buffer().as<const float *>();
-        const auto* deltas = inputs[INPUT_DELTAS]->buffer().as<const float *>();
-        const auto* scores = inputs[INPUT_SCORES]->buffer().as<const float *>();
-        const auto* im_info = inputs[INPUT_IM_INFO]->buffer().as<const float *>();
-
-        auto* output_boxes = outputs[OUTPUT_BOXES]->buffer().as<float *>();
-        auto* output_scores = outputs[OUTPUT_SCORES]->buffer().as<float *>();
-        auto* output_classes = outputs[OUTPUT_CLASSES]->buffer().as<int32_t *>();
-
-        const float img_H = im_info[0];
-        const float img_W = im_info[1];
-
-        // Apply deltas.
-        std::vector<float> refined_boxes(classes_num_ * rois_num * 4, 0);
-        std::vector<float> refined_scores(classes_num_ * rois_num, 0);
-        std::vector<float> refined_boxes_areas(classes_num_ * rois_num, 0);
-        Indexer refined_box_idx({classes_num_, rois_num, 4});
-        Indexer refined_score_idx({classes_num_, rois_num});
-
-        refine_boxes(boxes, deltas, &deltas_weights_[0], scores,
-                     &refined_boxes[0], &refined_boxes_areas[0], &refined_scores[0],
-                     rois_num, classes_num_,
-                     img_H, img_W,
-                     max_delta_log_wh_,
-                     1.0f);
-
-        // Apply NMS class-wise.
-        std::vector<int> buffer(rois_num, 0);
-        std::vector<int> indices(classes_num_ * rois_num, 0);
-        std::vector<int> detections_per_class(classes_num_, 0);
-        int total_detections_num = 0;
-
-        for (int class_idx = 1; class_idx < classes_num_; ++class_idx) {
-            nms_cf(&refined_scores[refined_score_idx({class_idx, 0})],
-                   &refined_boxes[refined_box_idx({class_idx, 0, 0})],
-                   &refined_boxes_areas[refined_score_idx({class_idx, 0})],
-                   &buffer[0],
-                   &indices[total_detections_num],
-                   detections_per_class[class_idx],
-                   rois_num,
-                   -1,
-                   max_detections_per_class_,
-                   score_threshold_,
-                   nms_threshold_);
-            total_detections_num += detections_per_class[class_idx];
-        }
-
-        // Leave only max_detections_per_image_ detections.
-        // confidence, <class, index>
-        std::vector<std::pair<float, std::pair<int, int>>> conf_index_class_map;
-
-        int indices_offset = 0;
-        for (int c = 0; c < classes_num_; ++c) {
-            int n = detections_per_class[c];
-            for (int i = 0; i < n; ++i) {
-                int idx = indices[indices_offset + i];
-                float score = refined_scores[refined_score_idx({c, idx})];
-                conf_index_class_map.push_back(std::make_pair(score, std::make_pair(c, idx)));
-            }
-            indices_offset += n;
-        }
-
-        assert(max_detections_per_image_ > 0);
-        if (total_detections_num > max_detections_per_image_) {
-            std::partial_sort(conf_index_class_map.begin(),
-                              conf_index_class_map.begin() + max_detections_per_image_,
-                              conf_index_class_map.end(),
-                              SortScorePairDescend<std::pair<int, int>>);
-            conf_index_class_map.resize(max_detections_per_image_);
-            total_detections_num = max_detections_per_image_;
-        }
-
-        // Fill outputs.
-        memset(output_boxes, 0, max_detections_per_image_ * 4 * sizeof(output_boxes[0]));
-        memset(output_scores, 0, max_detections_per_image_ * sizeof(output_scores[0]));
-        memset(output_classes, 0, max_detections_per_image_ * sizeof(output_classes[0]));
-
-        int i = 0;
-        for (const auto & detection : conf_index_class_map) {
-            float score = detection.first;
-            int cls = detection.second.first;
-            int idx = detection.second.second;
-            output_boxes[4 * i + 0] = refined_boxes[refined_box_idx({cls, idx, 0})];
-            output_boxes[4 * i + 1] = refined_boxes[refined_box_idx({cls, idx, 1})];
-            output_boxes[4 * i + 2] = refined_boxes[refined_box_idx({cls, idx, 2})];
-            output_boxes[4 * i + 3] = refined_boxes[refined_box_idx({cls, idx, 3})];
-            output_scores[i] = score;
-            output_classes[i] = cls;
-            ++i;
-        }
-
-        return OK;
-    }
-
-private:
-    float score_threshold_;
-    float nms_threshold_;
-    float max_delta_log_wh_;
-    int classes_num_;
-    int max_detections_per_class_;
-    int max_detections_per_image_;
-    bool class_agnostic_box_regression_;
-    std::vector<float> deltas_weights_;
-};
-
-
-
-REG_FACTORY_FOR(ExperimentalDetectronDetectionOutputImpl, ExperimentalDetectronDetectionOutput);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp b/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp
deleted file mode 100644
index 4ea74721adca49..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-#include <ngraph/op/gather_tree.hpp>
-#include <nodes/common/tensor_desc_creator.h>
-#include <utils/general_utils.h>
-
-#include <cmath>
-#include <limits>
-#include <cfloat>
-#include <string>
-#include <vector>
-#include <cassert>
-#include <algorithm>
-#include "ie_parallel.hpp"
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-class GatherTreeImpl: public ExtLayerBase {
-public:
-    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            auto gatherElementsOp = ngraph::as_type_ptr<const ngraph::op::v1::GatherTree>(op);
-            if (!gatherElementsOp) {
-                errorMessage = "Node is not an instance of the GatherTree operation from operation set v1.";
-                return false;
-            }
-
-            auto precision = op->get_input_element_type(GATHER_TREE_STEP_IDX);
-            if (!MKLDNNPlugin::one_of(precision, ngraph::element::f32, ngraph::element::i32))
-                precision = ngraph::element::f32;
-            if (op->get_input_element_type(GATHER_TREE_PARENT_IDX) != precision ||
-                    op->get_input_element_type(GATHER_TREE_MAX_SEQ_LEN) != precision ||
-                    op->get_input_element_type(GATHER_TREE_END_TOKEN) != precision ||
-                    op->get_output_element_type(0) != precision) {
-                errorMessage = "Node has incorrect input/output data precision. Must be the same.";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-
-        return true;
-    }
-
-    explicit GatherTreeImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            std::string errorPrefix = std::string("Node GatherTree with name '") + op->get_friendly_name() + "'";
-            if (op->get_input_size() != 4)
-                IE_THROW() << errorPrefix << " has incorrect number of input edges.";
-            if (op->get_output_size() != 1)
-                IE_THROW() << errorPrefix << " has incorrect number of output edges.";
-
-            precision = details::convertPrecision(op->get_input_element_type(GATHER_TREE_STEP_IDX));
-            if (!MKLDNNPlugin::one_of(precision, Precision::FP32, Precision::I32))
-                precision = Precision::FP32;
-
-            if (op->get_input_shape(GATHER_TREE_STEP_IDX).size() != 3)
-                IE_THROW() << errorPrefix << " step_idx vector should be 3 dimension";
-            if (op->get_input_shape(GATHER_TREE_PARENT_IDX).size() != 3)
-                IE_THROW() << errorPrefix << " parent_idx vector should be 3 dimension";
-            if (op->get_input_shape(GATHER_TREE_MAX_SEQ_LEN).size() != 1)
-                IE_THROW() << errorPrefix << " max_seq_len vector should be 1 dimension";
-            if (op->get_input_shape(GATHER_TREE_END_TOKEN).size() != 0)
-                IE_THROW() << errorPrefix << " end_token should be 1 dimension";
-
-            addConfig(op, {{TensorDescCreatorTypes::ncsp, precision},
-                           {TensorDescCreatorTypes::ncsp, precision},
-                           {TensorDescCreatorTypes::ncsp, precision},
-                           {TensorDescCreatorTypes::ncsp, precision}},
-                          {{TensorDescCreatorTypes::ncsp, precision}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-            throw;
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        if (precision == Precision::FP32)
-            return execute_impl<float  >(inputs, outputs, resp);
-        else
-            return execute_impl<int32_t>(inputs, outputs, resp);
-    }
-
-    template<typename DATA_T>
-    StatusCode execute_impl(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept {
-        const auto *step_idx = inputs[GATHER_TREE_STEP_IDX]->cbuffer().as<DATA_T *>() +
-            inputs[GATHER_TREE_STEP_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const auto * const parent_idx = inputs[GATHER_TREE_PARENT_IDX]->cbuffer().as<DATA_T *>() +
-            inputs[GATHER_TREE_PARENT_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const size_t parent_idx_size = inputs[GATHER_TREE_PARENT_IDX]->size()
-            - inputs[GATHER_TREE_PARENT_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const auto *max_seq_len = inputs[GATHER_TREE_MAX_SEQ_LEN]->cbuffer().as<DATA_T *>() +
-            inputs[GATHER_TREE_MAX_SEQ_LEN]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        auto end_token = (inputs[GATHER_TREE_END_TOKEN]->cbuffer().as<DATA_T *>() +
-            inputs[GATHER_TREE_END_TOKEN]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0];
-        auto * final_idx = outputs[0]->cbuffer().as<DATA_T *>() +
-            outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        SizeVector step_idx_dims = inputs[GATHER_TREE_STEP_IDX]->getTensorDesc().getDims();
-        SizeVector parent_idx_dims = inputs[GATHER_TREE_PARENT_IDX]->getTensorDesc().getDims();
-        SizeVector max_seq_len_dims = inputs[GATHER_TREE_MAX_SEQ_LEN]->getTensorDesc().getDims();
-        SizeVector final_idx_dims = outputs[0]->getTensorDesc().getDims();
-        int32_t max_time = step_idx_dims[0];
-        const size_t batch_size = step_idx_dims[1];
-        const size_t beam_width = step_idx_dims[2];
-        const size_t bb_size = batch_size * beam_width;
-
-        if (max_time != static_cast<int32_t>(parent_idx_dims[0]) || max_time != static_cast<int32_t>(final_idx_dims[0]) ||
-            batch_size != parent_idx_dims[1] || batch_size != final_idx_dims[1] || batch_size != max_seq_len_dims[0] ||
-            beam_width != parent_idx_dims[2] || beam_width != final_idx_dims[2]) {
-            if (resp) {
-                std::string errorMsg = "Input/Output tensors dimensions mismatch";
-                errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-            }
-            return PARAMETER_MISMATCH;
-        }
-
-        bool incorrect_result = false;
-        parallel_for2d(batch_size, beam_width, [&](size_t batch, size_t beam) {
-            int32_t max_sequence_in_beam = std::min<int32_t>(max_time, static_cast<int32_t>(max_seq_len[batch]));
-            if (max_sequence_in_beam > 0) {
-                int32_t time, idx = (max_time - 1) * bb_size + batch * beam_width;
-                for (time = (max_time - 1); time >= max_sequence_in_beam; time--, idx -= bb_size)
-                    final_idx[idx + beam] = end_token;
-
-                for (int32_t parent = static_cast<int32_t>(beam); time >= 0; time--, idx -= bb_size) {
-                    if (parent < 0
-                            || parent >= static_cast<int32_t>(beam_width)
-                            || idx + parent >= parent_idx_size) {
-                        incorrect_result = true;
-                        break;
-                    }
-                    final_idx[idx + beam] = step_idx[idx + parent];
-                    parent = static_cast<int32_t>(parent_idx[idx + parent]);
-                }
-
-                bool finished = false;
-                auto *final = &final_idx[batch * beam_width + beam];
-                for (time = 0; time < max_sequence_in_beam; time++, final += bb_size) {
-                    if (finished)
-                        (*final) = end_token;
-                    else if ((*final) == end_token)
-                        finished = true;
-                }
-            }
-        });
-
-        if (incorrect_result) {
-            if (resp) {
-                std::string errorMsg = "Wrong parent index, result is incorrect";
-                errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-            }
-            return OUT_OF_BOUNDS;
-        }
-
-        return OK;
-    }
-
-private:
-    static const size_t GATHER_TREE_STEP_IDX = 0;
-    static const size_t GATHER_TREE_PARENT_IDX = 1;
-    static const size_t GATHER_TREE_MAX_SEQ_LEN = 2;
-    static const size_t GATHER_TREE_END_TOKEN = 3;
-
-    InferenceEngine::Precision precision;
-};
-
-REG_FACTORY_FOR(GatherTreeImpl, GatherTree);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/grn.cpp b/inference-engine/src/mkldnn_plugin/nodes/grn.cpp
deleted file mode 100644
index 6ee077fd52ff1e..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/grn.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include "ie_parallel.hpp"
-#include <ngraph/opsets/opset1.hpp>
-
-using namespace MKLDNNPlugin;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-class GRNImpl: public ExtLayerBase {
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto grn = std::dynamic_pointer_cast<const ngraph::opset1::GRN>(op);
-            if (!grn) {
-                errorMessage = "Only opset1 GRN operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    std::string errorPrefix;
-
-public:
-    explicit GRNImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            errorPrefix = "GRN layer with name '" + op->get_friendly_name() + "'";
-            const auto grn = std::dynamic_pointer_cast<const ngraph::opset1::GRN>(op);
-
-            if (op->get_input_size() != 1 || op->get_output_size() != 1)
-                IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
-
-            bias = grn->get_bias();
-
-            addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}},
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        float* src_data = inputs[0]->buffer();
-        float* dst_data = outputs[0]->buffer();
-
-        SizeVector dims = inputs[0]->getTensorDesc().getDims();
-
-        int N = static_cast<int>((dims.size() > 0) ? dims[0] : 1);
-        int C = static_cast<int>((dims.size() > 1) ? dims[1] : 1);
-        int H = static_cast<int>((dims.size() > 2) ? dims[2] : 1);
-        int W = static_cast<int>((dims.size() > 3) ? dims[3] : 1);
-
-        parallel_for3d(N, H, W, [&](int b, int h, int w) {
-            double variance = 0;
-            for (int c = 0; c < C; c++) {
-                variance += std::pow(src_data[b*C*H*W + c*H*W + h*W + w], 2);
-            }
-            variance = std::pow(variance + bias, 0.5f);
-            for (int c = 0; c < C; c++) {
-                dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / static_cast<float>(variance);
-            }
-        });
-        return OK;
-    }
-
-private:
-    float bias = 1.0f;
-};
-
-REG_FACTORY_FOR(GRNImpl, GRN);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
index d06cefa7985ac2..d005c1e16b630d 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
@@ -7,24 +7,3 @@
 # define MKLDNN_EXTENSION_NODE(__prim, __type)
 #endif
 
-MKLDNN_EXTENSION_NODE(CTCLossImpl, CTCLoss);
-MKLDNN_EXTENSION_NODE(ExperimentalDetectronTopKROIsImpl, ExperimentalDetectronTopKROIs);
-MKLDNN_EXTENSION_NODE(ExtractImagePatchesImpl, ExtractImagePatches);
-MKLDNN_EXTENSION_NODE(ReverseSequenceImpl, ReverseSequence);
-MKLDNN_EXTENSION_NODE(DetectionOutputImpl, DetectionOutput);
-MKLDNN_EXTENSION_NODE(ExperimentalDetectronDetectionOutputImpl, ExperimentalDetectronDetectionOutput);
-MKLDNN_EXTENSION_NODE(LogSoftmaxImpl, LogSoftmax);
-MKLDNN_EXTENSION_NODE(ReorgYoloImpl, ReorgYolo);
-MKLDNN_EXTENSION_NODE(ExperimentalDetectronROIFeatureExtractorImpl, ExperimentalDetectronROIFeatureExtractor);
-MKLDNN_EXTENSION_NODE(ExperimentalDetectronGenerateProposalsSingleImageImpl, ExperimentalDetectronGenerateProposalsSingleImage);
-MKLDNN_EXTENSION_NODE(NonMaxSuppressionImpl, NonMaxSuppressionIEInternal);
-MKLDNN_EXTENSION_NODE(TopKImpl, TopK);
-MKLDNN_EXTENSION_NODE(ExperimentalDetectronPriorGridGeneratorImpl, ExperimentalDetectronPriorGridGenerator);
-MKLDNN_EXTENSION_NODE(GRNImpl, GRN);
-MKLDNN_EXTENSION_NODE(BucketizeImpl, Bucketize);
-MKLDNN_EXTENSION_NODE(CTCGreedyDecoderImpl, CTCGreedyDecoder);
-MKLDNN_EXTENSION_NODE(CTCGreedyDecoderSeqLenImpl, CTCGreedyDecoderSeqLen);
-MKLDNN_EXTENSION_NODE(ProposalImpl, Proposal);
-MKLDNN_EXTENSION_NODE(RangeImpl, Range);
-MKLDNN_EXTENSION_NODE(GatherTreeImpl, GatherTree);
-MKLDNN_EXTENSION_NODE(CumSumImpl, CumSum);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp b/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp
deleted file mode 100644
index 337549e3434be0..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cmath>
-#include <limits>
-#include <cfloat>
-#include <string>
-#include <vector>
-#include <cassert>
-#include "ie_parallel.hpp"
-#include <ngraph/opsets/opset5.hpp>
-
-using namespace MKLDNNPlugin;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-class LogSoftmaxImpl: public ExtLayerBase {
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto logSoftMax = std::dynamic_pointer_cast<const ngraph::opset5::LogSoftmax>(op);
-            if (!logSoftMax) {
-                errorMessage = "Only opset5 LogSoftmax operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-public:
-    explicit LogSoftmaxImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            errorPrefix = "LogSoftmax layer with name '" + op->get_friendly_name() + "'";
-            const auto logSoftMax = std::dynamic_pointer_cast<const ngraph::opset5::LogSoftmax>(op);
-
-            if (op->get_input_size() != 1 || op->get_output_size() != 1)
-                IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
-
-            SizeVector dims = op->get_input_shape(0);
-            if (!dims.size())
-                dims = SizeVector(1, 1);
-            int axis = logSoftMax->get_axis();
-            if (axis < 0)
-                axis += dims.size();
-
-            if (dims.size() < static_cast<size_t>((size_t)(1) + axis))
-                IE_THROW() << errorPrefix << " has incorrect input parameters dimensions and axis number!";
-
-            int j;
-            for (j = dims.size() - 1; j >= 0; j--) {
-                if (dims[j] != 1) break;
-            }
-            if (j == axis) is_last_dim = true;
-
-            for (int i = 0; i < axis; i++)
-                axis_step *= dims[i];
-            reduced_axis_size = dims[axis];
-            for (size_t i = (axis + 1); i < dims.size(); i++)
-                reduced_axis_stride *= dims[i];
-
-            addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        const float *src_data = inputs[0]->cbuffer().as<float *>() +
-            inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        float* dst_data = outputs[0]->buffer().as<float *>() +
-            outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        if (is_last_dim) {
-            parallel_for(axis_step, [&](size_t i) {
-                const float *src_dataPtr = &src_data[i * reduced_axis_size];
-                float *dst_dataPtr = &dst_data[i * reduced_axis_size];
-
-                float reduce_prod = 0.0f;
-                const float max = *std::max_element(src_dataPtr, src_dataPtr + reduced_axis_size);
-                for (size_t j = 0; j < reduced_axis_size; ++j)
-                    reduce_prod += expf(src_dataPtr[j] - max);
-
-                reduce_prod = logf(reduce_prod);
-                for (size_t j = 0; j < reduced_axis_size; ++j)
-                    dst_dataPtr[j] = src_dataPtr[j] - max - reduce_prod;
-            });
-        } else {
-            parallel_for2d(axis_step, reduced_axis_stride, [&](size_t k, size_t i) {
-                const float *src_dataPtr = &src_data[k * reduced_axis_stride * reduced_axis_size + i];
-                float *dst_dataPtr = &dst_data[k * reduced_axis_stride * reduced_axis_size + i];
-
-                float reduce_prod = 0.0f;
-                float max = std::numeric_limits<float>::min();
-                for (size_t j = 0; j < reduced_axis_size; ++j) {
-                    if (src_dataPtr[j * reduced_axis_stride] > max)
-                        max = src_dataPtr[j * reduced_axis_stride];
-                }
-
-                for (size_t j = 0; j < reduced_axis_size; ++j)
-                    reduce_prod += expf(src_dataPtr[j * reduced_axis_stride] - max);
-
-                reduce_prod = logf(reduce_prod);
-                for (size_t j = 0; j < reduced_axis_size; ++j)
-                    dst_dataPtr[j * reduced_axis_stride] = src_dataPtr[j * reduced_axis_stride] - max - reduce_prod;
-            });
-        }
-
-        return OK;
-    }
-
-private:
-    size_t reduced_axis_size;
-    size_t reduced_axis_stride = 1;
-    size_t axis_step = 1;
-    bool is_last_dim = false;
-
-    std::string errorPrefix;
-};
-
-REG_FACTORY_FOR(LogSoftmaxImpl, LogSoftmax);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp
new file mode 100644
index 00000000000000..c6c327a1993f3d
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp
@@ -0,0 +1,218 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include <ngraph/opsets/opset3.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_bucketize_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNBucketizeNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto bucketsize = std::dynamic_pointer_cast<const ngraph::opset3::Bucketize>(op);
+        if (!bucketsize) {
+            errorMessage = "Only opset3 Bucketize operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNBucketizeNode::MKLDNNBucketizeNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                                     MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "Bucketize layer with name '" + op->get_friendly_name() + "' ";
+    const auto bucketsize = std::dynamic_pointer_cast<const ngraph::opset3::Bucketize>(op);
+
+    if (getOriginalInputsNumber() != 2 || getOriginalOutputsNumber() != 1) {
+        IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
+    }
+
+    // check one attribute
+    with_right = bucketsize->get_with_right_bound();
+
+    // check dimensions of input tensors
+    SizeVector input_tensor_dims = op->get_input_shape(INPUT_TENSOR_PORT);
+    if (input_tensor_dims.size() < 1) {
+        IE_THROW() << errorPrefix << " has incorrect dimensions of the input.";
+    }
+    SizeVector input_bin_dims = op->get_input_shape(INPUT_BINS_PORT);
+    if (input_bin_dims.size() != 1) {
+        IE_THROW() << errorPrefix << " has incorrect dimensions of the boundaries tensor.";
+    }
+    if (input_bin_dims[0] != 0) {
+        with_bins = true;
+    }
+    num_bin_values = input_bin_dims[0];
+
+    num_values = std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), size_t(1), std::multiplies<size_t>());
+}
+
+void MKLDNNBucketizeNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    // check precisions for input and output tensors
+    input_precision = getOriginalInputPrecisionAtPort(INPUT_TENSOR_PORT);
+    if (input_precision != Precision::FP32 && input_precision != Precision::I32 &&
+        input_precision != Precision::I64) {
+        input_precision = Precision::FP32;
+    }
+    boundaries_precision = getOriginalInputPrecisionAtPort(INPUT_BINS_PORT);
+    if (boundaries_precision != Precision::FP32 && boundaries_precision != Precision::I32 &&
+        boundaries_precision != Precision::I64) {
+        boundaries_precision = Precision::FP32;
+    }
+    output_precision = getOriginalOutputPrecisionAtPort(OUTPUT_TENSOR_PORT);
+    if (output_precision != Precision::I32 && output_precision != Precision::I64) {
+        output_precision = Precision::I32;
+    }
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, input_precision},
+                          {TensorDescCreatorTypes::ncsp, boundaries_precision}},
+                         {{TensorDescCreatorTypes::ncsp, output_precision}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNBucketizeNode::execute(mkldnn::stream strm) {
+    auto precision_mask = getPrecisionMask(input_precision, boundaries_precision, output_precision);
+
+    switch (precision_mask) {
+        case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I32):
+            bucketize<PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type>();
+            break;
+        case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I64):
+            bucketize<PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type>();
+            break;
+        case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I32):
+            bucketize<PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type>();
+            break;
+        case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I64):
+            bucketize<PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type>();
+            break;
+        case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I32):
+            bucketize<PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type>();
+            break;
+        case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I64):
+            bucketize<PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I32):
+            bucketize<PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I64):
+            bucketize<PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I32, Precision::I32, Precision::I32):
+            bucketize<PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I32, Precision::I32, Precision::I64):
+            bucketize<PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I32, Precision::I64, Precision::I32):
+            bucketize<PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I32, Precision::I64, Precision::I64):
+            bucketize<PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I32):
+            bucketize<PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I64):
+            bucketize<PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::FP32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I64, Precision::I32, Precision::I32):
+            bucketize<PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I64, Precision::I32, Precision::I64):
+            bucketize<PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I64, Precision::I64, Precision::I32):
+            bucketize<PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I32>::value_type>();
+            break;
+        case getPrecisionMask(Precision::I64, Precision::I64, Precision::I64):
+            bucketize<PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type,
+                    PrecisionTrait<Precision::I64>::value_type>();
+            break;
+        default:
+            IE_THROW() << errorPrefix << " has unsupported precision: " << precision_mask;
+    }
+}
+
+template <typename T, typename T_BOUNDARIES, typename T_IND>
+void MKLDNNBucketizeNode::bucketize() {
+    const auto *input_data = reinterpret_cast<const T *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    const auto *boundaries_data = reinterpret_cast<const T_BOUNDARIES *>(getParentEdgeAt(1)->getMemoryPtr()->GetPtr());
+    auto *output_data = reinterpret_cast<T_IND *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+
+    if (!with_bins) {
+        memset(output_data, 0, num_values * sizeof(T_IND));
+        return;
+    }
+
+    // boundaries are assumed to be sorted and to have unique elements
+    parallel_for(num_values, [&](size_t ind) {
+        T value = input_data[ind];
+        if (with_right) {
+            auto low = std::lower_bound(boundaries_data, boundaries_data + num_bin_values, value);
+            output_data[ind] = static_cast<T_IND>(low - boundaries_data);
+        } else {
+            auto up = std::upper_bound(boundaries_data, boundaries_data + num_bin_values, value);
+            output_data[ind] = static_cast<T_IND>(up - boundaries_data);
+        }
+    });
+}
+
+bool MKLDNNBucketizeNode::created() const {
+    return getType() == Bucketize;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNBucketizeNode, Bucketize)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h
new file mode 100644
index 00000000000000..472e6aee3cfb03
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNBucketizeNode : public MKLDNNNode {
+public:
+    MKLDNNBucketizeNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    template <typename T, typename T_BOUNDARIES, typename T_IND>
+    void bucketize();
+
+    const size_t INPUT_TENSOR_PORT = 0;
+    const size_t INPUT_BINS_PORT = 1;
+    const size_t OUTPUT_TENSOR_PORT = 0;
+
+    size_t num_values = 0;
+    size_t num_bin_values = 0;
+    bool with_right = false;
+    bool with_bins = false;
+
+    InferenceEngine::Precision input_precision;
+    InferenceEngine::Precision boundaries_precision;
+    InferenceEngine::Precision output_precision;
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
index ba760cae535806..4990a658d61f1c 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
@@ -33,7 +33,7 @@ namespace {
 
 bool MKLDNNConcatNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto concatOp = ngraph::as_type_ptr<const ngraph::op::v0::Concat>(op);
+        const auto concatOp = ngraph::as_type_ptr<const ngraph::op::v0::Concat>(op);
         if (!concatOp) {
             errorMessage = "Node is not an instance of the Concat operation.";
             return false;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp
new file mode 100644
index 00000000000000..34c9aaf191e697
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp
@@ -0,0 +1,167 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+
+#include <ngraph/op/ctc_greedy_decoder.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_ctc_greedy_decoder_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNCTCGreedyDecoderNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v0::CTCGreedyDecoder>(op);
+        if (!greedyDecOp) {
+            errorMessage = "Node is not an instance of the CTCGreedyDecoder operation from operation set v0.";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNCTCGreedyDecoderNode::MKLDNNCTCGreedyDecoderNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "CTCGreedyDecoder layer with name '" + op->get_friendly_name() + "' ";
+    if (getOriginalInputsNumber() != 2)
+        IE_THROW() << errorPrefix << "has invalid number of input edges: " << getOriginalInputsNumber();
+    if (getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << "has invalid number of outputs edges: " << getOriginalOutputsNumber();
+
+    if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0] &&
+        op->get_input_shape(DATA_INDEX)[1] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[1])
+        IE_THROW() << errorPrefix << "has invalid input shapes.";
+
+    auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v0::CTCGreedyDecoder>(op);
+    mergeRepeated = greedyDecOp->get_ctc_merge_repeated();
+}
+
+void MKLDNNCTCGreedyDecoderNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    Precision inDataPrecision = getOriginalInputPrecisionAtPort(DATA_INDEX);
+    if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16)
+        IE_THROW() << errorPrefix << "has unsupported 'data' input precision: " << inDataPrecision;
+
+    Precision seqLenPrecision = getOriginalInputPrecisionAtPort(SEQUENCE_LENGTH_INDEX);
+    if (seqLenPrecision != Precision::FP32 && seqLenPrecision != Precision::BF16)
+        IE_THROW() << errorPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision;
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNCTCGreedyDecoderNode::execute(mkldnn::stream strm) {
+    const float* probabilities = reinterpret_cast<const float *>(getParentEdgeAt(DATA_INDEX)->getMemoryPtr()->GetPtr());
+    const float* sequenceMask = reinterpret_cast<const float *>(getParentEdgeAt(SEQUENCE_LENGTH_INDEX)->getMemoryPtr()->GetPtr());
+    float* outputSequences = reinterpret_cast<float *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+
+    const size_t T = getParentEdgeAt(DATA_INDEX)->getDims()[0];
+    const size_t B = getParentEdgeAt(DATA_INDEX)->getDims()[1];
+    const int C = getParentEdgeAt(DATA_INDEX)->getDims()[2];
+    const size_t BC = B * C;
+    const size_t CB1 = C * (B - 1);
+
+    const int blankIndex = C - 1;
+
+    std::vector<size_t> sequenceLengths(B, 0);
+    parallel_for(B, [&](size_t b) {
+        size_t t = 0;
+        for (; t < T; t++) {
+            if (sequenceMask[B * t + b] == 0.f)
+                break;
+        }
+        sequenceLengths[b] = t;
+    });
+
+    size_t workAmount = 0;
+    for (size_t b = 0; b < B; b++) {
+        workAmount += sequenceLengths[b];
+    }
+
+    // Parallelization could not be made directly by T due to output index depends on merged classes and
+    // blank index, thus could not be shared between threads. Better to divide operation on two steps.
+    // At the first stage find the maximum index. At second stage merge if needed.
+    // Such approach makes parallelization more efficient.
+    auto threadBody = [&](const int ithr, const int nthr) {
+        size_t start(0lu), end(0lu);
+        splitter(workAmount, nthr, ithr, start, end);
+        if (start >= end)
+            return;
+        size_t tStart = 0lu, bStart = 0lu;
+        for (; bStart < B; bStart++) {
+            tStart += sequenceLengths[bStart];
+            if (tStart >= start) {
+                tStart = start - (tStart - sequenceLengths[bStart]);
+                break;
+            }
+        }
+
+        size_t workCounter = start;
+
+        for (size_t b = bStart; b < B; ++b) {
+            size_t outputIndex = b * T + tStart;
+            const float* probs = probabilities + b * C + BC * tStart;
+            size_t sequenceLength = sequenceLengths[b];
+
+            for (size_t t = tStart; t < sequenceLength; ++t) {
+                int maxClassIdx = 0;
+
+                float maxProb = probs[0];
+                ++probs;
+
+                for (int c = 1; c < C; ++c, ++probs) {
+                    if (*probs > maxProb) {
+                        maxClassIdx = c;
+                        maxProb = *probs;
+                    }
+                }
+                probs += CB1;
+                outputSequences[outputIndex++] = static_cast<float>(maxClassIdx);
+
+                if (++workCounter >= end) {
+                    return;
+                }
+            }
+            tStart = 0lu;
+        }
+    }; // thread body
+
+    parallel_nt(0, threadBody);
+
+    parallel_for(B, [&](size_t b) {
+        int prevClassIdx = -1;
+        size_t outputIndex = b * T;
+        const size_t sequenceLength = sequenceLengths[b];
+        float* shiftedOut = outputSequences + b * T;
+        for (size_t t = 0; t < sequenceLength; ++t) {
+            if (*shiftedOut < blankIndex &&
+                !(mergeRepeated && *shiftedOut == prevClassIdx)) {
+                outputSequences[outputIndex++] = *shiftedOut;
+            }
+            prevClassIdx = *shiftedOut;
+            shiftedOut++;
+        }
+        std::fill(outputSequences + outputIndex, outputSequences + (b + 1) * T, -1.f);
+    });
+}
+
+bool MKLDNNCTCGreedyDecoderNode::created() const {
+    return getType() == CTCGreedyDecoder;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNCTCGreedyDecoderNode, CTCGreedyDecoder)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h
new file mode 100644
index 00000000000000..26554ae7333dca
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNCTCGreedyDecoderNode : public MKLDNNNode {
+public:
+    MKLDNNCTCGreedyDecoderNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    const size_t DATA_INDEX = 0lu;
+    const size_t SEQUENCE_LENGTH_INDEX = 1lu;
+    bool mergeRepeated;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp
new file mode 100644
index 00000000000000..0eccdbfa1b5b07
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp
@@ -0,0 +1,170 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+
+#include <ngraph/op/ctc_greedy_decoder_seq_len.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_ctc_greedy_decoder_seq_len_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNCTCGreedyDecoderSeqLenNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v6::CTCGreedyDecoderSeqLen>(op);
+        if (!greedyDecOp) {
+            errorMessage = "Node is not an instance of the CTCGreedyDecoderSeqLen operation from operation set v6.";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNCTCGreedyDecoderSeqLenNode::MKLDNNCTCGreedyDecoderSeqLenNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "CTCGreedyDecoderSeqLen layer with name '" + op->get_friendly_name() + "' ";
+    if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > 3)
+        IE_THROW() << errorPrefix << "has invalid number of input edges: " << getOriginalInputsNumber();
+    if (getOriginalOutputsNumber() != 2)
+        IE_THROW() << errorPrefix << "has invalid number of outputs edges: " << getOriginalOutputsNumber();
+
+    if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0])
+        IE_THROW() << errorPrefix << "has invalid input shapes.";
+
+    auto greedyDecOp = ngraph::as_type_ptr<const ngraph::op::v6::CTCGreedyDecoderSeqLen>(op);
+    mergeRepeated = greedyDecOp->get_merge_repeated();
+}
+
+void MKLDNNCTCGreedyDecoderSeqLenNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    Precision inDataPrecision = getOriginalInputPrecisionAtPort(DATA_INDEX);
+    if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16)
+        IE_THROW() << errorPrefix << "has unsupported 'data' input precision: " << inDataPrecision;
+
+    Precision seqLenPrecision = getOriginalInputPrecisionAtPort(SEQUENCE_LENGTH_INDEX);
+    if (seqLenPrecision != Precision::I32 && seqLenPrecision != Precision::I64)
+        IE_THROW() << errorPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision;
+
+    std::vector<DataConfigurator> inDataConf;
+    inDataConf.reserve(getOriginalInputsNumber());
+    inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
+    for (int i = 1; i < getOriginalInputsNumber(); ++i)
+        inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32);
+
+    addSupportedPrimDesc(inDataConf,
+                         {{TensorDescCreatorTypes::ncsp, Precision::I32},
+                          {TensorDescCreatorTypes::ncsp, Precision::I32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNCTCGreedyDecoderSeqLenNode::execute(mkldnn::stream strm) {
+    const float* probabilities = reinterpret_cast<const float *>(getParentEdgeAt(DATA_INDEX)->getMemoryPtr()->GetPtr());
+    const int* sequenceLengths = reinterpret_cast<const int *>(getParentEdgeAt(SEQUENCE_LENGTH_INDEX)->getMemoryPtr()->GetPtr());
+    int* decodedClasses =  reinterpret_cast<int *>(getChildEdgesAtPort(DECODED_CLASSES_INDEX)[0]->getMemoryPtr()->GetPtr());
+    int* decodedClassesLength = reinterpret_cast<int *>(getChildEdgesAtPort(DECODED_CLASSES_LENGTH_INDEX)[0]->getMemoryPtr()->GetPtr());
+
+    const size_t B = getParentEdgeAt(DATA_INDEX)->getDims()[0];;
+    const size_t T = getParentEdgeAt(DATA_INDEX)->getDims()[1];;
+    const int C = getParentEdgeAt(DATA_INDEX)->getDims()[2];;
+    const size_t TC = T * C;
+
+    int blankIndex = C - 1;
+    if (inDims.size() > BLANK_INDEX)
+        blankIndex = (reinterpret_cast<const int  *>(getParentEdgeAt(BLANK_INDEX)->getMemoryPtr()->GetPtr()))[0];
+
+    size_t workAmount = 0;
+    for (size_t b = 0; b < B; b++) {
+        if (sequenceLengths[b] > T) {
+            std::string errorMsg = errorPrefix
+                                   + ". Sequence length " + std::to_string(sequenceLengths[b])
+                                   + " cannot be greater than according decoded classes dimension size "
+                                   + std::to_string(getChildEdgesAtPort(DECODED_CLASSES_INDEX)[0]->getDims()[1]);
+            IE_THROW() << errorMsg;
+        }
+        workAmount += sequenceLengths[b];
+    }
+    // Parallelization could not be made directly by T due to output index depends on merged classes and
+    // blank index, thus could not be shared between threads. Better to divide operation on two steps.
+    // At the first stage find the maximum index. At second stage merge if needed.
+    // Such approach makes parallelization more efficient.
+    auto threadBody = [&](const int ithr, const int nthr) {
+        size_t start(0lu), end(0lu);
+        splitter(workAmount, nthr, ithr, start, end);
+        if (start >= end)
+            return;
+        size_t tStart = 0lu, bStart = 0lu;
+        for (; bStart < B; bStart++) {
+            tStart += sequenceLengths[bStart];
+            if (tStart >= start) {
+                tStart = start - (tStart - sequenceLengths[bStart]);
+                break;
+            }
+        }
+
+        size_t workCounter = start;
+
+        for (size_t b = bStart; b < B; ++b) {
+            size_t outputIndex = b * T + tStart;
+            const float* probs = probabilities + b * TC + C * tStart;
+            const size_t actualSeqLen = sequenceLengths[b];
+
+            for (size_t t = tStart; t < actualSeqLen; ++t) {
+                int maxClassIdx = 0;
+                float maxProb = probs[0];
+                probs++;
+
+                for (int c = 1; c < C; c++, probs++) {
+                    if (*probs > maxProb) {
+                        maxClassIdx = c;
+                        maxProb = *probs;
+                    }
+                }
+                decodedClasses[outputIndex++] = maxClassIdx;
+
+                if (++workCounter >= end) {
+                    return;
+                }
+            }
+            tStart = 0lu;
+        }
+    }; // thread body
+
+    parallel_nt(0, threadBody);
+
+    parallel_for(B, [&](size_t b) {
+        int prevClassIdx = -1;
+        size_t outputIndex = b * T;
+        const size_t actualSeqLen = sequenceLengths[b];
+        int* shiftedOut = decodedClasses + b * T;
+
+        for (size_t t = 0; t < actualSeqLen; ++t) {
+            if (*shiftedOut != blankIndex &&
+                !(mergeRepeated && *shiftedOut == prevClassIdx)) {
+                decodedClasses[outputIndex++] = *shiftedOut;
+            }
+            prevClassIdx = *shiftedOut;
+            shiftedOut++;
+        }
+        std::fill(decodedClasses + outputIndex, decodedClasses + (b + 1) * T, -1);
+        decodedClassesLength[b] = outputIndex - b * T;
+    });
+}
+
+bool MKLDNNCTCGreedyDecoderSeqLenNode::created() const {
+    return getType() == CTCGreedyDecoderSeqLen;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNCTCGreedyDecoderSeqLenNode, CTCGreedyDecoderSeqLen)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h
new file mode 100644
index 00000000000000..b1d5ab6d9ffef3
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNCTCGreedyDecoderSeqLenNode : public MKLDNNNode {
+public:
+    MKLDNNCTCGreedyDecoderSeqLenNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    const size_t DATA_INDEX = 0lu;
+    const size_t SEQUENCE_LENGTH_INDEX = 1lu;
+    const size_t BLANK_INDEX = 2lu;
+    const size_t DECODED_CLASSES_INDEX = 0lu;
+    const size_t DECODED_CLASSES_LENGTH_INDEX = 1lu;
+    bool mergeRepeated;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp
new file mode 100644
index 00000000000000..b355dcaefcd4b0
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp
@@ -0,0 +1,279 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cmath>
+
+#include <ngraph/op/ctc_loss.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_ctc_loss_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNCTCLossNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto ctcLossOp = ngraph::as_type_ptr<const ngraph::op::v4::CTCLoss>(op);
+        if (!ctcLossOp) {
+            errorMessage = "Node is not an instance of the CTCLoss operation from operation set v4.";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNCTCLossNode::MKLDNNCTCLossNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                                     MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = std::string("CTCLoss layer with name '") + op->get_friendly_name() + "'";
+
+    if (getOriginalInputsNumber() != 4 && getOriginalInputsNumber() != 5)
+        IE_THROW() << errorPrefix << " has invalid inputs number.";
+
+    auto ctcLossOp = ngraph::as_type_ptr<const ngraph::op::v4::CTCLoss>(op);
+    ctcMergeRepeated = ctcLossOp->get_ctc_merge_repeated();
+    preprocessCollapseRepeated = ctcLossOp->get_preprocess_collapse_repeated();
+    unique = ctcLossOp->get_unique();
+}
+
+void MKLDNNCTCLossNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    std::vector<DataConfigurator> inDataConf;
+    inDataConf.reserve(getOriginalInputsNumber());
+    inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
+    for (int i = 1; i < getOriginalInputsNumber(); ++i)
+        inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32);
+
+    addSupportedPrimDesc(inDataConf,
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNCTCLossNode::execute(mkldnn::stream strm) {
+    StatusCode returnCode = OK;
+
+    const float* logits = reinterpret_cast<const float *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    const int* logitsLength = reinterpret_cast<const int *>(getParentEdgeAt(1)->getMemoryPtr()->GetPtr());
+    const int* labels = reinterpret_cast<const int *>(getParentEdgeAt(2)->getMemoryPtr()->GetPtr());
+    const int* labelsLength = reinterpret_cast<const int *>(getParentEdgeAt(3)->getMemoryPtr()->GetPtr());
+    float* dstData = reinterpret_cast<float *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+
+    const size_t batchNum = getParentEdgeAt(0)->getDims()[0];
+    const size_t maxTime = getParentEdgeAt(0)->getDims()[1];
+    const size_t classesNum = getParentEdgeAt(0)->getDims()[2];
+
+    int blankIndex = classesNum - 1;
+    if (inDims.size() > 4) {
+        blankIndex = reinterpret_cast<const int *>(getParentEdgeAt(4)->getMemoryPtr()->GetPtr())[0];
+    }
+
+    std::vector<int> decodedTargetLenB(batchNum, 0);
+    std::vector<std::vector<int>> targetDB(batchNum);
+    std::vector<std::vector<std::vector<float>>> logProbabilitiesB(batchNum);
+    std::vector<std::string> errorMsgB(parallel_get_max_threads());
+
+    auto threadBody_1 = [&](const int ithr, const int nthr) {
+        size_t start(0lu), end(0lu);
+        splitter(batchNum, nthr, ithr, start, end);
+        if (start >= end)
+            return;
+
+        for (size_t b = start; b < end; b++) {
+            if (logitsLength[b] < 0 || labelsLength[b] < 0 || logitsLength[b] > maxTime || labelsLength[b] > logitsLength[b]) {
+                errorMsgB[ithr] = errorPrefix + ". Logit length cannot be greater than max sequence length. "
+                                  + "Label length cannot be greater than a logit length"
+                                  + " and both cannot be negative.\nMaxSeqLen: "
+                                  + std::to_string(maxTime) + "; Logit len: " + std::to_string(logitsLength[b])
+                                  + "; Label len: " + std::to_string(labelsLength[b]);
+                returnCode = GENERAL_ERROR;
+                return;
+            }
+            const size_t actualLogitLen = logitsLength[b];
+            const size_t actualTargetLen = labelsLength[b];
+            size_t decodedTargetLen = 0lu;
+
+            // Decoding target: merge repeated characters if preprocess_collapse_repeated == True,
+            // find unique elemnts if unique == True.
+            // Inserts blanks before each index and a blank at the end.
+            const int* target = &labels[b * maxTime];
+            targetDB[b].resize(actualTargetLen * 2 + 1);
+            auto& targetD = targetDB[b];
+            if (unique) {
+                std::unordered_set<int> uniqVals;
+                for (size_t t = 0lu; t < actualTargetLen; t++) {
+                    if (uniqVals.find(target[t]) != uniqVals.end()) {
+                        continue;
+                    }
+                    uniqVals.insert(target[t]);
+                    targetD[decodedTargetLen++] = blankIndex;
+                    targetD[decodedTargetLen++] = target[t];
+                }
+                targetD[decodedTargetLen++] = blankIndex;
+            } else if (preprocessCollapseRepeated) {
+                auto prevValue = target[0];
+                targetD[decodedTargetLen++] = blankIndex;
+                targetD[decodedTargetLen++] = target[0];
+                for (size_t t = 1lu; t < actualTargetLen; t++) {
+                    if (target[t] == prevValue) {
+                        continue;
+                    }
+                    targetD[decodedTargetLen++] = blankIndex;
+                    targetD[decodedTargetLen++] = prevValue = target[t];
+                }
+                targetD[decodedTargetLen++] = blankIndex;
+            } else {
+                for (size_t t = 0lu; t < actualTargetLen; t++) {
+                    targetD[decodedTargetLen++] = blankIndex;
+                    targetD[decodedTargetLen++] = target[t];
+                }
+                targetD[decodedTargetLen++] = blankIndex;
+            }
+            decodedTargetLenB[b] = decodedTargetLen;
+
+            auto& logProbabilities = logProbabilitiesB[b];
+            logProbabilities.resize(actualLogitLen);
+            for (size_t ll = 0; ll < actualLogitLen; ll++) {
+                logProbabilities[ll].resize(decodedTargetLen);
+            }
+        } // for batch
+    }; // threadBody_1
+
+    parallel_nt(0, threadBody_1);
+    if (returnCode != OK) {
+        std::string resErr("");
+        for (auto& err : errorMsgB) {
+            if (!err.empty())
+                resErr += err + "\n";
+        }
+        IE_THROW() << resErr;
+    }
+
+    const size_t TC = maxTime * classesNum;
+
+    size_t workAmount2 = 0lu;
+    for (size_t b = 0; b < batchNum; b++) {
+        workAmount2 += logitsLength[b];
+    }
+
+    auto threadBody_2 = [&](const int ithr, const int nthr) {
+        size_t start(0lu), end(0lu);
+        size_t sB(0lu), sT(0lu);
+        splitter(workAmount2, nthr, ithr, start, end);
+        if (start >= end)
+            return;
+        int64_t cw = 0, st = start;
+        for (; sB < batchNum; sB++) {
+            cw += logitsLength[sB];
+            if (cw >= st) {
+                sT = logitsLength[sB] + st - cw;
+                break;
+            }
+        }
+        size_t workCounter = start;
+
+        for (size_t b = sB; b < batchNum; b++) {
+            const size_t actualLogitLen = logitsLength[b];
+            const size_t decodedTargetLen = decodedTargetLenB[b];
+            auto& logProbabilities = logProbabilitiesB[b];
+            auto& targetD = targetDB[b];
+
+            double expSum = 0.0;
+            size_t btcT = b * TC + sT * classesNum;
+            // logProbabilities = logSoftmax = logits[b][t][c] - ln(sum_c(exp(logits[b][t])))
+            for (size_t t = sT; t < actualLogitLen; t++) {
+                expSum = 0.0;
+                for (size_t c = 0lu; c < classesNum; c++) {
+                    expSum += std::exp(logits[btcT + c]);
+                }
+                for (size_t s = 0lu; s < decodedTargetLen; s++) {
+                    logProbabilities[t][s] = logits[btcT + targetD[s]] - std::log(expSum);
+                }
+                btcT += classesNum;
+                if (++workCounter >= end) {
+                    return;
+                }
+            }
+            sT = 0lu;
+        }  // for batch
+    }; // threadBody_2
+
+    parallel_nt(0, threadBody_2);
+
+    const auto float_inf = std::numeric_limits<float>::infinity();
+
+    auto sumLogs = [&float_inf](float log1, float log2) {
+        if (log1 == -float_inf) {
+            return log2;
+        } else if (log2 == -float_inf) {
+            return log1;
+        } else {
+            if (log1 > log2)
+                return log1 + std::log1pf(std::exp(log2 - log1));
+            else
+                return log2 + std::log1pf(std::exp(log1 - log2));
+        }
+    };
+
+    auto threadBody_3 = [&](const int ithr, const int nthr) {
+        size_t start(0lu), end(0lu);
+        splitter(batchNum, nthr, ithr, start, end);
+        if (start >= end)
+            return;
+
+        // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks:
+        // Graves et al., 2016, paragraph 4.1 (10)
+        for (size_t b = start; b < end; b++) {
+            auto& targetD = targetDB[b];
+            auto& logProbabilities = logProbabilitiesB[b];
+            const int actualLogitLen = logitsLength[b];
+            const int decodedTargetLen = decodedTargetLenB[b];
+            std::vector<std::vector<float>> logBwd(decodedTargetLen, std::vector<float>(actualLogitLen, -float_inf));
+            for (int s = decodedTargetLen - 2; s < decodedTargetLen; s++)
+                logBwd[s][actualLogitLen - 1] = 0.f;
+
+            for (int t = actualLogitLen - 2; t >= 0; t--) {
+                const int t_1 = t + 1;
+                for (int s = std::max(0, decodedTargetLen - (2 * (actualLogitLen - t)));
+                     s < std::min(decodedTargetLen, 2 * (t_1)); s++) {
+                    if (ctcMergeRepeated || targetD[s] == blankIndex) {
+                        logBwd[s][t] = sumLogs(logBwd[s][t],
+                                               logBwd[s][t_1] + logProbabilities[t_1][s]);
+                    }
+
+                    if (s + 1 < decodedTargetLen) {
+                        logBwd[s][t] = sumLogs(logBwd[s][t],
+                                               logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]);
+                    }
+
+                    if (s + 2 < decodedTargetLen) {
+                        if (targetD[s] != blankIndex && (!ctcMergeRepeated || (targetD[s] != targetD[s + 2]))) {
+                            logBwd[s][t] = sumLogs(logBwd[s][t],
+                                                   logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]);
+                        }
+                    }
+                }
+            }
+
+            logBwd[0][0] += logProbabilities[0][0];
+            logBwd[1][0] += logProbabilities[0][(decodedTargetLen > 1) ? 1 : 0];
+
+            dstData[b] = -sumLogs(logBwd[0][0], logBwd[1][0]);
+        } // for batch
+    }; // threadBody_3
+
+    parallel_nt(0, threadBody_3);
+}
+
+bool MKLDNNCTCLossNode::created() const {
+    return getType() == CTCLoss;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNCTCLossNode, CTCLoss)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h
new file mode 100644
index 00000000000000..b46ff413e829be
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNCTCLossNode : public MKLDNNNode {
+public:
+    MKLDNNCTCLossNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    bool ctcMergeRepeated;
+    bool preprocessCollapseRepeated;
+    bool unique;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp
new file mode 100644
index 00000000000000..3f6c8f903482ce
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp
@@ -0,0 +1,279 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "list.hpp"
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include "ie_parallel.hpp"
+#include "ie_precision.hpp"
+#include <ie_ngraph_utils.hpp>
+#include "mkldnn_cum_sum_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNCumSumNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto cumsum = std::dynamic_pointer_cast<const ngraph::opset3::CumSum>(op);
+        if (!cumsum) {
+            errorMessage = "Only opset3 CumSum operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNCumSumNode::MKLDNNCumSumNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "CumSum layer with name '" + op->get_friendly_name() + "' ";
+
+    if ((getOriginalInputsNumber() != numOfInputs && getOriginalInputsNumber() != (numOfInputs - 1)) || getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
+
+    const auto &dataShape = op->get_input_shape(CUM_SUM_DATA);
+    if (dataShape.size() < 1) {
+        IE_THROW() << errorPrefix << " doesn't support 'data' input tensor with rank: " << dataShape.size();
+    }
+    numOfDims = dataShape.size();
+
+    const auto cumsum = std::dynamic_pointer_cast<const ngraph::opset3::CumSum>(op);
+    exclusive = cumsum->is_exclusive();
+    reverse = cumsum->is_reverse();
+
+    if (getOriginalInputsNumber() == numOfInputs) {
+        if (!ngraph::is_scalar(cumsum->get_input_shape(AXIS)))
+            IE_THROW() << errorPrefix << " doesn't support 'axis' input tensor with non scalar rank";
+    }
+
+    if (dataShape != cumsum->get_output_shape(0))
+        IE_THROW() << errorPrefix << " has different 'data' input and output dimensions";
+
+    shape = dataShape;
+}
+
+void MKLDNNCumSumNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    dataPrecision = getOriginalInputPrecisionAtPort(CUM_SUM_DATA);
+    if (dataPrecision != Precision::I8 && dataPrecision != Precision::U8 && dataPrecision != Precision::I16 && dataPrecision != Precision::I32 &&
+        dataPrecision != Precision::FP32 && dataPrecision != Precision::I64 && dataPrecision != Precision::U64 && dataPrecision != Precision::BF16)
+        IE_THROW() << errorPrefix << " has unsupported 'data' input precision: " << dataPrecision.name();
+
+    if (getOriginalInputsNumber() == numOfInputs) {
+        const auto &axisTensorPrec = getOriginalInputPrecisionAtPort(AXIS);
+        if (axisTensorPrec != Precision::I32 && axisTensorPrec != Precision::I64)
+            IE_THROW() << errorPrefix << " has unsupported 'axis' input precision: " << axisTensorPrec.name();
+    }
+
+    std::vector<DataConfigurator> inDataConf;
+    inDataConf.reserve(getOriginalInputsNumber());
+    inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, dataPrecision);
+    for (int i = 1; i < getOriginalInputsNumber(); ++i)
+        inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32);
+
+    addSupportedPrimDesc(inDataConf,
+                         {{TensorDescCreatorTypes::ncsp, dataPrecision}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNCumSumNode::execute(mkldnn::stream strm) {
+    if (inDims.size() == numOfInputs)
+        axis = getAxis(getParentEdgeAt(AXIS)->getBlob(), getParentEdgeAt(CUM_SUM_DATA)->getBlob());
+
+    switch (dataPrecision) {
+        case Precision::I8   : {
+            exec<int8_t>();
+            break;
+        }
+        case Precision::U8   : {
+            exec<uint8_t>();
+            break;
+        }
+        case Precision::I16  : {
+            exec<int16_t>();
+            break;
+        }
+        case Precision::I32  : {
+            exec<int32_t>();
+            break;
+        }
+        case Precision::FP32 : {
+            exec<float>();
+            break;
+        }
+        case Precision::I64  : {
+            exec<int64_t>();
+            break;
+        }
+        case Precision::U64  : {
+            exec<uint64_t>();
+            break;
+        }
+        default : {
+            std::string errorMsg = errorPrefix + " has unsupported 'data' input precision: " + dataPrecision.name();
+            IE_THROW() << errorMsg;
+        }
+    }
+}
+
+
+template <typename dataType>
+void MKLDNNCumSumNode::exec() {
+    const auto *input = reinterpret_cast<const dataType *>(getParentEdgeAt(CUM_SUM_DATA)->getMemoryPtr()->GetPtr());
+    auto *output = reinterpret_cast<dataType *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+    const std::vector<size_t> strides = getParentEdgeAt(CUM_SUM_DATA)->getDesc().getBlockingDesc().getStrides();
+
+    if (reverse) {
+        if (exclusive) {
+            cumSum<true, true, dataType>(input, output, strides);
+        } else {
+            cumSum<true, false, dataType>(input, output, strides);
+        }
+    } else {
+        if (exclusive) {
+            cumSum<false, true, dataType>(input, output, strides);
+        } else {
+            cumSum<false, false, dataType>(input, output, strides);
+        }
+    }
+}
+
+template <bool reverse, bool exclusive, typename dataType>
+void MKLDNNCumSumNode::cumSum(const dataType *input, dataType *output, const std::vector<size_t> &strides) {
+    SizeVector iterationRange(numOfDims - 1);
+    size_t j = 0;
+    for (size_t i = 0; i < shape.size(); i++) {
+        if (i == axis)
+            continue;
+        iterationRange[j++] = shape[i];
+    }
+    size_t work_amount_dst = std::accumulate(iterationRange.begin(), iterationRange.end(), 1, std::multiplies<size_t>());
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        SizeVector counters(numOfDims - 1, 0);
+        splitter(work_amount_dst, nthr, ithr, start, end);
+
+        parallelItInit(start, counters, iterationRange);
+
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            std::vector<size_t> forStartOffset(numOfDims);
+            forStartOffset[axis] = 0;
+            for (size_t offsetIdx = 0, countersIdx = 0; offsetIdx < numOfDims; ++offsetIdx) {
+                if (offsetIdx == axis) {
+                    continue;
+                }
+                forStartOffset[offsetIdx] = counters[countersIdx++];
+            }
+
+            size_t startOffset = getStartOffset(forStartOffset, strides);
+
+            const dataType *inputStart = input + startOffset;
+            dataType *outputStart = output + startOffset;
+
+            size_t offset = strides[axis];
+            if (reverse) {
+                if (exclusive) {
+                    outputStart[offset*(shape[axis] - 1)] = 0;
+                    for (int64_t i = shape[axis] - 2; i >= 0; i--) {
+                        outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset];
+                    }
+                } else {
+                    outputStart[offset*(shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)];
+                    for (int64_t i = shape[axis] - 2; i >= 0; i--) {
+                        outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset];
+                    }
+                }
+            } else {
+                if (exclusive) {
+                    outputStart[0] = 0;
+                    for (size_t i = 1; i < shape[axis]; i++) {
+                        outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset];
+                    }
+                } else {
+                    outputStart[0] = inputStart[0];
+                    for (size_t i = 1; i < shape[axis]; i++) {
+                        outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset];
+                    }
+                }
+            }
+
+            parallelItStep(counters, iterationRange);
+        }
+    });
+}
+
+void MKLDNNCumSumNode::parallelItInit(size_t start, std::vector<size_t>& counters, const std::vector<size_t>& iterationRange) {
+    auto itCounter = counters.rbegin();
+    auto itWork = iterationRange.rbegin();
+    while (itCounter != counters.rend() && itWork != iterationRange.rend()) {
+        *itCounter = start % *itWork;
+        start /= *itWork;
+        ++itCounter;
+        ++itWork;
+    }
+}
+
+inline void MKLDNNCumSumNode::parallelItStep(std::vector<size_t>& counters, const std::vector<size_t>& iterationRange) {
+    auto itCounter = counters.rbegin();
+    auto itWork = iterationRange.rbegin();
+
+    while (itCounter != counters.rend() && itWork != iterationRange.rend()) {
+        *itCounter = (*itCounter + 1) % *itWork;
+        if (*itCounter != 0) {
+            break;
+        }
+        ++itCounter;
+        ++itWork;
+    }
+}
+
+inline size_t MKLDNNCumSumNode::getStartOffset(const std::vector<size_t> &forStartOffset, const std::vector<size_t>& strides) const {
+    size_t startOffset = 0;
+    for (size_t idx = 0; idx < forStartOffset.size(); ++idx) {
+        startOffset += forStartOffset[idx] * strides[idx];
+    }
+    return startOffset;
+}
+
+size_t MKLDNNCumSumNode::getAxis(const Blob::CPtr& _axis, const Blob::CPtr& _data) const {
+    const auto& axisPrecision = _axis->getTensorDesc().getPrecision();
+    const int64_t dataShapeSize = static_cast<int64_t>(_data->getTensorDesc().getDims().size());
+    int64_t axisValueFromBlob;
+    switch (axisPrecision) {
+        case Precision::I32 : {
+            const auto *axisPtr = _axis->cbuffer().as<const int32_t *>();
+            axisValueFromBlob = static_cast<int64_t>(axisPtr[0]);
+            break;
+        }
+        case Precision::I64 : {
+            const auto *axisPtr = _axis->cbuffer().as<const int64_t *>();
+            axisValueFromBlob = axisPtr[0];
+            break;
+        }
+        default : {
+            IE_THROW() << errorPrefix << "  doesn't support 'axis' input with precision: " << axisPrecision.name();
+        }
+    }
+    if (axisValueFromBlob < -dataShapeSize || axisValueFromBlob > dataShapeSize - 1)
+        IE_THROW() << errorPrefix << "  has axis with a value out of range: " << axisValueFromBlob;
+    return axisValueFromBlob >= 0 ? axisValueFromBlob : (axisValueFromBlob + dataShapeSize);
+}
+
+bool MKLDNNCumSumNode::created() const {
+    return getType() == CumSum;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNCumSumNode, CumSum)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h
new file mode 100644
index 00000000000000..794d6bc73f1722
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNCumSumNode : public MKLDNNNode {
+public:
+    MKLDNNCumSumNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    template <typename dataType>
+    void exec();
+
+    template <bool reverse, bool exclusive, typename dataType>
+    void cumSum(const dataType *input, dataType *output, const std::vector<size_t> &strides);
+
+    void parallelItInit(size_t start, std::vector<size_t>& counters, const std::vector<size_t>& iterationRange);
+
+    inline void parallelItStep(std::vector<size_t>& counters, const std::vector<size_t>& iterationRange);
+
+    inline size_t getStartOffset(const std::vector<size_t> &forStartOffset, const std::vector<size_t>& strides) const;
+
+    size_t getAxis(const InferenceEngine::Blob::CPtr& _axis, const InferenceEngine::Blob::CPtr& _data) const;
+
+    enum { CUM_SUM_DATA, AXIS, numOfInputs };
+    bool exclusive;
+    bool reverse;
+    size_t numOfDims;
+    size_t axis = 0;
+    std::vector<size_t> shape;
+
+    InferenceEngine::Precision dataPrecision;
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp
index dde4d960c5897e..a2fae182a52f70 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp
@@ -741,7 +741,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_
 
 bool MKLDNNDeformableConvolutionNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto defConvNode = ngraph::as_type_ptr<const ngraph::op::v1::DeformableConvolution>(op);
+        const auto defConvNode = ngraph::as_type_ptr<const ngraph::op::v1::DeformableConvolution>(op);
         if (!defConvNode) {
             errorMessage = "Node is not an instance of DeformableConvolution form the operation set v1.";
             return false;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp
new file mode 100644
index 00000000000000..4b8c695a987315
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp
@@ -0,0 +1,601 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+
+#include <ngraph/op/detection_output.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_detection_output_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+template <typename T>
+static bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                                 const std::pair<float, T>& pair2) {
+    return pair1.first > pair2.first;
+}
+
+bool MKLDNNDetectionOutputNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto doOp = ngraph::as_type_ptr<const ngraph::op::v0::DetectionOutput>(op);
+        if (!doOp) {
+            errorMessage = "Node is not an instance of the DetectionOutput from the operations set v0.";
+            return false;
+        }
+        if (!details::CaselessEq<std::string>()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CENTER_SIZE") &&
+            !details::CaselessEq<std::string>()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CORNER")) {
+            errorMessage = "Unsupported code_type attribute: " + doOp->get_attrs().code_type;
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNDetectionOutputNode::MKLDNNDetectionOutputNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "DetectionOutput layer with name '" + op->get_friendly_name() + "' ";
+
+    if (getOriginalInputsNumber() != 3 && getOriginalInputsNumber() != 5)
+        IE_THROW() << errorPrefix <<  " has incorrect number of input edges.";
+
+    if (getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of output edges.";
+
+    auto doOp = ngraph::as_type_ptr<const ngraph::op::v0::DetectionOutput>(op);
+    auto attributes = doOp->get_attrs();
+
+    _num_classes = attributes.num_classes;
+    _background_label_id = attributes.background_label_id;
+    _top_k = attributes.top_k;
+    _variance_encoded_in_target = attributes.variance_encoded_in_target;
+    _keep_top_k = attributes.keep_top_k[0];
+    _nms_threshold = attributes.nms_threshold;
+    _confidence_threshold = attributes.confidence_threshold;
+    _share_location = attributes.share_location;
+    _clip_before_nms = attributes.clip_before_nms;
+    _clip_after_nms = attributes.clip_after_nms;
+    _decrease_label_id = attributes.decrease_label_id;
+    _normalized = attributes.normalized;
+    _image_height = attributes.input_height;
+    _image_width = attributes.input_width;
+    _prior_size = _normalized ? 4 : 5;
+    _offset = _normalized ? 0 : 1;
+    _num_loc_classes = _share_location ? 1 : _num_classes;
+
+    with_add_box_pred = getOriginalInputsNumber() == 5;
+    _objectness_score = attributes.objectness_score;
+
+    _code_type = (details::CaselessEq<std::string>()(attributes.code_type, "caffe.PriorBoxParameter.CENTER_SIZE") ?
+                  CodeType::CENTER_SIZE : CodeType::CORNER);
+
+    _num_priors = static_cast<int>(op->get_input_shape(idx_priors).back() / _prior_size);
+    _priors_batches = op->get_input_shape(idx_priors).front() != 1;
+
+    if (_num_priors * _num_loc_classes * 4 != static_cast<int>(op->get_input_shape(idx_location)[1]))
+        IE_THROW() << errorPrefix << " has incorrect number of priors must match number of location predictions ("
+                   << _num_priors * _num_loc_classes * 4 << " vs "
+                   << op->get_input_shape(idx_location)[1] << ")";
+
+    if (_num_priors * _num_classes != static_cast<int>(op->get_input_shape(idx_confidence).back()))
+        IE_THROW() << " has incorrect number of priors must match number of confidence predictions.";
+
+    if (_decrease_label_id && _background_label_id != 0)
+        IE_THROW() << errorPrefix << " cannot use decrease_label_id and background_label_id parameter simultaneously.";
+
+    _num = static_cast<int>(op->get_input_shape(idx_confidence)[0]);
+
+    _decoded_bboxes.resize(_num * _num_classes * _num_priors * 4);
+    _buffer.resize(_num * _num_classes * _num_priors);
+    _indices.resize(_num * _num_classes * _num_priors);
+    _detections_count.resize(_num * _num_classes);
+    _bbox_sizes.resize(_num * _num_classes * _num_priors);
+    _num_priors_actual.resize(_num);
+
+    const auto &confSize = op->get_input_shape(idx_confidence);
+    _reordered_conf.resize(std::accumulate(confSize.begin(), confSize.end(), 1, std::multiplies<size_t>()));
+}
+
+void MKLDNNDetectionOutputNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    std::vector<DataConfigurator> inDataConf;
+    inDataConf.reserve(getOriginalInputsNumber());
+    for (int i = 0; i < getOriginalInputsNumber(); ++i)
+        inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
+
+    addSupportedPrimDesc(inDataConf,
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNDetectionOutputNode::execute(mkldnn::stream strm) {
+    float *dst_data = reinterpret_cast<float *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+
+    const float *loc_data    = reinterpret_cast<const float *>(getParentEdgeAt(idx_location)->getMemoryPtr()->GetPtr());
+    const float *conf_data   = reinterpret_cast<const float *>(getParentEdgeAt(idx_confidence)->getMemoryPtr()->GetPtr());
+    const float *prior_data  = reinterpret_cast<const float *>(getParentEdgeAt(idx_priors)->getMemoryPtr()->GetPtr());
+    const float *arm_conf_data = inDims.size() > 3 ?
+            reinterpret_cast<const float *>(getParentEdgeAt(idx_arm_confidence)->getMemoryPtr()->GetPtr()) : nullptr;
+    const float *arm_loc_data = inDims.size() > 4 ?
+            reinterpret_cast<const float *>(getParentEdgeAt(idx_arm_location)->getMemoryPtr()->GetPtr()) : nullptr;
+
+    const int N = getParentEdgeAt(idx_confidence)->getDims()[0];
+
+    float *decoded_bboxes_data = _decoded_bboxes.data();
+    float *reordered_conf_data = _reordered_conf.data();
+    float *bbox_sizes_data     = _bbox_sizes.data();
+    int *detections_data       = _detections_count.data();
+    int *buffer_data           = _buffer.data();
+    int *indices_data          = _indices.data();
+    int *num_priors_actual     = _num_priors_actual.data();
+
+    for (int n = 0; n < N; ++n) {
+        const float *ppriors = prior_data;
+        const float *prior_variances = prior_data + _num_priors*_prior_size;
+        if (_priors_batches) {
+            ppriors += _variance_encoded_in_target ? n*_num_priors*_prior_size : 2*n*_num_priors*_prior_size;
+            prior_variances += _variance_encoded_in_target ? 0 : 2*n*_num_priors*_prior_size;
+        }
+
+        if (_share_location) {
+            const float *ploc = loc_data + n*4*_num_priors;
+            float *pboxes = decoded_bboxes_data + n*4*_num_priors;
+            float *psizes = bbox_sizes_data + n*_num_priors;
+
+            if (with_add_box_pred) {
+                const float *p_arm_loc = arm_loc_data + n*4*_num_priors;
+                decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
+                decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false);
+            } else {
+                decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
+            }
+        } else {
+            for (int c = 0; c < _num_loc_classes; ++c) {
+                if (c == _background_label_id) {
+                    continue;
+                }
+                const float *ploc = loc_data + n*4*_num_loc_classes*_num_priors + c*4;
+                float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors + c*4*_num_priors;
+                float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors + c*_num_priors;
+                if (with_add_box_pred) {
+                    const float *p_arm_loc = arm_loc_data + n*4*_num_loc_classes*_num_priors + c*4;
+                    decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
+                    decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false);
+                } else {
+                    decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
+                }
+            }
+        }
+    }
+
+    if (with_add_box_pred) {
+        for (int n = 0; n < N; ++n) {
+            for (int p = 0; p < _num_priors; ++p) {
+                if (arm_conf_data[n*_num_priors*2 + p * 2 + 1] < _objectness_score) {
+                    for (int c = 0; c < _num_classes; ++c) {
+                        reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = c == _background_label_id ? 1.0f : 0.0f;
+                    }
+                } else {
+                    for (int c = 0; c < _num_classes; ++c) {
+                        reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c];
+                    }
+                }
+            }
+        }
+    } else {
+        for (int n = 0; n < N; ++n) {
+            for (int c = 0; c < _num_classes; ++c) {
+                for (int p = 0; p < _num_priors; ++p) {
+                    reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c];
+                }
+            }
+        }
+    }
+
+    memset(detections_data, 0, N*_num_classes*sizeof(int));
+
+    for (int n = 0; n < N; ++n) {
+        int detections_total = 0;
+
+        if (!_decrease_label_id) {
+            // Caffe style
+            parallel_for(_num_classes, [&](int c) {
+                if (c != _background_label_id) {  // Ignore background class
+                    int *pindices    = indices_data + n*_num_classes*_num_priors + c*_num_priors;
+                    int *pbuffer     = buffer_data + c*_num_priors;
+                    int *pdetections = detections_data + n*_num_classes + c;
+
+                    const float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors;
+                    const float *pboxes;
+                    const float *psizes;
+                    if (_share_location) {
+                        pboxes = decoded_bboxes_data + n*4*_num_priors;
+                        psizes = bbox_sizes_data + n*_num_priors;
+                    } else {
+                        pboxes = decoded_bboxes_data + n*4*_num_classes*_num_priors + c*4*_num_priors;
+                        psizes = bbox_sizes_data + n*_num_classes*_num_priors + c*_num_priors;
+                    }
+
+                    nms_cf(pconf, pboxes, psizes, pbuffer, pindices, *pdetections, num_priors_actual[n]);
+                }
+            });
+        } else {
+            // MXNet style
+            int *pindices = indices_data + n*_num_classes*_num_priors;
+            int *pbuffer = buffer_data;
+            int *pdetections = detections_data + n*_num_classes;
+
+            const float *pconf = reordered_conf_data + n*_num_classes*_num_priors;
+            const float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors;
+            const float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors;
+
+            nms_mx(pconf, pboxes, psizes, pbuffer, pindices, pdetections, _num_priors);
+        }
+
+        for (int c = 0; c < _num_classes; ++c) {
+            detections_total += detections_data[n*_num_classes + c];
+        }
+
+        if (_keep_top_k > -1 && detections_total > _keep_top_k) {
+            std::vector<std::pair<float, std::pair<int, int>>> conf_index_class_map;
+
+            for (int c = 0; c < _num_classes; ++c) {
+                int detections = detections_data[n*_num_classes + c];
+                int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors;
+
+                float *pconf  = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors;
+
+                for (int i = 0; i < detections; ++i) {
+                    int idx = pindices[i];
+                    conf_index_class_map.push_back(std::make_pair(pconf[idx], std::make_pair(c, idx)));
+                }
+            }
+
+            std::sort(conf_index_class_map.begin(), conf_index_class_map.end(),
+                      SortScorePairDescend<std::pair<int, int>>);
+            conf_index_class_map.resize(_keep_top_k);
+
+            // Store the new indices.
+            memset(detections_data + n*_num_classes, 0, _num_classes * sizeof(int));
+
+            for (size_t j = 0; j < conf_index_class_map.size(); ++j) {
+                int label = conf_index_class_map[j].second.first;
+                int idx = conf_index_class_map[j].second.second;
+                int *pindices = indices_data + n * _num_classes * _num_priors + label * _num_priors;
+                pindices[detections_data[n*_num_classes + label]] = idx;
+                detections_data[n*_num_classes + label]++;
+            }
+        }
+    }
+
+    const int num_results = getChildEdgesAtPort(0)[0]->getDims()[2];
+    const int DETECTION_SIZE = getChildEdgesAtPort(0)[0]->getDims()[3];
+    if (DETECTION_SIZE != 7) {
+        IE_THROW() << NOT_IMPLEMENTED;
+    }
+
+    int dst_data_size = 0;
+    if (_keep_top_k > 0)
+        dst_data_size = N * _keep_top_k * DETECTION_SIZE * sizeof(float);
+    else if (_top_k > 0)
+        dst_data_size = N * _top_k * _num_classes * DETECTION_SIZE * sizeof(float);
+    else
+        dst_data_size = N * _num_classes * _num_priors * DETECTION_SIZE * sizeof(float);
+
+    if (dst_data_size > getChildEdgesAtPort(0)[0]->getBlob()->byteSize()) {
+        IE_THROW() << OUT_OF_BOUNDS;
+    }
+    memset(dst_data, 0, dst_data_size);
+
+    int count = 0;
+    for (int n = 0; n < N; ++n) {
+        const float *pconf   = reordered_conf_data + n * _num_priors * _num_classes;
+        const float *pboxes  = decoded_bboxes_data + n*_num_priors*4*_num_loc_classes;
+        const int *pindices  = indices_data + n*_num_classes*_num_priors;
+
+        for (int c = 0; c < _num_classes; ++c) {
+            for (int i = 0; i < detections_data[n*_num_classes + c]; ++i) {
+                int idx = pindices[c*_num_priors + i];
+
+                dst_data[count * DETECTION_SIZE + 0] = static_cast<float>(n);
+                dst_data[count * DETECTION_SIZE + 1] = static_cast<float>(_decrease_label_id ? c-1 : c);
+                dst_data[count * DETECTION_SIZE + 2] = pconf[c*_num_priors + idx];
+
+                float xmin = _share_location ? pboxes[idx*4 + 0] :
+                             pboxes[c*4*_num_priors + idx*4 + 0];
+                float ymin = _share_location ? pboxes[idx*4 + 1] :
+                             pboxes[c*4*_num_priors + idx*4 + 1];
+                float xmax = _share_location ? pboxes[idx*4 + 2] :
+                             pboxes[c*4*_num_priors + idx*4 + 2];
+                float ymax = _share_location ? pboxes[idx*4 + 3] :
+                             pboxes[c*4*_num_priors + idx*4 + 3];
+
+                if (_clip_after_nms) {
+                    xmin = (std::max)(0.0f, (std::min)(1.0f, xmin));
+                    ymin = (std::max)(0.0f, (std::min)(1.0f, ymin));
+                    xmax = (std::max)(0.0f, (std::min)(1.0f, xmax));
+                    ymax = (std::max)(0.0f, (std::min)(1.0f, ymax));
+                }
+
+                dst_data[count * DETECTION_SIZE + 3] = xmin;
+                dst_data[count * DETECTION_SIZE + 4] = ymin;
+                dst_data[count * DETECTION_SIZE + 5] = xmax;
+                dst_data[count * DETECTION_SIZE + 6] = ymax;
+
+                ++count;
+            }
+        }
+    }
+
+    if (count < num_results) {
+        // marker at end of boxes list
+        dst_data[count * DETECTION_SIZE + 0] = -1;
+    }
+}
+
+struct ConfidenceComparator {
+    explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {}
+
+    bool operator()(int idx1, int idx2) {
+        if (_conf_data[idx1] > _conf_data[idx2]) return true;
+        if (_conf_data[idx1] < _conf_data[idx2]) return false;
+        return idx1 < idx2;
+    }
+
+    const float* _conf_data;
+};
+
+static inline float JaccardOverlap(const float *decoded_bbox,
+                                   const float *bbox_sizes,
+                                   const int idx1,
+                                   const int idx2) {
+    float xmin1 = decoded_bbox[idx1*4 + 0];
+    float ymin1 = decoded_bbox[idx1*4 + 1];
+    float xmax1 = decoded_bbox[idx1*4 + 2];
+    float ymax1 = decoded_bbox[idx1*4 + 3];
+
+    float xmin2 = decoded_bbox[idx2*4 + 0];
+    float ymin2 = decoded_bbox[idx2*4 + 1];
+    float xmax2 = decoded_bbox[idx2*4 + 2];
+    float ymax2 = decoded_bbox[idx2*4 + 3];
+
+    if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) {
+        return 0.0f;
+    }
+
+    float intersect_xmin = (std::max)(xmin1, xmin2);
+    float intersect_ymin = (std::max)(ymin1, ymin2);
+    float intersect_xmax = (std::min)(xmax1, xmax2);
+    float intersect_ymax = (std::min)(ymax1, ymax2);
+
+    float intersect_width  = intersect_xmax - intersect_xmin;
+    float intersect_height = intersect_ymax - intersect_ymin;
+
+    if (intersect_width <= 0 || intersect_height <= 0) {
+        return 0.0f;
+    }
+
+    float intersect_size = intersect_width * intersect_height;
+    float bbox1_size = bbox_sizes[idx1];
+    float bbox2_size = bbox_sizes[idx2];
+
+    return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+}
+
+void MKLDNNDetectionOutputNode::decodeBBoxes(const float *prior_data,
+                                       const float *loc_data,
+                                       const float *variance_data,
+                                       float *decoded_bboxes,
+                                       float *decoded_bbox_sizes,
+                                       int* num_priors_actual,
+                                       int n,
+                                       const int& offs,
+                                       const int& pr_size,
+                                       bool decodeType) {
+    num_priors_actual[n] = _num_priors;
+    if (!_normalized && decodeType) {
+        int num = 0;
+        for (; num < _num_priors; ++num) {
+            float batch_id = prior_data[num * pr_size + 0];
+            if (batch_id == -1.f) {
+                num_priors_actual[n] = num;
+                break;
+            }
+        }
+    }
+    parallel_for(num_priors_actual[n], [&](int p) {
+        float new_xmin = 0.0f;
+        float new_ymin = 0.0f;
+        float new_xmax = 0.0f;
+        float new_ymax = 0.0f;
+
+        float prior_xmin = prior_data[p*pr_size + 0 + offs];
+        float prior_ymin = prior_data[p*pr_size + 1 + offs];
+        float prior_xmax = prior_data[p*pr_size + 2 + offs];
+        float prior_ymax = prior_data[p*pr_size + 3 + offs];
+
+        float loc_xmin = loc_data[4*p*_num_loc_classes + 0];
+        float loc_ymin = loc_data[4*p*_num_loc_classes + 1];
+        float loc_xmax = loc_data[4*p*_num_loc_classes + 2];
+        float loc_ymax = loc_data[4*p*_num_loc_classes + 3];
+
+        if (!_normalized) {
+            prior_xmin /= _image_width;
+            prior_ymin /= _image_height;
+            prior_xmax /= _image_width;
+            prior_ymax /= _image_height;
+        }
+
+        if (_code_type == CodeType::CORNER) {
+            if (_variance_encoded_in_target) {
+                // variance is encoded in target, we simply need to add the offset predictions.
+                new_xmin = prior_xmin + loc_xmin;
+                new_ymin = prior_ymin + loc_ymin;
+                new_xmax = prior_xmax + loc_xmax;
+                new_ymax = prior_ymax + loc_ymax;
+            } else {
+                new_xmin = prior_xmin + variance_data[p*4 + 0] * loc_xmin;
+                new_ymin = prior_ymin + variance_data[p*4 + 1] * loc_ymin;
+                new_xmax = prior_xmax + variance_data[p*4 + 2] * loc_xmax;
+                new_ymax = prior_ymax + variance_data[p*4 + 3] * loc_ymax;
+            }
+        } else if (_code_type == CodeType::CENTER_SIZE) {
+            float prior_width    =  prior_xmax - prior_xmin;
+            float prior_height   =  prior_ymax - prior_ymin;
+            float prior_center_x = (prior_xmin + prior_xmax) / 2.0f;
+            float prior_center_y = (prior_ymin + prior_ymax) / 2.0f;
+
+            float decode_bbox_center_x, decode_bbox_center_y;
+            float decode_bbox_width, decode_bbox_height;
+
+            if (_variance_encoded_in_target) {
+                // variance is encoded in target, we simply need to restore the offset predictions.
+                decode_bbox_center_x = loc_xmin * prior_width  + prior_center_x;
+                decode_bbox_center_y = loc_ymin * prior_height + prior_center_y;
+                decode_bbox_width  = std::exp(loc_xmax) * prior_width;
+                decode_bbox_height = std::exp(loc_ymax) * prior_height;
+            } else {
+                // variance is encoded in bbox, we need to scale the offset accordingly.
+                decode_bbox_center_x = variance_data[p*4 + 0] * loc_xmin * prior_width + prior_center_x;
+                decode_bbox_center_y = variance_data[p*4 + 1] * loc_ymin * prior_height + prior_center_y;
+                decode_bbox_width    = std::exp(variance_data[p*4 + 2] * loc_xmax) * prior_width;
+                decode_bbox_height   = std::exp(variance_data[p*4 + 3] * loc_ymax) * prior_height;
+            }
+
+            new_xmin = decode_bbox_center_x - decode_bbox_width  / 2.0f;
+            new_ymin = decode_bbox_center_y - decode_bbox_height / 2.0f;
+            new_xmax = decode_bbox_center_x + decode_bbox_width  / 2.0f;
+            new_ymax = decode_bbox_center_y + decode_bbox_height / 2.0f;
+        }
+
+        if (_clip_before_nms) {
+            new_xmin = (std::max)(0.0f, (std::min)(1.0f, new_xmin));
+            new_ymin = (std::max)(0.0f, (std::min)(1.0f, new_ymin));
+            new_xmax = (std::max)(0.0f, (std::min)(1.0f, new_xmax));
+            new_ymax = (std::max)(0.0f, (std::min)(1.0f, new_ymax));
+        }
+
+        decoded_bboxes[p*4 + 0] = new_xmin;
+        decoded_bboxes[p*4 + 1] = new_ymin;
+        decoded_bboxes[p*4 + 2] = new_xmax;
+        decoded_bboxes[p*4 + 3] = new_ymax;
+
+        decoded_bbox_sizes[p] = (new_xmax - new_xmin) * (new_ymax - new_ymin);
+    });
+}
+
+void MKLDNNDetectionOutputNode::nms_cf(const float* conf_data,
+                                 const float* bboxes,
+                                 const float* sizes,
+                                 int* buffer,
+                                 int* indices,
+                                 int& detections,
+                                 int num_priors_actual) {
+    int count = 0;
+    for (int i = 0; i < num_priors_actual; ++i) {
+        if (conf_data[i] > _confidence_threshold) {
+            indices[count] = i;
+            count++;
+        }
+    }
+
+    int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count));
+
+    std::partial_sort_copy(indices, indices + count,
+                           buffer, buffer + num_output_scores,
+                           ConfidenceComparator(conf_data));
+
+    for (int i = 0; i < num_output_scores; ++i) {
+        const int idx = buffer[i];
+
+        bool keep = true;
+        for (int k = 0; k < detections; ++k) {
+            const int kept_idx = indices[k];
+            float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx);
+            if (overlap > _nms_threshold) {
+                keep = false;
+                break;
+            }
+        }
+        if (keep) {
+            indices[detections] = idx;
+            detections++;
+        }
+    }
+}
+
+void MKLDNNDetectionOutputNode::nms_mx(const float* conf_data,
+                                 const float* bboxes,
+                                 const float* sizes,
+                                 int* buffer,
+                                 int* indices,
+                                 int* detections,
+                                 int num_priors_actual) {
+    int count = 0;
+    for (int i = 0; i < num_priors_actual; ++i) {
+        float conf = -1;
+        int id = 0;
+        for (int c = 1; c < _num_classes; ++c) {
+            float temp = conf_data[c*_num_priors + i];
+            if (temp > conf) {
+                conf = temp;
+                id = c;
+            }
+        }
+
+        if (id > 0 && conf >= _confidence_threshold) {
+            indices[count++] = id*_num_priors + i;
+        }
+    }
+
+    int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count));
+
+    std::partial_sort_copy(indices, indices + count,
+                           buffer, buffer + num_output_scores,
+                           ConfidenceComparator(conf_data));
+
+    for (int i = 0; i < num_output_scores; ++i) {
+        const int idx = buffer[i];
+        const int cls = idx/_num_priors;
+        const int prior = idx%_num_priors;
+
+        int &ndetection = detections[cls];
+        int *pindices = indices + cls*_num_priors;
+
+        bool keep = true;
+        for (int k = 0; k < ndetection; ++k) {
+            const int kept_idx = pindices[k];
+            float overlap = 0.0f;
+            if (_share_location) {
+                overlap = JaccardOverlap(bboxes, sizes, prior, kept_idx);
+            } else {
+                overlap = JaccardOverlap(bboxes, sizes, cls*_num_priors + prior, cls*_num_priors + kept_idx);
+            }
+            if (overlap > _nms_threshold) {
+                keep = false;
+                break;
+            }
+        }
+        if (keep) {
+            pindices[ndetection++] = prior;
+        }
+    }
+}
+
+bool MKLDNNDetectionOutputNode::created() const {
+    return getType() == DetectionOutput;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNDetectionOutputNode, DetectionOutput)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h
new file mode 100644
index 00000000000000..dbf9bde760907c
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h
@@ -0,0 +1,86 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNDetectionOutputNode : public MKLDNNNode {
+public:
+    MKLDNNDetectionOutputNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    const int idx_location = 0;
+    const int idx_confidence = 1;
+    const int idx_priors = 2;
+    const int idx_arm_confidence = 3;
+    const int idx_arm_location = 4;
+
+    int _num_classes = 0;
+    int _background_label_id = 0;
+    int _top_k = 0;
+    int _variance_encoded_in_target = 0;
+    int _keep_top_k = 0;
+    int _code_type = 0;
+
+    bool _share_location    = false;
+    bool _clip_before_nms   = false;  // clip bounding boxes before nms step
+    bool _clip_after_nms    = false;  // clip bounding boxes after nms step
+    bool _decrease_label_id = false;
+
+    bool with_add_box_pred = false;
+
+    int _image_width = 0;
+    int _image_height = 0;
+    int _prior_size = 4;
+    bool _normalized = true;
+    int _offset = 0;
+
+    float _nms_threshold = 0.0f;
+    float _confidence_threshold = 0.0f;
+    float _objectness_score = 0.0f;
+
+    int _num = 0;
+    int _num_loc_classes = 0;
+    int _num_priors = 0;
+    bool _priors_batches = false;
+
+    enum CodeType {
+        CORNER = 1,
+        CENTER_SIZE = 2,
+    };
+
+    void decodeBBoxes(const float *prior_data, const float *loc_data, const float *variance_data,
+                      float *decoded_bboxes, float *decoded_bbox_sizes, int* num_priors_actual, int n, const int& offs, const int& pr_size,
+                      bool decodeType = true); // after ARM = false
+
+    void nms_cf(const float *conf_data, const float *bboxes, const float *sizes,
+                int *buffer, int *indices, int &detections, int num_priors_actual);
+
+    void nms_mx(const float *conf_data, const float *bboxes, const float *sizes,
+                int *buffer, int *indices, int *detections, int num_priors_actual);
+
+    std::vector<float> _decoded_bboxes;
+    std::vector<int> _buffer;
+    std::vector<int> _indices;
+    std::vector<int> _detections_count;
+    std::vector<float> _reordered_conf;
+    std::vector<float> _bbox_sizes;
+    std::vector<int> _num_priors_actual;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
index 6b565370917db7..34e95d45ae06e8 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
@@ -124,4 +124,3 @@ class MKLDNNEltwiseNode : public MKLDNNNode {
 };
 
 }  // namespace MKLDNNPlugin
-
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp
index c8810e4444b2a5..f59b69b023d99c 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp
@@ -13,7 +13,7 @@ using namespace InferenceEngine;
 
 bool MKLDNNEmbeddingBagOffsetSumNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto embBagOffsetSumOp = ngraph::as_type_ptr<const ngraph::op::v3::EmbeddingBagOffsetsSum>(op);
+        const auto embBagOffsetSumOp = ngraph::as_type_ptr<const ngraph::op::v3::EmbeddingBagOffsetsSum>(op);
         if (!embBagOffsetSumOp) {
             errorMessage = "Node is not an instance of the EmbeddingBagOffsetsSum operation from opset v3.";
             return false;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp
index 4d1b808b502fb5..3318e1089faeed 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp
@@ -13,7 +13,7 @@ using namespace InferenceEngine;
 
 bool MKLDNNEmbeddingBagPackedSumNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto embBagPackedSumOp = ngraph::as_type_ptr<const ngraph::op::v3::EmbeddingBagPackedSum>(op);
+        const auto embBagPackedSumOp = ngraph::as_type_ptr<const ngraph::op::v3::EmbeddingBagPackedSum>(op);
         if (!embBagPackedSumOp) {
             errorMessage = "Node is not an instance of the EmbeddingBagPackedSum operation from opset v3.";
             return false;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp
index 798feecf7bd062..82eae04dcc2193 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp
@@ -13,7 +13,7 @@ using namespace InferenceEngine;
 
 bool MKLDNNEmbeddingSegmentsSumNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto embBagSegSumOp = ngraph::as_type_ptr<const ngraph::op::v3::EmbeddingSegmentsSum>(op);
+        const auto embBagSegSumOp = ngraph::as_type_ptr<const ngraph::op::v3::EmbeddingSegmentsSum>(op);
         if (!embBagSegSumOp) {
             errorMessage = "Node is not an instance of the EmbeddingSegmentsSum operation from opset v3.";
             return false;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp
new file mode 100644
index 00000000000000..fe2362003f377a
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp
@@ -0,0 +1,369 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+
+#include <ngraph/op/experimental_detectron_detection_output.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_experimental_detectron_detection_output_node.h"
+
+
+struct Indexer {
+    const std::vector<int> dims_;
+    int total_{1};
+
+    explicit Indexer(const std::vector<int>& dims) : dims_(dims) {
+        total_ = 1;
+        for (size_t i = 0; i < dims_.size(); ++i) {
+            total_ *= dims_[i];
+        }
+    }
+
+    int operator()(const std::vector<int>& idx) const {
+        int flat_idx = 0;
+        assert(idx.size() == dims_.size());
+        for (size_t i = 0; i < dims_.size(); ++i) {
+            assert(0 <= idx[i] && idx[i] < dims_[i]);
+            flat_idx = flat_idx * dims_[i] + idx[i];
+        }
+        assert(flat_idx < total_);
+        return flat_idx;
+    }
+};
+
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+static
+void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores,
+                  float* refined_boxes, float* refined_boxes_areas, float* refined_scores,
+                  const int rois_num, const int classes_num,
+                  const float img_H, const float img_W,
+                  const float max_delta_log_wh,
+                  float coordinates_offset) {
+    Indexer box_idx({rois_num, 4});
+    Indexer delta_idx({rois_num, classes_num, 4});
+    Indexer score_idx({rois_num, classes_num});
+
+    Indexer refined_box_idx({classes_num, rois_num, 4});
+    Indexer refined_score_idx({classes_num, rois_num});
+
+    for (int roi_idx = 0; roi_idx < rois_num; ++roi_idx) {
+        float x0 = boxes[box_idx({roi_idx, 0})];
+        float y0 = boxes[box_idx({roi_idx, 1})];
+        float x1 = boxes[box_idx({roi_idx, 2})];
+        float y1 = boxes[box_idx({roi_idx, 3})];
+
+        if (x1 - x0 <= 0 || y1 - y0 <= 0) {
+            continue;
+        }
+
+        // width & height of box
+        const float ww = x1 - x0 + coordinates_offset;
+        const float hh = y1 - y0 + coordinates_offset;
+        // center location of box
+        const float ctr_x = x0 + 0.5f * ww;
+        const float ctr_y = y0 + 0.5f * hh;
+
+        for (int class_idx = 1; class_idx < classes_num; ++class_idx) {
+            const float dx = deltas[delta_idx({roi_idx, class_idx, 0})] / weights[0];
+            const float dy = deltas[delta_idx({roi_idx, class_idx, 1})] / weights[1];
+            const float d_log_w = deltas[delta_idx({roi_idx, class_idx, 2})] / weights[2];
+            const float d_log_h = deltas[delta_idx({roi_idx, class_idx, 3})] / weights[3];
+
+            // new center location according to deltas (dx, dy)
+            const float pred_ctr_x = dx * ww + ctr_x;
+            const float pred_ctr_y = dy * hh + ctr_y;
+            // new width & height according to deltas d(log w), d(log h)
+            const float pred_w = std::exp((std::min)(d_log_w, max_delta_log_wh)) * ww;
+            const float pred_h = std::exp((std::min)(d_log_h, max_delta_log_wh)) * hh;
+
+            // update upper-left corner location
+            float x0_new = pred_ctr_x - 0.5f * pred_w;
+            float y0_new = pred_ctr_y - 0.5f * pred_h;
+            // update lower-right corner location
+            float x1_new = pred_ctr_x + 0.5f * pred_w - coordinates_offset;
+            float y1_new = pred_ctr_y + 0.5f * pred_h - coordinates_offset;
+
+            // adjust new corner locations to be within the image region,
+            x0_new = std::max<float>(0.0f, x0_new);
+            y0_new = std::max<float>(0.0f, y0_new);
+            x1_new = std::max<float>(0.0f, x1_new);
+            y1_new = std::max<float>(0.0f, y1_new);
+
+            // recompute new width & height
+            const float box_w = x1_new - x0_new + coordinates_offset;
+            const float box_h = y1_new - y0_new + coordinates_offset;
+
+            refined_boxes[refined_box_idx({class_idx, roi_idx, 0})] = x0_new;
+            refined_boxes[refined_box_idx({class_idx, roi_idx, 1})] = y0_new;
+            refined_boxes[refined_box_idx({class_idx, roi_idx, 2})] = x1_new;
+            refined_boxes[refined_box_idx({class_idx, roi_idx, 3})] = y1_new;
+
+            refined_boxes_areas[refined_score_idx({class_idx, roi_idx})] = box_w * box_h;
+
+            refined_scores[refined_score_idx({class_idx, roi_idx})] = scores[score_idx({roi_idx, class_idx})];
+        }
+    }
+}
+
+template <typename T>
+static bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                                 const std::pair<float, T>& pair2) {
+    return pair1.first > pair2.first;
+}
+
+
+struct ConfidenceComparator {
+    explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {}
+
+    bool operator()(int idx1, int idx2) {
+        if (_conf_data[idx1] > _conf_data[idx2]) return true;
+        if (_conf_data[idx1] < _conf_data[idx2]) return false;
+        return idx1 < idx2;
+    }
+
+    const float* _conf_data;
+};
+
+static inline float JaccardOverlap(const float *decoded_bbox,
+                                   const float *bbox_sizes,
+                                   const int idx1,
+                                   const int idx2,
+                                   const float coordinates_offset = 1) {
+    float xmin1 = decoded_bbox[idx1 * 4 + 0];
+    float ymin1 = decoded_bbox[idx1 * 4 + 1];
+    float xmax1 = decoded_bbox[idx1 * 4 + 2];
+    float ymax1 = decoded_bbox[idx1 * 4 + 3];
+
+    float xmin2 = decoded_bbox[idx2 * 4 + 0];
+    float ymin2 = decoded_bbox[idx2 * 4 + 1];
+    float ymax2 = decoded_bbox[idx2 * 4 + 3];
+    float xmax2 = decoded_bbox[idx2 * 4 + 2];
+
+    if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) {
+        return 0.0f;
+    }
+
+    float intersect_xmin = (std::max)(xmin1, xmin2);
+    float intersect_ymin = (std::max)(ymin1, ymin2);
+    float intersect_xmax = (std::min)(xmax1, xmax2);
+    float intersect_ymax = (std::min)(ymax1, ymax2);
+
+    float intersect_width  = intersect_xmax - intersect_xmin + coordinates_offset;
+    float intersect_height = intersect_ymax - intersect_ymin + coordinates_offset;
+
+    if (intersect_width <= 0 || intersect_height <= 0) {
+        return 0.0f;
+    }
+
+    float intersect_size = intersect_width * intersect_height;
+    float bbox1_size = bbox_sizes[idx1];
+    float bbox2_size = bbox_sizes[idx2];
+
+    return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+}
+
+
+static void nms_cf(const float* conf_data,
+                   const float* bboxes,
+                   const float* sizes,
+                   int* buffer,
+                   int* indices,
+                   int& detections,
+                   const int boxes_num,
+                   const int pre_nms_topn,
+                   const int post_nms_topn,
+                   const float confidence_threshold,
+                   const float nms_threshold) {
+    int count = 0;
+    for (int i = 0; i < boxes_num; ++i) {
+        if (conf_data[i] > confidence_threshold) {
+            indices[count] = i;
+            count++;
+        }
+    }
+
+    int num_output_scores = (pre_nms_topn == -1 ? count : (std::min)(pre_nms_topn, count));
+
+    std::partial_sort_copy(indices, indices + count,
+                           buffer, buffer + num_output_scores,
+                           ConfidenceComparator(conf_data));
+
+    detections = 0;
+    for (int i = 0; i < num_output_scores; ++i) {
+        const int idx = buffer[i];
+
+        bool keep = true;
+        for (int k = 0; k < detections; ++k) {
+            const int kept_idx = indices[k];
+            float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx);
+            if (overlap > nms_threshold) {
+                keep = false;
+                break;
+            }
+        }
+        if (keep) {
+            indices[detections] = idx;
+            detections++;
+        }
+    }
+
+    detections = (post_nms_topn == -1 ? detections : (std::min)(post_nms_topn, detections));
+}
+
+bool MKLDNNExperimentalDetectronDetectionOutputNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto doOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronDetectionOutput>(op);
+        if (!doOp) {
+            errorMessage = "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6.";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNExperimentalDetectronDetectionOutputNode::MKLDNNExperimentalDetectronDetectionOutputNode
+                        (const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                         MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+    auto doOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronDetectionOutput>(op);
+    auto attributes = doOp->get_attrs();
+
+    score_threshold_ = attributes.score_threshold;
+    nms_threshold_ = attributes.nms_threshold;
+    max_delta_log_wh_ = attributes.max_delta_log_wh;
+    classes_num_ = attributes.num_classes;
+    max_detections_per_class_ = attributes.post_nms_count;
+    max_detections_per_image_ = attributes.max_detections_per_image;
+    class_agnostic_box_regression_ = attributes.class_agnostic_box_regression;
+    deltas_weights_ = attributes.deltas_weights;
+}
+
+void MKLDNNExperimentalDetectronDetectionOutputNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    std::vector<DataConfigurator> inDataConf;
+    inDataConf.reserve(getOriginalInputsNumber());
+    for (int i = 0; i < getOriginalInputsNumber(); ++i)
+        inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
+
+    addSupportedPrimDesc(inDataConf,
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::I32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNExperimentalDetectronDetectionOutputNode::execute(mkldnn::stream strm) {
+    const int rois_num = getParentEdgeAt(INPUT_ROIS)->getDims()[0];
+    assert(classes_num_ == static_cast<int>(getParentEdgeAt(INPUT_SCORES)->getDims()[1]));
+    assert(4 * classes_num_ == static_cast<int>(getParentEdgeAt(INPUT_DELTAS)->getDims()[1]));
+
+    const auto* boxes = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_ROIS)->getMemoryPtr()->GetPtr());
+    const auto* deltas = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_DELTAS)->getMemoryPtr()->GetPtr());
+    const auto* scores = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_SCORES)->getMemoryPtr()->GetPtr());
+    const auto* im_info = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_IM_INFO)->getMemoryPtr()->GetPtr());
+
+    auto* output_boxes = reinterpret_cast<float *>(getChildEdgesAtPort(OUTPUT_BOXES)[0]->getMemoryPtr()->GetPtr());
+    auto* output_scores = reinterpret_cast<float *>(getChildEdgesAtPort(OUTPUT_SCORES)[0]->getMemoryPtr()->GetPtr());
+    auto* output_classes = reinterpret_cast<int32_t *>(getChildEdgesAtPort(OUTPUT_CLASSES)[0]->getMemoryPtr()->GetPtr());
+
+    const float img_H = im_info[0];
+    const float img_W = im_info[1];
+
+    // Apply deltas.
+    std::vector<float> refined_boxes(classes_num_ * rois_num * 4, 0);
+    std::vector<float> refined_scores(classes_num_ * rois_num, 0);
+    std::vector<float> refined_boxes_areas(classes_num_ * rois_num, 0);
+    Indexer refined_box_idx({classes_num_, rois_num, 4});
+    Indexer refined_score_idx({classes_num_, rois_num});
+
+    refine_boxes(boxes, deltas, &deltas_weights_[0], scores,
+                 &refined_boxes[0], &refined_boxes_areas[0], &refined_scores[0],
+                 rois_num, classes_num_,
+                 img_H, img_W,
+                 max_delta_log_wh_,
+                 1.0f);
+
+    // Apply NMS class-wise.
+    std::vector<int> buffer(rois_num, 0);
+    std::vector<int> indices(classes_num_ * rois_num, 0);
+    std::vector<int> detections_per_class(classes_num_, 0);
+    int total_detections_num = 0;
+
+    for (int class_idx = 1; class_idx < classes_num_; ++class_idx) {
+        nms_cf(&refined_scores[refined_score_idx({class_idx, 0})],
+               &refined_boxes[refined_box_idx({class_idx, 0, 0})],
+               &refined_boxes_areas[refined_score_idx({class_idx, 0})],
+               &buffer[0],
+               &indices[total_detections_num],
+               detections_per_class[class_idx],
+               rois_num,
+               -1,
+               max_detections_per_class_,
+               score_threshold_,
+               nms_threshold_);
+        total_detections_num += detections_per_class[class_idx];
+    }
+
+    // Leave only max_detections_per_image_ detections.
+    // confidence, <class, index>
+    std::vector<std::pair<float, std::pair<int, int>>> conf_index_class_map;
+
+    int indices_offset = 0;
+    for (int c = 0; c < classes_num_; ++c) {
+        int n = detections_per_class[c];
+        for (int i = 0; i < n; ++i) {
+            int idx = indices[indices_offset + i];
+            float score = refined_scores[refined_score_idx({c, idx})];
+            conf_index_class_map.push_back(std::make_pair(score, std::make_pair(c, idx)));
+        }
+        indices_offset += n;
+    }
+
+    assert(max_detections_per_image_ > 0);
+    if (total_detections_num > max_detections_per_image_) {
+        std::partial_sort(conf_index_class_map.begin(),
+                          conf_index_class_map.begin() + max_detections_per_image_,
+                          conf_index_class_map.end(),
+                          SortScorePairDescend<std::pair<int, int>>);
+        conf_index_class_map.resize(max_detections_per_image_);
+        total_detections_num = max_detections_per_image_;
+    }
+
+    // Fill outputs.
+    memset(output_boxes, 0, max_detections_per_image_ * 4 * sizeof(output_boxes[0]));
+    memset(output_scores, 0, max_detections_per_image_ * sizeof(output_scores[0]));
+    memset(output_classes, 0, max_detections_per_image_ * sizeof(output_classes[0]));
+
+    int i = 0;
+    for (const auto & detection : conf_index_class_map) {
+        float score = detection.first;
+        int cls = detection.second.first;
+        int idx = detection.second.second;
+        output_boxes[4 * i + 0] = refined_boxes[refined_box_idx({cls, idx, 0})];
+        output_boxes[4 * i + 1] = refined_boxes[refined_box_idx({cls, idx, 1})];
+        output_boxes[4 * i + 2] = refined_boxes[refined_box_idx({cls, idx, 2})];
+        output_boxes[4 * i + 3] = refined_boxes[refined_box_idx({cls, idx, 3})];
+        output_scores[i] = score;
+        output_classes[i] = cls;
+        ++i;
+    }
+}
+
+bool MKLDNNExperimentalDetectronDetectionOutputNode::created() const {
+    return getType() == ExperimentalDetectronDetectionOutput;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronDetectionOutputNode, ExperimentalDetectronDetectionOutput)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h
new file mode 100644
index 00000000000000..2df28ce5c4983b
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNExperimentalDetectronDetectionOutputNode : public MKLDNNNode {
+public:
+    MKLDNNExperimentalDetectronDetectionOutputNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    const int INPUT_ROIS {0};
+    const int INPUT_DELTAS {1};
+    const int INPUT_SCORES {2};
+    const int INPUT_IM_INFO {3};
+
+    const int OUTPUT_BOXES {0};
+    const int OUTPUT_CLASSES {1};
+    const int OUTPUT_SCORES {2};
+
+    float score_threshold_;
+    float nms_threshold_;
+    float max_delta_log_wh_;
+    int classes_num_;
+    int max_detections_per_class_;
+    int max_detections_per_image_;
+    bool class_agnostic_box_regression_;
+    std::vector<float> deltas_weights_;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp
new file mode 100644
index 00000000000000..255f8443765660
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp
@@ -0,0 +1,429 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "base.hpp"
+
+#include <cstring>
+#include <cassert>
+#include <cmath>
+#include <string>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+#if defined(HAVE_AVX2)
+#include <immintrin.h>
+#endif
+
+#include <ngraph/op/experimental_detectron_generate_proposals.hpp>
+#include "ie_parallel.hpp"
+#include "common/cpu_memcpy.h"
+#include "mkldnn_experimental_detectron_generate_proposals_single_image_node.h"
+
+namespace {
+struct Indexer4d {
+    int dim3_;
+    int dim23_;
+    int dim123_;
+
+    explicit Indexer4d(int dim0, int dim1, int dim2, int dim3):
+            dim3_(dim3), dim23_(dim2 * dim3), dim123_(dim1 * dim2 * dim3) {
+        (void)dim0;
+    }
+
+    int operator()(int i, int j, int k, int n) const {
+        return  i * dim123_ + j * dim23_ + k * dim3_ + n;
+    }
+};
+}  // namespace
+
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+static
+void refine_anchors(const float* deltas, const float* scores, const float* anchors,
+                    float* proposals, const int anchors_num, const int bottom_H,
+                    const int bottom_W, const float img_H, const float img_W,
+                    const float min_box_H, const float min_box_W,
+                    const float max_delta_log_wh,
+                    float coordinates_offset) {
+    Indexer4d delta_idx(anchors_num, 4, bottom_H, bottom_W);
+    Indexer4d score_idx(anchors_num, 1, bottom_H, bottom_W);
+    Indexer4d proposal_idx(bottom_H, bottom_W, anchors_num, 5);
+    Indexer4d anchor_idx(bottom_H, bottom_W, anchors_num, 4);
+
+    parallel_for2d(bottom_H, bottom_W, [&](int h, int w) {
+        for (int anchor = 0; anchor < anchors_num; ++anchor) {
+            int a_idx = anchor_idx(h, w, anchor, 0);
+            float x0 = anchors[a_idx + 0];
+            float y0 = anchors[a_idx + 1];
+            float x1 = anchors[a_idx + 2];
+            float y1 = anchors[a_idx + 3];
+
+            const float dx = deltas[delta_idx(anchor, 0, h, w)];
+            const float dy = deltas[delta_idx(anchor, 1, h, w)];
+            const float d_log_w = deltas[delta_idx(anchor, 2, h, w)];
+            const float d_log_h = deltas[delta_idx(anchor, 3, h, w)];
+
+            const float score = scores[score_idx(anchor, 0, h, w)];
+
+            // width & height of box
+            const float ww = x1 - x0 + coordinates_offset;
+            const float hh = y1 - y0 + coordinates_offset;
+            // center location of box
+            const float ctr_x = x0 + 0.5f * ww;
+            const float ctr_y = y0 + 0.5f * hh;
+
+            // new center location according to deltas (dx, dy)
+            const float pred_ctr_x = dx * ww + ctr_x;
+            const float pred_ctr_y = dy * hh + ctr_y;
+            // new width & height according to deltas d(log w), d(log h)
+            const float pred_w = std::exp(std::min(d_log_w, max_delta_log_wh)) * ww;
+            const float pred_h = std::exp(std::min(d_log_h, max_delta_log_wh)) * hh;
+
+            // update upper-left corner location
+            x0 = pred_ctr_x - 0.5f * pred_w;
+            y0 = pred_ctr_y - 0.5f * pred_h;
+            // update lower-right corner location
+            x1 = pred_ctr_x + 0.5f * pred_w - coordinates_offset;
+            y1 = pred_ctr_y + 0.5f * pred_h - coordinates_offset;
+
+            // adjust new corner locations to be within the image region,
+            x0 = std::max<float>(0.0f, std::min<float>(x0, img_W - coordinates_offset));
+            y0 = std::max<float>(0.0f, std::min<float>(y0, img_H - coordinates_offset));
+            x1 = std::max<float>(0.0f, std::min<float>(x1, img_W - coordinates_offset));
+            y1 = std::max<float>(0.0f, std::min<float>(y1, img_H - coordinates_offset));
+
+            // recompute new width & height
+            const float box_w = x1 - x0 + coordinates_offset;
+            const float box_h = y1 - y0 + coordinates_offset;
+
+            int p_idx = proposal_idx(h, w, anchor, 0);
+            proposals[p_idx + 0] = x0;
+            proposals[p_idx + 1] = y0;
+            proposals[p_idx + 2] = x1;
+            proposals[p_idx + 3] = y1;
+            proposals[p_idx + 4] = (min_box_W <= box_w) * (min_box_H <= box_h) * score;
+        }
+    });
+}
+
+static void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) {
+    parallel_for(pre_nms_topn, [&](size_t i) {
+        unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0];
+        unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1];
+        unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2];
+        unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3];
+        unpacked_boxes[4*pre_nms_topn + i] = p_proposals[5*i + 4];
+    });
+}
+
+static
+void nms_cpu(const int num_boxes, int is_dead[],
+             const float* boxes, int index_out[], int* const num_out,
+             const int base_index, const float nms_thresh, const int max_num_out,
+             float coordinates_offset) {
+    const int num_proposals = num_boxes;
+    int count = 0;
+
+    const float* x0 = boxes + 0 * num_proposals;
+    const float* y0 = boxes + 1 * num_proposals;
+    const float* x1 = boxes + 2 * num_proposals;
+    const float* y1 = boxes + 3 * num_proposals;
+
+    std::memset(is_dead, 0, num_boxes * sizeof(int));
+
+#if defined(HAVE_AVX2)
+    __m256  vc_fone = _mm256_set1_ps(coordinates_offset);
+    __m256i vc_ione = _mm256_set1_epi32(1);
+    __m256  vc_zero = _mm256_set1_ps(0.0f);
+
+    __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh);
+#endif
+
+    for (int box = 0; box < num_boxes; ++box) {
+        if (is_dead[box])
+            continue;
+
+        index_out[count++] = base_index + box;
+        if (count == max_num_out)
+            break;
+
+        int tail = box + 1;
+
+#if defined(HAVE_AVX2)
+        __m256 vx0i = _mm256_set1_ps(x0[box]);
+        __m256 vy0i = _mm256_set1_ps(y0[box]);
+        __m256 vx1i = _mm256_set1_ps(x1[box]);
+        __m256 vy1i = _mm256_set1_ps(y1[box]);
+
+        __m256 vA_width  = _mm256_sub_ps(vx1i, vx0i);
+        __m256 vA_height = _mm256_sub_ps(vy1i, vy0i);
+        __m256 vA_area   = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone));
+
+        for (; tail <= num_boxes - 8; tail += 8) {
+            __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail);
+            __m256i  vdst = _mm256_loadu_si256(pdst);
+
+            __m256 vx0j = _mm256_loadu_ps(x0 + tail);
+            __m256 vy0j = _mm256_loadu_ps(y0 + tail);
+            __m256 vx1j = _mm256_loadu_ps(x1 + tail);
+            __m256 vy1j = _mm256_loadu_ps(y1 + tail);
+
+            __m256 vx0 = _mm256_max_ps(vx0i, vx0j);
+            __m256 vy0 = _mm256_max_ps(vy0i, vy0j);
+            __m256 vx1 = _mm256_min_ps(vx1i, vx1j);
+            __m256 vy1 = _mm256_min_ps(vy1i, vy1j);
+
+            __m256 vwidth  = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone);
+            __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone);
+            __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight));
+
+            __m256 vB_width  = _mm256_sub_ps(vx1j, vx0j);
+            __m256 vB_height = _mm256_sub_ps(vy1j, vy0j);
+            __m256 vB_area   = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone));
+
+            __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea);
+            __m256 vintersection_area = _mm256_div_ps(varea, vdivisor);
+
+            __m256 vcmp_0 = _mm256_cmp_ps(vx0i, vx1j, _CMP_LE_OS);
+            __m256 vcmp_1 = _mm256_cmp_ps(vy0i, vy1j, _CMP_LE_OS);
+            __m256 vcmp_2 = _mm256_cmp_ps(vx0j, vx1i, _CMP_LE_OS);
+            __m256 vcmp_3 = _mm256_cmp_ps(vy0j, vy1i, _CMP_LE_OS);
+            __m256 vcmp_4 = _mm256_cmp_ps(vc_nms_thresh, vintersection_area, _CMP_LT_OS);
+
+            vcmp_0 = _mm256_and_ps(vcmp_0, vcmp_1);
+            vcmp_2 = _mm256_and_ps(vcmp_2, vcmp_3);
+            vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_0);
+            vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_2);
+
+            _mm256_storeu_si256(pdst, _mm256_blendv_epi8(vdst, vc_ione, _mm256_castps_si256(vcmp_4)));
+        }
+#endif
+
+        for (; tail < num_boxes; ++tail) {
+            float res = 0.0f;
+
+            const float x0i = x0[box];
+            const float y0i = y0[box];
+            const float x1i = x1[box];
+            const float y1i = y1[box];
+
+            const float x0j = x0[tail];
+            const float y0j = y0[tail];
+            const float x1j = x1[tail];
+            const float y1j = y1[tail];
+
+            if (x0i <= x1j && y0i <= y1j && x0j <= x1i && y0j <= y1i) {
+                // overlapped region (= box)
+                const float x0 = std::max<float>(x0i, x0j);
+                const float y0 = std::max<float>(y0i, y0j);
+                const float x1 = std::min<float>(x1i, x1j);
+                const float y1 = std::min<float>(y1i, y1j);
+
+                // intersection area
+                const float width  = std::max<float>(0.0f,  x1 - x0 + coordinates_offset);
+                const float height = std::max<float>(0.0f,  y1 - y0 + coordinates_offset);
+                const float area   = width * height;
+
+                // area of A, B
+                const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset);
+                const float B_area = (x1j - x0j + coordinates_offset) * (y1j - y0j + coordinates_offset);
+
+                // IoU
+                res = area / (A_area + B_area - area);
+            }
+
+            if (nms_thresh < res)
+                is_dead[tail] = 1;
+        }
+    }
+
+    *num_out = count;
+}
+
+
+static
+void fill_output_blobs(const float* proposals, const int* roi_indices,
+                       float* rois, float* scores,
+                       const int num_proposals, const int num_rois, const int post_nms_topn) {
+    const float *src_x0 = proposals + 0 * num_proposals;
+    const float *src_y0 = proposals + 1 * num_proposals;
+    const float *src_x1 = proposals + 2 * num_proposals;
+    const float *src_y1 = proposals + 3 * num_proposals;
+    const float *src_score = proposals + 4 * num_proposals;
+
+    parallel_for(num_rois, [&](size_t i) {
+        int index = roi_indices[i];
+        rois[i * 4 + 0] = src_x0[index];
+        rois[i * 4 + 1] = src_y0[index];
+        rois[i * 4 + 2] = src_x1[index];
+        rois[i * 4 + 3] = src_y1[index];
+        scores[i] = src_score[index];
+    });
+
+    if (num_rois < post_nms_topn) {
+        for (int i = 4 * num_rois; i < 4 * post_nms_topn; i++) {
+            rois[i] = 0.f;
+        }
+        for (int i = num_rois; i < post_nms_topn; i++) {
+            scores[i] = 0.f;
+        }
+    }
+}
+
+bool MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::isSupportedOperation
+            (const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto proposalOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronGenerateProposalsSingleImage>(op);
+        if (!proposalOp) {
+            errorMessage = "Node is not an instance of the Proposal from the operations set v0.";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode
+        (const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    auto proposalOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronGenerateProposalsSingleImage>(op);
+    auto proposalAttrs = proposalOp->get_attrs();
+
+    min_size_ = proposalAttrs.min_size;
+    nms_thresh_ = proposalAttrs.nms_threshold;
+    pre_nms_topn_ = proposalAttrs.pre_nms_count;
+    post_nms_topn_ = proposalAttrs.post_nms_count;
+
+    coordinates_offset = 0.0f;
+
+    roi_indices_.resize(post_nms_topn_);
+}
+
+void MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::execute(mkldnn::stream strm) {
+    try {
+        if (inDims.size() != 4 || outDims.size() != 2) {
+            IE_THROW() << "Incorrect number of input or output edges!";
+        }
+
+        size_t anchor_dims_size = 1;
+        for (size_t i = 0; i < getParentEdgeAt(INPUT_ANCHORS)->getDims().ToSizeVector().size(); i++) {
+            anchor_dims_size *= getParentEdgeAt(INPUT_ANCHORS)->getDims().ToSizeVector()[i];
+        }
+
+        size_t deltas_dims_size = 1;
+        for (size_t i = 0; i < getParentEdgeAt(INPUT_DELTAS)->getDims().ToSizeVector().size(); i++) {
+            deltas_dims_size *= getParentEdgeAt(INPUT_DELTAS)->getDims().ToSizeVector()[i];
+        }
+        if (anchor_dims_size != deltas_dims_size)
+            IE_THROW() << "'Anchors' blob size for ONNXProposal is incompatible with 'deltas' blob size!";
+
+        size_t score_dims_size = 1;
+        for (size_t i = 0; i < getParentEdgeAt(INPUT_SCORES)->getDims().ToSizeVector().size(); i++) {
+            score_dims_size *= getParentEdgeAt(INPUT_SCORES)->getDims().ToSizeVector()[i];
+        }
+        if (deltas_dims_size != (4 * score_dims_size))
+            IE_THROW() << "'Deltas' blob size for ONNXProposal is incompatible with 'scores' blob size!";
+
+        // Prepare memory
+        const float *p_deltas_item  = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_DELTAS)->getMemoryPtr()->GetPtr());
+        const float *p_scores_item  = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_SCORES)->getMemoryPtr()->GetPtr());
+        const float *p_anchors_item = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_ANCHORS)->getMemoryPtr()->GetPtr());
+        const float *p_img_info_cpu = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_IM_INFO)->getMemoryPtr()->GetPtr());
+
+        float *p_roi_item       = reinterpret_cast<float *>(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr());
+        float *p_roi_score_item = reinterpret_cast<float *>(getChildEdgesAtPort(OUTPUT_SCORES)[0]->getMemoryPtr()->GetPtr());
+
+        const int anchors_num = getParentEdgeAt(INPUT_SCORES)->getDims()[0];
+
+        // bottom shape: (num_anchors) x H x W
+        const int bottom_H = getParentEdgeAt(INPUT_DELTAS)->getDims()[1];
+        const int bottom_W = getParentEdgeAt(INPUT_DELTAS)->getDims()[2];
+
+        // input image height & width
+        const float img_H = p_img_info_cpu[0];
+        const float img_W = p_img_info_cpu[1];
+
+        // scale factor for height & width
+
+        // minimum box width & height
+        const float min_box_H = min_size_;
+        const float min_box_W = min_size_;
+
+        // number of all proposals = num_anchors * H * W
+        const int num_proposals = anchors_num * bottom_H * bottom_W;
+
+        // number of top-n proposals before NMS
+        const int pre_nms_topn = std::min<int>(num_proposals, pre_nms_topn_);
+
+        // number of final RoIs
+        int num_rois = 0;
+
+        // enumerate all proposals
+        //   num_proposals = num_anchors * H * W
+        //   (x1, y1, x2, y2, score) for each proposal
+        // NOTE: for bottom, only foreground scores are passed
+        struct ProposalBox {
+            float x0;
+            float y0;
+            float x1;
+            float y1;
+            float score;
+        };
+        std::vector<ProposalBox> proposals_(num_proposals);
+        std::vector<float> unpacked_boxes(5 * pre_nms_topn);
+        std::vector<int> is_dead(pre_nms_topn);
+
+        // Execute
+        int batch_size = 1;  // inputs[INPUT_DELTAS]->getTensorDesc().getDims()[0];
+        for (int n = 0; n < batch_size; ++n) {
+            refine_anchors(p_deltas_item, p_scores_item, p_anchors_item,
+                           reinterpret_cast<float *>(&proposals_[0]), anchors_num, bottom_H,
+                           bottom_W, img_H, img_W,
+                           min_box_H, min_box_W,
+                           static_cast<const float>(log(1000. / 16.)),
+                           1.0f);
+            std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(),
+                              [](const ProposalBox &struct1, const ProposalBox &struct2) {
+                                  return (struct1.score > struct2.score);
+                              });
+
+            unpack_boxes(reinterpret_cast<float *>(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn);
+            nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0,
+                    nms_thresh_, post_nms_topn_, coordinates_offset);
+            fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], p_roi_item, p_roi_score_item,
+                              pre_nms_topn, num_rois, post_nms_topn_);
+        }
+    } catch (const std::exception &e) {
+        std::string errorMsg = e.what();
+        IE_THROW() << errorMsg;
+    }
+}
+
+bool MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::created() const {
+    return getType() == ExperimentalDetectronGenerateProposalsSingleImage;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode, ExperimentalDetectronGenerateProposalsSingleImage)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h
new file mode 100644
index 00000000000000..b2f5f0bcd89fe1
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode : public MKLDNNNode {
+public:
+    MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode(const std::shared_ptr<ngraph::Node>& op,
+        const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    // Inputs:
+    //      rois, shape [n, 4]
+    //      rois_probs, shape [n]
+    // Outputs:
+    //      top_rois, shape [max_rois, 4]
+
+    const int INPUT_IM_INFO {0};
+    const int INPUT_ANCHORS {1};
+    const int INPUT_DELTAS {2};
+    const int INPUT_SCORES {3};
+    const int OUTPUT_ROIS {0};
+    const int OUTPUT_SCORES {1};
+
+    float min_size_;
+    int pre_nms_topn_;
+    int post_nms_topn_;
+    float nms_thresh_;
+    float coordinates_offset;
+
+    std::vector<int> roi_indices_;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp
new file mode 100644
index 00000000000000..b5d073a0b3552e
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "base.hpp"
+
+#include <string>
+
+#include <ngraph/opsets/opset6.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_experimental_detectron_priorgridgenerator_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNExperimentalDetectronPriorGridGeneratorNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto priorGridGen = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronPriorGridGenerator>(op);
+        if (!priorGridGen) {
+            errorMessage = "Only opset6 ExperimentalDetectronPriorGridGenerator operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNExperimentalDetectronPriorGridGeneratorNode::MKLDNNExperimentalDetectronPriorGridGeneratorNode
+        (const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "ExperimentalDetectronPriorGridGenerator layer with name '" + op->get_friendly_name() + "'";
+    const auto priorGridGen = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronPriorGridGenerator>(op);
+    if (getOriginalInputsNumber() != 3 || getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
+
+    if (op->get_input_shape(INPUT_PRIORS).size() != 2 ||
+        op->get_input_shape(INPUT_FEATUREMAP).size() != 4 ||
+        op->get_input_shape(INPUT_IMAGE).size() != 4)
+        IE_THROW() << errorPrefix << " has unsupported input shape";
+
+    const auto &attr = priorGridGen->get_attrs();
+    grid_w_ = attr.w;
+    grid_h_ = attr.h;
+    stride_h_ = attr.stride_y;
+    stride_w_ = attr.stride_x;
+}
+
+void MKLDNNExperimentalDetectronPriorGridGeneratorNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNExperimentalDetectronPriorGridGeneratorNode::execute(mkldnn::stream strm) {
+    const int num_priors_ = getParentEdgeAt(INPUT_PRIORS)->getDims()[0];
+    assert(getParentEdgeAt(INPUT_PRIORS)->getDims()[1] == 4);
+
+    // Execute
+    const int layer_width = grid_w_ ? grid_w_ : getParentEdgeAt(INPUT_FEATUREMAP)->getDims()[3];
+    const int layer_height = grid_h_ ? grid_h_ : getParentEdgeAt(INPUT_FEATUREMAP)->getDims()[2];
+    const float step_w = stride_w_ ? stride_w_ : static_cast<float>(getParentEdgeAt(INPUT_IMAGE)->getDims()[3]) / layer_width;
+    const float step_h = stride_h_ ? stride_h_ : static_cast<float>(getParentEdgeAt(INPUT_IMAGE)->getDims()[2]) / layer_height;
+
+    const auto *bottom_data_0 = reinterpret_cast<const float *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    auto *top_data_0 = reinterpret_cast<float *>(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr());
+
+    for (int h = 0; h < layer_height; ++h) {
+        for (int w = 0; w < layer_width; ++w) {
+            for (int s = 0; s < num_priors_; ++s) {
+                top_data_0[0] = bottom_data_0[4 * s + 0] + step_w * (w + 0.5f);
+                top_data_0[1] = bottom_data_0[4 * s + 1] + step_h * (h + 0.5f);
+                top_data_0[2] = bottom_data_0[4 * s + 2] + step_w * (w + 0.5f);
+                top_data_0[3] = bottom_data_0[4 * s + 3] + step_h * (h + 0.5f);
+                top_data_0 += 4;
+            }
+        }
+    }
+}
+
+bool MKLDNNExperimentalDetectronPriorGridGeneratorNode::created() const {
+    return getType() == ExperimentalDetectronPriorGridGenerator;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronPriorGridGeneratorNode, ExperimentalDetectronPriorGridGenerator)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h
new file mode 100644
index 00000000000000..9ef117f44e65f7
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNExperimentalDetectronPriorGridGeneratorNode : public MKLDNNNode {
+public:
+    MKLDNNExperimentalDetectronPriorGridGeneratorNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    // Inputs:
+    //      priors, shape [n, 4]
+    //      [feature_map], shape [b, c, h, w]
+    //      [im_data], shape [b, 3, im_h, im_w]
+    // Outputs:
+    //      priors_grid, shape [m, 4]
+
+    const int INPUT_PRIORS {0};
+    const int INPUT_FEATUREMAP {1};
+    const int INPUT_IMAGE {2};
+
+    const int OUTPUT_ROIS {0};
+
+    int grid_w_;
+    int grid_h_;
+    float stride_w_;
+    float stride_h_;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp
new file mode 100644
index 00000000000000..94e7f033a95548
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp
@@ -0,0 +1,413 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include <ngraph/opsets/opset6.hpp>
+#include "ie_parallel.hpp"
+#include "common/cpu_memcpy.h"
+#include "mkldnn_experimental_detectron_roifeatureextractor_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+    int pos1;
+    int pos2;
+    int pos3;
+    int pos4;
+    T w1;
+    T w2;
+    T w3;
+    T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+        const int height,
+        const int width,
+        const int pooled_height,
+        const int pooled_width,
+        const int iy_upper,
+        const int ix_upper,
+        T roi_start_h,
+        T roi_start_w,
+        T bin_size_h,
+        T bin_size_w,
+        int roi_bin_grid_h,
+        int roi_bin_grid_w,
+        std::vector<PreCalc<T>>& pre_calc) {
+    int pre_calc_index = 0;
+    for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+            for (int iy = 0; iy < iy_upper; iy++) {
+                const T yy = roi_start_h + ph * bin_size_h +
+                             static_cast<T>(iy + .5f) * bin_size_h /
+                             static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+                for (int ix = 0; ix < ix_upper; ix++) {
+                    const T xx = roi_start_w + pw * bin_size_w +
+                                 static_cast<T>(ix + .5f) * bin_size_w /
+                                 static_cast<T>(roi_bin_grid_w);
+
+                    T x = xx;
+                    T y = yy;
+                    // deal with: inverse elements are out of feature map boundary
+                    if (y < -1.0 || y > height || x < -1.0 || x > width) {
+                        // empty
+                        PreCalc<T> pc;
+                        pc.pos1 = 0;
+                        pc.pos2 = 0;
+                        pc.pos3 = 0;
+                        pc.pos4 = 0;
+                        pc.w1 = 0;
+                        pc.w2 = 0;
+                        pc.w3 = 0;
+                        pc.w4 = 0;
+                        pre_calc.at(pre_calc_index) = pc;
+                        pre_calc_index += 1;
+                        continue;
+                    }
+
+                    if (y <= 0) {
+                        y = 0;
+                    }
+                    if (x <= 0) {
+                        x = 0;
+                    }
+
+                    int y_low = static_cast<int>(y);
+                    int x_low = static_cast<int>(x);
+                    int y_high = 0;
+                    int x_high = 0;
+
+                    if (y_low >= height - 1) {
+                        y_high = y_low = height - 1;
+                        y = (T)y_low;
+                    } else {
+                        y_high = y_low + 1;
+                    }
+
+                    if (x_low >= width - 1) {
+                        x_high = x_low = width - 1;
+                        x = (T)x_low;
+                    } else {
+                        x_high = x_low + 1;
+                    }
+
+                    T ly = y - y_low;
+                    T lx = x - x_low;
+                    T hy = static_cast<T>(1) - ly, hx = static_cast<T>(1) - lx;
+                    T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+                    // save weights and indeces
+                    PreCalc<T> pc;
+                    pc.pos1 = y_low * width + x_low;
+                    pc.pos2 = y_low * width + x_high;
+                    pc.pos3 = y_high * width + x_low;
+                    pc.pos4 = y_high * width + x_high;
+                    pc.w1 = w1;
+                    pc.w2 = w2;
+                    pc.w3 = w3;
+                    pc.w4 = w4;
+                    pre_calc[pre_calc_index] = pc;
+
+                    pre_calc_index += 1;
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void ROIAlignForward_cpu_kernel(
+        const int nthreads,
+        const T* bottom_data,
+        const T& spatial_scale,
+        const int channels,
+        const int height,
+        const int width,
+        const int pooled_height,
+        const int pooled_width,
+        const int sampling_ratio,
+        const T* bottom_rois,
+        const bool aligned,
+        T* top_data) {
+    int roi_cols = 4;
+
+    int n_rois = nthreads / channels / pooled_width / pooled_height;
+    // (n, c, ph, pw) is an element in the pooled output
+    parallel_for(n_rois, [&](size_t n) {
+        int index_n = n * channels * pooled_width * pooled_height;
+
+        // roi could have 4 or 5 columns
+        const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+        int roi_batch_ind = 0;
+        if (roi_cols == 5) {
+            roi_batch_ind = static_cast<int>(offset_bottom_rois[0]);
+            offset_bottom_rois++;
+        }
+
+        T offset = aligned ? (T)0.5 : (T)0.0;
+        // Do not using rounding; this implementation detail is critical
+        T roi_start_w = offset_bottom_rois[0] * spatial_scale - offset;
+        T roi_start_h = offset_bottom_rois[1] * spatial_scale - offset;
+        T roi_end_w = offset_bottom_rois[2] * spatial_scale - offset;
+        T roi_end_h = offset_bottom_rois[3] * spatial_scale - offset;
+
+        // Force malformed ROIs to be 1x1
+        T roi_width = (std::max)(roi_end_w - roi_start_w, (T)1.);
+        T roi_height = (std::max)(roi_end_h - roi_start_h, (T)1.);
+        T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+        T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+        // We use roi_bin_grid to sample the grid and mimic integral
+        int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : static_cast<int>(ceil(roi_height / pooled_height));  // e.g., = 2
+        int roi_bin_grid_w =
+                (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceil(roi_width / pooled_width));
+
+        // We do average (integral) pooling inside a bin
+        const T count = static_cast<T>(roi_bin_grid_h * roi_bin_grid_w);  // e.g. = 4
+
+        // we want to precalculate indeces and weights shared by all chanels,
+        // this is the key point of optimiation
+        std::vector<PreCalc<T>> pre_calc(
+                roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+        pre_calc_for_bilinear_interpolate(
+                height,
+                width,
+                pooled_height,
+                pooled_width,
+                roi_bin_grid_h,
+                roi_bin_grid_w,
+                roi_start_h,
+                roi_start_w,
+                bin_size_h,
+                bin_size_w,
+                roi_bin_grid_h,
+                roi_bin_grid_w,
+                pre_calc);
+
+        for (int c = 0; c < channels; c++) {
+            int index_n_c = index_n + c * pooled_width * pooled_height;
+            const T* offset_bottom_data =
+                    bottom_data + (roi_batch_ind * channels + c) * height * width;
+            int pre_calc_index = 0;
+
+            for (int ph = 0; ph < pooled_height; ph++) {
+                for (int pw = 0; pw < pooled_width; pw++) {
+                    int index = index_n_c + ph * pooled_width + pw;
+
+                    T output_val = 0.;
+                    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+                        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+                            PreCalc<T> pc = pre_calc[pre_calc_index];
+                            output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                                          pc.w2 * offset_bottom_data[pc.pos2] +
+                                          pc.w3 * offset_bottom_data[pc.pos3] +
+                                          pc.w4 * offset_bottom_data[pc.pos4];
+
+                            pre_calc_index += 1;
+                        }
+                    }
+                    output_val /= count;
+
+                    top_data[index] = output_val;
+                }  // for pw
+            }  // for ph
+        }  // for c
+    });
+}
+
+
+void redistribute_rois(const float* rois, int* level_ids,
+                       const int num_rois, const int levels_num) {
+    const float canonical_scale = 224.0f;
+    const int canonical_level = 2;
+
+    for (int i = 0; i < num_rois; ++i) {
+        const float x0 = rois[4 * i + 0];
+        const float y0 = rois[4 * i + 1];
+        const float x1 = rois[4 * i + 2];
+        const float y1 = rois[4 * i + 3];
+
+        int target_level = levels_num;
+        float area = (x1 - x0) * (y1 - y0);
+        if (area > 0) {
+            area = std::sqrt(area) / canonical_scale;
+            area = std::log2(area + 1e-6f);
+            target_level = static_cast<int>(std::floor(area + canonical_level));
+            target_level = (std::max)(0, (std::min)(levels_num - 1, target_level));
+        }
+
+        level_ids[i] = target_level;
+    }
+}
+
+
+void reord(const float* src_data, const int* ranks, const int n, const int step, float* dst_data,
+             int* dst_mapping) {
+    std::iota(dst_mapping, dst_mapping + n, 0);
+    std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) {return ranks[i1] < ranks[i2];});
+    for (int i = 0; i < n; ++i) {
+        const int j = dst_mapping[i];
+        assert(0 <= j && j < n);
+        cpu_memcpy(dst_data + i * step, src_data + j * step, sizeof(float) * step);
+    }
+}
+
+void split_points(const std::vector<int>& ids, std::vector<int>& rois_per_level, const int levels_num) {
+    rois_per_level.clear();
+    rois_per_level.resize(levels_num, 0);
+    for (size_t i = 0; i < ids.size(); ++i) {
+        assert(0 <= ids[i] && ids[i] < levels_num);
+        rois_per_level[ids[i]]++;
+    }
+    for (int i = 1; i < levels_num; ++i) {
+        rois_per_level[i] += rois_per_level[i - 1];
+    }
+    rois_per_level.insert(rois_per_level.begin(), 0);
+}
+
+
+void reorder_rois(const float *rois, const int* ids, int* mapping, const int rois_num,
+                  float * reordered_rois, std::vector<int>& rois_per_level, const int levels_num) {
+    rois_per_level.clear();
+    rois_per_level.resize(levels_num, 0);
+    for (int i = 0; i < rois_num; ++i) {
+        assert(0 <= ids[i] && ids[i] < levels_num);
+        rois_per_level[ids[i]]++;
+    }
+    for (int i = 1; i < levels_num; ++i) {
+        rois_per_level[i] += rois_per_level[i - 1];
+    }
+    rois_per_level.insert(rois_per_level.begin(), 0);
+
+    std::vector<int> level_counter = rois_per_level;
+
+    for (int i = 0; i < rois_num; ++i) {
+        const int level = ids[i];
+        assert(level < levels_num);
+        const int j = level_counter[level];
+        assert(0 <= j && j < rois_num);
+        reordered_rois[j * 4 + 0] = rois[i * 4 + 0];
+        reordered_rois[j * 4 + 1] = rois[i * 4 + 1];
+        reordered_rois[j * 4 + 2] = rois[i * 4 + 2];
+        reordered_rois[j * 4 + 3] = rois[i * 4 + 3];
+        level_counter[level]++;
+    }
+}
+
+bool MKLDNNExperimentalDetectronROIFeatureExtractorNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto roiFeatureExtractor = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronROIFeatureExtractor>(op);
+        if (!roiFeatureExtractor) {
+            errorMessage = "Only opset6 ExperimentalDetectronROIFeatureExtractor operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNExperimentalDetectronROIFeatureExtractorNode::MKLDNNExperimentalDetectronROIFeatureExtractorNode
+        (const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    const auto roiFeatureExtractor = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronROIFeatureExtractor>(op);
+    const auto &attr = roiFeatureExtractor->get_attrs();
+    output_dim_ = attr.output_size;
+    pyramid_scales_ = attr.pyramid_scales;
+    sampling_ratio_ = attr.sampling_ratio;
+    aligned_ = attr.aligned;
+    pooled_height_ = output_dim_;
+    pooled_width_ = output_dim_;
+}
+
+void MKLDNNExperimentalDetectronROIFeatureExtractorNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    std::vector<DataConfigurator> inDataConf;
+    inDataConf.reserve(getOriginalInputsNumber());
+    for (int i = 0; i < getOriginalInputsNumber(); ++i)
+        inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
+
+    addSupportedPrimDesc(inDataConf,
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNExperimentalDetectronROIFeatureExtractorNode::execute(mkldnn::stream strm) {
+    const int levels_num = inDims.size() - INPUT_FEATURES_START;
+    const int num_rois = getParentEdgeAt(INPUT_ROIS)->getDims()[0];
+    const int channels_num = getParentEdgeAt(INPUT_FEATURES_START)->getDims()[1];
+    const int feaxels_per_roi = pooled_height_ * pooled_width_ * channels_num;
+
+    auto *input_rois = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_ROIS)->getMemoryPtr()->GetPtr());
+    auto *output_rois_features = reinterpret_cast<float *>(getChildEdgesAtPort(OUTPUT_ROI_FEATURES)[0]->getMemoryPtr()->GetPtr());
+    float *output_rois = nullptr;
+    if (OUTPUT_ROIS < outDims.size()) {
+        output_rois = reinterpret_cast<float *>(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr());
+    }
+
+    std::vector<int> level_ids(num_rois, 0);
+    redistribute_rois(input_rois, reinterpret_cast<int *>(&level_ids[0]), num_rois, levels_num);
+
+    std::vector<float> reordered_rois(4 * num_rois, 0);
+    std::vector<int> original_rois_mapping(num_rois, 0);
+    reord(input_rois, &level_ids[0], num_rois, 4, &reordered_rois[0], &original_rois_mapping[0]);
+
+    std::vector<int> rois_per_level;
+    split_points(level_ids, rois_per_level, levels_num + 1);
+
+    std::vector<float> output_rois_features_temp(feaxels_per_roi * num_rois, 0);
+    for (int i = 0; i < levels_num; ++i) {
+        const int level_rois_offset = rois_per_level[i];
+        const int level_rois_num = rois_per_level[i + 1] - level_rois_offset;
+        if (level_rois_num > 0) {
+            auto *featuremap = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_FEATURES_START + i)->getMemoryPtr()->GetPtr());
+            const int featuremap_height = getParentEdgeAt(INPUT_FEATURES_START + i)->getDims()[2];
+            const int featuremap_width = getParentEdgeAt(INPUT_FEATURES_START + i)->getDims()[3];
+            ROIAlignForward_cpu_kernel<float>(feaxels_per_roi * level_rois_num,
+                                              featuremap,
+                                              1.0f / pyramid_scales_[i],
+                                              channels_num,
+                                              featuremap_height,
+                                              featuremap_width,
+                                              pooled_height_,
+                                              pooled_width_,
+                                              sampling_ratio_,
+                                              &reordered_rois[4 * level_rois_offset],
+                                              aligned_,
+                                              &output_rois_features_temp[feaxels_per_roi * level_rois_offset]);
+        }
+    }
+
+    std::vector<int> dummy_mapping(num_rois, 0);
+    reord(&output_rois_features_temp[0], &original_rois_mapping[0], num_rois, feaxels_per_roi,
+            output_rois_features, &dummy_mapping[0]);
+    if (output_rois != nullptr) {
+        cpu_memcpy(output_rois, input_rois, 4 * num_rois * sizeof(float));
+    }
+}
+
+bool MKLDNNExperimentalDetectronROIFeatureExtractorNode::created() const {
+    return getType() == ExperimentalDetectronROIFeatureExtractor;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronROIFeatureExtractorNode, ExperimentalDetectronROIFeatureExtractor)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h
new file mode 100644
index 00000000000000..bfcb9061f26fbe
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNExperimentalDetectronROIFeatureExtractorNode : public MKLDNNNode {
+public:
+    MKLDNNExperimentalDetectronROIFeatureExtractorNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    const int INPUT_ROIS {0};
+    const int INPUT_FEATURES_START {1};
+
+    const int OUTPUT_ROI_FEATURES {0};
+    const int OUTPUT_ROIS {1};
+
+    int output_dim_ = 0;
+    int pooled_height_ = 0;
+    int pooled_width_ = 0;
+    std::vector<int64_t> pyramid_scales_;
+    int sampling_ratio_ = 0;
+    bool aligned_ = false;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp
new file mode 100644
index 00000000000000..d543658f78e724
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include <ngraph/opsets/opset6.hpp>
+#include "ie_parallel.hpp"
+#include "common/cpu_memcpy.h"
+#include "mkldnn_experimental_detectron_topkrois_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNExperimentalDetectronTopKROIsNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto topKROI = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronTopKROIs>(op);
+        if (!topKROI) {
+            errorMessage = "Only opset6 ExperimentalDetectronTopKROIs operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNExperimentalDetectronTopKROIsNode::MKLDNNExperimentalDetectronTopKROIsNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "ExperimentalDetectronTopKROIs layer with name '" + op->get_friendly_name() + "'";
+    const auto topKROI = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronTopKROIs>(op);
+    if (getOriginalInputsNumber() != 2 || getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
+
+    if (op->get_input_shape(INPUT_ROIS).size() != 2 || op->get_input_shape(INPUT_PROBS).size() != 1)
+        IE_THROW() << errorPrefix << " has nsupported input shape";
+
+    max_rois_num_ = topKROI->get_max_rois();
+}
+
+void MKLDNNExperimentalDetectronTopKROIsNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNExperimentalDetectronTopKROIsNode::execute(mkldnn::stream strm) {
+    const int input_rois_num = getParentEdgeAt(INPUT_ROIS)->getDims()[0];
+    const int top_rois_num = (std::min)(max_rois_num_, input_rois_num);
+
+    auto *input_rois = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_ROIS)->getMemoryPtr()->GetPtr());
+    auto *input_probs = reinterpret_cast<const float *>(getParentEdgeAt(INPUT_PROBS)->getMemoryPtr()->GetPtr());
+    auto *output_rois = reinterpret_cast<float *>(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr());
+
+    std::vector<size_t> idx(input_rois_num);
+    iota(idx.begin(), idx.end(), 0);
+    // FIXME. partial_sort is enough here.
+    sort(idx.begin(), idx.end(), [&input_probs](size_t i1, size_t i2) {return input_probs[i1] > input_probs[i2];});
+
+    for (int i = 0; i < top_rois_num; ++i) {
+        cpu_memcpy(output_rois + 4 * i, input_rois + 4 * idx[i], 4 * sizeof(float));
+    }
+}
+
+bool MKLDNNExperimentalDetectronTopKROIsNode::created() const {
+    return getType() == ExperimentalDetectronTopKROIs;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronTopKROIsNode, ExperimentalDetectronTopKROIs)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h
new file mode 100644
index 00000000000000..76171de71e473c
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNExperimentalDetectronTopKROIsNode : public MKLDNNNode {
+public:
+    MKLDNNExperimentalDetectronTopKROIsNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    // Inputs:
+    //      rois, shape [n, 4]
+    //      rois_probs, shape [n]
+    // Outputs:
+    //      top_rois, shape [max_rois, 4]
+
+    const int INPUT_ROIS {0};
+    const int INPUT_PROBS {1};
+
+    const int OUTPUT_ROIS {0};
+    int max_rois_num_;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.cpp
similarity index 66%
rename from inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.cpp
rename to inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.cpp
index b0f0aa5d327ed8..d4c5d3037962b0 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.cpp
@@ -1,22 +1,22 @@
-// Copyright (C) 2020-2021 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "extract_image_patches.hpp"
-#include "caseless.hpp"
-#include "ie_parallel.hpp"
-#include "list.hpp"
-#include <cpu/x64/jit_generator.hpp>
+#include "base.hpp"
+
 #include <cstring>
 #include <string>
 #include <cmath>
+
 #include <ngraph/opsets/opset3.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_extract_image_patches_node.h"
+#include "list.hpp"
+#include <cpu/x64/jit_generator.hpp>
+#include "caseless.hpp"
 
 using namespace MKLDNNPlugin;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
+using namespace InferenceEngine;
 
 using details::CaselessEq;
 
@@ -266,11 +266,11 @@ struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_k
         align(64);
         L(gather_index_table);
         for (int32_t i = 0; i < vlen / sizeof(int32_t); i++)
-                dd(i * jpp.SW * jpp.dtype_size);
+            dd(i * jpp.SW * jpp.dtype_size);
     }
 };
 
-bool ExtractImagePatchesImpl::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+bool MKLDNNExtractImagePatchesNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
         const auto extImgPatcher = std::dynamic_pointer_cast<const ngraph::opset3::ExtractImagePatches>(op);
         if (!extImgPatcher) {
@@ -292,140 +292,141 @@ bool ExtractImagePatchesImpl::isSupportedOperation(const std::shared_ptr<ngraph:
     return true;
 }
 
-ExtractImagePatchesImpl::ExtractImagePatchesImpl(const std::shared_ptr<ngraph::Node>& op) {
-    try {
-        std::string errorMessage;
-        if (!isSupportedOperation(op, errorMessage)) {
-            IE_THROW(NotImplemented) << errorMessage;
-        }
+MKLDNNExtractImagePatchesNode::MKLDNNExtractImagePatchesNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
 
-        errorPrefix = "ExtractImagePatches layer with name '" + op->get_friendly_name() + "' ";
-        const auto extImgPatcher = std::dynamic_pointer_cast<const ngraph::opset3::ExtractImagePatches>(op);
+    errorPrefix = "ExtractImagePatches layer with name '" + op->get_friendly_name() + "' ";
+    const auto extImgPatcher = std::dynamic_pointer_cast<const ngraph::opset3::ExtractImagePatches>(op);
 
-        if (op->get_input_size() != 1 || op->get_output_size() != 1)
-                IE_THROW() << errorPrefix << "has incorrect number of input or output edges!"
-                           << " Input: " << op->get_input_size() << "; Output: " << op->get_output_size();
-
-        if (op->get_input_shape(0).size() != 4)
-                IE_THROW() << errorPrefix << "must have 4D input tensor. Actual: " << op->get_input_shape(0).size();
-
-        if (op->get_output_shape(0).size() != 4)
-            IE_THROW() << errorPrefix << "must have 4D output tensor. Actual: " << op->get_output_shape(0).size();
-
-        const auto precision = details::convertPrecision(op->get_input_element_type(0));
-            if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end())
-                IE_THROW() << errorPrefix << "has unsupported precision: " << precision.name();
-
-        auto ksizes = extImgPatcher->get_sizes();
-        auto strides = extImgPatcher->get_strides();
-        auto rates = extImgPatcher->get_rates();
-        if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::VALID) {
-            _auto_pad = ExtImgPatcherPadType::VALID;
-        } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_LOWER) {
-            _auto_pad = ExtImgPatcherPadType::SAME_LOWER;
-        } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_UPPER) {
-            _auto_pad = ExtImgPatcherPadType::SAME_UPPER;
-        } else {
-            IE_THROW() << errorPrefix << "has unsupported pad type: " << extImgPatcher->get_auto_pad();
-        }
+    if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << "has incorrect number of input or output edges!"
+                   << " Input: " << getOriginalInputsNumber() << "; Output: " << getOriginalOutputsNumber();
+
+    if (op->get_input_shape(0).size() != 4)
+        IE_THROW() << errorPrefix << "must have 4D input tensor. Actual: " << op->get_input_shape(0).size();
 
-        if (ksizes.size() != 2 || strides.size() != 2 || rates.size() != 2)
-            IE_THROW() << errorPrefix << "must have the following attributes with shape {2}: sizes, strides, rates.";
-        _ksizes.clear();
-        _strides.clear();
-        _rates.clear();
-        for (const auto& x :  ksizes) {
-            if (x < 0)
-                IE_THROW() << "Kernel sizes must be non-negative, got '" << x << "'.";
-            _ksizes.push_back(static_cast<size_t>(x));
+    if (op->get_output_shape(0).size() != 4)
+        IE_THROW() << errorPrefix << "must have 4D output tensor. Actual: " << op->get_output_shape(0).size();
+
+    auto ksizes = extImgPatcher->get_sizes();
+    auto strides = extImgPatcher->get_strides();
+    auto rates = extImgPatcher->get_rates();
+    if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::VALID) {
+        _auto_pad = ExtImgPatcherPadType::VALID;
+    } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_LOWER) {
+        _auto_pad = ExtImgPatcherPadType::SAME_LOWER;
+    } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_UPPER) {
+        _auto_pad = ExtImgPatcherPadType::SAME_UPPER;
+    } else {
+        IE_THROW() << errorPrefix << "has unsupported pad type: " << extImgPatcher->get_auto_pad();
+    }
+
+    if (ksizes.size() != 2 || strides.size() != 2 || rates.size() != 2)
+        IE_THROW() << errorPrefix << "must have the following attributes with shape {2}: sizes, strides, rates.";
+    _ksizes.clear();
+    _strides.clear();
+    _rates.clear();
+    for (const auto& x :  ksizes) {
+        if (x < 0)
+            IE_THROW() << "Kernel sizes must be non-negative, got '" << x << "'.";
+        _ksizes.push_back(static_cast<size_t>(x));
+    }
+    for (const auto& x :  strides) {
+        if (x < 0)
+            IE_THROW() << "Strides must be non-negative, got '" << x << "'.";
+        _strides.push_back(static_cast<size_t>(x));
+    }
+    for (const auto& x :  rates) {
+        if (x < 0)
+            IE_THROW() << "Rates must be non-negative, got '" << x << "'.";
+        _rates.push_back(static_cast<size_t>(x));
+    }
+
+    SizeVector in_dims = op->get_input_shape(0);
+    _pad_left = 0;
+    _pad_top = 0;
+    jit_extract_image_patches_params jpp;
+    jpp.need_padding = false;
+    if (_auto_pad != ExtImgPatcherPadType::VALID) {
+        const size_t iheight = in_dims[2];
+        const size_t iwidth = in_dims[3];
+        const int64_t ihStep = _ksizes[0] + (_rates[0] - 1) * (_ksizes[0] - 1);
+        const int64_t iwStep = _ksizes[1] + (_rates[1] - 1) * (_ksizes[1] - 1);
+
+        int64_t PW = (std::ceil(1.f * iwidth/_strides[1]) - 1) * _strides[1] + iwStep - iwidth;
+        int64_t PH = (std::ceil(1.f * iheight/_strides[0]) - 1) * _strides[0] + ihStep - iheight;
+
+        int64_t increment_sign = 0;
+        if (_auto_pad == ExtImgPatcherPadType::SAME_LOWER) {
+            increment_sign = 1;
+        } else if (_auto_pad == ExtImgPatcherPadType::SAME_UPPER) {
+            increment_sign = -1;
         }
-        for (const auto& x :  strides) {
-            if (x < 0)
-                IE_THROW() << "Strides must be non-negative, got '" << x << "'.";
-            _strides.push_back(static_cast<size_t>(x));
+
+        if ((PW > 0) && (PW < iwStep)) {
+            _pad_left = static_cast<size_t>((PW + increment_sign * (PW % 2)) / 2);
+            jpp.need_padding = true;
         }
-        for (const auto& x :  rates) {
-            if (x < 0)
-                IE_THROW() << "Rates must be non-negative, got '" << x << "'.";
-            _rates.push_back(static_cast<size_t>(x));
+        if ((PH > 0) && (PH < ihStep)) {
+            _pad_top = static_cast<size_t>((PH + increment_sign * (PH % 2)) / 2);
+            jpp.need_padding = true;
         }
+    }
 
-        SizeVector in_dims = op->get_input_shape(0);
-        _pad_left = 0;
-        _pad_top = 0;
-        jit_extract_image_patches_params jpp;
-        jpp.need_padding = false;
-        if (_auto_pad != ExtImgPatcherPadType::VALID) {
-            const size_t iheight = in_dims[2];
-            const size_t iwidth = in_dims[3];
-            const int64_t ihStep = _ksizes[0] + (_rates[0] - 1) * (_ksizes[0] - 1);
-            const int64_t iwStep = _ksizes[1] + (_rates[1] - 1) * (_ksizes[1] - 1);
-
-            int64_t PW = (std::ceil(1.f * iwidth/_strides[1]) - 1) * _strides[1] + iwStep - iwidth;
-            int64_t PH = (std::ceil(1.f * iheight/_strides[0]) - 1) * _strides[0] + ihStep - iheight;
-
-            int64_t increment_sign = 0;
-            if (_auto_pad == ExtImgPatcherPadType::SAME_LOWER) {
-                increment_sign = 1;
-            } else if (_auto_pad == ExtImgPatcherPadType::SAME_UPPER) {
-                increment_sign = -1;
-            }
+    jpp.IW = in_dims[3];
+    SizeVector out_dims = op->get_output_shape(0);
+    jpp.OH = out_dims[2];
+    jpp.OW = out_dims[3];
+    jpp.KH = _ksizes[0];
+    jpp.KW = _ksizes[1];
+    jpp.SH = _strides[0];
+    jpp.SW = _strides[1];
+    jpp.dtype_size = getOriginalInputPrecisionAtPort(0).size();
+    jpp.block_size = 1;
+
+    if (mayiuse(x64::avx512_common)) {
+        jpp.block_size = cpu_isa_traits<x64::avx512_common>::vlen / jpp.dtype_size;
+        extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel<x64::avx512_common>(jpp));
+    } else if (mayiuse(x64::avx2)) {
+        jpp.block_size = cpu_isa_traits<x64::avx2>::vlen / jpp.dtype_size;
+        extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel<x64::avx2>(jpp));
+    } else if (mayiuse(x64::sse41)) {
+        jpp.block_size = cpu_isa_traits<x64::sse41>::vlen / jpp.dtype_size;
+        extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel<x64::sse41>(jpp));
+    }
 
-            if ((PW > 0) && (PW < iwStep)) {
-                _pad_left = static_cast<size_t>((PW + increment_sign * (PW % 2)) / 2);
-                jpp.need_padding = true;
-            }
-            if ((PH > 0) && (PH < ihStep)) {
-                _pad_top = static_cast<size_t>((PH + increment_sign * (PH % 2)) / 2);
-                jpp.need_padding = true;
-            }
-        }
+    if (extract_image_patches_kernel)
+        extract_image_patches_kernel->create_ker();
+}
 
-        jpp.IW = in_dims[3];
-        SizeVector out_dims = op->get_output_shape(0);
-        jpp.OH = out_dims[2];
-        jpp.OW = out_dims[3];
-        jpp.KH = _ksizes[0];
-        jpp.KW = _ksizes[1];
-        jpp.SH = _strides[0];
-        jpp.SW = _strides[1];
-        jpp.dtype_size = precision.size();
-        jpp.block_size = 1;
-
-        if (mayiuse(x64::avx512_common)) {
-            jpp.block_size = cpu_isa_traits<x64::avx512_common>::vlen / jpp.dtype_size;
-            extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel<x64::avx512_common>(jpp));
-        } else if (mayiuse(x64::avx2)) {
-            jpp.block_size = cpu_isa_traits<x64::avx2>::vlen / jpp.dtype_size;
-            extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel<x64::avx2>(jpp));
-        } else if (mayiuse(x64::sse41)) {
-            jpp.block_size = cpu_isa_traits<x64::sse41>::vlen / jpp.dtype_size;
-            extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel<x64::sse41>(jpp));
-        }
+void MKLDNNExtractImagePatchesNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
 
-        if (extract_image_patches_kernel)
-            extract_image_patches_kernel->create_ker();
+    precision = getOriginalInputPrecisionAtPort(0);
+    if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end())
+        IE_THROW() << errorPrefix << "has unsupported precision: " << precision.name();
 
-        addConfig(op, {{TensorDescCreatorTypes::ncsp, precision}},
-                      {{TensorDescCreatorTypes::ncsp, precision}});
-    } catch (InferenceEngine::Exception &ex) {
-        errorMsg = ex.what();
-    }
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}},
+                         {{TensorDescCreatorTypes::ncsp, precision}},
+                         impl_desc_type::ref_any);
 }
 
-StatusCode ExtractImagePatchesImpl::execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept {
-    const char *src_data = inputs[0]->cbuffer().as<const char *>() +
-            inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    char *dst_data = outputs[0]->buffer().as<char *>() +
-            outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    const size_t dtype_size = inputs[0]->getTensorDesc().getPrecision().size();
+void MKLDNNExtractImagePatchesNode::execute(mkldnn::stream strm) {
+    const char *src_data = reinterpret_cast<const char *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    char *dst_data = reinterpret_cast<char *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+    const size_t dtype_size = getOriginalInputPrecisionAtPort(0).size();
 
-    const auto& inDims = inputs[0]->getTensorDesc().getDims();
+    const auto& inDims = getParentEdgeAt(0)->getDims().ToSizeVector();
     const size_t IC = inDims[1];
     const size_t IH = inDims[2];
     const size_t IW = inDims[3];
 
-    const auto& outDims = outputs[0]->getTensorDesc().getDims();
+    const auto& outDims = getChildEdgesAtPort(0)[0]->getDims().ToSizeVector();
     const size_t OB = outDims[0];
     const size_t OH = outDims[2];
     const size_t OW = outDims[3];
@@ -435,8 +436,8 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector<Blob::Ptr>& inputs, std:
     const size_t RH = _rates[0], RW = _rates[1];
     const size_t PT = _pad_top, PL = _pad_left;
 
-    const std::vector<size_t> istrides = inputs[0]->getTensorDesc().getBlockingDesc().getStrides();
-    const std::vector<size_t> ostrides = outputs[0]->getTensorDesc().getBlockingDesc().getStrides();
+    const std::vector<size_t> istrides = getParentEdgeAt(0)->getDesc().getBlockingDesc().getStrides();
+    const std::vector<size_t> ostrides = getChildEdgesAtPort(0)[0]->getDesc().getBlockingDesc().getStrides();
     const std::vector<size_t> ostrides_partial = {ostrides[0], KW * IC * ostrides[1], IC * ostrides[1], ostrides[1]};
 
     if (extract_image_patches_kernel) {
@@ -471,7 +472,7 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector<Blob::Ptr>& inputs, std:
             const size_t iw_hpad = std::ceil((IW - 1.f * iw_start) / SW) > OW ? OW : std::ceil((IW - 1.f * iw_start) / SW);
 
             char *my_dst_ptr = dst_data +
-                    (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]) * dtype_size;
+                               (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]) * dtype_size;
             const char *my_src_ptr = src_data + (ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start) * dtype_size;
 
             size_t num_bytes_to_set = ih_lpad * OW * dtype_size;
@@ -480,14 +481,14 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector<Blob::Ptr>& inputs, std:
 
             const char* src_ptr_h_stop = my_src_ptr + ih_hpad * SH * IW * dtype_size;
             for (const char *src_h_ptr = my_src_ptr + ih_lpad * SH * IW * dtype_size;
-                src_h_ptr < src_ptr_h_stop; src_h_ptr += SH * IW * dtype_size) {
+                 src_h_ptr < src_ptr_h_stop; src_h_ptr += SH * IW * dtype_size) {
                 num_bytes_to_set = iw_lpad * dtype_size;
                 memset(my_dst_ptr, 0, num_bytes_to_set);
                 my_dst_ptr += num_bytes_to_set;
 
                 const char* src_ptr_w_stop = src_h_ptr + iw_hpad * SW * dtype_size;
                 for (const char* src_w_ptr = src_h_ptr + iw_lpad * SW * dtype_size;
-                    src_w_ptr < src_ptr_w_stop; src_w_ptr += SW * dtype_size) {
+                     src_w_ptr < src_ptr_w_stop; src_w_ptr += SW * dtype_size) {
                     num_bytes_to_set = dtype_size;
                     memcpy(my_dst_ptr, src_w_ptr, num_bytes_to_set);
                     my_dst_ptr += num_bytes_to_set;
@@ -500,11 +501,12 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector<Blob::Ptr>& inputs, std:
             memset(my_dst_ptr, 0, num_bytes_to_set);
         });
     }
-    return OK;
 }
 
-const std::set<size_t> ExtractImagePatchesImpl::_supported_precisions_sizes = {1, 2, 4};
+const std::set<size_t> MKLDNNExtractImagePatchesNode::_supported_precisions_sizes = {1, 2, 4};
+
+bool MKLDNNExtractImagePatchesNode::created() const {
+    return getType() == ExtractImagePatches;
+}
 
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
+REG_MKLDNN_PRIM_FOR(MKLDNNExtractImagePatchesNode, ExtractImagePatches)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.hpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.h
similarity index 64%
rename from inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.hpp
rename to inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.h
index 8ed62fbca89b0d..2990b12d08f2e3 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.h
@@ -1,16 +1,16 @@
-// Copyright (C) 2021 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+
 #pragma once
 
-#include "base.hpp"
+#include <ie_common.h>
+#include <mkldnn_node.h>
+#include <string>
+#include <memory>
 #include <vector>
-#include <set>
-#include <cassert>
 
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
+namespace MKLDNNPlugin {
 
 struct jit_extract_image_patches_params {
     size_t IW;
@@ -40,12 +40,17 @@ struct jit_uni_extract_image_patches_kernel {
     virtual ~jit_uni_extract_image_patches_kernel() {}
 };
 
-
-class ExtractImagePatchesImpl : public ExtLayerBase {
+class MKLDNNExtractImagePatchesNode : public MKLDNNNode {
 public:
-    explicit ExtractImagePatchesImpl(const std::shared_ptr<ngraph::Node>& op);
-    StatusCode execute(std::vector<Blob::Ptr>&, std::vector<Blob::Ptr>&, ResponseDesc*) noexcept override;
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+    MKLDNNExtractImagePatchesNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
 
 private:
     enum class ExtImgPatcherPadType {
@@ -63,12 +68,9 @@ class ExtractImagePatchesImpl : public ExtLayerBase {
     static const std::set<size_t> _supported_precisions_sizes;
 
     ExtImgPatcherPadType _auto_pad;
+    InferenceEngine::Precision precision;
 
     std::string errorPrefix;
 };
 
-REG_FACTORY_FOR(ExtractImagePatchesImpl, ExtractImagePatches);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp
index eabd4f52aac8b2..e3e14e356912db 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp
@@ -18,7 +18,7 @@ using namespace InferenceEngine;
 
 bool MKLDNNGatherElementsNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto gatherElementsOp = ngraph::as_type_ptr<const ngraph::op::v6::GatherElements>(op);
+        const auto gatherElementsOp = ngraph::as_type_ptr<const ngraph::op::v6::GatherElements>(op);
         if (!gatherElementsOp) {
             errorMessage = "Node is not an instance of the GatherElements operation from operation set v6.";
             return false;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp
index 3e858dd309d8ca..ee7623f9b4810b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp
@@ -18,7 +18,7 @@ using namespace InferenceEngine;
 
 bool MKLDNNGatherNDNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto gatherElementsOp = ngraph::as_type_ptr<const ngraph::op::v5::GatherND>(op);
+        const auto gatherElementsOp = ngraph::as_type_ptr<const ngraph::op::v5::GatherND>(op);
         if (!gatherElementsOp) {
             errorMessage = "Node is not an instance of the GatherND operation from operation set v5.";
             return false;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp
index 3bd50aadf3357e..ade92f6a4a0060 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp
@@ -15,13 +15,13 @@ using namespace InferenceEngine;
 
 bool MKLDNNGatherNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto gatherOp = ngraph::as_type_ptr<const ngraph::op::v7::Gather>(op);
+        const auto gatherOp = ngraph::as_type_ptr<const ngraph::op::v7::Gather>(op);
         if (!gatherOp) {
             errorMessage = "Only opset7 Gather operation is supported";
             return false;
         }
 
-        auto axesOp = gatherOp->get_input_node_shared_ptr(GATHER_AXIS);
+        const auto axesOp = gatherOp->get_input_node_shared_ptr(GATHER_AXIS);
         if (!ngraph::as_type_ptr<const ngraph::op::Constant>(axesOp)) {
             errorMessage = "Only Constant operation on 'axis' input is supported";
             return false;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp
new file mode 100644
index 00000000000000..ce396446df2418
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp
@@ -0,0 +1,148 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+#include <cmath>
+
+#include <ngraph/op/gather_tree.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_gather_tree_node.h"
+#include <utils/general_utils.h>
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNGatherTreeNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto gatherElementsOp = ngraph::as_type_ptr<const ngraph::op::v1::GatherTree>(op);
+        if (!gatherElementsOp) {
+            errorMessage = "Node is not an instance of the GatherTree operation from operation set v1.";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNGatherTreeNode::MKLDNNGatherTreeNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = std::string("Node GatherTree with name '") + op->get_friendly_name() + "'";
+    if (op->get_input_size() != 4)
+        IE_THROW() << errorPrefix << " has incorrect number of input edges.";
+    if (op->get_output_size() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of output edges.";
+
+    if (op->get_input_shape(GATHER_TREE_STEP_IDX).size() != 3)
+        IE_THROW() << errorPrefix << " step_idx vector should be 3 dimension";
+    if (op->get_input_shape(GATHER_TREE_PARENT_IDX).size() != 3)
+        IE_THROW() << errorPrefix << " parent_idx vector should be 3 dimension";
+    if (op->get_input_shape(GATHER_TREE_MAX_SEQ_LEN).size() != 1)
+        IE_THROW() << errorPrefix << " max_seq_len vector should be 1 dimension";
+    if (op->get_input_shape(GATHER_TREE_END_TOKEN).size() != 0)
+        IE_THROW() << errorPrefix << " end_token should be 1 dimension";
+}
+
+void MKLDNNGatherTreeNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    precision = getOriginalInputPrecisionAtPort(GATHER_TREE_STEP_IDX);
+    if (!MKLDNNPlugin::one_of(precision, Precision::FP32, Precision::I32))
+        precision = Precision::FP32;
+
+    if (getOriginalInputPrecisionAtPort(GATHER_TREE_PARENT_IDX)  != precision ||
+        getOriginalInputPrecisionAtPort(GATHER_TREE_MAX_SEQ_LEN) != precision ||
+        getOriginalInputPrecisionAtPort(GATHER_TREE_END_TOKEN)   != precision ||
+        getOriginalOutputPrecisionAtPort(0)                 != precision) {
+            IE_THROW() << errorPrefix << " has incorrect input/output data precision. Must be the same.";
+    }
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision},
+                            {TensorDescCreatorTypes::ncsp, precision},
+                            {TensorDescCreatorTypes::ncsp, precision},
+                            {TensorDescCreatorTypes::ncsp, precision}},
+                         {{TensorDescCreatorTypes::ncsp, precision}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNGatherTreeNode::execute(mkldnn::stream strm) {
+    if (precision == Precision::FP32)
+        return gatherTreeKernel<float>();
+    else
+        return gatherTreeKernel<int32_t>();
+}
+
+template<typename DATA_T>
+void MKLDNNGatherTreeNode::gatherTreeKernel() noexcept {
+    const auto *step_idx = reinterpret_cast<DATA_T *>(getParentEdgeAt(GATHER_TREE_STEP_IDX)->getMemoryPtr()->GetPtr());
+    const auto * const parent_idx = reinterpret_cast<DATA_T *>(getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getMemoryPtr()->GetPtr());
+    const size_t parent_idx_size = getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getDims().size()
+                                   - getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getDesc().getBlockingDesc().getOffsetPadding();
+    const auto *max_seq_len = reinterpret_cast<DATA_T *>(getParentEdgeAt(GATHER_TREE_MAX_SEQ_LEN)->getMemoryPtr()->GetPtr());
+    auto end_token = (reinterpret_cast<DATA_T *>(getParentEdgeAt(GATHER_TREE_END_TOKEN)->getMemoryPtr()->GetPtr()))[0];
+    auto * final_idx = reinterpret_cast<DATA_T *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+
+    SizeVector step_idx_dims = getParentEdgeAt(GATHER_TREE_STEP_IDX)->getDims().ToSizeVector();
+    SizeVector parent_idx_dims = getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getDims().ToSizeVector();
+    SizeVector max_seq_len_dims = getParentEdgeAt(GATHER_TREE_MAX_SEQ_LEN)->getDims().ToSizeVector();
+    SizeVector final_idx_dims = getChildEdgesAtPort(0)[0]->getDims().ToSizeVector();
+    int32_t max_time = step_idx_dims[0];
+    const size_t batch_size = step_idx_dims[1];
+    const size_t beam_width = step_idx_dims[2];
+    const size_t bb_size = batch_size * beam_width;
+
+    if (max_time != static_cast<int32_t>(parent_idx_dims[0]) || max_time != static_cast<int32_t>(final_idx_dims[0]) ||
+        batch_size != parent_idx_dims[1] || batch_size != final_idx_dims[1] || batch_size != max_seq_len_dims[0] ||
+        beam_width != parent_idx_dims[2] || beam_width != final_idx_dims[2]) {
+        std::string errorMsg = "Input/Output tensors dimensions mismatch";
+        IE_THROW() << errorMsg;
+    }
+
+    bool incorrect_result = false;
+    parallel_for2d(batch_size, beam_width, [&](size_t batch, size_t beam) {
+        int32_t max_sequence_in_beam = std::min<int32_t>(max_time, static_cast<int32_t>(max_seq_len[batch]));
+        if (max_sequence_in_beam > 0) {
+            int32_t time, idx = (max_time - 1) * bb_size + batch * beam_width;
+            for (time = (max_time - 1); time >= max_sequence_in_beam; time--, idx -= bb_size)
+                final_idx[idx + beam] = end_token;
+
+            for (int32_t parent = static_cast<int32_t>(beam); time >= 0; time--, idx -= bb_size) {
+                if (parent < 0 || parent >= static_cast<int32_t>(beam_width) || idx + parent >= parent_idx_size) {
+                    incorrect_result = true;
+                    break;
+                }
+                final_idx[idx + beam] = step_idx[idx + parent];
+                parent = static_cast<int32_t>(parent_idx[idx + parent]);
+            }
+
+            bool finished = false;
+            auto *final = &final_idx[batch * beam_width + beam];
+            for (time = 0; time < max_sequence_in_beam; time++, final += bb_size) {
+                if (finished)
+                    (*final) = end_token;
+                else if ((*final) == end_token)
+                    finished = true;
+            }
+        }
+    });
+
+    if (incorrect_result) {
+        std::string errorMsg = "Wrong parent index, result is incorrect";
+        IE_THROW() << errorMsg;
+    }
+}
+
+bool MKLDNNGatherTreeNode::created() const {
+    return getType() == GatherTree;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNGatherTreeNode, GatherTree)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h
new file mode 100644
index 00000000000000..63f34fe6d6e685
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNGatherTreeNode : public MKLDNNNode {
+public:
+    MKLDNNGatherTreeNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+    template<typename DATA_T>
+    void gatherTreeKernel() noexcept;
+
+        private:
+    static const size_t GATHER_TREE_STEP_IDX = 0;
+    static const size_t GATHER_TREE_PARENT_IDX = 1;
+    static const size_t GATHER_TREE_MAX_SEQ_LEN = 2;
+    static const size_t GATHER_TREE_END_TOKEN = 3;
+
+    InferenceEngine::Precision precision;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp
new file mode 100644
index 00000000000000..0dbe8dee59ea51
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "base.hpp"
+
+#include <string>
+
+#include <ngraph/opsets/opset1.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_grn_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNGRNNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto grn = std::dynamic_pointer_cast<const ngraph::opset1::GRN>(op);
+        if (!grn) {
+            errorMessage = "Only opset1 GRN operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNGRNNode::MKLDNNGRNNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "GRN layer with name '" + op->get_friendly_name() + "'";
+    const auto grn = std::dynamic_pointer_cast<const ngraph::opset1::GRN>(op);
+
+    if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
+
+    bias = grn->get_bias();
+}
+
+void MKLDNNGRNNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNGRNNode::execute(mkldnn::stream strm) {
+    const float* src_data = reinterpret_cast<const float *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    float* dst_data = reinterpret_cast<float *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+
+    SizeVector dims = getParentEdgeAt(0)->getDims().ToSizeVector();
+
+    int N = static_cast<int>((dims.size() > 0) ? dims[0] : 1);
+    int C = static_cast<int>((dims.size() > 1) ? dims[1] : 1);
+    int H = static_cast<int>((dims.size() > 2) ? dims[2] : 1);
+    int W = static_cast<int>((dims.size() > 3) ? dims[3] : 1);
+
+    parallel_for3d(N, H, W, [&](int b, int h, int w) {
+        double variance = 0;
+        for (int c = 0; c < C; c++) {
+            variance += std::pow(src_data[b*C*H*W + c*H*W + h*W + w], 2);
+        }
+        variance = std::pow(variance + bias, 0.5f);
+        for (int c = 0; c < C; c++) {
+            dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / static_cast<float>(variance);
+        }
+    });
+}
+
+bool MKLDNNGRNNode::created() const {
+    return getType() == GRN;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNGRNNode, GRN)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h
new file mode 100644
index 00000000000000..8fe8d9d75b04e7
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNGRNNode : public MKLDNNNode {
+public:
+    MKLDNNGRNNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    float bias = 1.0f;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp
new file mode 100644
index 00000000000000..5750f8517b0096
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp
@@ -0,0 +1,116 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cmath>
+
+#include <ngraph/opsets/opset5.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_log_softmax_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNLogSoftmaxNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto logSoftMax = std::dynamic_pointer_cast<const ngraph::opset5::LogSoftmax>(op);
+        if (!logSoftMax) {
+            errorMessage = "Only opset5 LogSoftmax operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNLogSoftmaxNode::MKLDNNLogSoftmaxNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                                     MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "LogSoftmax layer with name '" + op->get_friendly_name() + "'";
+    const auto logSoftMax = std::dynamic_pointer_cast<const ngraph::opset5::LogSoftmax>(op);
+
+    if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
+
+    SizeVector dims = op->get_input_shape(0);
+    if (!dims.size())
+        dims = SizeVector(1, 1);
+    int axis = logSoftMax->get_axis();
+    if (axis < 0)
+        axis += dims.size();
+
+    if (dims.size() < static_cast<size_t>((size_t)(1) + axis))
+        IE_THROW() << errorPrefix << " has incorrect input parameters dimensions and axis number!";
+
+    int j;
+    for (j = dims.size() - 1; j >= 0; j--) {
+        if (dims[j] != 1) break;
+    }
+    if (j == axis) isLastDim = true;
+
+    for (int i = 0; i < axis; i++)
+        axisStep *= dims[i];
+    reducedAxisSize = dims[axis];
+    for (size_t i = (axis + 1); i < dims.size(); i++)
+        reducedAxisStride *= dims[i];
+}
+
+void MKLDNNLogSoftmaxNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNLogSoftmaxNode::execute(mkldnn::stream strm) {
+    const float *srcData = reinterpret_cast<const float *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    float* dstData = reinterpret_cast<float *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+
+    if (isLastDim) {
+        parallel_for(axisStep, [&](size_t i) {
+            const float *srcDataPtr = &srcData[i * reducedAxisSize];
+            float *dstDataPtr = &dstData[i * reducedAxisSize];
+
+            float reduceProd = 0.0f;
+            const float max = *std::max_element(srcDataPtr, srcDataPtr + reducedAxisSize);
+            for (size_t j = 0; j < reducedAxisSize; ++j)
+                reduceProd += expf(srcDataPtr[j] - max);
+
+            reduceProd = logf(reduceProd);
+            for (size_t j = 0; j < reducedAxisSize; ++j)
+                dstDataPtr[j] = srcDataPtr[j] - max - reduceProd;
+        });
+    } else {
+        parallel_for2d(axisStep, reducedAxisStride, [&](size_t k, size_t i) {
+            const float *srcDataPtr = &srcData[k * reducedAxisStride * reducedAxisSize + i];
+            float *dstDataPtr = &dstData[k * reducedAxisStride * reducedAxisSize + i];
+
+            float reduceProd = 0.0f;
+            float max = std::numeric_limits<float>::min();
+            for (size_t j = 0; j < reducedAxisSize; ++j) {
+                if (srcDataPtr[j * reducedAxisStride] > max)
+                    max = srcDataPtr[j * reducedAxisStride];
+            }
+
+            for (size_t j = 0; j < reducedAxisSize; ++j)
+                reduceProd += expf(srcDataPtr[j * reducedAxisStride] - max);
+
+            reduceProd = logf(reduceProd);
+            for (size_t j = 0; j < reducedAxisSize; ++j)
+                dstDataPtr[j * reducedAxisStride] = srcDataPtr[j * reducedAxisStride] - max - reduceProd;
+        });
+    }
+}
+
+bool MKLDNNLogSoftmaxNode::created() const {
+    return getType() == LogSoftmax;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNLogSoftmaxNode, LogSoftmax)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h
new file mode 100644
index 00000000000000..456d7321efcdc4
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNLogSoftmaxNode : public MKLDNNNode {
+public:
+    MKLDNNLogSoftmaxNode(const std::shared_ptr<ngraph::Node>& op,
+        const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    size_t reducedAxisSize;
+    size_t reducedAxisStride = 1;
+    size_t axisStep = 1;
+    bool isLastDim = false;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp
index ecfa4fbbd32468..908686bf6df1eb 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp
@@ -43,17 +43,17 @@ MKLDNNMathNode::MKLDNNMathNode(const std::shared_ptr<ngraph::Node>& op, const mk
     }
 
     initializers[op->get_type_info()](op, *this);
-
-    size_t sizeVector = op->get_input_size();
-    inDataConf.reserve(sizeVector);
-    for (int i = 0; i < sizeVector; ++i)
-        inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
 }
 
 void MKLDNNMathNode::initSupportedPrimitiveDescriptors() {
     if (!supportedPrimitiveDescriptors.empty())
         return;
 
+    std::vector<DataConfigurator> inDataConf;
+    inDataConf.reserve(getOriginalInputsNumber());
+    for (int i = 0; i < getOriginalInputsNumber(); ++i)
+        inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
+
     addSupportedPrimDesc(inDataConf,
                          {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
                          impl_desc_type::ref_any);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h
index a91cb3ae373d9c..28260dc476ec54 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h
@@ -28,7 +28,6 @@ class MKLDNNMathNode : public MKLDNNNode {
     float beta = 0.0f;
     float gamma = 0.0f;
 
-    std::vector<DataConfigurator> inDataConf;
     std::string errorPrefix;
 };
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp
new file mode 100644
index 00000000000000..093127eada5f9a
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp
@@ -0,0 +1,406 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include <queue>
+
+#include "mkldnn_non_max_suppression_node.h"
+#include "ie_parallel.hpp"
+#include <ngraph_ops/nms_ie_internal.hpp>
+#include "utils/general_utils.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNNonMaxSuppressionNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto nms = std::dynamic_pointer_cast<const ngraph::op::internal::NonMaxSuppressionIEInternal>(op);
+        if (!nms) {
+            errorMessage = "Only internal NonMaxSuppression operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNNonMaxSuppressionNode::MKLDNNNonMaxSuppressionNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+        std::string errorMessage;
+        if (!isSupportedOperation(op, errorMessage)) {
+            IE_THROW(NotImplemented) << errorMessage;
+        }
+
+        errorPrefix = "NMS layer with name '" + op->get_friendly_name() + "' ";
+        const auto nms = std::dynamic_pointer_cast<const ngraph::op::internal::NonMaxSuppressionIEInternal>(op);
+
+        if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > 6)
+            IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getOriginalInputsNumber();
+
+        if (getOriginalOutputsNumber() < 1 || getOriginalOutputsNumber() > 3)
+            IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getOriginalOutputsNumber();
+
+        boxEncodingType = nms->m_center_point_box ? boxEncoding::CENTER : boxEncoding::CORNER;
+
+        sort_result_descending = nms->m_sort_result_descending;
+
+        const SizeVector &boxes_dims = op->get_input_shape(NMS_BOXES);
+        num_batches = boxes_dims[0];
+        num_boxes = boxes_dims[1];
+        if (boxes_dims.size() != 3)
+            IE_THROW() << errorPrefix << "has unsupported 'boxes' input rank: " << boxes_dims.size();
+        if (boxes_dims[2] != 4)
+            IE_THROW() << errorPrefix << "has unsupported 'boxes' input 3rd dimension size: " << boxes_dims[2];
+
+        const SizeVector &scores_dims = op->get_input_shape(NMS_SCORES);
+        num_classes = scores_dims[1];
+        if (scores_dims.size() != 3)
+            IE_THROW() << errorPrefix << "has unsupported 'scores' input rank: " << scores_dims.size();
+
+        if (num_batches != scores_dims[0])
+            IE_THROW() << errorPrefix << " num_batches is different in 'boxes' and 'scores' inputs";
+        if (num_boxes != scores_dims[2])
+            IE_THROW() << errorPrefix << " num_boxes is different in 'boxes' and 'scores' inputs";
+
+        numFiltBox.resize(num_batches);
+        for (auto & i : numFiltBox)
+            i.resize(num_classes);
+
+        inputShape_MAXOUTPUTBOXESPERCLASS = op->get_input_shape(NMS_MAXOUTPUTBOXESPERCLASS);
+        inputShape_IOUTHRESHOLD = op->get_input_shape(NMS_IOUTHRESHOLD);
+        inputShape_SCORETHRESHOLD = op->get_input_shape(NMS_SCORETHRESHOLD);
+        if (getOriginalInputsNumber() > NMS_SOFTNMSSIGMA) {
+            inputShape_SOFTNMSSIGMA = op->get_input_shape(NMS_SOFTNMSSIGMA);
+        }
+
+        outputShape_SELECTEDINDICES = op->get_output_shape(NMS_SELECTEDINDICES);
+        outputShape_SELECTEDSCORES = op->get_output_shape(NMS_SELECTEDSCORES);
+
+        const SizeVector &valid_outputs_dims = op->get_input_shape(NMS_VALIDOUTPUTS);
+        if (valid_outputs_dims.size() != 1)
+            IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output rank: " << valid_outputs_dims.size();
+        if (valid_outputs_dims[0] != 1)
+            IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output 1st dimension size: " << valid_outputs_dims[1];
+}
+
+void MKLDNNNonMaxSuppressionNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    const std::vector<Precision> supportedFloatPrecision = {Precision::FP32, Precision::BF16};
+    const std::vector<Precision> supportedIntOutputPrecision = {Precision::I32, Precision::I64};
+
+    checkPrecision(getOriginalInputPrecisionAtPort(NMS_BOXES), supportedFloatPrecision, "boxes", inType);
+    checkPrecision(getOriginalInputPrecisionAtPort(NMS_SCORES), supportedFloatPrecision, "scores", inType);
+    checkPrecision(getOriginalInputPrecisionAtPort(NMS_VALIDOUTPUTS), supportedIntOutputPrecision, "valid_outputs", outType);
+
+    const std::vector<Precision> supportedPrecision = {Precision::I16, Precision::U8, Precision::I8, Precision::U16, Precision::I32,
+                                                       Precision::U32, Precision::I64, Precision::U64};
+
+    check1DInput(inputShape_MAXOUTPUTBOXESPERCLASS, supportedPrecision, "max_output_boxes_per_class", NMS_MAXOUTPUTBOXESPERCLASS);
+    check1DInput(inputShape_IOUTHRESHOLD, supportedFloatPrecision, "iou_threshold", NMS_IOUTHRESHOLD);
+    check1DInput(inputShape_SCORETHRESHOLD, supportedFloatPrecision, "score_threshold", NMS_SCORETHRESHOLD);
+
+    if (getOriginalInputsNumber() > NMS_SOFTNMSSIGMA) {
+        check1DInput(inputShape_SOFTNMSSIGMA, supportedFloatPrecision, "soft_nms_sigma", NMS_SOFTNMSSIGMA);
+    }
+
+    checkOutput(outputShape_SELECTEDINDICES, supportedIntOutputPrecision, "selected_indices", NMS_SELECTEDINDICES);
+    checkOutput(outputShape_SELECTEDSCORES, supportedFloatPrecision, "selected_scores", NMS_SELECTEDSCORES);
+
+    std::vector<DataConfigurator> inDataConf;
+    inDataConf.reserve(getOriginalInputsNumber());
+    for (int i = 0; i < getOriginalInputsNumber(); ++i) {
+        Precision inPrecision = i == NMS_MAXOUTPUTBOXESPERCLASS ? Precision::I32 : Precision::FP32;
+        inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, inPrecision);
+    }
+
+    std::vector<DataConfigurator> outDataConf;
+    outDataConf.reserve(getOriginalOutputsNumber());
+    for (int i = 0; i < getOriginalOutputsNumber(); ++i) {
+        Precision outPrecision = i == NMS_SELECTEDSCORES ? Precision::FP32 : Precision::I32;
+        outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, outPrecision);
+    }
+
+    addSupportedPrimDesc(inDataConf, outDataConf, impl_desc_type::ref_any);
+}
+
+void MKLDNNNonMaxSuppressionNode::execute(mkldnn::stream strm) {
+    const float *boxes = reinterpret_cast<const float *>(getParentEdgeAt(NMS_BOXES)->getMemoryPtr()->GetPtr());
+    const float *scores = reinterpret_cast<const float *>(getParentEdgeAt(NMS_SCORES)->getMemoryPtr()->GetPtr());
+
+    max_output_boxes_per_class = outDims.size() > NMS_SELECTEDSCORES ? 0 : num_boxes;
+    if (inDims.size() > NMS_MAXOUTPUTBOXESPERCLASS) {
+        max_output_boxes_per_class = reinterpret_cast<int *>(getParentEdgeAt(NMS_MAXOUTPUTBOXESPERCLASS)->getMemoryPtr()->GetPtr())[0];
+    }
+
+    if (max_output_boxes_per_class == 0)
+        return;
+
+    iou_threshold = outDims.size() > NMS_SELECTEDSCORES ? 0.0f : 1.0f;
+    if (inDims.size() > NMS_IOUTHRESHOLD)
+        iou_threshold = reinterpret_cast<float *>(getParentEdgeAt(NMS_IOUTHRESHOLD)->getMemoryPtr()->GetPtr())[0];
+
+    score_threshold = 0.0f;
+    if (inDims.size() > NMS_SCORETHRESHOLD)
+        score_threshold = reinterpret_cast<float *>(getParentEdgeAt(NMS_SCORETHRESHOLD)->getMemoryPtr()->GetPtr())[0];
+
+    soft_nms_sigma = 0.0f;
+    if (inDims.size() > NMS_SOFTNMSSIGMA)
+        soft_nms_sigma = reinterpret_cast<float *>(getParentEdgeAt(NMS_SOFTNMSSIGMA)->getMemoryPtr()->GetPtr())[0];
+    scale = 0.0f;
+    if (soft_nms_sigma > 0.0) {
+        scale = -0.5 / soft_nms_sigma;
+    }
+
+    int *selected_indices = reinterpret_cast<int *>(getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getMemoryPtr()->GetPtr());
+
+    float *selected_scores = nullptr;
+    if (outDims.size() > NMS_SELECTEDSCORES)
+        selected_scores = reinterpret_cast<float *>(getChildEdgesAtPort(NMS_SELECTEDSCORES)[0]->getMemoryPtr()->GetPtr());
+
+    int *valid_outputs = nullptr;
+    if (outDims.size() > NMS_VALIDOUTPUTS)
+        valid_outputs = reinterpret_cast<int *>(getChildEdgesAtPort(NMS_VALIDOUTPUTS)[0]->getMemoryPtr()->GetPtr());
+
+    auto boxesStrides = getParentEdgeAt(NMS_BOXES)->getDesc().getBlockingDesc().getStrides();
+    auto scoresStrides = getParentEdgeAt(NMS_SCORES)->getDesc().getBlockingDesc().getStrides();
+
+    std::vector<filteredBoxes> filtBoxes(max_output_boxes_per_class * num_batches * num_classes);
+
+    if (soft_nms_sigma == 0.0f) {
+        nmsWithoutSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes);
+    } else {
+        nmsWithSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes);
+    }
+
+    size_t startOffset = numFiltBox[0][0];
+    for (size_t b = 0; b < numFiltBox.size(); b++) {
+        size_t batchOffset = b*num_classes*max_output_boxes_per_class;
+        for (size_t c = (b == 0 ? 1 : 0); c < numFiltBox[b].size(); c++) {
+            size_t offset = batchOffset + c*max_output_boxes_per_class;
+            for (size_t i = 0; i < numFiltBox[b][c]; i++) {
+                filtBoxes[startOffset + i] = filtBoxes[offset + i];
+            }
+            startOffset += numFiltBox[b][c];
+        }
+    }
+    filtBoxes.resize(startOffset);
+
+    // need more particular comparator to get deterministic behaviour
+    // escape situation when filtred boxes with same score have different position from launch to launch
+    if (sort_result_descending) {
+        parallel_sort(filtBoxes.begin(), filtBoxes.end(),
+                      [](const filteredBoxes& l, const filteredBoxes& r) {
+                          return (l.score > r.score) ||
+                                 (l.score ==  r.score && l.batch_index < r.batch_index) ||
+                                 (l.score ==  r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) ||
+                                 (l.score ==  r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index);
+                      });
+    }
+
+    const size_t selectedBoxesNum = getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getDims()[0];
+    const size_t validOutputs = std::min(filtBoxes.size(), selectedBoxesNum);
+
+    int selectedIndicesStride = getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getDesc().getBlockingDesc().getStrides()[0];
+    int *selectedIndicesPtr = selected_indices;
+    float *selectedScoresPtr = selected_scores;
+
+    size_t idx = 0lu;
+    for (; idx < validOutputs; idx++) {
+        selectedIndicesPtr[0] = filtBoxes[idx].batch_index;
+        selectedIndicesPtr[1] = filtBoxes[idx].class_index;
+        selectedIndicesPtr[2] = filtBoxes[idx].box_index;
+        selectedIndicesPtr += selectedIndicesStride;
+        if (outDims.size() > NMS_SELECTEDSCORES) {
+            selectedScoresPtr[0] = static_cast<float>(filtBoxes[idx].batch_index);
+            selectedScoresPtr[1] = static_cast<float>(filtBoxes[idx].class_index);
+            selectedScoresPtr[2] = static_cast<float>(filtBoxes[idx].score);
+            selectedScoresPtr += selectedIndicesStride;
+        }
+    }
+    std::fill(selectedIndicesPtr, selectedIndicesPtr + (selectedBoxesNum - idx) * selectedIndicesStride, -1);
+    if (outDims.size() > NMS_SELECTEDSCORES) {
+        std::fill(selectedScoresPtr, selectedScoresPtr + (selectedBoxesNum - idx) * selectedIndicesStride, -1.f);
+    }
+    if (outDims.size() > NMS_VALIDOUTPUTS)
+        *valid_outputs = static_cast<int>(validOutputs);
+}
+
+bool MKLDNNNonMaxSuppressionNode::created() const {
+    return getType() == NonMaxSuppression;
+}
+
+float MKLDNNNonMaxSuppressionNode::intersectionOverUnion(const float *boxesI, const float *boxesJ) {
+    float yminI, xminI, ymaxI, xmaxI, yminJ, xminJ, ymaxJ, xmaxJ;
+    if (boxEncodingType == boxEncoding::CENTER) {
+        //  box format: x_center, y_center, width, height
+        yminI = boxesI[1] - boxesI[3] / 2.f;
+        xminI = boxesI[0] - boxesI[2] / 2.f;
+        ymaxI = boxesI[1] + boxesI[3] / 2.f;
+        xmaxI = boxesI[0] + boxesI[2] / 2.f;
+        yminJ = boxesJ[1] - boxesJ[3] / 2.f;
+        xminJ = boxesJ[0] - boxesJ[2] / 2.f;
+        ymaxJ = boxesJ[1] + boxesJ[3] / 2.f;
+        xmaxJ = boxesJ[0] + boxesJ[2] / 2.f;
+    } else {
+        //  box format: y1, x1, y2, x2
+        yminI = (std::min)(boxesI[0], boxesI[2]);
+        xminI = (std::min)(boxesI[1], boxesI[3]);
+        ymaxI = (std::max)(boxesI[0], boxesI[2]);
+        xmaxI = (std::max)(boxesI[1], boxesI[3]);
+        yminJ = (std::min)(boxesJ[0], boxesJ[2]);
+        xminJ = (std::min)(boxesJ[1], boxesJ[3]);
+        ymaxJ = (std::max)(boxesJ[0], boxesJ[2]);
+        xmaxJ = (std::max)(boxesJ[1], boxesJ[3]);
+    }
+
+    float areaI = (ymaxI - yminI) * (xmaxI - xminI);
+    float areaJ = (ymaxJ - yminJ) * (xmaxJ - xminJ);
+    if (areaI <= 0.f || areaJ <= 0.f)
+        return 0.f;
+
+    float intersection_area =
+            (std::max)((std::min)(ymaxI, ymaxJ) - (std::max)(yminI, yminJ), 0.f) *
+            (std::max)((std::min)(xmaxI, xmaxJ) - (std::max)(xminI, xminJ), 0.f);
+    return intersection_area / (areaI + areaJ - intersection_area);
+}
+
+void MKLDNNNonMaxSuppressionNode::nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides,
+                                                             const SizeVector &scoresStrides, std::vector<filteredBoxes> &filtBoxes) {
+    auto less = [](const boxInfo& l, const boxInfo& r) {
+        return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx));
+    };
+
+    auto coeff = [&](float iou) {
+        const float weight = std::exp(scale * iou * iou);
+        return iou <= iou_threshold ? weight : 0.0f;
+    };
+
+    parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) {
+        std::vector<filteredBoxes> fb;
+        const float *boxesPtr = boxes + batch_idx * boxesStrides[0];
+        const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
+
+        std::priority_queue<boxInfo, std::vector<boxInfo>, decltype(less)> sorted_boxes(less);
+        for (int box_idx = 0; box_idx < num_boxes; box_idx++) {
+            if (scoresPtr[box_idx] > score_threshold)
+                sorted_boxes.emplace(boxInfo({scoresPtr[box_idx], box_idx, 0}));
+        }
+
+        fb.reserve(sorted_boxes.size());
+        if (sorted_boxes.size() > 0) {
+            while (fb.size() < max_output_boxes_per_class && !sorted_boxes.empty()) {
+                boxInfo currBox = sorted_boxes.top();
+                float origScore = currBox.score;
+                sorted_boxes.pop();
+
+                bool box_is_selected = true;
+                for (int idx = static_cast<int>(fb.size()) - 1; idx >= currBox.suppress_begin_index; idx--) {
+                    float iou = intersectionOverUnion(&boxesPtr[currBox.idx * 4], &boxesPtr[fb[idx].box_index * 4]);
+                    currBox.score *= coeff(iou);
+                    if (iou >= iou_threshold) {
+                        box_is_selected = false;
+                        break;
+                    }
+                    if (currBox.score <= score_threshold)
+                        break;
+                }
+
+                currBox.suppress_begin_index = fb.size();
+                if (box_is_selected) {
+                    if (currBox.score == origScore) {
+                        fb.push_back({ currBox.score, batch_idx, class_idx, currBox.idx });
+                        continue;
+                    }
+                    if (currBox.score > score_threshold) {
+                        sorted_boxes.push(currBox);
+                    }
+                }
+            }
+        }
+        numFiltBox[batch_idx][class_idx] = fb.size();
+        size_t offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class;
+        for (size_t i = 0; i < fb.size(); i++) {
+            filtBoxes[offset + i] = fb[i];
+        }
+    });
+}
+
+void MKLDNNNonMaxSuppressionNode::nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides,
+                                                                const SizeVector &scoresStrides, std::vector<filteredBoxes> &filtBoxes) {
+    int max_out_box = static_cast<int>(max_output_boxes_per_class);
+    parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) {
+        const float *boxesPtr = boxes + batch_idx * boxesStrides[0];
+        const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
+
+        std::vector<std::pair<float, int>> sorted_boxes;
+        for (int box_idx = 0; box_idx < num_boxes; box_idx++) {
+            if (scoresPtr[box_idx] > score_threshold)
+                sorted_boxes.emplace_back(std::make_pair(scoresPtr[box_idx], box_idx));
+        }
+
+        int io_selection_size = 0;
+        if (sorted_boxes.size() > 0) {
+            parallel_sort(sorted_boxes.begin(), sorted_boxes.end(),
+                          [](const std::pair<float, int>& l, const std::pair<float, int>& r) {
+                              return (l.first > r.first || ((l.first == r.first) && (l.second < r.second)));
+                          });
+            int offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class;
+            filtBoxes[offset + 0] = filteredBoxes(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second);
+            io_selection_size++;
+            for (size_t box_idx = 1; (box_idx < sorted_boxes.size()) && (io_selection_size < max_out_box); box_idx++) {
+                bool box_is_selected = true;
+                for (int idx = io_selection_size - 1; idx >= 0; idx--) {
+                    float iou = intersectionOverUnion(&boxesPtr[sorted_boxes[box_idx].second * 4], &boxesPtr[filtBoxes[offset + idx].box_index * 4]);
+                    if (iou >= iou_threshold) {
+                        box_is_selected = false;
+                        break;
+                    }
+                }
+
+                if (box_is_selected) {
+                    filtBoxes[offset + io_selection_size] = filteredBoxes(sorted_boxes[box_idx].first, batch_idx, class_idx, sorted_boxes[box_idx].second);
+                    io_selection_size++;
+                }
+            }
+        }
+        numFiltBox[batch_idx][class_idx] = io_selection_size;
+    });
+}
+
+void MKLDNNNonMaxSuppressionNode::checkPrecision(const Precision prec, const std::vector<Precision> precList,
+                                                           const std::string name, const std::string type) {
+    if (std::find(precList.begin(), precList.end(), prec) == precList.end())
+        IE_THROW() << errorPrefix << "has unsupported '" << name << "' " << type << " precision: " << prec;
+}
+
+void MKLDNNNonMaxSuppressionNode::check1DInput(const SizeVector& dims, const std::vector<Precision> precList,
+                                                         const std::string name, const size_t port) {
+    checkPrecision(getOriginalInputPrecisionAtPort(port), precList, name, inType);
+
+    if (dims.size() != 0 && dims.size() != 1)
+        IE_THROW() << errorPrefix << "has unsupported '" << name << "' input rank: " << dims.size();
+    if (dims.size() == 1)
+        if (dims[0] != 1)
+            IE_THROW() << errorPrefix << "has unsupported '" << name << "' input 1st dimension size: " << dims[0];
+}
+
+void MKLDNNNonMaxSuppressionNode::checkOutput(const SizeVector& dims, const std::vector<Precision> precList,
+                                                        const std::string name, const size_t port) {
+    checkPrecision(getOriginalOutputPrecisionAtPort(port), precList, name, outType);
+
+    if (dims.size() != 2)
+        IE_THROW() << errorPrefix << "has unsupported '" << name << "' output rank: " << dims.size();
+    if (dims[1] != 3)
+        IE_THROW() << errorPrefix << "has unsupported '" << name << "' output 2nd dimension size: " << dims[1];
+}
+
+
+REG_MKLDNN_PRIM_FOR(MKLDNNNonMaxSuppressionNode, NonMaxSuppression)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h
new file mode 100644
index 00000000000000..4651da1f2e795c
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h
@@ -0,0 +1,102 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+#include <string>
+#include <memory>
+#include <vector>
+
+using namespace InferenceEngine;
+
+namespace MKLDNNPlugin {
+
+class MKLDNNNonMaxSuppressionNode : public MKLDNNNode {
+public:
+    MKLDNNNonMaxSuppressionNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+    struct filteredBoxes {
+        float score;
+        int batch_index;
+        int class_index;
+        int box_index;
+        filteredBoxes() = default;
+        filteredBoxes(float _score, int _batch_index, int _class_index, int _box_index) :
+                score(_score), batch_index(_batch_index), class_index(_class_index), box_index(_box_index) {}
+    };
+
+    struct boxInfo {
+        float score;
+        int idx;
+        int suppress_begin_index;
+    };
+
+    float intersectionOverUnion(const float *boxesI, const float *boxesJ);
+
+    void nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides,
+                          const SizeVector &scoresStrides, std::vector<filteredBoxes> &filtBoxes);
+
+    void nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides,
+                             const SizeVector &scoresStrides, std::vector<filteredBoxes> &filtBoxes);
+
+private:
+    // input
+    const size_t NMS_BOXES = 0;
+    const size_t NMS_SCORES = 1;
+    const size_t NMS_MAXOUTPUTBOXESPERCLASS = 2;
+    const size_t NMS_IOUTHRESHOLD = 3;
+    const size_t NMS_SCORETHRESHOLD = 4;
+    const size_t NMS_SOFTNMSSIGMA = 5;
+
+    // output
+    const size_t NMS_SELECTEDINDICES = 0;
+    const size_t NMS_SELECTEDSCORES = 1;
+    const size_t NMS_VALIDOUTPUTS = 2;
+
+    enum class boxEncoding {
+        CORNER,
+        CENTER
+    };
+    boxEncoding boxEncodingType = boxEncoding::CORNER;
+    bool sort_result_descending = true;
+
+    size_t num_batches;
+    size_t num_boxes;
+    size_t num_classes;
+
+    size_t max_output_boxes_per_class = 0lu;
+    float iou_threshold = 0.0f;
+    float score_threshold = 0.0f;
+    float soft_nms_sigma = 0.0f;
+    float scale = 1.f;
+
+    SizeVector inputShape_MAXOUTPUTBOXESPERCLASS;
+    SizeVector inputShape_IOUTHRESHOLD;
+    SizeVector inputShape_SCORETHRESHOLD;
+    SizeVector inputShape_SOFTNMSSIGMA;
+
+    SizeVector outputShape_SELECTEDINDICES;
+    SizeVector outputShape_SELECTEDSCORES;
+
+    std::string errorPrefix;
+
+    std::vector<std::vector<size_t>> numFiltBox;
+    const std::string inType = "input", outType = "output";
+
+    void checkPrecision(const Precision prec, const std::vector<Precision> precList, const std::string name, const std::string type);
+    void check1DInput(const SizeVector& dims, const std::vector<Precision> precList, const std::string name, const size_t port);
+    void checkOutput(const SizeVector& dims, const std::vector<Precision> precList, const std::string name, const size_t port);
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp
new file mode 100644
index 00000000000000..584960373aeb2e
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp
@@ -0,0 +1,198 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+
+#include <ngraph/op/proposal.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_proposal_node.h"
+
+static std::vector<float> generate_anchors(proposal_conf &conf) {
+    auto base_size = conf.base_size_;
+    auto coordinates_offset = conf.coordinates_offset;
+    auto round_ratios = conf.round_ratios;
+
+    auto num_ratios = conf.ratios.size();
+    auto ratios = conf.ratios.data();
+
+    auto num_scales = conf.scales.size();
+    auto scales = conf.scales.data();
+
+    std::vector<float> anchors(num_scales * num_ratios * 4);
+    auto anchors_ptr = anchors.data();
+
+    // base box's width & height & center location
+    const float base_area = static_cast<float>(base_size * base_size);
+    const float half_base_size = base_size * 0.5f;
+    const float center = 0.5f * (base_size - coordinates_offset);
+
+    // enumerate all transformed boxes
+    for (int ratio = 0; ratio < num_ratios; ++ratio) {
+        // transformed width & height for given ratio factors
+        float ratio_w;
+        float ratio_h;
+        if (round_ratios) {
+            ratio_w = std::roundf(std::sqrt(base_area / ratios[ratio]));
+            ratio_h = std::roundf(ratio_w * ratios[ratio]);
+        } else {
+            ratio_w = std::sqrt(base_area / ratios[ratio]);
+            ratio_h = ratio_w * ratios[ratio];
+        }
+
+        float * const p_anchors_wm = anchors_ptr + 0 * num_ratios * num_scales + ratio * num_scales;
+        float * const p_anchors_hm = anchors_ptr + 1 * num_ratios * num_scales + ratio * num_scales;
+        float * const p_anchors_wp = anchors_ptr + 2 * num_ratios * num_scales + ratio * num_scales;
+        float * const p_anchors_hp = anchors_ptr + 3 * num_ratios * num_scales + ratio * num_scales;
+
+        for (int scale = 0; scale < num_scales; ++scale) {
+            // transformed width & height for given scale factors
+            const float scale_w = 0.5f * (ratio_w * scales[scale] - coordinates_offset);
+            const float scale_h = 0.5f * (ratio_h * scales[scale] - coordinates_offset);
+
+            // (x1, y1, x2, y2) for transformed box
+            p_anchors_wm[scale] = center - scale_w;
+            p_anchors_hm[scale] = center - scale_h;
+            p_anchors_wp[scale] = center + scale_w;
+            p_anchors_hp[scale] = center + scale_h;
+
+            if (conf.shift_anchors) {
+                p_anchors_wm[scale] -= half_base_size;
+                p_anchors_hm[scale] -= half_base_size;
+                p_anchors_wp[scale] -= half_base_size;
+                p_anchors_hp[scale] -= half_base_size;
+            }
+        }
+    }
+    return anchors;
+}
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNProposalNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto proposal0Op = ngraph::as_type_ptr<const ngraph::op::v0::Proposal>(op);
+        const auto proposal4Op = ngraph::as_type_ptr<const ngraph::op::v4::Proposal>(op);
+        if (!proposal0Op && !proposal4Op) {
+            errorMessage = "Node is not an instance of the Proposal from the operations set v0 or v4.";
+            return false;
+        }
+        auto proposalOp = std::dynamic_pointer_cast<const ngraph::op::v0::Proposal>(op);
+        if (proposalOp->get_attrs().framework != "tensorflow" && !proposalOp->get_attrs().framework.empty()) {
+            errorMessage = "Unsupported framework attribute: " + proposalOp->get_attrs().framework;
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNProposalNode::MKLDNNProposalNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    auto proposalOp = std::dynamic_pointer_cast<const ngraph::op::v0::Proposal>(op);
+    auto proposalAttrs = proposalOp->get_attrs();
+
+    conf.feat_stride_ = proposalAttrs.feat_stride;
+    conf.base_size_ = proposalAttrs.base_size;
+    conf.min_size_ = proposalAttrs.min_size;
+    conf.pre_nms_topn_ = proposalAttrs.pre_nms_topn;
+    conf.post_nms_topn_ = proposalAttrs.post_nms_topn;
+    conf.nms_thresh_ = proposalAttrs.nms_thresh;
+    conf.box_coordinate_scale_ = proposalAttrs.box_coordinate_scale;
+    conf.box_size_scale_ = proposalAttrs.box_size_scale;
+    conf.scales = proposalAttrs.scale;
+    conf.ratios = proposalAttrs.ratio;
+    conf.normalize_ = proposalAttrs.normalize;
+    conf.clip_before_nms = proposalAttrs.clip_before_nms;
+    conf.clip_after_nms = proposalAttrs.clip_after_nms;
+    conf.anchors_shape_0 = conf.ratios.size() * conf.scales.size();
+
+    if (proposalAttrs.framework == "tensorflow") {
+        conf.coordinates_offset = 0.0f;
+        conf.initial_clip = true;
+        conf.shift_anchors = true;
+        conf.round_ratios = false;
+        conf.swap_xy = true;
+    } else {
+        conf.coordinates_offset = 1.0f;
+        conf.initial_clip = false;
+        conf.shift_anchors = false;
+        conf.round_ratios = true;
+        conf.swap_xy = false;
+    }
+
+    anchors = generate_anchors(conf);
+    roi_indices.resize(conf.post_nms_topn_);
+
+    store_prob = op->get_output_size() == 2;
+}
+
+void MKLDNNProposalNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    if (store_prob) {
+        addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                              {TensorDescCreatorTypes::ncsp, Precision::FP32},
+                              {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                             {{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                              {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                             impl_desc_type::ref_any);
+    } else {
+        addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                              {TensorDescCreatorTypes::ncsp, Precision::FP32},
+                              {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                             {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                             impl_desc_type::ref_any);
+    }
+}
+
+void MKLDNNProposalNode::execute(mkldnn::stream strm) {
+    try {
+        const float* probabilitiesData = reinterpret_cast<const float *>(getParentEdgeAt(PROBABILITIES_IN_IDX)->getMemoryPtr()->GetPtr());
+        const float* anchorsData = reinterpret_cast<const float *>(getParentEdgeAt(ANCHORS_IN_IDX)->getMemoryPtr()->GetPtr());
+        const float* imgInfoData = reinterpret_cast<const float *>(getParentEdgeAt(IMG_INFO_IN_IDX)->getMemoryPtr()->GetPtr());
+        float* outRoiData = reinterpret_cast <float *>(getChildEdgesAtPort(ROI_OUT_IDX)[0]->getMemoryPtr()->GetPtr());
+        float* outProbData = nullptr;
+        if (store_prob)
+            outProbData = reinterpret_cast <float *>(getChildEdgesAtPort(PROBABILITIES_OUT_IDX)[0]->getMemoryPtr()->GetPtr());
+
+        auto inProbDims = getParentEdgeAt(0)->getDims().ToSizeVector();
+        const size_t imgInfoSize = getParentEdgeAt(2)->getDims()[0];
+
+        // input image height & width
+        const float imgHeight = imgInfoData[0];
+        const float imgWidth = imgInfoData[1];
+        if (!std::isnormal(imgHeight) || !std::isnormal(imgWidth) || (imgHeight < 0.f) || (imgWidth < 0.f)) {
+            IE_THROW() << "Proposal operation image info input must have positive image height and width.";
+        }
+
+        // scale factor for height & width
+        const float scaleHeight = imgInfoData[2];
+        const float scaleWidth = imgInfoSize == 4 ? imgInfoData[3] : scaleHeight;
+        if (!std::isfinite(scaleHeight) || !std::isfinite(scaleWidth) || (scaleHeight < 0.f) || (scaleWidth < 0.f)) {
+            IE_THROW() << "Proposal operation image info input must have non negative scales.";
+        }
+
+        InferenceEngine::Extensions::Cpu::XARCH::proposal_exec(probabilitiesData, anchorsData, inProbDims,
+                {imgHeight, imgWidth, scaleHeight, scaleWidth}, anchors.data(), roi_indices.data(), outRoiData, outProbData, conf);
+    } catch (const InferenceEngine::Exception& e) {
+        std::string errorMsg = e.what();
+        IE_THROW() << errorMsg;
+    }
+}
+
+bool MKLDNNProposalNode::created() const {
+    return getType() == Proposal;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNProposalNode, Proposal)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h
new file mode 100644
index 00000000000000..4fdb333b25921b
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+#include "proposal_imp.hpp"
+
+using proposal_conf = InferenceEngine::Extensions::Cpu::proposal_conf;
+
+namespace MKLDNNPlugin {
+
+class MKLDNNProposalNode : public MKLDNNNode {
+public:
+    MKLDNNProposalNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    const size_t PROBABILITIES_IN_IDX = 0lu;
+    const size_t ANCHORS_IN_IDX = 1lu;
+    const size_t IMG_INFO_IN_IDX = 2lu;
+    const size_t ROI_OUT_IDX = 0lu;
+    const size_t PROBABILITIES_OUT_IDX = 1lu;
+
+    proposal_conf conf;
+    std::vector<float> anchors;
+    std::vector<int> roi_indices;
+    bool store_prob;  // store blob with proposal probabilities
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp
new file mode 100644
index 00000000000000..33e625fce6f88a
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp
@@ -0,0 +1,140 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "base.hpp"
+
+#include <string>
+
+#include <ngraph/opsets/opset1.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_range_node.h"
+#include <utils/general_utils.h>
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNRangeNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        if (!MKLDNNPlugin::one_of(op->get_type_info(), ngraph::op::v0::Range::type_info, ngraph::op::v4::Range::type_info)) {
+            errorMessage = "Only opset1 and opset4 Range operation is supported";
+            return false;
+        }
+        if (std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(RANGE_START)) == nullptr ||
+            std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(RANGE_LIMIT)) == nullptr ||
+            std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(RANGE_DELTA)) == nullptr) {
+            errorMessage = "Only const inputs for Range operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNRangeNode::MKLDNNRangeNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "Range layer with name '" + op->get_friendly_name() + "'";
+
+    if (getOriginalInputsNumber() != 3 || getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
+
+    SizeVector start_dims = op->get_input_shape(RANGE_START);
+    if (ngraph::shape_size(start_dims) != 1)
+        IE_THROW() << errorPrefix << " has start scalar with more than 1 value";
+
+    SizeVector limit_dims = op->get_input_shape(RANGE_LIMIT);
+    if (ngraph::shape_size(limit_dims) != 1)
+        IE_THROW() << errorPrefix << " has limit scalar with more than 1 value";
+
+    SizeVector delta_dims = op->get_input_shape(RANGE_DELTA);
+    if (ngraph::shape_size(delta_dims) != 1)
+        IE_THROW() << errorPrefix << " has delta scalar with more than 1 value";
+
+    SizeVector dst_dims = op->get_output_shape(0);
+    if (dst_dims.size() > 1)
+        IE_THROW() << errorPrefix << " has unsupported rank for output: " << dst_dims.size();
+}
+
+void MKLDNNRangeNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    std::vector<DataConfigurator> inDataConf;
+    std::vector<DataConfigurator> outDataConf;
+
+    if (!(getOriginalInputPrecisionAtPort(RANGE_START) == Precision::I32 &&
+            getOriginalInputPrecisionAtPort(RANGE_LIMIT) == Precision::I32 &&
+            getOriginalInputPrecisionAtPort(RANGE_DELTA) == Precision::I32 &&
+            getOriginalOutputPrecisionAtPort(0)     == Precision::I32) &&
+        !(getOriginalInputPrecisionAtPort(RANGE_START) == Precision::FP32 &&
+            getOriginalInputPrecisionAtPort(RANGE_LIMIT) == Precision::FP32 &&
+            getOriginalInputPrecisionAtPort(RANGE_DELTA) == Precision::FP32 &&
+            getOriginalOutputPrecisionAtPort(0) == Precision::FP32)) {
+        inDataConf.reserve(getOriginalInputsNumber());
+        for (int i = 0; i < getOriginalInputsNumber(); ++i)
+            inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
+        outDataConf.reserve(1);
+        outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
+        addSupportedPrimDesc(inDataConf, outDataConf, impl_desc_type::ref_any);
+    } else {
+        inDataConf.reserve(getOriginalInputsNumber());
+        for (int i = 0; i < getOriginalInputsNumber(); ++i)
+            inDataConf.emplace_back(TensorDescCreatorTypes::ncsp);
+        outDataConf.reserve(1);
+        outDataConf.emplace_back(TensorDescCreatorTypes::ncsp);
+        addSupportedPrimDesc(inDataConf, outDataConf, impl_desc_type::ref_any);
+    }
+}
+
+void MKLDNNRangeNode::execute(mkldnn::stream strm) {
+    StatusCode retcode = OK;
+    switch (getParentEdgeAt(0)->getDesc().getPrecision()) {
+        case Precision::FP32:
+            retcode = rangeKernel<float>();
+            break;
+        case Precision::I32:
+            retcode = rangeKernel<int32_t>();
+            break;
+        default:
+            IE_THROW() << "Incorrect output precision. Only FP32 and I32 are supported!";
+    }
+    if (retcode == PARAMETER_MISMATCH) {
+        std::string errorMsg = "Range indexes exceeds data tensor dimension";
+        IE_THROW() << errorMsg;
+    }
+}
+
+template <typename data_t>
+InferenceEngine::StatusCode MKLDNNRangeNode::rangeKernel() noexcept {
+    size_t dst_size = (getChildEdgesAtPort(0)[0]->getDims())[0];
+    data_t* dst_data = reinterpret_cast<data_t *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+    data_t start = reinterpret_cast<const data_t *>(getParentEdgeAt(RANGE_START)->getMemoryPtr()->GetPtr())[0];
+    data_t limit = reinterpret_cast<const data_t *>(getParentEdgeAt(RANGE_LIMIT)->getMemoryPtr()->GetPtr())[0];
+    data_t delta = reinterpret_cast<const data_t *>(getParentEdgeAt(RANGE_DELTA)->getMemoryPtr()->GetPtr())[0];
+    size_t work_amount_dst = static_cast<size_t>(std::floor(std::abs((limit - start) / delta)));
+    if (work_amount_dst != dst_size)
+        return PARAMETER_MISMATCH;
+
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t iwork = 0, end = 0;
+        splitter(work_amount_dst, nthr, ithr, iwork, end);
+        data_t dst_value = start + iwork * delta;
+
+        for (; iwork < end; ++iwork, dst_value += delta) {
+            dst_data[iwork] = dst_value;
+        }
+    });
+    return OK;
+}
+
+bool MKLDNNRangeNode::created() const {
+    return getType() == Range;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNRangeNode, Range)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h
new file mode 100644
index 00000000000000..b5584be6aa949c
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h
@@ -0,0 +1,34 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNRangeNode : public MKLDNNNode {
+public:
+    MKLDNNRangeNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+    template <typename data_t>
+    InferenceEngine::StatusCode rangeKernel() noexcept;
+private:
+    static const size_t RANGE_START = 0;
+    static const size_t RANGE_LIMIT = 1;
+    static const size_t RANGE_DELTA = 2;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp
new file mode 100644
index 00000000000000..3db7470e92fba9
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp
@@ -0,0 +1,93 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "base.hpp"
+
+#include <string>
+
+#include <ngraph/opsets/opset2.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_reorg_yolo_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNReorgYoloNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto reorgYolo = std::dynamic_pointer_cast<const ngraph::opset2::ReorgYolo>(op);
+        if (!reorgYolo) {
+            errorMessage = "Only opset2 ReorgYolo operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNReorgYoloNode::MKLDNNReorgYoloNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+        MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = std::string(op->get_type_name()) + " node with name '" + op->get_friendly_name() + "'";
+    if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
+
+    const auto reorgYolo = std::dynamic_pointer_cast<const ngraph::opset2::ReorgYolo>(op);
+    const auto strides = reorgYolo->get_strides();
+    if (strides.empty())
+        IE_THROW() << errorPrefix << " has empty strides";
+    stride = strides[0];
+}
+
+void MKLDNNReorgYoloNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNReorgYoloNode::execute(mkldnn::stream strm) {
+    const auto *src_data = reinterpret_cast<const float *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    auto *dst_data = reinterpret_cast<float *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+
+    int IW = (getParentEdgeAt(0)->getDesc().getDims().size() > 3) ? getParentEdgeAt(0)->getDims()[3] : 1;
+    int IH = (getParentEdgeAt(0)->getDesc().getDims().size() > 2) ? getParentEdgeAt(0)->getDims()[2] : 1;
+    int IC = (getParentEdgeAt(0)->getDesc().getDims().size() > 1) ? getParentEdgeAt(0)->getDims()[1] : 1;
+    int B  = (getParentEdgeAt(0)->getDesc().getDims().size() > 0) ? getParentEdgeAt(0)->getDims()[0] : 1;
+
+    int ic_off = IC / (stride * stride);
+    int ih_off = IH * stride;
+    int iw_off = IW * stride;
+    for (int b = 0; b < B; b++) {
+        for (int ic = 0; ic < IC; ic++) {
+            for (int ih = 0; ih < IH; ih++) {
+                for (int iw = 0; iw < IW; iw++) {
+                    int dstIndex = b * IC * IH * IW + ic * IH * IW + ih * IW + iw;
+
+                    int oc = ic % ic_off;
+                    int offset = ic / ic_off;
+
+                    int ow = iw * stride + offset % stride;
+                    int oh = ih * stride + offset / stride;
+
+                    int srcIndex = b * ic_off * ih_off * iw_off + oc * ih_off * iw_off + oh * iw_off + ow;
+
+                    dst_data[dstIndex] = src_data[srcIndex];
+                }
+            }
+        }
+    }
+}
+
+bool MKLDNNReorgYoloNode::created() const {
+    return getType() == ReorgYolo;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNReorgYoloNode, ReorgYolo)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h
new file mode 100644
index 00000000000000..b88f19010e0491
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNReorgYoloNode : public MKLDNNNode {
+public:
+    MKLDNNReorgYoloNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    int stride;
+
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp
new file mode 100644
index 00000000000000..5f6e6083e90c4a
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp
@@ -0,0 +1,182 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "base.hpp"
+
+#include <string>
+#include <vector>
+
+#include <ngraph/opsets/opset1.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_reverse_sequence_node.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNReverseSequenceNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto revSeq = std::dynamic_pointer_cast<const ngraph::opset1::ReverseSequence>(op);
+        if (!revSeq) {
+            errorMessage = "Only opset1 ReverseSequence operation is supported";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNReverseSequenceNode::MKLDNNReverseSequenceNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                                         MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "ReverseSequence layer with name '" + op->get_friendly_name() + "'";
+    const auto revSeq = std::dynamic_pointer_cast<const ngraph::opset1::ReverseSequence>(op);
+
+    if (getOriginalInputsNumber() != 2 || getOriginalOutputsNumber() != 1)
+        IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
+
+    src_dims = op->get_input_shape(REVERSESEQUENCE_DATA);
+
+    SizeVector seq_lengths_dims = op->get_input_shape(REVERSESEQUENCE_LENGTHS);
+    if (seq_lengths_dims.size() != 1)
+        IE_THROW() << errorPrefix << " has incorrect 2nd input rank: " << seq_lengths_dims.size();
+
+    SizeVector dst_dims = op->get_output_shape(0);
+    if (src_dims.size() != dst_dims.size())
+        IE_THROW() << errorPrefix << " has incorrect number of input/output sizes!";
+
+    for (size_t i = 0; i < dst_dims.size(); i++) {
+        if (src_dims[i] != dst_dims[i])
+            IE_THROW() << errorPrefix << " has incorrect number of input/output dimension!";
+    }
+
+    seq_axis = revSeq->get_sequence_axis();
+
+    if (seq_axis < 0 || seq_axis >= static_cast<int>(src_dims.size()))
+        IE_THROW() << errorPrefix << " has incorrect 'seq_axis' parameters dimensions and axis number!";
+
+    batch_axis = revSeq->get_batch_axis();
+
+    if (batch_axis < 0 || batch_axis >= static_cast<int>(src_dims.size()))
+        IE_THROW() << errorPrefix << " has incorrect 'batch_axis' parameters dimensions and axis number!";
+
+    if (seq_lengths_dims[0] != dst_dims[batch_axis])
+        IE_THROW() << errorPrefix << " has incorrect 'seq_lengths_dims' parameters dimension!";
+
+    srcStrides.resize(src_dims.size());
+    srcStrides[srcStrides.size() - 1] = 1;
+    for (int i = srcStrides.size() - 2; i >= 0; i--) {
+        srcStrides[i] = srcStrides[i + 1] * src_dims[i + 1];
+    }
+
+    work_amount_dst = srcStrides[0] * src_dims[0];
+}
+
+void MKLDNNReverseSequenceNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    lengthsPrecision = getOriginalInputPrecisionAtPort(REVERSESEQUENCE_LENGTHS);
+    if (lengthsPrecision != Precision::I32 && lengthsPrecision != Precision::FP32)
+        lengthsPrecision = Precision::I32;
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, lengthsPrecision}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNReverseSequenceNode::execute(mkldnn::stream strm) {
+    size_t i;
+    const float *src_data = reinterpret_cast<const float *>(getParentEdgeAt(REVERSESEQUENCE_DATA)->getMemoryPtr()->GetPtr());
+    float* dst_data = reinterpret_cast<float *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+
+    switch (getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getDesc().getPrecision()) {
+        case Precision::FP32: {
+            float *seq_lengths_data = reinterpret_cast<float *>(getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getMemoryPtr()->GetPtr());
+            for (i = 0; i < src_dims[batch_axis]; i++) {
+                if (static_cast<int32_t>(seq_lengths_data[i]) > static_cast<int>(src_dims[seq_axis])) {
+                    std::string errorMsg = "Incorrect input 'seq_lengths' values!";
+                    IE_THROW() << errorMsg;
+                }
+            }
+
+            parallel_nt(0, [&](const int ithr, const int nthr) {
+                size_t i, start = 0, end = 0, src_idx = 0;
+                SizeVector counters(src_dims.size(), 0);
+                splitter(work_amount_dst, nthr, ithr, start, end);
+                for (int j = src_dims.size() - 1, i = start; j >= 0; j--) {
+                    counters[j] = i % src_dims[j];
+                    i /= src_dims[j];
+                }
+
+                for (size_t iwork = start; iwork < end; ++iwork) {
+                    for (i = 0, src_idx = 0; i < src_dims.size(); ++i) {
+                        size_t idx = counters[i];
+                        if (static_cast<int>(i) == seq_axis &&
+                            static_cast<int>(idx) < static_cast<int32_t>(seq_lengths_data[counters[batch_axis]])) {
+                            idx = static_cast<int32_t>(seq_lengths_data[counters[batch_axis]]) - idx - 1;
+                        }
+                        src_idx += idx * srcStrides[i];
+                    }
+                    dst_data[iwork] = src_data[src_idx];
+                    for (int j = src_dims.size() - 1; j >= 0; j--) {
+                        counters[j] = (counters[j] + 1) % src_dims[j];
+                        if (counters[j] != 0) break;
+                    }
+                }
+            });
+        }
+        break;
+        case Precision::I32: {
+            int32_t *seq_lengths_data = reinterpret_cast<int32_t *>(getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getMemoryPtr()->GetPtr());
+            for (i = 0; i < src_dims[batch_axis]; i++) {
+                if (seq_lengths_data[i] > static_cast<int>(src_dims[seq_axis])) {
+                    std::string errorMsg = "Incorrect input 'seq_lengths' values!";
+                    IE_THROW() << errorMsg;
+                }
+            }
+
+            parallel_nt(0, [&](const int ithr, const int nthr) {
+                size_t i, start = 0, end = 0, src_idx = 0;
+                SizeVector counters(src_dims.size(), 0);
+                splitter(work_amount_dst, nthr, ithr, start, end);
+                for (int j = src_dims.size() - 1, i = start; j >= 0; j--) {
+                    counters[j] = i % src_dims[j];
+                    i /= src_dims[j];
+                }
+
+                for (size_t iwork = start; iwork < end; ++iwork) {
+                    for (i = 0, src_idx = 0; i < src_dims.size(); ++i) {
+                        size_t idx = counters[i];
+                        if (static_cast<int>(i) == seq_axis &&
+                            static_cast<int>(idx) < seq_lengths_data[counters[batch_axis]]) {
+                            idx = seq_lengths_data[counters[batch_axis]] - idx - 1;
+                        }
+                        src_idx += idx * srcStrides[i];
+                    }
+                    dst_data[iwork] = src_data[src_idx];
+                    for (int j = src_dims.size() - 1; j >= 0; j--) {
+                        counters[j] = (counters[j] + 1) % src_dims[j];
+                        if (counters[j] != 0) break;
+                    }
+                }
+            });
+        }
+        break;
+        default:
+            IE_THROW() << "ReverseSequence layer does not support "
+                        << getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getDesc().getPrecision()  << " precision";
+    }
+}
+
+bool MKLDNNReverseSequenceNode::created() const {
+    return getType() == ReverseSequence;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNReverseSequenceNode, ReverseSequence)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h
new file mode 100644
index 00000000000000..4b3cf056c63afa
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNReverseSequenceNode : public MKLDNNNode {
+public:
+    MKLDNNReverseSequenceNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    const size_t REVERSESEQUENCE_DATA = 0;
+    const size_t REVERSESEQUENCE_LENGTHS = 1;
+
+    int seq_axis;
+    int batch_axis;
+    InferenceEngine::SizeVector src_dims;
+    InferenceEngine::SizeVector srcStrides;
+    size_t work_amount_dst;
+
+    InferenceEngine::Precision lengthsPrecision;
+    std::string errorPrefix;
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
index 6d4c9a27dc4d8b..53dda785e69115 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
@@ -14,7 +14,7 @@ using namespace InferenceEngine;
 
 MKLDNNSoftMaxNode::MKLDNNSoftMaxNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
         MKLDNNNode(op, eng, cache) {
-    auto softmaxOp = ngraph::as_type_ptr<ngraph::op::v1::Softmax>(op);
+    const auto softmaxOp = ngraph::as_type_ptr<ngraph::op::v1::Softmax>(op);
     if (softmaxOp) {
         axis = softmaxOp->get_axis();
     } else {
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp
new file mode 100644
index 00000000000000..1c78c44b48df5a
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp
@@ -0,0 +1,478 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cmath>
+
+#include <ngraph/op/topk.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn_topk_node.h"
+#include "utils/general_utils.h"
+
+#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
+#include <immintrin.h>
+#endif
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+bool MKLDNNTopKNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto topKOp = ngraph::as_type_ptr<const ngraph::op::v1::TopK>(op);
+        if (!topKOp) {
+            errorMessage = "Node is not an instance of the TopK from the operations set v1 or v3";
+            return false;
+        }
+        if (topKOp->get_mode() != ngraph::op::TopKMode::MAX &&
+            topKOp->get_mode() != ngraph::op::TopKMode::MIN) {
+            errorMessage = "Unsupported mode.";
+            return false;
+        }
+        if (!MKLDNNPlugin::one_of(topKOp->get_sort_type(), ngraph::op::TopKSortType::NONE,
+                                  ngraph::op::TopKSortType::SORT_VALUES,
+                                  ngraph::op::TopKSortType::SORT_INDICES)) {
+            errorMessage = "Unsupported sort type.";
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNTopKNode::MKLDNNTopKNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng,
+                                     MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+    auto topK1Op = ngraph::as_type_ptr<ngraph::op::v1::TopK>(op);
+
+    SizeVector dstDims = topK1Op->get_output_shape(TOPK_VALUE);
+    src_dims = topK1Op->get_input_shape(TOPK_DATA);
+
+    axis = topK1Op->get_axis();
+
+    if (topK1Op->get_mode() == ngraph::op::TopKMode::MAX)
+        mode_max = true;
+    else
+        mode_max = false;
+
+    if (topK1Op->get_sort_type() == ngraph::op::TopKSortType::SORT_VALUES)
+        sort_value = true;
+    else
+        sort_value = false;
+
+    int j;
+    for (j = src_dims.size() - 1; j >= 0; j--) {
+        if (src_dims[j] != 1) break;
+    }
+    if (static_cast<size_t>(j) == axis) is_last_dim = true;
+
+    for (size_t i = 0; i < axis; i++) {
+        axis_step *= src_dims[i];
+    }
+    axis_dim = src_dims[axis];
+    for (size_t i = (axis + 1); i < src_dims.size(); i++) {
+        axis_stride *= src_dims[i];
+    }
+    dim = static_cast<int>(src_dims[axis]);
+    before_num = count(src_dims, 0, axis);
+}
+
+void MKLDNNTopKNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    std::vector<DataConfigurator> outDataConf;
+    outDataConf.reserve(getOriginalOutputsNumber());
+    outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32);
+    for (int i = 1; i < getOriginalOutputsNumber(); ++i)
+        outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32);
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::I32}},
+                         outDataConf,
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNTopKNode::execute(mkldnn::stream strm) {
+    const float *src = reinterpret_cast<const float *>(getParentEdgeAt(TOPK_DATA)->getMemoryPtr()->GetPtr());
+    src_k = reinterpret_cast<int *>(getParentEdgeAt(TOPK_K)->getMemoryPtr()->GetPtr())[0];
+    float* dst_data = nullptr;
+    int* dst_idx = nullptr;
+
+    if (outDims.size() == 1) {
+        if (getOriginalOutputPrecisionAtPort(0) == Precision::FP32) {
+            dst_data = reinterpret_cast<float *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+        } else {
+            dst_idx = reinterpret_cast<int *>(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr());
+        }
+        SizeVector dstDims = getChildEdgesAtPort(0)[0]->getDims().ToSizeVector();
+
+        if (dstDims[axis] != static_cast<size_t>(src_k)) {
+            std::string errorMsg = "Output tensor dimension mismatch";
+            IE_THROW() << errorMsg;
+        }
+    } else if (outDims.size() == 2) {
+        dst_data = reinterpret_cast<float *>(getChildEdgesAtPort(TOPK_VALUE)[0]->getMemoryPtr()->GetPtr());
+        SizeVector dst_data_dims = getChildEdgesAtPort(TOPK_VALUE)[0]->getDims().ToSizeVector();
+
+        dst_idx = reinterpret_cast<int *>(getChildEdgesAtPort(TOPK_INDEX)[0]->getMemoryPtr()->GetPtr());
+        SizeVector dst_idx_dims = getChildEdgesAtPort(TOPK_INDEX)[0]->getDims().ToSizeVector();
+
+        if (dst_idx_dims[axis] != static_cast<size_t>(src_k) || dst_data_dims[axis] != static_cast<size_t>(src_k)) {
+            std::string errorMsg = "Output tensors dimension mismatch";
+            IE_THROW() << errorMsg;
+        }
+    } else {
+        std::string errorMsg = "Output tensors amount mismatch";
+        IE_THROW() << errorMsg;
+    }
+
+    if (src_dims[axis] < static_cast<size_t>(src_k))
+        src_k = src_dims[axis];
+
+    SizeVector in_dims = getParentEdgeAt(TOPK_DATA)->getDims().ToSizeVector();
+
+    if (src_k == 1) {
+        if (is_last_dim) {
+            if (mode_max)
+                top1<std::greater>(src, dst_data, dst_idx, in_dims);
+            else
+                top1<std::less>(src, dst_data, dst_idx, in_dims);
+        } else {
+            if (mode_max)
+                top1_axis<cmpgt_ps, std::greater>(src, dst_data, dst_idx, in_dims);
+            else
+                top1_axis<cmplt_ps, std::less>(src, dst_data, dst_idx, in_dims);
+        }
+    } else {
+        if (is_last_dim) {
+            if (mode_max)
+                topk<std::greater>(src, dst_data, dst_idx, in_dims);
+            else
+                topk<std::less>(src, dst_data, dst_idx, in_dims);
+        } else {
+            if (mode_max)
+                topk_axis<cmpgt_ps, std::greater>(src, dst_data, dst_idx, in_dims);
+            else
+                topk_axis<cmplt_ps, std::less>(src, dst_data, dst_idx, in_dims);
+        }
+    }
+}
+
+bool MKLDNNTopKNode::created() const {
+    return getType() == TopK;
+}
+
+template <class Compare1, template <typename> class Compare2>
+void MKLDNNTopKNode::top1_axis(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) {
+    int after_num = count(in_dims, axis + 1, in_dims.size());
+    int first_index = 0;
+
+#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
+    parallel_for2d(before_num, after_num / block_size, [&](int i0, int ib1) {
+            int s_index = i0 * dim * after_num + ib1 * block_size;
+            vec_type_f vmax_val = _mm_uni_loadu_ps(src_data + s_index);
+            vec_type_i vindex_max_val = _mm_uni_setzero_si();
+            for (int i2 = 1; i2 < dim; i2++) {
+                s_index += after_num;
+                vec_type_f vsrc = _mm_uni_loadu_ps(src_data + s_index);
+                vmask_type vmask = Compare1::cmp_ps(vsrc, vmax_val);
+                vmax_val = _mm_uni_blendv_ps(vmax_val, vsrc, vmask);
+
+                vec_type_i vindex_cur_val = _mm_uni_set1_epi32(i2);
+#if defined(HAVE_AVX512F)
+                vindex_max_val = _mm512_mask_blend_epi32(vmask, vindex_max_val, vindex_cur_val);
+#else
+                vindex_max_val = _mm_uni_blendv_epi8(vindex_max_val, vindex_cur_val, _mm_uni_castps_si(vmask));
+#endif
+            }
+            if (dst_data)
+                _mm_uni_storeu_ps(dst_data + i0 * after_num + ib1 * block_size, vmax_val);
+            if (dst_idx)
+                _mm_uni_storeu_si(reinterpret_cast<vec_type_i*>(dst_idx + i0 * after_num + ib1 * block_size), vindex_max_val);
+        });
+        first_index = after_num / block_size * block_size;
+#endif
+    int rest = after_num - first_index;
+    parallel_for2d(before_num, rest, [&](int i0, int i1) {
+        int index_max_val = 0;
+        int s_index = i0 * dim * after_num + first_index + i1;
+        float max_val = src_data[s_index];
+        for (int i2 = 1; i2 < dim; i2++) {
+            s_index += after_num;
+            if (Compare2<float>()(src_data[s_index], max_val)) {
+                max_val = src_data[s_index];
+                index_max_val = i2;
+            }
+        }
+        if (dst_data)
+            dst_data[i0 * after_num + first_index + i1] = max_val;
+        if (dst_idx)
+            dst_idx[i0 * after_num + first_index + i1] = index_max_val;
+    });
+}
+
+template <template <typename> class Compare>
+void MKLDNNTopKNode::top1(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) {
+    parallel_for(before_num, [&](int i0) {
+        int index_max_val = 0;
+        int s_index = i0 * dim;
+        float max_val = src_data[s_index];
+        for (int i1 = 1; i1 < dim; i1++) {
+            s_index++;
+            if (Compare<float>()(src_data[s_index], max_val)) {
+                max_val = src_data[s_index];
+                index_max_val = i1;
+            }
+        }
+        if (dst_data)
+            dst_data[i0] = max_val;
+        if (dst_idx)
+            dst_idx[i0] = index_max_val;
+    });
+}
+
+template <class Compare1, template <typename> class Compare2>
+void MKLDNNTopKNode::topk_axis(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) {
+    int after_num = count(in_dims, axis + 1, in_dims.size());
+    int first_index = 0;
+
+#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
+    if (src_k < count_vec) {
+            parallel_for2d(before_num, after_num / block_size, [&](int i0, int ib1) {
+#if defined(HAVE_AVX512F)
+                const int N = 32;
+                vec_type_f vmax_values[N];
+                vec_type_i vmax_indexes[N];
+#else
+                const int N = 16;
+                vec_type_f vmax_values[N];
+                vec_type_i vmax_indexes[N];
+#endif
+                vec_type_f vtmp;
+                vec_type_i vtmp_indexes;
+                vmask_type vmask;
+                int s_index = i0 * dim * after_num + ib1 * block_size;
+
+                auto vswap_func = [&](int index1, int index2) {
+                    vtmp = vmax_values[index1];
+                    vmax_values[index1] = _mm_uni_blendv_ps(vmax_values[index1], vmax_values[index2], vmask);
+                    vmax_values[index2] = _mm_uni_blendv_ps(vmax_values[index2], vtmp, vmask);
+
+                    vtmp_indexes = vmax_indexes[index1];
+#if defined(HAVE_AVX512F)
+                    vmax_indexes[index1] = _mm512_mask_blend_epi32(vmask, vmax_indexes[index1], vmax_indexes[index2]);
+                    vmax_indexes[index2] = _mm512_mask_blend_epi32(vmask, vmax_indexes[index2], vtmp_indexes);
+#else
+                    vmax_indexes[index1] = _mm_uni_blendv_epi8(vmax_indexes[index1], vmax_indexes[index2], _mm_uni_castps_si(vmask));
+                    vmax_indexes[index2] = _mm_uni_blendv_epi8(vmax_indexes[index2], vtmp_indexes, _mm_uni_castps_si(vmask));
+#endif
+                };
+
+                for (int i2 = 0; i2 < src_k; i2++) {
+                    vmax_values[i2] = _mm_uni_loadu_ps(src_data + s_index);
+                    vmax_indexes[i2] = _mm_uni_set1_epi32(i2);
+                    s_index += after_num;
+                }
+                for (int i2 = 0; i2 < src_k - 1; i2++) {
+                    for (int i3 = src_k - 1; i3 > i2; i3--) {
+                        vmask = Compare1::cmp_ps(vmax_values[i3], vmax_values[i3 - 1]);
+#if defined(HAVE_AVX512F)
+                        if (vmask)
+                            vswap_func(i3, i3 - 1);
+#else
+                        int swap = _mm_uni_movemask_ps(vmask);
+                        if (swap)
+                            vswap_func(i3, i3 - 1);
+#endif
+                    }
+                }
+                for (int i2 = src_k; i2 < dim; i2++) {
+                    vmax_values[src_k] = _mm_uni_loadu_ps(src_data + s_index);
+                    vmax_indexes[src_k] = _mm_uni_set1_epi32(i2);
+                    for (int i3 = src_k; i3 > 0; i3--) {
+                        vmask = Compare1::cmp_ps(vmax_values[i3], vmax_values[i3 - 1]);
+#if defined(HAVE_AVX512F)
+                        if (vmask)
+                            vswap_func(i3, i3 - 1);
+                        else
+                            break;
+#else
+                        int swap = _mm_uni_movemask_ps(vmask);
+                        if (swap)
+                            vswap_func(i3, i3 - 1);
+                        else
+                            break;
+#endif
+                    }
+                    s_index += after_num;
+                }
+                if (!sort_value) {
+                    for (int i2 = 0; i2 < src_k - 1; i2++) {
+                        for (int i3 = src_k - 1; i3 > i2; i3--) {
+                            vmask = _mm_uni_cmpgt_i32(vmax_indexes[i3 - 1], vmax_indexes[i3]);
+#if defined(HAVE_AVX512F)
+                            if (vmask)
+                                vswap_func(i3, i3 - 1);
+                            else
+                                break;
+#else
+                            int swap = _mm_uni_movemask_ps(vmask);
+                            if (swap)
+                                vswap_func(i3, i3 - 1);
+                            else
+                                break;
+#endif
+                        }
+                    }
+                }
+                if (dst_data) {
+                    for (int i2 = 0; i2 < src_k; i2++)
+                        _mm_uni_storeu_ps(dst_data + (i0 * src_k + i2) * after_num + ib1 * block_size, vmax_values[i2]);
+                }
+                if (dst_idx) {
+                    for (int i2 = 0; i2 < src_k; i2++)
+                        _mm_uni_storeu_si(reinterpret_cast<vec_type_i*>(dst_idx + (i0 * src_k + i2) * after_num + ib1 * block_size), vmax_indexes[i2]);
+                }
+            });
+            first_index = after_num / block_size * block_size;
+        }
+#endif
+    int rest = after_num - first_index;
+    parallel_for2d(before_num, rest, [&](int i0, int i1) {
+        std::vector<float> max_values(src_k + 1);
+        std::vector<int> max_indexes(src_k + 1);
+        float tmp_value;
+        int tmp_index;
+        int s_index = i0 * dim * after_num + first_index + i1;
+
+        auto swap_func = [&](int index1, int index2) {
+            tmp_value = max_values[index1];
+            max_values[index1] = max_values[index2];
+            max_values[index2] = tmp_value;
+
+            tmp_index = max_indexes[index1];
+            max_indexes[index1] = max_indexes[index2];
+            max_indexes[index2] = tmp_index;
+        };
+
+        for (int i2 = 0; i2 < src_k; i2++) {
+            max_values[i2] = src_data[s_index];
+            max_indexes[i2] = i2;
+            s_index += after_num;
+        }
+        for (int i2 = 0; i2 < src_k - 1; i2++) {
+            for (int i3 = src_k - 1; i3 > i2; i3--) {
+                if (Compare2<float>()(max_values[i3], max_values[i3 - 1])) {
+                    swap_func(i3, i3 - 1);
+                }
+            }
+        }
+        for (int i2 = src_k; i2 < dim; i2++) {
+            max_values[src_k] = src_data[s_index];
+            max_indexes[src_k] = i2;
+            for (int i3 = src_k; i3 > 0; i3--) {
+                if (Compare2<float>()(max_values[i3], max_values[i3 - 1]))
+                    swap_func(i3, i3 - 1);
+                else
+                    break;
+            }
+            s_index += after_num;
+        }
+        if (!sort_value) {
+            for (int i2 = 0; i2 < src_k - 1; i2++) {
+                for (int i3 = src_k - 1; i3 > i2; i3--) {
+                    if (std::greater<int>()(max_indexes[i3 - 1], max_indexes[i3])) {
+                        swap_func(i3, i3 - 1);
+                    }
+                }
+            }
+        }
+        if (dst_data) {
+            for (int i2 = 0; i2 < src_k; i2++)
+                dst_data[i0 * src_k * after_num + i2 * after_num + first_index + i1] = max_values[i2];
+        }
+        if (dst_idx) {
+            for (int i2 = 0; i2 < src_k; i2++)
+                dst_idx[i0 * src_k * after_num + i2 * after_num + first_index + i1] = max_indexes[i2];
+        }
+    });
+}
+
+template <template <typename> class Compare>
+void MKLDNNTopKNode::topk(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) {
+    parallel_for(before_num, [&](int i0) {
+        std::vector<float> max_values(src_k + 1);
+        std::vector<int> max_indexes(src_k + 1);
+        float tmp_value;
+        int tmp_index;
+        int s_index = i0 * dim;
+
+        auto swap_func = [&](int index1, int index2) {
+            tmp_value = max_values[index1];
+            max_values[index1] = max_values[index2];
+            max_values[index2] = tmp_value;
+
+            tmp_index = max_indexes[index1];
+            max_indexes[index1] = max_indexes[index2];
+            max_indexes[index2] = tmp_index;
+        };
+
+        for (int i2 = 0; i2 < src_k; i2++) {
+            max_values[i2] = src_data[s_index];
+            max_indexes[i2] = i2;
+            s_index++;
+        }
+        for (int i2 = 0; i2 < src_k - 1; i2++) {
+            for (int i3 = src_k - 1; i3 > i2; i3--) {
+                if (Compare<float>()(max_values[i3], max_values[i3 - 1])) {
+                    swap_func(i3, i3 - 1);
+                }
+            }
+        }
+        for (int i2 = src_k; i2 < dim; i2++) {
+            max_values[src_k] = src_data[s_index];
+            max_indexes[src_k] = i2;
+            for (int i3 = src_k; i3 > 0; i3--) {
+                if (Compare<float>()(max_values[i3], max_values[i3 - 1]))
+                    swap_func(i3, i3 - 1);
+                else
+                    break;
+            }
+            s_index++;
+        }
+        if (!sort_value) {
+            for (int i2 = 0; i2 < src_k - 1; i2++) {
+                for (int i3 = src_k - 1; i3 > i2; i3--) {
+                    if (std::greater<int>()(max_indexes[i3 - 1], max_indexes[i3])) {
+                        swap_func(i3, i3 - 1);
+                    }
+                }
+            }
+        }
+        if (dst_data) {
+            for (int i2 = 0; i2 < src_k; i2++)
+                dst_data[i0 * src_k + i2] = max_values[i2];
+        }
+        if (dst_idx) {
+            for (int i2 = 0; i2 < src_k; i2++)
+                dst_idx[i0 * src_k + i2] = max_indexes[i2];
+        }
+    });
+}
+
+inline int MKLDNNTopKNode::count(SizeVector dims, size_t start_ind, size_t end_ind) {
+    size_t count = 1;
+    for (size_t i = start_ind; i < end_ind; i++)
+        count *= dims[i];
+    return static_cast<int>(count);
+}
+
+inline int MKLDNNTopKNode::count(SizeVector dims, size_t start_ind) {
+    return count(dims, start_ind, dims.size());
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNTopKNode, TopK)
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.h
new file mode 100644
index 00000000000000..b8594c40363731
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.h
@@ -0,0 +1,114 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_common.h"
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNTopKNode : public MKLDNNNode {
+public:
+    MKLDNNTopKNode(const std::shared_ptr<ngraph::Node> &op, const mkldnn::engine &eng,
+                   MKLDNNWeightsSharing::Ptr &cache);
+
+    void getSupportedDescriptors() override {};
+
+    void initSupportedPrimitiveDescriptors() override;
+
+    void createPrimitive() override {};
+
+    void execute(mkldnn::stream strm) override;
+
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node> &op, std::string &errorMessage) noexcept;
+
+#if defined(HAVE_AVX512F)
+    const int block_size = 16;
+    typedef __m512 vec_type_f;
+    typedef __m512i vec_type_i;
+    typedef __mmask16 vmask_type;
+#elif defined(HAVE_AVX2)
+    const int block_size = 8;
+    typedef __m256 vec_type_f;
+    typedef __m256i vec_type_i;
+    typedef __m256 vmask_type;
+#elif defined(HAVE_SSE)
+    const int block_size = 4;
+    typedef __m128 vec_type_f;
+    typedef __m128i vec_type_i;
+    typedef __m128 vmask_type;
+#else
+    typedef float vec_type_f;
+    typedef int vmask_type;
+#endif
+
+    struct cmpgt_ps {
+        static inline vmask_type cmp_ps(const vec_type_f _Left, const vec_type_f _Right) {
+#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
+            return _mm_uni_cmpgt_ps(_Left, _Right);
+#else
+            return _Left > _Right ? _Left : _Right;
+#endif
+        }
+    };
+
+    struct cmplt_ps {
+        static inline vmask_type cmp_ps(const vec_type_f _Left, const vec_type_f _Right) {
+#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
+            return _mm_uni_cmpgt_ps(_Right, _Left);
+#else
+            return _Right > _Left ? _Right : _Left;
+#endif
+        }
+    };
+
+    template<class Compare1, template<typename> class Compare2>
+    void top1_axis(const float *src_data, float *dst_data, int *dst_idx, InferenceEngine::SizeVector in_dims);
+
+    template<template<typename> class Compare>
+    void top1(const float *src_data, float *dst_data, int *dst_idx, InferenceEngine::SizeVector in_dims);
+
+    template<class Compare1, template<typename> class Compare2>
+    void topk_axis(const float *src_data, float *dst_data, int *dst_idx, InferenceEngine::SizeVector in_dims);
+
+    template<template<typename> class Compare>
+    void topk(const float *src_data, float *dst_data, int *dst_idx, InferenceEngine::SizeVector in_dims);
+
+private:
+    const size_t TOPK_DATA = 0;
+    const size_t TOPK_K = 1;
+    const size_t TOPK_VALUE = 0;
+    const size_t TOPK_INDEX = 1;
+
+    InferenceEngine::SizeVector src_dims;
+    size_t axis;
+    size_t axis_dim;
+    size_t axis_stride = 1;
+    size_t axis_step = 1;
+    bool is_last_dim = false;
+    int src_k = 1;
+
+    bool sort_value = false;
+    bool mode_max = true;
+
+    int dim, before_num;
+
+    std::string errorPrefix;
+
+#if defined(HAVE_AVX512F)
+    const int count_vec = 32;
+#elif defined(HAVE_SSE) || defined(HAVE_AVX2)
+    const int count_vec = 16;
+#endif
+
+    inline int count(InferenceEngine::SizeVector dims, size_t start_ind, size_t end_ind);
+
+    inline int count(InferenceEngine::SizeVector dims, size_t start_ind = 0);
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_transpose_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_transpose_node.cpp
index eaf93abd611cbe..49bc1bb695dd1b 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_transpose_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_transpose_node.cpp
@@ -19,7 +19,7 @@ using namespace InferenceEngine;
 
 bool MKLDNNTransposeNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
     try {
-        auto transposeOp = ngraph::as_type_ptr<const ngraph::op::v1::Transpose>(op);
+        const auto transposeOp = ngraph::as_type_ptr<const ngraph::op::v1::Transpose>(op);
         if (!transposeOp) {
             errorMessage = "Node is not an instance of the Transpose operation.";
             return false;
diff --git a/inference-engine/src/mkldnn_plugin/nodes/non_max_suppression.cpp b/inference-engine/src/mkldnn_plugin/nodes/non_max_suppression.cpp
deleted file mode 100644
index b5884a234cbb2e..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/non_max_suppression.cpp
+++ /dev/null
@@ -1,464 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <cassert>
-#include <algorithm>
-#include <utility>
-#include <queue>
-#include "ie_parallel.hpp"
-#include <ngraph_ops/nms_ie_internal.hpp>
-#include "utils/general_utils.h"
-#include <ie_ngraph_utils.hpp>
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using namespace MKLDNNPlugin;
-
-class NonMaxSuppressionImpl: public ExtLayerBase {
-public:
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto nms = std::dynamic_pointer_cast<const ngraph::op::internal::NonMaxSuppressionIEInternal>(op);
-            if (!nms) {
-                errorMessage = "Only internal NonMaxSuppression operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    explicit NonMaxSuppressionImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            errorPrefix = "NMS layer with name '" + op->get_friendly_name() + "' ";
-            const auto nms = std::dynamic_pointer_cast<const ngraph::op::internal::NonMaxSuppressionIEInternal>(op);
-
-            if (nms->get_input_size() < 2 || nms->get_input_size() > 6)
-                IE_THROW() << errorPrefix << "has incorrect number of input edges: " << nms->get_input_size();
-
-            if (nms->get_output_size() < 1 || nms->get_output_size() > 3)
-                IE_THROW() << errorPrefix << "has incorrect number of output edges: " << nms->get_output_size();
-
-            boxEncodingType = nms->m_center_point_box ? boxEncoding::CENTER : boxEncoding::CORNER;
-
-            sort_result_descending = nms->m_sort_result_descending;
-
-            const std::vector<Precision> supportedFloatPrecision = {Precision::FP32, Precision::BF16};
-            const std::vector<Precision> supportedIntOutputPrecision = {Precision::I32, Precision::I64};
-
-            checkPrecision(op->get_input_element_type(NMS_BOXES), supportedFloatPrecision, "boxes", inType);
-            const SizeVector &boxes_dims = op->get_input_shape(NMS_BOXES);
-            num_batches = boxes_dims[0];
-            num_boxes = boxes_dims[1];
-            if (boxes_dims.size() != 3)
-                IE_THROW() << errorPrefix << "has unsupported 'boxes' input rank: " << boxes_dims.size();
-            if (boxes_dims[2] != 4)
-                IE_THROW() << errorPrefix << "has unsupported 'boxes' input 3rd dimension size: " << boxes_dims[2];
-
-            checkPrecision(op->get_input_element_type(NMS_SCORES), supportedFloatPrecision, "scores", inType);
-            const SizeVector &scores_dims = op->get_input_shape(NMS_SCORES);
-            num_classes = scores_dims[1];
-            if (scores_dims.size() != 3)
-                IE_THROW() << errorPrefix << "has unsupported 'scores' input rank: " << scores_dims.size();
-
-            if (num_batches != scores_dims[0])
-                IE_THROW() << errorPrefix << " num_batches is different in 'boxes' and 'scores' inputs";
-            if (num_boxes != scores_dims[2])
-                IE_THROW() << errorPrefix << " num_boxes is different in 'boxes' and 'scores' inputs";
-
-            numFiltBox.resize(num_batches);
-            for (size_t i = 0; i < numFiltBox.size(); i++)
-                numFiltBox[i].resize(num_classes);
-
-            const std::vector<Precision> supportedPrecision = {Precision::I16, Precision::U8, Precision::I8, Precision::U16, Precision::I32,
-                                                               Precision::U32, Precision::I64, Precision::U64};
-            check1DInput(op, supportedPrecision, "max_output_boxes_per_class", NMS_MAXOUTPUTBOXESPERCLASS);
-            check1DInput(op, supportedFloatPrecision, "iou_threshold", NMS_IOUTHRESHOLD);
-            check1DInput(op, supportedFloatPrecision, "score_threshold", NMS_SCORETHRESHOLD);
-
-            if (op->get_input_size() > NMS_SOFTNMSSIGMA) {
-                check1DInput(op, supportedFloatPrecision, "soft_nms_sigma", NMS_SOFTNMSSIGMA);
-            }
-
-            checkOutput(op, supportedIntOutputPrecision, "selected_indices", NMS_SELECTEDINDICES);
-            checkOutput(op, supportedFloatPrecision, "selected_scores", NMS_SELECTEDSCORES);
-            checkPrecision(op->get_input_element_type(NMS_VALIDOUTPUTS), supportedIntOutputPrecision, "valid_outputs", outType);
-            const SizeVector &valid_outputs_dims = op->get_input_shape(NMS_VALIDOUTPUTS);
-            if (valid_outputs_dims.size() != 1)
-                IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output rank: " << valid_outputs_dims.size();
-            if (valid_outputs_dims[0] != 1)
-                IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output 1st dimension size: " << valid_outputs_dims[1];
-
-            LayerConfig config;
-            for (size_t i = 0; i < op->get_input_size(); i++) {
-                DataConfig inConfig;
-
-                Precision inPrecision = i == NMS_MAXOUTPUTBOXESPERCLASS ? Precision::I32 : Precision::FP32;
-                const SizeVector& inDims = op->get_input_shape(i);
-                inConfig.desc = TensorDesc(inPrecision, inDims, InferenceEngine::TensorDesc::getLayoutByDims(inDims));
-                config.inConfs.push_back(inConfig);
-            }
-            for (size_t i = 0; i < op->get_output_size(); i++) {
-                DataConfig outConfig;
-
-                Precision outPrecision = i == NMS_SELECTEDSCORES ? Precision::FP32 : Precision::I32;
-                const SizeVector& outDims = op->get_output_shape(i);
-                outConfig.desc = TensorDesc(outPrecision, outDims, InferenceEngine::TensorDesc::getLayoutByDims(outDims));
-                config.outConfs.push_back(outConfig);
-            }
-
-            config.dynBatchSupport = false;
-            confs.push_back(config);
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    float intersectionOverUnion(const float *boxesI, const float *boxesJ) {
-        float yminI, xminI, ymaxI, xmaxI, yminJ, xminJ, ymaxJ, xmaxJ;
-        if (boxEncodingType == boxEncoding::CENTER) {
-            //  box format: x_center, y_center, width, height
-            yminI = boxesI[1] - boxesI[3] / 2.f;
-            xminI = boxesI[0] - boxesI[2] / 2.f;
-            ymaxI = boxesI[1] + boxesI[3] / 2.f;
-            xmaxI = boxesI[0] + boxesI[2] / 2.f;
-            yminJ = boxesJ[1] - boxesJ[3] / 2.f;
-            xminJ = boxesJ[0] - boxesJ[2] / 2.f;
-            ymaxJ = boxesJ[1] + boxesJ[3] / 2.f;
-            xmaxJ = boxesJ[0] + boxesJ[2] / 2.f;
-        } else {
-            //  box format: y1, x1, y2, x2
-            yminI = (std::min)(boxesI[0], boxesI[2]);
-            xminI = (std::min)(boxesI[1], boxesI[3]);
-            ymaxI = (std::max)(boxesI[0], boxesI[2]);
-            xmaxI = (std::max)(boxesI[1], boxesI[3]);
-            yminJ = (std::min)(boxesJ[0], boxesJ[2]);
-            xminJ = (std::min)(boxesJ[1], boxesJ[3]);
-            ymaxJ = (std::max)(boxesJ[0], boxesJ[2]);
-            xmaxJ = (std::max)(boxesJ[1], boxesJ[3]);
-        }
-
-        float areaI = (ymaxI - yminI) * (xmaxI - xminI);
-        float areaJ = (ymaxJ - yminJ) * (xmaxJ - xminJ);
-        if (areaI <= 0.f || areaJ <= 0.f)
-            return 0.f;
-
-        float intersection_area =
-            (std::max)((std::min)(ymaxI, ymaxJ) - (std::max)(yminI, yminJ), 0.f) *
-            (std::max)((std::min)(xmaxI, xmaxJ) - (std::max)(xminI, xminJ), 0.f);
-        return intersection_area / (areaI + areaJ - intersection_area);
-    }
-
-    struct filteredBoxes {
-        float score;
-        int batch_index;
-        int class_index;
-        int box_index;
-        filteredBoxes() = default;
-        filteredBoxes(float _score, int _batch_index, int _class_index, int _box_index) :
-                      score(_score), batch_index(_batch_index), class_index(_class_index), box_index(_box_index) {}
-    };
-
-    struct boxInfo {
-        float score;
-        int idx;
-        int suppress_begin_index;
-    };
-
-    void nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, const SizeVector &scoresStrides,
-                          std::vector<filteredBoxes> &filtBoxes) {
-        auto less = [](const boxInfo& l, const boxInfo& r) {
-            return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx));
-        };
-
-        auto coeff = [&](float iou) {
-            const float weight = std::exp(scale * iou * iou);
-            return iou <= iou_threshold ? weight : 0.0f;
-        };
-
-        parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) {
-            std::vector<filteredBoxes> fb;
-            const float *boxesPtr = boxes + batch_idx * boxesStrides[0];
-            const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
-
-            std::priority_queue<boxInfo, std::vector<boxInfo>, decltype(less)> sorted_boxes(less);
-            for (int box_idx = 0; box_idx < num_boxes; box_idx++) {
-                if (scoresPtr[box_idx] > score_threshold)
-                    sorted_boxes.emplace(boxInfo({scoresPtr[box_idx], box_idx, 0}));
-            }
-
-            fb.reserve(sorted_boxes.size());
-            if (sorted_boxes.size() > 0) {
-                while (fb.size() < max_output_boxes_per_class && !sorted_boxes.empty()) {
-                    boxInfo currBox = sorted_boxes.top();
-                    float origScore = currBox.score;
-                    sorted_boxes.pop();
-
-                    bool box_is_selected = true;
-                    for (int idx = static_cast<int>(fb.size()) - 1; idx >= currBox.suppress_begin_index; idx--) {
-                        float iou = intersectionOverUnion(&boxesPtr[currBox.idx * 4], &boxesPtr[fb[idx].box_index * 4]);
-                        currBox.score *= coeff(iou);
-                        if (iou >= iou_threshold) {
-                            box_is_selected = false;
-                            break;
-                        }
-                        if (currBox.score <= score_threshold)
-                            break;
-                    }
-
-                    currBox.suppress_begin_index = fb.size();
-                    if (box_is_selected) {
-                        if (currBox.score == origScore) {
-                            fb.push_back({ currBox.score, batch_idx, class_idx, currBox.idx });
-                            continue;
-                        }
-                        if (currBox.score > score_threshold) {
-                            sorted_boxes.push(currBox);
-                        }
-                    }
-                }
-            }
-            numFiltBox[batch_idx][class_idx] = fb.size();
-            size_t offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class;
-            for (size_t i = 0; i < fb.size(); i++) {
-                filtBoxes[offset + i] = fb[i];
-            }
-        });
-    }
-
-    void nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, const SizeVector &scoresStrides,
-                             std::vector<filteredBoxes> &filtBoxes) {
-        int max_out_box = static_cast<int>(max_output_boxes_per_class);
-        parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) {
-            const float *boxesPtr = boxes + batch_idx * boxesStrides[0];
-            const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
-
-            std::vector<std::pair<float, int>> sorted_boxes;
-            for (int box_idx = 0; box_idx < num_boxes; box_idx++) {
-                if (scoresPtr[box_idx] > score_threshold)
-                    sorted_boxes.emplace_back(std::make_pair(scoresPtr[box_idx], box_idx));
-            }
-
-            int io_selection_size = 0;
-            if (sorted_boxes.size() > 0) {
-                parallel_sort(sorted_boxes.begin(), sorted_boxes.end(),
-                              [](const std::pair<float, int>& l, const std::pair<float, int>& r) {
-                                    return (l.first > r.first || ((l.first == r.first) && (l.second < r.second)));
-                                });
-                int offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class;
-                filtBoxes[offset + 0] = filteredBoxes(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second);
-                io_selection_size++;
-                for (size_t box_idx = 1; (box_idx < sorted_boxes.size()) && (io_selection_size < max_out_box); box_idx++) {
-                    bool box_is_selected = true;
-                    for (int idx = io_selection_size - 1; idx >= 0; idx--) {
-                        float iou = intersectionOverUnion(&boxesPtr[sorted_boxes[box_idx].second * 4], &boxesPtr[filtBoxes[offset + idx].box_index * 4]);
-                        if (iou >= iou_threshold) {
-                            box_is_selected = false;
-                            break;
-                        }
-                    }
-
-                    if (box_is_selected) {
-                        filtBoxes[offset + io_selection_size] = filteredBoxes(sorted_boxes[box_idx].first, batch_idx, class_idx, sorted_boxes[box_idx].second);
-                        io_selection_size++;
-                    }
-                }
-            }
-            numFiltBox[batch_idx][class_idx] = io_selection_size;
-        });
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        const float *boxes = inputs[NMS_BOXES]->cbuffer().as<const float *>() + inputs[NMS_BOXES]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        const float *scores = inputs[NMS_SCORES]->cbuffer().as<const float *>() + inputs[NMS_SCORES]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        max_output_boxes_per_class = outputs.size() > NMS_SELECTEDSCORES ? 0 : num_boxes;
-        if (inputs.size() > NMS_MAXOUTPUTBOXESPERCLASS) {
-            max_output_boxes_per_class = (inputs[NMS_MAXOUTPUTBOXESPERCLASS]->cbuffer().as<int *>() +
-                                          inputs[NMS_MAXOUTPUTBOXESPERCLASS]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0];
-        }
-
-        if (max_output_boxes_per_class == 0)
-            return OK;
-
-        iou_threshold = outputs.size() > NMS_SELECTEDSCORES ? 0.0f : 1.0f;
-        if (inputs.size() > NMS_IOUTHRESHOLD)
-            iou_threshold = (inputs[NMS_IOUTHRESHOLD]->cbuffer().as<float *>() +
-                             inputs[NMS_IOUTHRESHOLD]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0];
-
-        score_threshold = 0.0f;
-        if (inputs.size() > NMS_SCORETHRESHOLD)
-            score_threshold = (inputs[NMS_SCORETHRESHOLD]->cbuffer().as<float *>() +
-                               inputs[NMS_SCORETHRESHOLD]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0];
-
-        soft_nms_sigma = 0.0f;
-        if (inputs.size() > NMS_SOFTNMSSIGMA)
-            soft_nms_sigma = (inputs[NMS_SOFTNMSSIGMA]->cbuffer().as<float *>() +
-                              inputs[NMS_SOFTNMSSIGMA]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0];
-        scale = 0.0f;
-        if (soft_nms_sigma > 0.0) {
-            scale = -0.5 / soft_nms_sigma;
-        }
-
-        int *selected_indices = outputs[NMS_SELECTEDINDICES]->buffer().as<int *>() +
-                                outputs[NMS_SELECTEDINDICES]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        float *selected_scores = nullptr;
-        if (outputs.size() > NMS_SELECTEDSCORES)
-            selected_scores = outputs[NMS_SELECTEDSCORES]->buffer().as<float *>() +
-                              outputs[NMS_SELECTEDSCORES]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        int *valid_outputs = nullptr;
-        if (outputs.size() > NMS_VALIDOUTPUTS)
-            valid_outputs = outputs[NMS_VALIDOUTPUTS]->buffer().as<int *>() +
-                            outputs[NMS_VALIDOUTPUTS]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        const SizeVector &boxesStrides = inputs[NMS_BOXES]->getTensorDesc().getBlockingDesc().getStrides();
-        const SizeVector &scoresStrides = inputs[NMS_SCORES]->getTensorDesc().getBlockingDesc().getStrides();
-
-        std::vector<filteredBoxes> filtBoxes(max_output_boxes_per_class * num_batches * num_classes);
-
-        if (soft_nms_sigma == 0.0f) {
-            nmsWithoutSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes);
-        } else {
-            nmsWithSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes);
-        }
-
-        size_t startOffset = numFiltBox[0][0];
-        for (size_t b = 0; b < numFiltBox.size(); b++) {
-            size_t batchOffset = b*num_classes*max_output_boxes_per_class;
-            for (size_t c = (b == 0 ? 1 : 0); c < numFiltBox[b].size(); c++) {
-                size_t offset = batchOffset + c*max_output_boxes_per_class;
-                for (size_t i = 0; i < numFiltBox[b][c]; i++) {
-                    filtBoxes[startOffset + i] = filtBoxes[offset + i];
-                }
-                startOffset += numFiltBox[b][c];
-            }
-        }
-        filtBoxes.resize(startOffset);
-
-        // need more particular comparator to get deterministic behaviour
-        // escape situation when filtred boxes with same score have different position from launch to launch
-        if (sort_result_descending) {
-            parallel_sort(filtBoxes.begin(), filtBoxes.end(),
-                          [](const filteredBoxes& l, const filteredBoxes& r) {
-                                return (l.score > r.score) ||
-                                (l.score ==  r.score && l.batch_index < r.batch_index) ||
-                                (l.score ==  r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) ||
-                                (l.score ==  r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index);
-                            });
-        }
-
-        const size_t selectedBoxesNum = outputs[NMS_SELECTEDINDICES]->getTensorDesc().getDims()[0];
-        const size_t validOutputs = std::min(filtBoxes.size(), selectedBoxesNum);
-
-        int selectedIndicesStride = outputs[NMS_SELECTEDINDICES]->getTensorDesc().getBlockingDesc().getStrides()[0];
-        int *selectedIndicesPtr = selected_indices;
-        float *selectedScoresPtr = selected_scores;
-
-        size_t idx = 0lu;
-        for (; idx < validOutputs; idx++) {
-            selectedIndicesPtr[0] = filtBoxes[idx].batch_index;
-            selectedIndicesPtr[1] = filtBoxes[idx].class_index;
-            selectedIndicesPtr[2] = filtBoxes[idx].box_index;
-            selectedIndicesPtr += selectedIndicesStride;
-            if (outputs.size() > NMS_SELECTEDSCORES) {
-                selectedScoresPtr[0] = static_cast<float>(filtBoxes[idx].batch_index);
-                selectedScoresPtr[1] = static_cast<float>(filtBoxes[idx].class_index);
-                selectedScoresPtr[2] = static_cast<float>(filtBoxes[idx].score);
-                selectedScoresPtr += selectedIndicesStride;
-            }
-        }
-        std::fill(selectedIndicesPtr, selectedIndicesPtr + (selectedBoxesNum - idx) * selectedIndicesStride, -1);
-        if (outputs.size() > NMS_SELECTEDSCORES) {
-            std::fill(selectedScoresPtr, selectedScoresPtr + (selectedBoxesNum - idx) * selectedIndicesStride, -1.f);
-        }
-        if (outputs.size() > NMS_VALIDOUTPUTS)
-            *valid_outputs = static_cast<int>(validOutputs);
-
-        return OK;
-    }
-
-private:
-    // input
-    const size_t NMS_BOXES = 0;
-    const size_t NMS_SCORES = 1;
-    const size_t NMS_MAXOUTPUTBOXESPERCLASS = 2;
-    const size_t NMS_IOUTHRESHOLD = 3;
-    const size_t NMS_SCORETHRESHOLD = 4;
-    const size_t NMS_SOFTNMSSIGMA = 5;
-
-    // output
-    const size_t NMS_SELECTEDINDICES = 0;
-    const size_t NMS_SELECTEDSCORES = 1;
-    const size_t NMS_VALIDOUTPUTS = 2;
-
-    enum class boxEncoding {
-        CORNER,
-        CENTER
-    };
-    boxEncoding boxEncodingType = boxEncoding::CORNER;
-    bool sort_result_descending = true;
-
-    size_t num_batches;
-    size_t num_boxes;
-    size_t num_classes;
-
-    size_t max_output_boxes_per_class = 0lu;
-    float iou_threshold = 0.0f;
-    float score_threshold = 0.0f;
-    float soft_nms_sigma = 0.0f;
-    float scale = 1.f;
-
-    std::string errorPrefix;
-
-    std::vector<std::vector<size_t>> numFiltBox;
-    const std::string inType = "input", outType = "output";
-
-    void checkPrecision(const ngraph::element::Type &ngPrec, const std::vector<Precision> precList, const std::string name, const std::string type) {
-        const auto prec = details::convertPrecision(ngPrec);
-        if (std::find(precList.begin(), precList.end(), prec) == precList.end())
-            IE_THROW() << errorPrefix << "has unsupported '" << name << "' " << type << " precision: " << prec;
-    }
-
-    void check1DInput(const std::shared_ptr<ngraph::Node>& op, const std::vector<Precision> precList, const std::string name, const size_t port) {
-        checkPrecision(op->get_input_element_type(port), precList, name, inType);
-
-        const SizeVector &dims = op->get_input_shape(port);
-        if (dims.size() != 0 && dims.size() != 1)
-            IE_THROW() << errorPrefix << "has unsupported '" << name << "' input rank: " << dims.size();
-        if (dims.size() == 1)
-            if (dims[0] != 1)
-                IE_THROW() << errorPrefix << "has unsupported '" << name << "' input 1st dimension size: " << dims[0];
-    }
-
-    void checkOutput(const std::shared_ptr<ngraph::Node>& op, const std::vector<Precision> precList, const std::string name, const size_t port) {
-        checkPrecision(op->get_output_element_type(port), precList, name, outType);
-
-        const SizeVector &dims = op->get_output_shape(port);
-        if (dims.size() != 2)
-            IE_THROW() << errorPrefix << "has unsupported '" << name << "' output rank: " << dims.size();
-        if (dims[1] != 3)
-            IE_THROW() << errorPrefix << "has unsupported '" << name << "' output 2nd dimension size: " << dims[1];
-    }
-};
-
-REG_FACTORY_FOR(NonMaxSuppressionImpl, NonMaxSuppressionIEInternal);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/priorgridgenerator_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/priorgridgenerator_onnx.cpp
deleted file mode 100644
index 662c24086ee172..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/priorgridgenerator_onnx.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-#include <algorithm>
-#include <cassert>
-#include <vector>
-#include <ngraph/opsets/opset6.hpp>
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-const int INPUT_PRIORS {0};
-const int INPUT_FEATUREMAP {1};
-const int INPUT_IMAGE {2};
-
-const int OUTPUT_ROIS {0};
-
-class ExperimentalDetectronPriorGridGeneratorImpl: public ExtLayerBase {
-private:
-    // Inputs:
-    //      priors, shape [n, 4]
-    //      [feature_map], shape [b, c, h, w]
-    //      [im_data], shape [b, 3, im_h, im_w]
-    // Outputs:
-    //      priors_grid, shape [m, 4]
-
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto priorGridGen = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronPriorGridGenerator>(op);
-            if (!priorGridGen) {
-                errorMessage = "Only opset6 ExperimentalDetectronPriorGridGenerator operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    std::string errorPrefix;
-
-public:
-    explicit ExperimentalDetectronPriorGridGeneratorImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            errorPrefix = "ExperimentalDetectronPriorGridGenerator layer with name '" + op->get_friendly_name() + "'";
-            const auto priorGridGen = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronPriorGridGenerator>(op);
-            if (op->get_input_size() != 3 || op->get_output_size() != 1)
-                IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
-
-            if (op->get_input_shape(INPUT_PRIORS).size() != 2 ||
-                op->get_input_shape(INPUT_FEATUREMAP).size() != 4 ||
-                    op->get_input_shape(INPUT_IMAGE).size() != 4)
-                IE_THROW() << errorPrefix << " has unsupported input shape";
-
-            const auto &attr = priorGridGen->get_attrs();
-            grid_w_ = attr.w;
-            grid_h_ = attr.h;
-            stride_h_ = attr.stride_y;
-            stride_w_ = attr.stride_x;
-
-            addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                           {TensorDescCreatorTypes::ncsp, Precision::FP32},
-                           {TensorDescCreatorTypes::ncsp, Precision::FP32}},
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        const int num_priors_ = inputs[INPUT_PRIORS]->getTensorDesc().getDims()[0];
-        assert(inputs[INPUT_PRIORS]->getTensorDesc().getDims()[1] == 4);
-
-        // Execute
-        const int layer_width = grid_w_ ? grid_w_ : inputs[INPUT_FEATUREMAP]->getTensorDesc().getDims()[3];
-        const int layer_height = grid_h_ ? grid_h_ : inputs[INPUT_FEATUREMAP]->getTensorDesc().getDims()[2];
-        const float step_w = stride_w_ ? stride_w_ : static_cast<float>(inputs[INPUT_IMAGE]->getTensorDesc().getDims()[3]) / layer_width;
-        const float step_h = stride_h_ ? stride_h_ : static_cast<float>(inputs[INPUT_IMAGE]->getTensorDesc().getDims()[2]) / layer_height;
-
-        const auto *bottom_data_0 = inputs[0]->buffer().as<const float *>();
-        auto *top_data_0 = outputs[OUTPUT_ROIS]->buffer().as<float *>();
-
-        for (int h = 0; h < layer_height; ++h) {
-            for (int w = 0; w < layer_width; ++w) {
-                for (int s = 0; s < num_priors_; ++s) {
-                    top_data_0[0] = bottom_data_0[4 * s + 0] + step_w * (w + 0.5f);
-                    top_data_0[1] = bottom_data_0[4 * s + 1] + step_h * (h + 0.5f);
-                    top_data_0[2] = bottom_data_0[4 * s + 2] + step_w * (w + 0.5f);
-                    top_data_0[3] = bottom_data_0[4 * s + 3] + step_h * (h + 0.5f);
-                    top_data_0 += 4;
-                }
-            }
-        }
-
-        return OK;
-    }
-
-private:
-    int grid_w_;
-    int grid_h_;
-    float stride_w_;
-    float stride_h_;
-};
-
-
-REG_FACTORY_FOR(ExperimentalDetectronPriorGridGeneratorImpl, ExperimentalDetectronPriorGridGenerator);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/proposal.cpp b/inference-engine/src/mkldnn_plugin/nodes/proposal.cpp
deleted file mode 100644
index 7dd69d7a0071ea..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/proposal.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <string>
-#include <cmath>
-#include <vector>
-
-#include "common/tensor_desc_creator.h"
-#include "proposal_imp.hpp"
-#include <ngraph/op/proposal.hpp>
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-static
-std::vector<float> generate_anchors(proposal_conf &conf) {
-    auto base_size = conf.base_size_;
-    auto coordinates_offset = conf.coordinates_offset;
-    auto round_ratios = conf.round_ratios;
-
-    auto num_ratios = conf.ratios.size();
-    auto ratios = conf.ratios.data();
-
-    auto num_scales = conf.scales.size();
-    auto scales = conf.scales.data();
-
-    std::vector<float> anchors(num_scales * num_ratios * 4);
-    auto anchors_ptr = anchors.data();
-
-    // base box's width & height & center location
-    const float base_area = static_cast<float>(base_size * base_size);
-    const float half_base_size = base_size * 0.5f;
-    const float center = 0.5f * (base_size - coordinates_offset);
-
-    // enumerate all transformed boxes
-    for (int ratio = 0; ratio < num_ratios; ++ratio) {
-        // transformed width & height for given ratio factors
-        float ratio_w;
-        float ratio_h;
-        if (round_ratios) {
-            ratio_w = std::roundf(std::sqrt(base_area / ratios[ratio]));
-            ratio_h = std::roundf(ratio_w * ratios[ratio]);
-        } else {
-            ratio_w = std::sqrt(base_area / ratios[ratio]);
-            ratio_h = ratio_w * ratios[ratio];
-        }
-
-        float * const p_anchors_wm = anchors_ptr + 0 * num_ratios * num_scales + ratio * num_scales;
-        float * const p_anchors_hm = anchors_ptr + 1 * num_ratios * num_scales + ratio * num_scales;
-        float * const p_anchors_wp = anchors_ptr + 2 * num_ratios * num_scales + ratio * num_scales;
-        float * const p_anchors_hp = anchors_ptr + 3 * num_ratios * num_scales + ratio * num_scales;
-
-        for (int scale = 0; scale < num_scales; ++scale) {
-            // transformed width & height for given scale factors
-            const float scale_w = 0.5f * (ratio_w * scales[scale] - coordinates_offset);
-            const float scale_h = 0.5f * (ratio_h * scales[scale] - coordinates_offset);
-
-            // (x1, y1, x2, y2) for transformed box
-            p_anchors_wm[scale] = center - scale_w;
-            p_anchors_hm[scale] = center - scale_h;
-            p_anchors_wp[scale] = center + scale_w;
-            p_anchors_hp[scale] = center + scale_h;
-
-            if (conf.shift_anchors) {
-                p_anchors_wm[scale] -= half_base_size;
-                p_anchors_hm[scale] -= half_base_size;
-                p_anchors_wp[scale] -= half_base_size;
-                p_anchors_hp[scale] -= half_base_size;
-            }
-        }
-    }
-    return anchors;
-}
-
-class ProposalImpl : public ExtLayerBase {
-public:
-    bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            auto proposal0Op = ngraph::as_type_ptr<const ngraph::op::v0::Proposal>(op);
-            auto proposal4Op = ngraph::as_type_ptr<const ngraph::op::v4::Proposal>(op);
-            if (!proposal0Op && !proposal4Op) {
-                errorMessage = "Node is not an instance of the Proposal from the operations set v0 or v4.";
-                return false;
-            }
-            auto proposalOp = std::dynamic_pointer_cast<const ngraph::op::v0::Proposal>(op);
-            // [NM] TODO: Enable after fix Issue: 53750
-            // if (proposalOp->get_attrs().framework != "tensorflow" && !proposalOp->get_attrs().framework.empty()) {
-            //     errorMessage = "Unsupported framework attribute: " + proposalOp->get_attrs().framework;
-            //     return false;
-            // }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    explicit ProposalImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            auto proposalOp = std::dynamic_pointer_cast<const ngraph::op::v0::Proposal>(op);
-            auto proposalAttrs = proposalOp->get_attrs();
-
-            conf.feat_stride_ = proposalAttrs.feat_stride;
-            conf.base_size_ = proposalAttrs.base_size;
-            conf.min_size_ = proposalAttrs.min_size;
-            conf.pre_nms_topn_ = proposalAttrs.pre_nms_topn;
-            conf.post_nms_topn_ = proposalAttrs.post_nms_topn;
-            conf.nms_thresh_ = proposalAttrs.nms_thresh;
-            conf.box_coordinate_scale_ = proposalAttrs.box_coordinate_scale;
-            conf.box_size_scale_ = proposalAttrs.box_size_scale;
-            conf.scales = proposalAttrs.scale;
-            conf.ratios = proposalAttrs.ratio;
-            conf.normalize_ = proposalAttrs.normalize;
-            conf.clip_before_nms = proposalAttrs.clip_before_nms;
-            conf.clip_after_nms = proposalAttrs.clip_after_nms;
-            conf.anchors_shape_0 = conf.ratios.size() * conf.scales.size();
-
-            if (proposalAttrs.framework == "tensorflow") {
-                conf.coordinates_offset = 0.0f;
-                conf.initial_clip = true;
-                conf.shift_anchors = true;
-                conf.round_ratios = false;
-                conf.swap_xy = true;
-            } else {
-                conf.coordinates_offset = 1.0f;
-                conf.initial_clip = false;
-                conf.shift_anchors = false;
-                conf.round_ratios = true;
-                conf.swap_xy = false;
-            }
-
-            anchors = generate_anchors(conf);
-            roi_indices.resize(conf.post_nms_topn_);
-
-            store_prob = op->get_output_size() == 2;
-            if (store_prob) {
-                addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::FP32}},
-                              {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::FP32}});
-            } else {
-                addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::FP32}},
-                              {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-            }
-        } catch (const InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-            throw;
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr> &inputs, std::vector<Blob::Ptr> &outputs,
-                       ResponseDesc *resp) noexcept override {
-        try {
-            const float* probabilitiesData = inputs[PROBABILITIES_IN_IDX]->cbuffer().as<const float*>() +
-                inputs[PROBABILITIES_IN_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-            const float* anchorsData = inputs[ANCHORS_IN_IDX]->cbuffer().as<const float*>() +
-                inputs[ANCHORS_IN_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-            const float* imgInfoData = inputs[IMG_INFO_IN_IDX]->cbuffer().as<const float*>() +
-                inputs[IMG_INFO_IN_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-            float* outRoiData = outputs[ROI_OUT_IDX]->buffer().as<float*>() +
-                outputs[ROI_OUT_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-            float* outProbData = nullptr;
-            if (store_prob)
-                outProbData = outputs[PROBABILITIES_OUT_IDX]->buffer().as<float*>() +
-                    outputs[PROBABILITIES_OUT_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-            auto inProbDims = inputs[0]->getTensorDesc().getDims();
-            const size_t imgInfoSize = inputs[2]->getTensorDesc().getDims()[0];
-
-            // input image height & width
-            const float imgHeight = imgInfoData[0];
-            const float imgWidth = imgInfoData[1];
-            if (!std::isnormal(imgHeight) || !std::isnormal(imgWidth) || (imgHeight < 0.f) || (imgWidth < 0.f)) {
-                IE_THROW() << "Proposal operation image info input must have positive image height and width.";
-            }
-
-            // scale factor for height & width
-            const float scaleHeight = imgInfoData[2];
-            const float scaleWidth = imgInfoSize == 4 ? imgInfoData[3] : scaleHeight;
-            if (!std::isfinite(scaleHeight) || !std::isfinite(scaleWidth) || (scaleHeight < 0.f) || (scaleWidth < 0.f)) {
-                IE_THROW() << "Proposal operation image info input must have non negative scales.";
-            }
-
-            XARCH::proposal_exec(probabilitiesData, anchorsData, inProbDims,
-                    {imgHeight, imgWidth, scaleHeight, scaleWidth}, anchors.data(), roi_indices.data(), outRoiData, outProbData, conf);
-
-            return OK;
-        } catch (const InferenceEngine::Exception& e) {
-            if (resp) {
-                std::string errorMsg = e.what();
-                errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-            }
-            return GENERAL_ERROR;
-        }
-    }
-
-private:
-    const size_t PROBABILITIES_IN_IDX = 0lu;
-    const size_t ANCHORS_IN_IDX = 1lu;
-    const size_t IMG_INFO_IN_IDX = 2lu;
-    const size_t ROI_OUT_IDX = 0lu;
-    const size_t PROBABILITIES_OUT_IDX = 1lu;
-
-    proposal_conf conf;
-    std::vector<float> anchors;
-    std::vector<int> roi_indices;
-    bool store_prob;  // store blob with proposal probabilities
-};
-
-REG_FACTORY_FOR(ProposalImpl, Proposal);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/proposal_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/proposal_onnx.cpp
deleted file mode 100644
index c36f47f68c62ce..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/proposal_onnx.cpp
+++ /dev/null
@@ -1,450 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cstring>
-#include <cassert>
-#include <cmath>
-#include <string>
-#include <vector>
-#include <utility>
-#include <algorithm>
-#if defined(HAVE_AVX2)
-#include <immintrin.h>
-#endif
-#include "ie_parallel.hpp"
-#include <ngraph/op/experimental_detectron_generate_proposals.hpp>
-#include "common/tensor_desc_creator.h"
-
-
-namespace {
-struct Indexer4d {
-  int dim3_;
-  int dim23_;
-  int dim123_;
-
-  explicit Indexer4d(int dim0, int dim1, int dim2, int dim3):
-      dim3_(dim3), dim23_(dim2 * dim3), dim123_(dim1 * dim2 * dim3) {
-      (void)dim0;
-  }
-
-  int operator()(int i, int j, int k, int n) const {
-      return  i * dim123_ + j * dim23_ + k * dim3_ + n;
-  }
-};
-}  // namespace
-
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-static
-void refine_anchors(const float* deltas, const float* scores, const float* anchors,
-                    float* proposals, const int anchors_num, const int bottom_H,
-                    const int bottom_W, const float img_H, const float img_W,
-                    const float min_box_H, const float min_box_W,
-                    const float max_delta_log_wh,
-                    float coordinates_offset) {
-    Indexer4d delta_idx(anchors_num, 4, bottom_H, bottom_W);
-    Indexer4d score_idx(anchors_num, 1, bottom_H, bottom_W);
-    Indexer4d proposal_idx(bottom_H, bottom_W, anchors_num, 5);
-    Indexer4d anchor_idx(bottom_H, bottom_W, anchors_num, 4);
-
-    parallel_for2d(bottom_H, bottom_W, [&](int h, int w) {
-            for (int anchor = 0; anchor < anchors_num; ++anchor) {
-                int a_idx = anchor_idx(h, w, anchor, 0);
-                float x0 = anchors[a_idx + 0];
-                float y0 = anchors[a_idx + 1];
-                float x1 = anchors[a_idx + 2];
-                float y1 = anchors[a_idx + 3];
-
-                const float dx = deltas[delta_idx(anchor, 0, h, w)];
-                const float dy = deltas[delta_idx(anchor, 1, h, w)];
-                const float d_log_w = deltas[delta_idx(anchor, 2, h, w)];
-                const float d_log_h = deltas[delta_idx(anchor, 3, h, w)];
-
-                const float score = scores[score_idx(anchor, 0, h, w)];
-
-                // width & height of box
-                const float ww = x1 - x0 + coordinates_offset;
-                const float hh = y1 - y0 + coordinates_offset;
-                // center location of box
-                const float ctr_x = x0 + 0.5f * ww;
-                const float ctr_y = y0 + 0.5f * hh;
-
-                // new center location according to deltas (dx, dy)
-                const float pred_ctr_x = dx * ww + ctr_x;
-                const float pred_ctr_y = dy * hh + ctr_y;
-                // new width & height according to deltas d(log w), d(log h)
-                const float pred_w = std::exp(std::min(d_log_w, max_delta_log_wh)) * ww;
-                const float pred_h = std::exp(std::min(d_log_h, max_delta_log_wh)) * hh;
-
-                // update upper-left corner location
-                x0 = pred_ctr_x - 0.5f * pred_w;
-                y0 = pred_ctr_y - 0.5f * pred_h;
-                // update lower-right corner location
-                x1 = pred_ctr_x + 0.5f * pred_w - coordinates_offset;
-                y1 = pred_ctr_y + 0.5f * pred_h - coordinates_offset;
-
-                // adjust new corner locations to be within the image region,
-                x0 = std::max<float>(0.0f, std::min<float>(x0, img_W - coordinates_offset));
-                y0 = std::max<float>(0.0f, std::min<float>(y0, img_H - coordinates_offset));
-                x1 = std::max<float>(0.0f, std::min<float>(x1, img_W - coordinates_offset));
-                y1 = std::max<float>(0.0f, std::min<float>(y1, img_H - coordinates_offset));
-
-                // recompute new width & height
-                const float box_w = x1 - x0 + coordinates_offset;
-                const float box_h = y1 - y0 + coordinates_offset;
-
-                int p_idx = proposal_idx(h, w, anchor, 0);
-                proposals[p_idx + 0] = x0;
-                proposals[p_idx + 1] = y0;
-                proposals[p_idx + 2] = x1;
-                proposals[p_idx + 3] = y1;
-                proposals[p_idx + 4] = (min_box_W <= box_w) * (min_box_H <= box_h) * score;
-            }
-    });
-}
-
-static void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) {
-    parallel_for(pre_nms_topn, [&](size_t i) {
-        unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0];
-        unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1];
-        unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2];
-        unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3];
-        unpacked_boxes[4*pre_nms_topn + i] = p_proposals[5*i + 4];
-    });
-}
-
-static
-void nms_cpu(const int num_boxes, int is_dead[],
-             const float* boxes, int index_out[], int* const num_out,
-             const int base_index, const float nms_thresh, const int max_num_out,
-             float coordinates_offset) {
-    const int num_proposals = num_boxes;
-    int count = 0;
-
-    const float* x0 = boxes + 0 * num_proposals;
-    const float* y0 = boxes + 1 * num_proposals;
-    const float* x1 = boxes + 2 * num_proposals;
-    const float* y1 = boxes + 3 * num_proposals;
-
-    std::memset(is_dead, 0, num_boxes * sizeof(int));
-
-#if defined(HAVE_AVX2)
-    __m256  vc_fone = _mm256_set1_ps(coordinates_offset);
-    __m256i vc_ione = _mm256_set1_epi32(1);
-    __m256  vc_zero = _mm256_set1_ps(0.0f);
-
-    __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh);
-#endif
-
-    for (int box = 0; box < num_boxes; ++box) {
-        if (is_dead[box])
-            continue;
-
-        index_out[count++] = base_index + box;
-        if (count == max_num_out)
-            break;
-
-        int tail = box + 1;
-
-#if defined(HAVE_AVX2)
-        __m256 vx0i = _mm256_set1_ps(x0[box]);
-        __m256 vy0i = _mm256_set1_ps(y0[box]);
-        __m256 vx1i = _mm256_set1_ps(x1[box]);
-        __m256 vy1i = _mm256_set1_ps(y1[box]);
-
-        __m256 vA_width  = _mm256_sub_ps(vx1i, vx0i);
-        __m256 vA_height = _mm256_sub_ps(vy1i, vy0i);
-        __m256 vA_area   = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone));
-
-        for (; tail <= num_boxes - 8; tail += 8) {
-            __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail);
-            __m256i  vdst = _mm256_loadu_si256(pdst);
-
-            __m256 vx0j = _mm256_loadu_ps(x0 + tail);
-            __m256 vy0j = _mm256_loadu_ps(y0 + tail);
-            __m256 vx1j = _mm256_loadu_ps(x1 + tail);
-            __m256 vy1j = _mm256_loadu_ps(y1 + tail);
-
-            __m256 vx0 = _mm256_max_ps(vx0i, vx0j);
-            __m256 vy0 = _mm256_max_ps(vy0i, vy0j);
-            __m256 vx1 = _mm256_min_ps(vx1i, vx1j);
-            __m256 vy1 = _mm256_min_ps(vy1i, vy1j);
-
-            __m256 vwidth  = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone);
-            __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone);
-            __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight));
-
-            __m256 vB_width  = _mm256_sub_ps(vx1j, vx0j);
-            __m256 vB_height = _mm256_sub_ps(vy1j, vy0j);
-            __m256 vB_area   = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone));
-
-            __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea);
-            __m256 vintersection_area = _mm256_div_ps(varea, vdivisor);
-
-            __m256 vcmp_0 = _mm256_cmp_ps(vx0i, vx1j, _CMP_LE_OS);
-            __m256 vcmp_1 = _mm256_cmp_ps(vy0i, vy1j, _CMP_LE_OS);
-            __m256 vcmp_2 = _mm256_cmp_ps(vx0j, vx1i, _CMP_LE_OS);
-            __m256 vcmp_3 = _mm256_cmp_ps(vy0j, vy1i, _CMP_LE_OS);
-            __m256 vcmp_4 = _mm256_cmp_ps(vc_nms_thresh, vintersection_area, _CMP_LT_OS);
-
-            vcmp_0 = _mm256_and_ps(vcmp_0, vcmp_1);
-            vcmp_2 = _mm256_and_ps(vcmp_2, vcmp_3);
-            vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_0);
-            vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_2);
-
-            _mm256_storeu_si256(pdst, _mm256_blendv_epi8(vdst, vc_ione, _mm256_castps_si256(vcmp_4)));
-        }
-#endif
-
-        for (; tail < num_boxes; ++tail) {
-            float res = 0.0f;
-
-            const float x0i = x0[box];
-            const float y0i = y0[box];
-            const float x1i = x1[box];
-            const float y1i = y1[box];
-
-            const float x0j = x0[tail];
-            const float y0j = y0[tail];
-            const float x1j = x1[tail];
-            const float y1j = y1[tail];
-
-            if (x0i <= x1j && y0i <= y1j && x0j <= x1i && y0j <= y1i) {
-                // overlapped region (= box)
-                const float x0 = std::max<float>(x0i, x0j);
-                const float y0 = std::max<float>(y0i, y0j);
-                const float x1 = std::min<float>(x1i, x1j);
-                const float y1 = std::min<float>(y1i, y1j);
-
-                // intersection area
-                const float width  = std::max<float>(0.0f,  x1 - x0 + coordinates_offset);
-                const float height = std::max<float>(0.0f,  y1 - y0 + coordinates_offset);
-                const float area   = width * height;
-
-                // area of A, B
-                const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset);
-                const float B_area = (x1j - x0j + coordinates_offset) * (y1j - y0j + coordinates_offset);
-
-                // IoU
-                res = area / (A_area + B_area - area);
-            }
-
-            if (nms_thresh < res)
-                is_dead[tail] = 1;
-        }
-    }
-
-    *num_out = count;
-}
-
-
-static
-void fill_output_blobs(const float* proposals, const int* roi_indices,
-                       float* rois, float* scores,
-                       const int num_proposals, const int num_rois, const int post_nms_topn) {
-    const float *src_x0 = proposals + 0 * num_proposals;
-    const float *src_y0 = proposals + 1 * num_proposals;
-    const float *src_x1 = proposals + 2 * num_proposals;
-    const float *src_y1 = proposals + 3 * num_proposals;
-    const float *src_score = proposals + 4 * num_proposals;
-
-    parallel_for(num_rois, [&](size_t i) {
-        int index = roi_indices[i];
-        rois[i * 4 + 0] = src_x0[index];
-        rois[i * 4 + 1] = src_y0[index];
-        rois[i * 4 + 2] = src_x1[index];
-        rois[i * 4 + 3] = src_y1[index];
-        scores[i] = src_score[index];
-    });
-
-    if (num_rois < post_nms_topn) {
-        for (int i = 4 * num_rois; i < 4 * post_nms_topn; i++) {
-            rois[i] = 0.f;
-        }
-        for (int i = num_rois; i < post_nms_topn; i++) {
-            scores[i] = 0.f;
-        }
-    }
-}
-
-
-class ExperimentalDetectronGenerateProposalsSingleImageImpl : public ExtLayerBase {
-private:
-    const int INPUT_IM_INFO {0};
-    const int INPUT_ANCHORS {1};
-    const int INPUT_DELTAS {2};
-    const int INPUT_SCORES {3};
-    const int OUTPUT_ROIS {0};
-    const int OUTPUT_SCORES {1};
-
-public:
-    bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            auto proposalOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronGenerateProposalsSingleImage>(op);
-            if (!proposalOp) {
-                errorMessage = "Node is not an instance of the Proposal from the operations set v0.";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    explicit ExperimentalDetectronGenerateProposalsSingleImageImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            auto proposalOp = ngraph::as_type_ptr<const ngraph::op::v6::ExperimentalDetectronGenerateProposalsSingleImage>(op);
-            auto proposalAttrs = proposalOp->get_attrs();
-
-            min_size_ = proposalAttrs.min_size;
-            nms_thresh_ = proposalAttrs.nms_threshold;
-            pre_nms_topn_ = proposalAttrs.pre_nms_count;
-            post_nms_topn_ = proposalAttrs.post_nms_count;
-
-            coordinates_offset = 0.0f;
-
-            roi_indices_.resize(post_nms_topn_);
-            addConfig(op,
-                      {{TensorDescCreatorTypes::ncsp, Precision::FP32}, {TensorDescCreatorTypes::ncsp, Precision::FP32},
-                       {TensorDescCreatorTypes::ncsp, Precision::FP32}, {TensorDescCreatorTypes::ncsp, Precision::FP32}},
-                      {{TensorDescCreatorTypes::ncsp, Precision::FP32}, {TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-            throw;
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr> &inputs, std::vector<Blob::Ptr> &outputs,
-                       ResponseDesc *resp) noexcept override {
-        try {
-            if (inputs.size() != 4 || outputs.size() != 2) {
-                IE_THROW() << "Incorrect number of input or output edges!";
-            }
-
-            size_t anchor_dims_size = 1;
-            for (size_t i = 0; i < inputs[INPUT_ANCHORS]->getTensorDesc().getDims().size(); i++) {
-                anchor_dims_size *= inputs[INPUT_ANCHORS]->getTensorDesc().getDims()[i];
-            }
-
-            size_t deltas_dims_size = 1;
-            for (size_t i = 0; i < inputs[INPUT_DELTAS]->getTensorDesc().getDims().size(); i++) {
-                deltas_dims_size *= inputs[INPUT_DELTAS]->getTensorDesc().getDims()[i];
-            }
-            if (anchor_dims_size != deltas_dims_size)
-                IE_THROW() << "'Anchors' blob size for ONNXProposal is incompatible with 'deltas' blob size!";
-
-            size_t score_dims_size = 1;
-            for (size_t i = 0; i < inputs[INPUT_SCORES]->getTensorDesc().getDims().size(); i++) {
-                score_dims_size *= inputs[INPUT_SCORES]->getTensorDesc().getDims()[i];
-            }
-            if (deltas_dims_size != (4 * score_dims_size))
-                IE_THROW() << "'Deltas' blob size for ONNXProposal is incompatible with 'scores' blob size!";
-
-            // Prepare memory
-            const float* p_deltas_item = inputs[INPUT_DELTAS]->buffer();
-            const float* p_scores_item = inputs[INPUT_SCORES]->buffer();
-            const float* p_anchors_item = inputs[INPUT_ANCHORS]->buffer();
-            const float* p_img_info_cpu = inputs[INPUT_IM_INFO]->buffer();
-
-            float* p_roi_item = outputs[OUTPUT_ROIS]->buffer();
-            float* p_roi_score_item = outputs[OUTPUT_SCORES]->buffer();
-
-            const int anchors_num = inputs[INPUT_SCORES]->getTensorDesc().getDims()[0];
-
-            // bottom shape: (num_anchors) x H x W
-            const int bottom_H = inputs[INPUT_DELTAS]->getTensorDesc().getDims()[1];
-            const int bottom_W = inputs[INPUT_DELTAS]->getTensorDesc().getDims()[2];
-
-            // input image height & width
-            const float img_H = p_img_info_cpu[0];
-            const float img_W = p_img_info_cpu[1];
-
-            // scale factor for height & width
-
-            // minimum box width & height
-            const float min_box_H = min_size_;
-            const float min_box_W = min_size_;
-
-            // number of all proposals = num_anchors * H * W
-            const int num_proposals = anchors_num * bottom_H * bottom_W;
-
-            // number of top-n proposals before NMS
-            const int pre_nms_topn = std::min<int>(num_proposals, pre_nms_topn_);
-
-            // number of final RoIs
-            int num_rois = 0;
-
-            // enumerate all proposals
-            //   num_proposals = num_anchors * H * W
-            //   (x1, y1, x2, y2, score) for each proposal
-            // NOTE: for bottom, only foreground scores are passed
-            struct ProposalBox {
-                float x0;
-                float y0;
-                float x1;
-                float y1;
-                float score;
-            };
-            std::vector<ProposalBox> proposals_(num_proposals);
-            std::vector<float> unpacked_boxes(5 * pre_nms_topn);
-            std::vector<int> is_dead(pre_nms_topn);
-
-            // Execute
-            int batch_size = 1;  // inputs[INPUT_DELTAS]->getTensorDesc().getDims()[0];
-            for (int n = 0; n < batch_size; ++n) {
-                refine_anchors(p_deltas_item, p_scores_item, p_anchors_item,
-                               reinterpret_cast<float *>(&proposals_[0]), anchors_num, bottom_H,
-                               bottom_W, img_H, img_W,
-                               min_box_H, min_box_W,
-                               static_cast<const float>(log(1000. / 16.)),
-                               1.0f);
-                std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(),
-                                  [](const ProposalBox& struct1, const ProposalBox& struct2) {
-                                      return (struct1.score > struct2.score);
-                                  });
-
-                unpack_boxes(reinterpret_cast<float *>(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn);
-                nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0,
-                        nms_thresh_, post_nms_topn_, coordinates_offset);
-                fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], p_roi_item, p_roi_score_item,
-                                  pre_nms_topn, num_rois, post_nms_topn_);
-            }
-
-            return OK;
-        } catch (const std::exception& e) {
-            if (resp) {
-                std::string errorMsg = e.what();
-                errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-            }
-            return GENERAL_ERROR;
-        }
-    }
-
-private:
-    float min_size_;
-    int pre_nms_topn_;
-    int post_nms_topn_;
-    float nms_thresh_;
-    float coordinates_offset;
-
-    std::vector<int> roi_indices_;
-};
-
-REG_FACTORY_FOR(ExperimentalDetectronGenerateProposalsSingleImageImpl, ExperimentalDetectronGenerateProposalsSingleImage);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/range.cpp b/inference-engine/src/mkldnn_plugin/nodes/range.cpp
deleted file mode 100644
index 30de35f6c72781..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/range.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <cassert>
-#include "ie_parallel.hpp"
-#include <ngraph/opsets/opset1.hpp>
-#include <utils/general_utils.h>
-
-using namespace MKLDNNPlugin;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-
-
-class RangeImpl: public ExtLayerBase {
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            if (!MKLDNNPlugin::one_of(op->get_type_info(), ngraph::op::v0::Range::type_info, ngraph::op::v4::Range::type_info)) {
-                errorMessage = "Only opset1 and opset4 Range operation is supported";
-                return false;
-            }
-            if (std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(RANGE_START)) == nullptr ||
-                std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(RANGE_LIMIT)) == nullptr ||
-                    std::dynamic_pointer_cast<const ngraph::opset1::Constant>(op->get_input_node_shared_ptr(RANGE_DELTA)) == nullptr) {
-                errorMessage = "Only const inputs for Range operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    std::string errorPrefix;
-
-public:
-    explicit RangeImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            errorPrefix = "Range layer with name '" + op->get_friendly_name() + "'";
-
-            if (op->get_input_size() != 3 || op->get_output_size() != 1)
-                IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
-
-            SizeVector start_dims = op->get_input_shape(RANGE_START);
-            if (ngraph::shape_size(start_dims) != 1)
-                IE_THROW() << errorPrefix << " has start scalar with more than 1 value";
-
-            SizeVector limit_dims = op->get_input_shape(RANGE_LIMIT);
-            if (ngraph::shape_size(limit_dims) != 1)
-                IE_THROW() << errorPrefix << " has limit scalar with more than 1 value";
-
-            SizeVector delta_dims = op->get_input_shape(RANGE_DELTA);
-            if (ngraph::shape_size(delta_dims) != 1)
-                IE_THROW() << errorPrefix << " has delta scalar with more than 1 value";
-
-            SizeVector dst_dims = op->get_output_shape(0);
-            if (dst_dims.size() > 1)
-                IE_THROW() << errorPrefix << " has unsupported rank for output: " << dst_dims.size();
-
-            if (!(details::convertPrecision(op->get_input_element_type(RANGE_START)) == Precision::I32 &&
-                  details::convertPrecision(op->get_input_element_type(RANGE_LIMIT)) == Precision::I32 &&
-                  details::convertPrecision(op->get_input_element_type(RANGE_DELTA)) == Precision::I32 &&
-                  details::convertPrecision(op->get_output_element_type(0)) == Precision::I32) &&
-                !(details::convertPrecision(op->get_input_element_type(RANGE_START)) == Precision::FP32 &&
-                  details::convertPrecision(op->get_input_element_type(RANGE_LIMIT)) == Precision::FP32 &&
-                  details::convertPrecision(op->get_input_element_type(RANGE_DELTA)) == Precision::FP32 &&
-                  details::convertPrecision(op->get_output_element_type(0)) == Precision::FP32)) {
-                      addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                                     {TensorDescCreatorTypes::ncsp, Precision::FP32},
-                                     {TensorDescCreatorTypes::ncsp, Precision::FP32}},
-                                    {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-            } else {
-                addConfig(op, {{TensorDescCreatorTypes::ncsp},
-                               {TensorDescCreatorTypes::ncsp},
-                               {TensorDescCreatorTypes::ncsp}},
-                              {{TensorDescCreatorTypes::ncsp}});
-            }
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        StatusCode retcode = OK;
-        switch (outputs[0]->getTensorDesc().getPrecision()) {
-        case Precision::FP32: {
-            retcode = range((inputs[RANGE_START]->cbuffer().as<float *>() +
-                             inputs[RANGE_START]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0],
-                            (inputs[RANGE_LIMIT]->cbuffer().as<float *>() +
-                             inputs[RANGE_LIMIT]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0],
-                            (inputs[RANGE_DELTA]->cbuffer().as<float *>() +
-                             inputs[RANGE_DELTA]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], outputs[0]);
-        }
-        break;
-        case Precision::I32: {
-            retcode = range((inputs[RANGE_START]->cbuffer().as<int32_t *>() +
-                             inputs[RANGE_START]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0],
-                            (inputs[RANGE_LIMIT]->cbuffer().as<int32_t *>() +
-                             inputs[RANGE_LIMIT]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0],
-                            (inputs[RANGE_DELTA]->cbuffer().as<int32_t *>() +
-                             inputs[RANGE_DELTA]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0], outputs[0]);
-        }
-        break;
-        default:
-            if (resp) {
-                std::string errorMsg = "Incorrect output precision. Only FP32 and I32 are supported!";
-                errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-            }
-            retcode = GENERAL_ERROR;
-        }
-        if (resp && retcode == PARAMETER_MISMATCH) {
-            std::string errorMsg = "Range indexes exceeds data tensor dimension";
-            errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-        }
-        return retcode;
-    }
-
-private:
-    static const size_t RANGE_START = 0;
-    static const size_t RANGE_LIMIT = 1;
-    static const size_t RANGE_DELTA = 2;
-
-    template <typename data_t>
-    StatusCode range(data_t start, data_t limit, data_t delta, Blob::Ptr output);
-};
-
-template <typename data_t>
-StatusCode RangeImpl::range(data_t start, data_t limit, data_t delta, Blob::Ptr output) {
-    size_t dst_size = (output->getTensorDesc().getDims())[0];
-    data_t* dst_data = output->cbuffer().as<data_t *>() +
-                       output->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    size_t work_amount_dst = static_cast<size_t>(std::floor(std::abs((limit - start) / delta)));
-    if (work_amount_dst != dst_size)
-        return PARAMETER_MISMATCH;
-
-    parallel_nt(0, [&](const int ithr, const int nthr) {
-        size_t iwork = 0, end = 0;
-        splitter(work_amount_dst, nthr, ithr, iwork, end);
-        data_t dst_value = start + iwork * delta;
-
-        for (; iwork < end; ++iwork, dst_value += delta) {
-            dst_data[iwork] = dst_value;
-        }
-    });
-    return OK;
-}
-REG_FACTORY_FOR(RangeImpl, Range);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/reorg_yolo.cpp b/inference-engine/src/mkldnn_plugin/nodes/reorg_yolo.cpp
deleted file mode 100644
index ff705fc63b03ad..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/reorg_yolo.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-#include <vector>
-#include <ngraph/opsets/opset2.hpp>
-
-using namespace MKLDNNPlugin;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-class ReorgYoloImpl: public ExtLayerBase {
-public:
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto reorgYolo = std::dynamic_pointer_cast<const ngraph::opset2::ReorgYolo>(op);
-            if (!reorgYolo) {
-                errorMessage = "Only opset2 ReorgYolo operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    explicit ReorgYoloImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            errorPrefix = std::string(op->get_type_name()) + " node with name '" + op->get_friendly_name() + "'";
-            if (op->get_input_size() != 1 || op->get_output_size() != 1)
-                IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
-
-            const auto reorgYolo = std::dynamic_pointer_cast<const ngraph::opset2::ReorgYolo>(op);
-            const auto strides = reorgYolo->get_strides();
-            if (strides.empty())
-                IE_THROW() << errorPrefix << " has empty strides";
-            stride = strides[0];
-
-            addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}},
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        const auto *src_data = inputs[0]->cbuffer().as<const float *>();
-        auto *dst_data = outputs[0]->buffer().as<float *>();
-
-        int IW = (inputs[0]->getTensorDesc().getDims().size() > 3) ? inputs[0]->getTensorDesc().getDims()[3] : 1;
-        int IH = (inputs[0]->getTensorDesc().getDims().size() > 2) ? inputs[0]->getTensorDesc().getDims()[2] : 1;
-        int IC = (inputs[0]->getTensorDesc().getDims().size() > 1) ? inputs[0]->getTensorDesc().getDims()[1] : 1;
-        int B = (inputs[0]->getTensorDesc().getDims().size() > 0) ? inputs[0]->getTensorDesc().getDims()[0] : 1;
-
-        int ic_off = IC / (stride * stride);
-        int ih_off = IH * stride;
-        int iw_off = IW * stride;
-        for (int b = 0; b < B; b++) {
-            for (int ic = 0; ic < IC; ic++) {
-                for (int ih = 0; ih < IH; ih++) {
-                    for (int iw = 0; iw < IW; iw++) {
-                        int dstIndex = b * IC * IH * IW + ic * IH * IW + ih * IW + iw;
-
-                        int oc = ic % ic_off;
-                        int offset = ic / ic_off;
-
-                        int ow = iw * stride + offset % stride;
-                        int oh = ih * stride + offset / stride;
-
-                        int srcIndex = b * ic_off * ih_off * iw_off + oc * ih_off * iw_off + oh * iw_off + ow;
-
-                        dst_data[dstIndex] = src_data[srcIndex];
-                    }
-                }
-            }
-        }
-        return OK;
-    }
-
-private:
-    int stride;
-
-    std::string errorPrefix;
-};
-
-REG_FACTORY_FOR(ReorgYoloImpl, ReorgYolo);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/reverse_sequence.cpp b/inference-engine/src/mkldnn_plugin/nodes/reverse_sequence.cpp
deleted file mode 100644
index 85ab2b8c414b5c..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/reverse_sequence.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <cassert>
-#include <algorithm>
-#include "ie_parallel.hpp"
-#include <ngraph/opsets/opset1.hpp>
-
-using namespace MKLDNNPlugin;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-class ReverseSequenceImpl: public ExtLayerBase {
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto revSeq = std::dynamic_pointer_cast<const ngraph::opset1::ReverseSequence>(op);
-            if (!revSeq) {
-                errorMessage = "Only opset1 ReverseSequence operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    std::string errorPrefix;
-
-public:
-    explicit ReverseSequenceImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            errorPrefix = "ReverseSequence layer with name '" + op->get_friendly_name() + "'";
-            const auto revSeq = std::dynamic_pointer_cast<const ngraph::opset1::ReverseSequence>(op);
-
-            if (op->get_input_size() != 2 || op->get_output_size() != 1)
-                IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
-
-            src_dims = op->get_input_shape(REVERSESEQUENCE_DATA);
-
-            Precision lengthsPrecision = details::convertPrecision(op->get_input_element_type(REVERSESEQUENCE_LENGTHS));
-            if (lengthsPrecision != Precision::I32 && lengthsPrecision != Precision::FP32)
-                lengthsPrecision = Precision::I32;
-
-            SizeVector seq_lengths_dims = op->get_input_shape(REVERSESEQUENCE_LENGTHS);
-            if (seq_lengths_dims.size() != 1)
-                IE_THROW() << errorPrefix << " has incorrect 2nd input rank: " << seq_lengths_dims.size();
-
-            SizeVector dst_dims = op->get_output_shape(0);
-            if (src_dims.size() != dst_dims.size())
-                IE_THROW() << errorPrefix << " has incorrect number of input/output sizes!";
-
-            for (size_t i = 0; i < dst_dims.size(); i++) {
-                if (src_dims[i] != dst_dims[i])
-                    IE_THROW() << errorPrefix << " has incorrect number of input/output dimension!";
-            }
-
-            seq_axis = revSeq->get_sequence_axis();
-
-            if (seq_axis < 0 || seq_axis >= static_cast<int>(src_dims.size()))
-                IE_THROW() << errorPrefix << " has incorrect 'seq_axis' parameters dimensions and axis number!";
-
-            batch_axis = revSeq->get_batch_axis();
-
-            if (batch_axis < 0 || batch_axis >= static_cast<int>(src_dims.size()))
-                IE_THROW() << errorPrefix << " has incorrect 'batch_axis' parameters dimensions and axis number!";
-
-            if (seq_lengths_dims[0] != dst_dims[batch_axis])
-                IE_THROW() << errorPrefix << " has incorrect 'seq_lengths_dims' parameters dimension!";
-
-            srcStrides.resize(src_dims.size());
-            srcStrides[srcStrides.size() - 1] = 1;
-            for (int i = srcStrides.size() - 2; i >= 0; i--) {
-                srcStrides[i] = srcStrides[i + 1] * src_dims[i + 1];
-            }
-
-            work_amount_dst = srcStrides[0] * src_dims[0];
-
-            addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                           {TensorDescCreatorTypes::ncsp, lengthsPrecision}},
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        size_t i;
-        const float *src_data = inputs[REVERSESEQUENCE_DATA]->cbuffer().as<const float *>() +
-                                inputs[REVERSESEQUENCE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        float* dst_data = outputs[0]->cbuffer().as<float *>() +
-                          outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-        switch (inputs[REVERSESEQUENCE_LENGTHS]->getTensorDesc().getPrecision()) {
-            case Precision::FP32: {
-                float *seq_lengths_data = inputs[REVERSESEQUENCE_LENGTHS]->cbuffer().as<float *>() +
-                                          inputs[REVERSESEQUENCE_LENGTHS]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-                for (i = 0; i < src_dims[batch_axis]; i++) {
-                    if (static_cast<int32_t>(seq_lengths_data[i]) > static_cast<int>(src_dims[seq_axis])) {
-                        if (resp) {
-                            std::string errorMsg = "Incorrect input 'seq_lengths' values!";
-                            errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                        }
-                        return PARAMETER_MISMATCH;
-                    }
-                }
-
-                parallel_nt(0, [&](const int ithr, const int nthr) {
-                    size_t i, start = 0, end = 0, src_idx = 0;
-                    SizeVector counters(src_dims.size(), 0);
-                    splitter(work_amount_dst, nthr, ithr, start, end);
-                    for (int j = src_dims.size() - 1, i = start; j >= 0; j--) {
-                        counters[j] = i % src_dims[j];
-                        i /= src_dims[j];
-                    }
-
-                    for (size_t iwork = start; iwork < end; ++iwork) {
-                        for (i = 0, src_idx = 0; i < src_dims.size(); ++i) {
-                            size_t idx = counters[i];
-                            if (static_cast<int>(i) == seq_axis &&
-                                    static_cast<int>(idx) < static_cast<int32_t>(seq_lengths_data[counters[batch_axis]])) {
-                                idx = static_cast<int32_t>(seq_lengths_data[counters[batch_axis]]) - idx - 1;
-                            }
-                            src_idx += idx * srcStrides[i];
-                        }
-                        dst_data[iwork] = src_data[src_idx];
-                        for (int j = src_dims.size() - 1; j >= 0; j--) {
-                            counters[j] = (counters[j] + 1) % src_dims[j];
-                            if (counters[j] != 0) break;
-                        }
-                    }
-                });
-            }
-            break;
-            case Precision::I32: {
-                int32_t *seq_lengths_data = inputs[REVERSESEQUENCE_LENGTHS]->cbuffer().as<int32_t *>() +
-                                            inputs[REVERSESEQUENCE_LENGTHS]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-                for (i = 0; i < src_dims[batch_axis]; i++) {
-                    if (seq_lengths_data[i] > static_cast<int>(src_dims[seq_axis])) {
-                        if (resp) {
-                            std::string errorMsg = "Incorrect input 'seq_lengths' values!";
-                            errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                        }
-                        return PARAMETER_MISMATCH;
-                    }
-                }
-
-                parallel_nt(0, [&](const int ithr, const int nthr) {
-                    size_t i, start = 0, end = 0, src_idx = 0;
-                    SizeVector counters(src_dims.size(), 0);
-                    splitter(work_amount_dst, nthr, ithr, start, end);
-                    for (int j = src_dims.size() - 1, i = start; j >= 0; j--) {
-                        counters[j] = i % src_dims[j];
-                        i /= src_dims[j];
-                    }
-
-                    for (size_t iwork = start; iwork < end; ++iwork) {
-                        for (i = 0, src_idx = 0; i < src_dims.size(); ++i) {
-                            size_t idx = counters[i];
-                            if (static_cast<int>(i) == seq_axis &&
-                                    static_cast<int>(idx) < seq_lengths_data[counters[batch_axis]]) {
-                                idx = seq_lengths_data[counters[batch_axis]] - idx - 1;
-                            }
-                            src_idx += idx * srcStrides[i];
-                        }
-                        dst_data[iwork] = src_data[src_idx];
-                        for (int j = src_dims.size() - 1; j >= 0; j--) {
-                            counters[j] = (counters[j] + 1) % src_dims[j];
-                            if (counters[j] != 0) break;
-                        }
-                    }
-                });
-            }
-            break;
-            default:
-                return GENERAL_ERROR;
-        }
-
-        return OK;
-    }
-
-private:
-    const size_t REVERSESEQUENCE_DATA = 0;
-    const size_t REVERSESEQUENCE_LENGTHS = 1;
-
-    int seq_axis;
-    int batch_axis;
-    SizeVector src_dims;
-    SizeVector srcStrides;
-    size_t work_amount_dst;
-};
-
-REG_FACTORY_FOR(ReverseSequenceImpl, ReverseSequence);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp
deleted file mode 100644
index 102a9bbd6cbbfa..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-// There are some code snippets in this file.
-// Original source file is avaialble here (Copyright (c) 2018 Facebook, MIT License):
-// https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
-//
-
-#include "base.hpp"
-#include <cassert>
-#include <cmath>
-#include <vector>
-#include <string>
-#include <algorithm>
-#include "ie_parallel.hpp"
-#include "common/cpu_memcpy.h"
-#include <ngraph/opsets/opset6.hpp>
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-// implementation taken from Caffe2
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    const int iy_upper,
-    const int ix_upper,
-    T roi_start_h,
-    T roi_start_w,
-    T bin_size_h,
-    T bin_size_w,
-    int roi_bin_grid_h,
-    int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-            static_cast<T>(iy + .5f) * bin_size_h /
-                static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-              static_cast<T>(ix + .5f) * bin_size_w /
-                  static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc.at(pre_calc_index) = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = static_cast<int>(y);
-          int x_low = static_cast<int>(x);
-          int y_high = 0;
-          int x_high = 0;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = static_cast<T>(1) - ly, hx = static_cast<T>(1) - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indeces
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignForward_cpu_kernel(
-    const int nthreads,
-    const T* bottom_data,
-    const T& spatial_scale,
-    const int channels,
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    const int sampling_ratio,
-    const T* bottom_rois,
-    const bool aligned,
-    T* top_data) {
-  int roi_cols = 4;
-
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  parallel_for(n_rois, [&](size_t n) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    // roi could have 4 or 5 columns
-    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
-    int roi_batch_ind = 0;
-    if (roi_cols == 5) {
-      roi_batch_ind = static_cast<int>(offset_bottom_rois[0]);
-      offset_bottom_rois++;
-    }
-
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale - offset;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale - offset;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale - offset;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale - offset;
-
-    // Force malformed ROIs to be 1x1
-    T roi_width = (std::max)(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = (std::max)(roi_end_h - roi_start_h, (T)1.);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-        ? sampling_ratio
-        : static_cast<int>(ceil(roi_height / pooled_height));  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceil(roi_width / pooled_width));
-
-    // We do average (integral) pooling inside a bin
-    const T count = static_cast<T>(roi_bin_grid_h * roi_bin_grid_w);  // e.g. = 4
-
-    // we want to precalculate indeces and weights shared by all chanels,
-    // this is the key point of optimiation
-    std::vector<PreCalc<T>> pre_calc(
-        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
-        height,
-        width,
-        pooled_height,
-        pooled_width,
-        roi_bin_grid_h,
-        roi_bin_grid_w,
-        roi_start_h,
-        roi_start_w,
-        bin_size_h,
-        bin_size_w,
-        roi_bin_grid_h,
-        roi_bin_grid_w,
-        pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_bottom_data =
-          bottom_data + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
-                  pc.w2 * offset_bottom_data[pc.pos2] +
-                  pc.w3 * offset_bottom_data[pc.pos3] +
-                  pc.w4 * offset_bottom_data[pc.pos4];
-
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          top_data[index] = output_val;
-        }  // for pw
-      }  // for ph
-    }  // for c
-  });
-}
-
-
-void redistribute_rois(const float* rois, int* level_ids,
-                       const int num_rois, const int levels_num) {
-    const float canonical_scale = 224.0f;
-    const int canonical_level = 2;
-
-    for (int i = 0; i < num_rois; ++i) {
-        const float x0 = rois[4 * i + 0];
-        const float y0 = rois[4 * i + 1];
-        const float x1 = rois[4 * i + 2];
-        const float y1 = rois[4 * i + 3];
-
-        int target_level = levels_num;
-        float area = (x1 - x0) * (y1 - y0);
-        if (area > 0) {
-            area = std::sqrt(area) / canonical_scale;
-            area = std::log2(area + 1e-6f);
-            target_level = static_cast<int>(std::floor(area + canonical_level));
-            target_level = (std::max)(0, (std::min)(levels_num - 1, target_level));
-        }
-
-        level_ids[i] = target_level;
-    }
-}
-
-
-void reorder(const float* src_data, const int* ranks, const int n, const int step, float* dst_data,
-             int* dst_mapping) {
-    std::iota(dst_mapping, dst_mapping + n, 0);
-    std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) {return ranks[i1] < ranks[i2];});
-    for (int i = 0; i < n; ++i) {
-        const int j = dst_mapping[i];
-        assert(0 <= j && j < n);
-        cpu_memcpy(dst_data + i * step, src_data + j * step, sizeof(float) * step);
-    }
-}
-
-void split_points(const std::vector<int>& ids, std::vector<int>& rois_per_level, const int levels_num) {
-    rois_per_level.clear();
-    rois_per_level.resize(levels_num, 0);
-    for (size_t i = 0; i < ids.size(); ++i) {
-        assert(0 <= ids[i] && ids[i] < levels_num);
-        rois_per_level[ids[i]]++;
-    }
-    for (int i = 1; i < levels_num; ++i) {
-        rois_per_level[i] += rois_per_level[i - 1];
-    }
-    rois_per_level.insert(rois_per_level.begin(), 0);
-}
-
-
-void reorder_rois(const float *rois, const int* ids, int* mapping, const int rois_num,
-                  float * reordered_rois, std::vector<int>& rois_per_level, const int levels_num) {
-    rois_per_level.clear();
-    rois_per_level.resize(levels_num, 0);
-    for (int i = 0; i < rois_num; ++i) {
-        assert(0 <= ids[i] && ids[i] < levels_num);
-        rois_per_level[ids[i]]++;
-    }
-    for (int i = 1; i < levels_num; ++i) {
-        rois_per_level[i] += rois_per_level[i - 1];
-    }
-    rois_per_level.insert(rois_per_level.begin(), 0);
-
-    std::vector<int> level_counter = rois_per_level;
-
-    for (int i = 0; i < rois_num; ++i) {
-        const int level = ids[i];
-        assert(level < levels_num);
-        const int j = level_counter[level];
-        assert(0 <= j && j < rois_num);
-        reordered_rois[j * 4 + 0] = rois[i * 4 + 0];
-        reordered_rois[j * 4 + 1] = rois[i * 4 + 1];
-        reordered_rois[j * 4 + 2] = rois[i * 4 + 2];
-        reordered_rois[j * 4 + 3] = rois[i * 4 + 3];
-        level_counter[level]++;
-    }
-}
-
-class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase {
-private:
-    const int INPUT_ROIS {0};
-    const int INPUT_FEATURES_START {1};
-
-    const int OUTPUT_ROI_FEATURES {0};
-    const int OUTPUT_ROIS {1};
-
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto roiFeatureExtractor = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronROIFeatureExtractor>(op);
-            if (!roiFeatureExtractor) {
-                errorMessage = "Only opset6 ExperimentalDetectronROIFeatureExtractor operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-public:
-    explicit ExperimentalDetectronROIFeatureExtractorImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            const auto roiFeatureExtractor = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronROIFeatureExtractor>(op);
-            const auto &attr = roiFeatureExtractor->get_attrs();
-            output_dim_ = attr.output_size;
-            pyramid_scales_ = attr.pyramid_scales;
-            sampling_ratio_ = attr.sampling_ratio;
-            aligned_ = attr.aligned;
-            pooled_height_ = output_dim_;
-            pooled_width_ = output_dim_;
-
-            std::vector<DataConfigurator> inDataConfigurators(op->get_input_size(), DataConfigurator{TensorDescCreatorTypes::ncsp, Precision::FP32});
-            addConfig(op, inDataConfigurators,
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                           {TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        const int levels_num = inputs.size() - INPUT_FEATURES_START;
-        const int num_rois = inputs[INPUT_ROIS]->getTensorDesc().getDims()[0];
-        const int channels_num = inputs[INPUT_FEATURES_START]->getTensorDesc().getDims()[1];
-        const int feaxels_per_roi = pooled_height_ * pooled_width_ * channels_num;
-
-        auto *input_rois = inputs[INPUT_ROIS]->buffer().as<const float *>();
-        auto *output_rois_features = outputs[OUTPUT_ROI_FEATURES]->buffer().as<float *>();
-        float *output_rois = nullptr;
-        if (OUTPUT_ROIS < static_cast<int>(outputs.size())) {
-            output_rois = outputs[OUTPUT_ROIS]->buffer().as<float *>();
-        }
-
-        std::vector<int> level_ids(num_rois, 0);
-        redistribute_rois(input_rois, reinterpret_cast<int *>(&level_ids[0]), num_rois, levels_num);
-
-        std::vector<float> reordered_rois(4 * num_rois, 0);
-        std::vector<int> original_rois_mapping(num_rois, 0);
-        reorder(input_rois, &level_ids[0], num_rois, 4, &reordered_rois[0], &original_rois_mapping[0]);
-
-        std::vector<int> rois_per_level;
-        split_points(level_ids, rois_per_level, levels_num + 1);
-
-        std::vector<float> output_rois_features_temp(feaxels_per_roi * num_rois, 0);
-        for (int i = 0; i < levels_num; ++i) {
-            const int level_rois_offset = rois_per_level[i];
-            const int level_rois_num = rois_per_level[i + 1] - level_rois_offset;
-            if (level_rois_num > 0) {
-                auto *featuremap = inputs[INPUT_FEATURES_START + i]->buffer().as<const float *>();
-                const int featuremap_height = inputs[INPUT_FEATURES_START + i]->getTensorDesc().getDims()[2];
-                const int featuremap_width = inputs[INPUT_FEATURES_START + i]->getTensorDesc().getDims()[3];
-                ROIAlignForward_cpu_kernel<float>(feaxels_per_roi * level_rois_num,
-                    featuremap,
-                    1.0f / pyramid_scales_[i],
-                    channels_num,
-                    featuremap_height,
-                    featuremap_width,
-                    pooled_height_,
-                    pooled_width_,
-                    sampling_ratio_,
-                    &reordered_rois[4 * level_rois_offset],
-                    aligned_,
-                    &output_rois_features_temp[feaxels_per_roi * level_rois_offset]);
-            }
-        }
-
-        std::vector<int> dummy_mapping(num_rois, 0);
-        reorder(&output_rois_features_temp[0], &original_rois_mapping[0], num_rois, feaxels_per_roi,
-                output_rois_features, &dummy_mapping[0]);
-        if (output_rois != nullptr) {
-            cpu_memcpy(output_rois, input_rois, 4 * num_rois * sizeof(float));
-        }
-
-        return OK;
-    }
-
-private:
-    int output_dim_ = 0;
-    int pooled_height_ = 0;
-    int pooled_width_ = 0;
-    std::vector<int64_t> pyramid_scales_;
-    int sampling_ratio_ = 0;
-    bool aligned_ = false;
-};
-
-REG_FACTORY_FOR(ExperimentalDetectronROIFeatureExtractorImpl, ExperimentalDetectronROIFeatureExtractor);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/topk.cpp b/inference-engine/src/mkldnn_plugin/nodes/topk.cpp
deleted file mode 100644
index 31cd81b410dab5..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/topk.cpp
+++ /dev/null
@@ -1,572 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cmath>
-#include <limits>
-#include <cfloat>
-#include <string>
-#include <vector>
-#include <cassert>
-#include <functional>
-
-#include "ie_parallel.hpp"
-#include <ngraph/op/topk.hpp>
-#include "common/tensor_desc_creator.h"
-#include "utils/general_utils.h"
-#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
-#include <immintrin.h>
-#endif
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-class TopKImpl: public ExtLayerBase {
-public:
-    bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            auto topKOp = ngraph::as_type_ptr<const ngraph::op::v1::TopK>(op);
-            if (!topKOp) {
-                errorMessage = "Node is not an instance of the TopK from the operations set v1 or v3";
-                return false;
-            }
-            if (topKOp->get_mode() != ngraph::op::TopKMode::MAX &&
-                    topKOp->get_mode() != ngraph::op::TopKMode::MIN) {
-                errorMessage = "Unsupported mode.";
-                return false;
-            }
-            if (!MKLDNNPlugin::one_of(topKOp->get_sort_type(), ngraph::op::TopKSortType::NONE,
-                    ngraph::op::TopKSortType::SORT_VALUES,
-                    ngraph::op::TopKSortType::SORT_INDICES)) {
-                errorMessage = "Unsupported sort type.";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    explicit TopKImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-                IE_THROW(NotImplemented) << errorMessage;
-            }
-            auto topK1Op = ngraph::as_type_ptr<ngraph::op::v1::TopK>(op);
-
-            SizeVector dstDims = topK1Op->get_output_shape(TOPK_VALUE);
-            src_dims = topK1Op->get_input_shape(TOPK_DATA);
-
-            axis = topK1Op->get_axis();
-
-            if (topK1Op->get_mode() == ngraph::op::TopKMode::MAX)
-                mode_max = true;
-            else
-                mode_max = false;
-
-            if (topK1Op->get_sort_type() == ngraph::op::TopKSortType::SORT_VALUES)
-                sort_value = true;
-            else
-                sort_value = false;
-
-            int j;
-            for (j = src_dims.size() - 1; j >= 0; j--) {
-                if (src_dims[j] != 1) break;
-            }
-            if (static_cast<size_t>(j) == axis) is_last_dim = true;
-
-            for (size_t i = 0; i < axis; i++) {
-                axis_step *= src_dims[i];
-            }
-            axis_dim = src_dims[axis];
-            for (size_t i = (axis + 1); i < src_dims.size(); i++) {
-                axis_stride *= src_dims[i];
-            }
-            dim = static_cast<int>(src_dims[axis]);
-            before_num = count(src_dims, 0, axis);
-
-            if (topK1Op->get_output_size() == 1) {
-                addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::I32}},
-                              {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-            } else {
-                addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::I32}},
-                              {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                               {TensorDescCreatorTypes::ncsp, Precision::I32}});
-            }
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-            throw;
-        }
-    }
-
-#if defined(HAVE_AVX512F)
-    const int block_size = 16;
-    typedef __m512 vec_type_f;
-    typedef __m512i vec_type_i;
-    typedef __mmask16 vmask_type;
-#elif defined(HAVE_AVX2)
-    const int block_size = 8;
-    typedef __m256 vec_type_f;
-    typedef __m256i vec_type_i;
-    typedef __m256 vmask_type;
-#elif defined(HAVE_SSE)
-    const int block_size = 4;
-    typedef __m128 vec_type_f;
-    typedef __m128i vec_type_i;
-    typedef __m128 vmask_type;
-#else
-    typedef float vec_type_f;
-    typedef int vmask_type;
-#endif
-
-    struct cmpgt_ps {
-        static inline vmask_type cmp_ps(const vec_type_f _Left, const vec_type_f _Right) {
-#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
-            return _mm_uni_cmpgt_ps(_Left, _Right);
-#else
-            return _Left > _Right ? _Left : _Right;
-#endif
-        }
-    };
-
-    struct cmplt_ps {
-        static inline vmask_type cmp_ps(const vec_type_f _Left, const vec_type_f _Right) {
-#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
-            return _mm_uni_cmpgt_ps(_Right, _Left);
-#else
-            return _Right > _Left ? _Right : _Left;
-#endif
-        }
-    };
-
-    template <class Compare1, template <typename> class Compare2>
-    void top1_axis(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) {
-        int after_num = count(in_dims, axis + 1, in_dims.size());
-        int first_index = 0;
-
-#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
-        parallel_for2d(before_num, after_num / block_size, [&](int i0, int ib1) {
-            int s_index = i0 * dim * after_num + ib1 * block_size;
-            vec_type_f vmax_val = _mm_uni_loadu_ps(src_data + s_index);
-            vec_type_i vindex_max_val = _mm_uni_setzero_si();
-            for (int i2 = 1; i2 < dim; i2++) {
-                s_index += after_num;
-                vec_type_f vsrc = _mm_uni_loadu_ps(src_data + s_index);
-                vmask_type vmask = Compare1::cmp_ps(vsrc, vmax_val);
-                vmax_val = _mm_uni_blendv_ps(vmax_val, vsrc, vmask);
-
-                vec_type_i vindex_cur_val = _mm_uni_set1_epi32(i2);
-#if defined(HAVE_AVX512F)
-                vindex_max_val = _mm512_mask_blend_epi32(vmask, vindex_max_val, vindex_cur_val);
-#else
-                vindex_max_val = _mm_uni_blendv_epi8(vindex_max_val, vindex_cur_val, _mm_uni_castps_si(vmask));
-#endif
-            }
-            if (dst_data)
-                _mm_uni_storeu_ps(dst_data + i0 * after_num + ib1 * block_size, vmax_val);
-            if (dst_idx)
-                _mm_uni_storeu_si(reinterpret_cast<vec_type_i*>(dst_idx + i0 * after_num + ib1 * block_size), vindex_max_val);
-        });
-        first_index = after_num / block_size * block_size;
-#endif
-        int rest = after_num - first_index;
-        parallel_for2d(before_num, rest, [&](int i0, int i1) {
-            int index_max_val = 0;
-            int s_index = i0 * dim * after_num + first_index + i1;
-            float max_val = src_data[s_index];
-            for (int i2 = 1; i2 < dim; i2++) {
-                s_index += after_num;
-                if (Compare2<float>()(src_data[s_index], max_val)) {
-                    max_val = src_data[s_index];
-                    index_max_val = i2;
-                }
-            }
-            if (dst_data)
-                dst_data[i0 * after_num + first_index + i1] = max_val;
-            if (dst_idx)
-                dst_idx[i0 * after_num + first_index + i1] = index_max_val;
-        });
-    }
-
-    template <template <typename> class Compare>
-    void top1(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) {
-        parallel_for(before_num, [&](int i0) {
-            int index_max_val = 0;
-            int s_index = i0 * dim;
-            float max_val = src_data[s_index];
-            for (int i1 = 1; i1 < dim; i1++) {
-                s_index++;
-                if (Compare<float>()(src_data[s_index], max_val)) {
-                    max_val = src_data[s_index];
-                    index_max_val = i1;
-                }
-            }
-            if (dst_data)
-                dst_data[i0] = max_val;
-            if (dst_idx)
-                dst_idx[i0] = index_max_val;
-        });
-    }
-
-    template <class Compare1, template <typename> class Compare2>
-    void topk_axis(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) {
-        int after_num = count(in_dims, axis + 1, in_dims.size());
-        int first_index = 0;
-
-#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
-        if (src_k < count_vec) {
-            parallel_for2d(before_num, after_num / block_size, [&](int i0, int ib1) {
-#if defined(HAVE_AVX512F)
-                const int N = 32;
-                vec_type_f vmax_values[N];
-                vec_type_i vmax_indexes[N];
-#else
-                const int N = 16;
-                vec_type_f vmax_values[N];
-                vec_type_i vmax_indexes[N];
-#endif
-                vec_type_f vtmp;
-                vec_type_i vtmp_indexes;
-                vmask_type vmask;
-                int s_index = i0 * dim * after_num + ib1 * block_size;
-
-                auto vswap_func = [&](int index1, int index2) {
-                    vtmp = vmax_values[index1];
-                    vmax_values[index1] = _mm_uni_blendv_ps(vmax_values[index1], vmax_values[index2], vmask);
-                    vmax_values[index2] = _mm_uni_blendv_ps(vmax_values[index2], vtmp, vmask);
-
-                    vtmp_indexes = vmax_indexes[index1];
-#if defined(HAVE_AVX512F)
-                    vmax_indexes[index1] = _mm512_mask_blend_epi32(vmask, vmax_indexes[index1], vmax_indexes[index2]);
-                    vmax_indexes[index2] = _mm512_mask_blend_epi32(vmask, vmax_indexes[index2], vtmp_indexes);
-#else
-                    vmax_indexes[index1] = _mm_uni_blendv_epi8(vmax_indexes[index1], vmax_indexes[index2], _mm_uni_castps_si(vmask));
-                    vmax_indexes[index2] = _mm_uni_blendv_epi8(vmax_indexes[index2], vtmp_indexes, _mm_uni_castps_si(vmask));
-#endif
-                };
-
-                for (int i2 = 0; i2 < src_k; i2++) {
-                    vmax_values[i2] = _mm_uni_loadu_ps(src_data + s_index);
-                    vmax_indexes[i2] = _mm_uni_set1_epi32(i2);
-                    s_index += after_num;
-                }
-                for (int i2 = 0; i2 < src_k - 1; i2++) {
-                    for (int i3 = src_k - 1; i3 > i2; i3--) {
-                        vmask = Compare1::cmp_ps(vmax_values[i3], vmax_values[i3 - 1]);
-#if defined(HAVE_AVX512F)
-                        if (vmask)
-                            vswap_func(i3, i3 - 1);
-#else
-                        int swap = _mm_uni_movemask_ps(vmask);
-                        if (swap)
-                            vswap_func(i3, i3 - 1);
-#endif
-                    }
-                }
-                for (int i2 = src_k; i2 < dim; i2++) {
-                    vmax_values[src_k] = _mm_uni_loadu_ps(src_data + s_index);
-                    vmax_indexes[src_k] = _mm_uni_set1_epi32(i2);
-                    for (int i3 = src_k; i3 > 0; i3--) {
-                        vmask = Compare1::cmp_ps(vmax_values[i3], vmax_values[i3 - 1]);
-#if defined(HAVE_AVX512F)
-                        if (vmask)
-                            vswap_func(i3, i3 - 1);
-                        else
-                            break;
-#else
-                        int swap = _mm_uni_movemask_ps(vmask);
-                        if (swap)
-                            vswap_func(i3, i3 - 1);
-                        else
-                            break;
-#endif
-                    }
-                    s_index += after_num;
-                }
-                if (!sort_value) {
-                    for (int i2 = 0; i2 < src_k - 1; i2++) {
-                        for (int i3 = src_k - 1; i3 > i2; i3--) {
-                            vmask = _mm_uni_cmpgt_i32(vmax_indexes[i3 - 1], vmax_indexes[i3]);
-#if defined(HAVE_AVX512F)
-                            if (vmask)
-                                vswap_func(i3, i3 - 1);
-                            else
-                                break;
-#else
-                            int swap = _mm_uni_movemask_ps(vmask);
-                            if (swap)
-                                vswap_func(i3, i3 - 1);
-                            else
-                                break;
-#endif
-                        }
-                    }
-                }
-                if (dst_data) {
-                    for (int i2 = 0; i2 < src_k; i2++)
-                        _mm_uni_storeu_ps(dst_data + (i0 * src_k + i2) * after_num + ib1 * block_size, vmax_values[i2]);
-                }
-                if (dst_idx) {
-                    for (int i2 = 0; i2 < src_k; i2++)
-                        _mm_uni_storeu_si(reinterpret_cast<vec_type_i*>(dst_idx + (i0 * src_k + i2) * after_num + ib1 * block_size), vmax_indexes[i2]);
-                }
-            });
-            first_index = after_num / block_size * block_size;
-        }
-#endif
-        int rest = after_num - first_index;
-        parallel_for2d(before_num, rest, [&](int i0, int i1) {
-            std::vector<float> max_values(src_k + 1);
-            std::vector<int> max_indexes(src_k + 1);
-            float tmp_value;
-            int tmp_index;
-            int s_index = i0 * dim * after_num + first_index + i1;
-
-            auto swap_func = [&](int index1, int index2) {
-                tmp_value = max_values[index1];
-                max_values[index1] = max_values[index2];
-                max_values[index2] = tmp_value;
-
-                tmp_index = max_indexes[index1];
-                max_indexes[index1] = max_indexes[index2];
-                max_indexes[index2] = tmp_index;
-            };
-
-            for (int i2 = 0; i2 < src_k; i2++) {
-                max_values[i2] = src_data[s_index];
-                max_indexes[i2] = i2;
-                s_index += after_num;
-            }
-            for (int i2 = 0; i2 < src_k - 1; i2++) {
-                for (int i3 = src_k - 1; i3 > i2; i3--) {
-                    if (Compare2<float>()(max_values[i3], max_values[i3 - 1])) {
-                        swap_func(i3, i3 - 1);
-                    }
-                }
-            }
-            for (int i2 = src_k; i2 < dim; i2++) {
-                max_values[src_k] = src_data[s_index];
-                max_indexes[src_k] = i2;
-                for (int i3 = src_k; i3 > 0; i3--) {
-                    if (Compare2<float>()(max_values[i3], max_values[i3 - 1]))
-                        swap_func(i3, i3 - 1);
-                    else
-                        break;
-                }
-                s_index += after_num;
-            }
-            if (!sort_value) {
-                for (int i2 = 0; i2 < src_k - 1; i2++) {
-                    for (int i3 = src_k - 1; i3 > i2; i3--) {
-                        if (std::greater<int>()(max_indexes[i3 - 1], max_indexes[i3])) {
-                            swap_func(i3, i3 - 1);
-                        }
-                    }
-                }
-            }
-            if (dst_data) {
-                for (int i2 = 0; i2 < src_k; i2++)
-                    dst_data[i0 * src_k * after_num + i2 * after_num + first_index + i1] = max_values[i2];
-            }
-            if (dst_idx) {
-                for (int i2 = 0; i2 < src_k; i2++)
-                    dst_idx[i0 * src_k * after_num + i2 * after_num + first_index + i1] = max_indexes[i2];
-            }
-        });
-    }
-
-    template <template <typename> class Compare>
-    void topk(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) {
-        parallel_for(before_num, [&](int i0) {
-            std::vector<float> max_values(src_k + 1);
-            std::vector<int> max_indexes(src_k + 1);
-            float tmp_value;
-            int tmp_index;
-            int s_index = i0 * dim;
-
-            auto swap_func = [&](int index1, int index2) {
-                tmp_value = max_values[index1];
-                max_values[index1] = max_values[index2];
-                max_values[index2] = tmp_value;
-
-                tmp_index = max_indexes[index1];
-                max_indexes[index1] = max_indexes[index2];
-                max_indexes[index2] = tmp_index;
-            };
-
-            for (int i2 = 0; i2 < src_k; i2++) {
-                max_values[i2] = src_data[s_index];
-                max_indexes[i2] = i2;
-                s_index++;
-            }
-            for (int i2 = 0; i2 < src_k - 1; i2++) {
-                for (int i3 = src_k - 1; i3 > i2; i3--) {
-                    if (Compare<float>()(max_values[i3], max_values[i3 - 1])) {
-                        swap_func(i3, i3 - 1);
-                    }
-                }
-            }
-            for (int i2 = src_k; i2 < dim; i2++) {
-                max_values[src_k] = src_data[s_index];
-                max_indexes[src_k] = i2;
-                for (int i3 = src_k; i3 > 0; i3--) {
-                    if (Compare<float>()(max_values[i3], max_values[i3 - 1]))
-                        swap_func(i3, i3 - 1);
-                    else
-                        break;
-                }
-                s_index++;
-            }
-            if (!sort_value) {
-                for (int i2 = 0; i2 < src_k - 1; i2++) {
-                    for (int i3 = src_k - 1; i3 > i2; i3--) {
-                        if (std::greater<int>()(max_indexes[i3 - 1], max_indexes[i3])) {
-                            swap_func(i3, i3 - 1);
-                        }
-                    }
-                }
-            }
-            if (dst_data) {
-                for (int i2 = 0; i2 < src_k; i2++)
-                    dst_data[i0 * src_k + i2] = max_values[i2];
-            }
-            if (dst_idx) {
-                for (int i2 = 0; i2 < src_k; i2++)
-                    dst_idx[i0 * src_k + i2] = max_indexes[i2];
-            }
-        });
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        const float *src = inputs[TOPK_DATA]->cbuffer().as<float *>() +
-            inputs[TOPK_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        src_k = (inputs[TOPK_K]->cbuffer().as<int *>() +
-            inputs[TOPK_K]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0];
-        float* dst_data = nullptr;
-        int* dst_idx = nullptr;
-
-        if (outputs.size() == 1) {
-            if (outputs[0]->getTensorDesc().getPrecision() == Precision::FP32) {
-                dst_data = outputs[0]->buffer().as<float *>() +
-                    outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-            } else {
-                dst_idx = outputs[0]->buffer().as<int *>() +
-                    outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-            }
-            SizeVector dstDims = outputs[0]->getTensorDesc().getDims();
-
-            if (dstDims[axis] != static_cast<size_t>(src_k)) {
-                if (resp) {
-                    std::string errorMsg = "Output tensor dimension mismatch";
-                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                }
-                return PARAMETER_MISMATCH;
-            }
-        } else if (outputs.size() == 2) {
-            dst_data = outputs[TOPK_VALUE]->buffer().as<float *>() +
-                outputs[TOPK_VALUE]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-            SizeVector dst_data_dims = outputs[TOPK_VALUE]->getTensorDesc().getDims();
-
-            dst_idx = outputs[TOPK_INDEX]->buffer().as<int *>() +
-                outputs[TOPK_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-            SizeVector dst_idx_dims = outputs[TOPK_INDEX]->getTensorDesc().getDims();
-
-            if (dst_idx_dims[axis] != static_cast<size_t>(src_k) || dst_data_dims[axis] != static_cast<size_t>(src_k)) {
-                if (resp) {
-                    std::string errorMsg = "Output tensors dimension mismatch";
-                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                }
-                return PARAMETER_MISMATCH;
-            }
-        } else {
-            if (resp) {
-                std::string errorMsg = "Output tensors amount mismatch";
-                errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-            }
-            return PARAMETER_MISMATCH;
-        }
-
-        if (src_dims[axis] < static_cast<size_t>(src_k))
-            src_k = src_dims[axis];
-
-        SizeVector in_dims = inputs[TOPK_DATA]->getTensorDesc().getDims();
-
-        if (src_k == 1) {
-            if (is_last_dim) {
-                if (mode_max)
-                    top1<std::greater>(src, dst_data, dst_idx, in_dims);
-                else
-                    top1<std::less>(src, dst_data, dst_idx, in_dims);
-            } else {
-                if (mode_max)
-                    top1_axis<cmpgt_ps, std::greater>(src, dst_data, dst_idx, in_dims);
-                else
-                    top1_axis<cmplt_ps, std::less>(src, dst_data, dst_idx, in_dims);
-            }
-        } else {
-            if (is_last_dim) {
-                if (mode_max)
-                    topk<std::greater>(src, dst_data, dst_idx, in_dims);
-                else
-                    topk<std::less>(src, dst_data, dst_idx, in_dims);
-            } else {
-                if (mode_max)
-                    topk_axis<cmpgt_ps, std::greater>(src, dst_data, dst_idx, in_dims);
-                else
-                    topk_axis<cmplt_ps, std::less>(src, dst_data, dst_idx, in_dims);
-            }
-        }
-
-        return OK;
-    }
-
-private:
-    const size_t TOPK_DATA = 0;
-    const size_t TOPK_K = 1;
-    const size_t TOPK_VALUE = 0;
-    const size_t TOPK_INDEX = 1;
-
-    SizeVector src_dims;
-    size_t axis;
-    size_t axis_dim;
-    size_t axis_stride = 1;
-    size_t axis_step = 1;
-    bool is_last_dim = false;
-    int src_k = 1;
-
-    bool sort_value = false;
-    bool mode_max = true;
-
-    int dim, before_num;
-
-#if defined(HAVE_AVX512F)
-    const int count_vec = 32;
-#elif defined(HAVE_SSE) || defined(HAVE_AVX2)
-    const int count_vec = 16;
-#endif
-
-    inline int count(SizeVector dims, size_t start_ind, size_t end_ind) {
-        size_t count = 1;
-        for (size_t i = start_ind; i < end_ind; i++)
-            count *= dims[i];
-        return static_cast<int>(count);
-    }
-
-    inline int count(SizeVector dims, size_t start_ind = 0) {
-        return count(dims, start_ind, dims.size());
-    }
-};
-
-REG_FACTORY_FOR(TopKImpl, TopK);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/src/mkldnn_plugin/nodes/topkrois_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/topkrois_onnx.cpp
deleted file mode 100644
index e4203ef564f2a1..00000000000000
--- a/inference-engine/src/mkldnn_plugin/nodes/topkrois_onnx.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (C) 2018-2021 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-#include <algorithm>
-#include <cassert>
-#include <vector>
-#include "common/cpu_memcpy.h"
-#include <ngraph/opsets/opset6.hpp>
-
-using MKLDNNPlugin::TensorDescCreatorTypes;
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-class ExperimentalDetectronTopKROIsImpl: public ExtLayerBase {
-private:
-    // Inputs:
-    //      rois, shape [n, 4]
-    //      rois_probs, shape [n]
-    // Outputs:
-    //      top_rois, shape [max_rois, 4]
-
-    const int INPUT_ROIS {0};
-    const int INPUT_PROBS {1};
-
-    const int OUTPUT_ROIS {0};
-
-    bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
-        try {
-            const auto topKROI = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronTopKROIs>(op);
-            if (!topKROI) {
-                errorMessage = "Only opset6 ExperimentalDetectronTopKROIs operation is supported";
-                return false;
-            }
-        } catch (...) {
-            return false;
-        }
-        return true;
-    }
-
-    std::string errorPrefix;
-
-public:
-    explicit ExperimentalDetectronTopKROIsImpl(const std::shared_ptr<ngraph::Node>& op) {
-        try {
-            std::string errorMessage;
-            if (!isSupportedOperation(op, errorMessage)) {
-              IE_THROW(NotImplemented) << errorMessage;
-            }
-
-            errorPrefix = "ExperimentalDetectronTopKROIs layer with name '" + op->get_friendly_name() + "'";
-            const auto topKROI = std::dynamic_pointer_cast<const ngraph::opset6::ExperimentalDetectronTopKROIs>(op);
-            if (op->get_input_size() != 2 || op->get_output_size() != 1)
-                IE_THROW() << errorPrefix << " has incorrect number of input/output edges!";
-
-            if (op->get_input_shape(INPUT_ROIS).size() != 2 || op->get_input_shape(INPUT_PROBS).size() != 1)
-                IE_THROW() << errorPrefix << " has nsupported input shape";
-
-            max_rois_num_ = topKROI->get_max_rois();
-
-            addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32},
-                           {TensorDescCreatorTypes::ncsp, Precision::FP32}},
-                          {{TensorDescCreatorTypes::ncsp, Precision::FP32}});
-        } catch (InferenceEngine::Exception &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        const int input_rois_num = inputs[INPUT_ROIS]->getTensorDesc().getDims()[0];
-        const int top_rois_num = (std::min)(max_rois_num_, input_rois_num);
-
-        auto *input_rois = inputs[INPUT_ROIS]->buffer().as<const float *>();
-        auto *input_probs = inputs[INPUT_PROBS]->buffer().as<const float *>();
-        auto *output_rois = outputs[OUTPUT_ROIS]->buffer().as<float *>();
-
-        std::vector<size_t> idx(input_rois_num);
-        iota(idx.begin(), idx.end(), 0);
-        // FIXME. partial_sort is enough here.
-        sort(idx.begin(), idx.end(), [&input_probs](size_t i1, size_t i2) {return input_probs[i1] > input_probs[i2];});
-
-        for (int i = 0; i < top_rois_num; ++i) {
-            cpu_memcpy(output_rois + 4 * i, input_rois + 4 * idx[i], 4 * sizeof(float));
-        }
-
-        return OK;
-    }
-
-private:
-    int max_rois_num_;
-};
-
-REG_FACTORY_FOR(ExperimentalDetectronTopKROIsImpl, ExperimentalDetectronTopKROIs);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/extract_image_patches.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/extract_image_patches.cpp
index 20edee2aad70d4..58e532a7f1cc8e 100755
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/extract_image_patches.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/extract_image_patches.cpp
@@ -43,7 +43,7 @@ class ExtractImagePatchesLayerCPUTest : public testing::WithParamInterface<extra
         ngraph::op::PadType pad_type;
         InferenceEngine::Precision netPrecision;
         std::tie(inputShape, kernel, strides, rates, pad_type, netPrecision, inPrc, outPrc, inLayout, targetDevice) = basicParamsSet;
-        selectedType = std::string("unknown_") + netPrecision.name();
+        selectedType = std::string("ref_any_") + netPrecision.name();
 
         auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
         auto inputNode = std::make_shared<ngraph::opset6::Parameter>(ngPrc, ngraph::Shape(inputShape));