From 18806ecd761b5d967f3f870e5e325b7c08b42295 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Fri, 30 Jul 2021 20:45:44 +0800
Subject: [PATCH] MulticlassNms/MatrixNms: transformations and CPU
 implementation (#6653)

* init version, need revise: opset7

* add convert testcase

* multiclass_nms support spec

* init version

* matrixnms support spec

* init support for matrix_nms

* impl matirx_nms

* implemented multiclass_nms reference.

TODO: more test cases.

* support dynamic shape in test

* update to spec 0611

* update to spec 0611

* fixes.

* fix: now sort by class_id and score work.

* fix clang check error

* more test cases verified.

* fixes in ref impl.

* attribute nms_eta works

* test cross_batch and output_type i32.

* enable multiclass-nms cpu plugin fallback ngraph

* keep topk typo

* enable matrix-nms cpu plugin fallback ngraph

* support sort_result_across_batch

* Add matrix_nms unit test

* Add cross batch test cases

* fix typo

* move multiclass to opset8

* move matrixnms to opset8

* Reference implementations for MulticlassNms and MatrixNms ops

* fix name conflict

* remove unused var
sort_result_across_batch default set to false

* avoid float overflow

* fix clang check error

* info for mac fail

* change testcase due to unstable sort

* nms add 'normalized' attribute

* multiclass cpu test support 'normalized'

* nms add 'normalized' attribute

* fixes: 1. normalized support. 2. sort by score before keep_top_k inside a batch.

* fixes: 1. normalized support. 2. sort by score before keep_top_k inside a batch.

* fix sort order in matrix_nms

* fix review comments

* add matrix_nms MKLDNN extension layer

* parallel in matirx nms

* separate filtered_box

* separate class_nms result

* parallel in class

* parallel in batch

* partial new nms

* partial remove useless function

* debug & fix

* debug in indexing

* fix test cases

* remove logging

* fix code-style

* fix typo

* add matrix_nms extension

* nms python api

* remove unused testcases

* refactor transformation

* transform dynamic shape to static shape

* Update inference-engine/src/transformations/include/ngraph_ops/nms_static_shape_ie.hpp

Co-authored-by: Ilya Churaev <ilyachur@gmail.com>

* remove register_pass call

* [MKLDNN]migrate matrix_nms to MKLDNNNode

* bug fix in matrix_nms

* padding on matrix_nms

* remove logging

* test case refine

* merged transform_matrix_nms branch

* refine matrixnms testcase

* multiclass nms cpu plugin implement for static shape, rebased on Reference implementations PR

* rebase to new multi-classs transform provided by lc

* Name style algin with matrix-nms

* static shape padding style to batch inside,new unit test method, real classnum shape

* fix format

* fix ci error

* multi-class NMS modification based on PR reviewer opinion: code format, copyright, delete unused include and funciton way

* explicit template instantiation due to mac ci fail

* Yi3/fix review (#16)

* fix coding style

* use parallel_for2d

* fix ci fail

* unify 'copyright 2021'

* mkldnn_multiclass_nms node update based on PR review (#17)

* [MKLDNN] apply suggestion for matrix_nms (#18)

* fix bug

* apply review comments

* apply review comments

* apply review comments

* apply review comments

* skip only Nms test, not MatrixNms MulticlassNms test

Co-authored-by: Zhang Yi3 <yi3.zhang@intel.com>
Co-authored-by: jialipen <cecilia.peng@intel.com>
Co-authored-by: mangguo <mang.guo@intel.com>
Co-authored-by: Ilya Churaev <ilyachur@gmail.com>
Co-authored-by: liubo-intel <bo4.liu@intel.com>
---
 .../cnn_network_ngraph_impl.cpp               |   5 +
 .../src/mkldnn_plugin/cpu_types.h             |   4 +-
 .../src/mkldnn_plugin/mkldnn_node.cpp         |   4 +-
 .../src/mkldnn_plugin/mkldnn_node.h           |   4 +
 .../src/mkldnn_plugin/mkldnn_plugin.cpp       |   4 +
 .../nodes/mkldnn_matrix_nms_node.cpp          | 382 ++++++++++++++++
 .../nodes/mkldnn_matrix_nms_node.h            | 100 +++++
 .../nodes/mkldnn_multiclass_nms.cpp           | 414 ++++++++++++++++++
 .../nodes/mkldnn_multiclass_nms.hpp           |  93 ++++
 .../ngraph_ops/nms_static_shape_ie.hpp        | 114 +++++
 .../convert_matrix_nms_to_matrix_nms_ie.hpp   |  26 ++
 ...rt_multiclass_nms_to_multiclass_nms_ie.hpp |  26 ++
 .../src/ngraph_ops/nms_static_shape_ie.cpp    |  19 +
 .../convert_matrix_nms_to_matrix_nms_ie.cpp   |  66 +++
 ...rt_multiclass_nms_to_multiclass_nms_ie.cpp |  67 +++
 .../serialization/single_layer/matrix_nms.cpp |  60 +++
 .../single_layer/multiclass_nms.cpp           |  60 +++
 .../inference_engine/skip_tests_config.cpp    |   2 +-
 ...t_matrix_nms_to_matrix_nms_ie_internal.cpp |  58 +++
 ...lass_nms_to_multiclass_nms_ie_internal.cpp |  58 +++
 .../single_layer_tests/matrix_nms.cpp         |  54 +++
 .../single_layer_tests/multiclass_nms.cpp     |  37 ++
 .../skip_tests_config.cpp                     |   2 +-
 .../include/single_layer_tests/matrix_nms.hpp |  15 +
 .../single_layer_tests/multiclass_nms.hpp     |  15 +
 .../single_layer/matrix_nms.hpp               |  58 +++
 .../single_layer/multiclass_nms.hpp           |  59 +++
 .../src/single_layer/matrix_nms.cpp           | 250 +++++++++++
 .../src/single_layer/multiclass_nms.cpp       | 270 ++++++++++++
 ngraph/core/src/op/matrix_nms.cpp             |   3 +-
 ngraph/core/src/op/util/nms_base.cpp          |   2 +-
 31 files changed, 2325 insertions(+), 6 deletions(-)
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_matrix_nms_node.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_matrix_nms_node.h
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_multiclass_nms.cpp
 create mode 100644 inference-engine/src/mkldnn_plugin/nodes/mkldnn_multiclass_nms.hpp
 create mode 100644 inference-engine/src/transformations/include/ngraph_ops/nms_static_shape_ie.hpp
 create mode 100644 inference-engine/src/transformations/include/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp
 create mode 100644 inference-engine/src/transformations/include/transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp
 create mode 100644 inference-engine/src/transformations/src/ngraph_ops/nms_static_shape_ie.cpp
 create mode 100644 inference-engine/src/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp
 create mode 100644 inference-engine/src/transformations/src/transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.cpp
 create mode 100644 inference-engine/tests/functional/inference_engine/serialization/single_layer/matrix_nms.cpp
 create mode 100644 inference-engine/tests/functional/inference_engine/serialization/single_layer/multiclass_nms.cpp
 create mode 100644 inference-engine/tests/functional/inference_engine/transformations/convert_matrix_nms_to_matrix_nms_ie_internal.cpp
 create mode 100644 inference-engine/tests/functional/inference_engine/transformations/convert_multiclass_nms_to_multiclass_nms_ie_internal.cpp
 create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp
 create mode 100644 inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/multiclass_nms.cpp
 create mode 100644 inference-engine/tests/functional/plugin/shared/include/single_layer_tests/matrix_nms.hpp
 create mode 100644 inference-engine/tests/functional/plugin/shared/include/single_layer_tests/multiclass_nms.hpp
 create mode 100644 inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/matrix_nms.hpp
 create mode 100644 inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/multiclass_nms.hpp
 create mode 100644 inference-engine/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp
 create mode 100644 inference-engine/tests/functional/shared_test_classes/src/single_layer/multiclass_nms.cpp

diff --git a/inference-engine/src/inference_engine/cnn_network_ngraph_impl.cpp b/inference-engine/src/inference_engine/cnn_network_ngraph_impl.cpp
index c60c515edda59a..5343fd108c1a67 100644
--- a/inference-engine/src/inference_engine/cnn_network_ngraph_impl.cpp
+++ b/inference-engine/src/inference_engine/cnn_network_ngraph_impl.cpp
@@ -35,6 +35,9 @@
 
 #include <transformations/low_precision/disable_convert_constant_folding_on_const_path.hpp>
 
+#include <transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp>
+#include <transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp>
+
 #include "ie_ngraph_utils.hpp"
 #include "exec_graph_info.hpp"
 #include "ie_itt.hpp"
@@ -389,6 +392,8 @@ CNNNetworkNGraphImpl::reshape(const std::map<std::string, ngraph::PartialShape>&
                 ::ngraph::pass::Manager manager;
                 // resolves dynamism by replacing dynamic operation with static version
                 manager.register_pass<::ngraph::pass::ConvertNMS5ToLegacyMatcher>(false);
+                manager.register_pass<::ngraph::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
+                manager.register_pass<::ngraph::pass::ConvertMatrixNmsToMatrixNmsIE>();
                 manager.register_pass<::ngraph::pass::DisableConvertConstantFoldingOnConstPath>();
                 manager.register_pass<::ngraph::pass::ConstantFolding>();
                 // OneHotToLegacy changes output precision
diff --git a/inference-engine/src/mkldnn_plugin/cpu_types.h b/inference-engine/src/mkldnn_plugin/cpu_types.h
index eb54b431cb6658..7c820c4db50ccf 100644
--- a/inference-engine/src/mkldnn_plugin/cpu_types.h
+++ b/inference-engine/src/mkldnn_plugin/cpu_types.h
@@ -86,7 +86,9 @@ enum Type {
     ExperimentalDetectronPriorGridGenerator,
     ExperimentalDetectronGenerateProposalsSingleImage,
     ExtractImagePatches,
-    NonMaxSuppression
+    NonMaxSuppression,
+    MatrixNms,
+    MulticlassNms
 };
 
 enum Algorithm {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
index 49e8e7ca10b972..8f4b204ca8e727 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -225,7 +225,9 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
         { "ExperimentalDetectronPriorGridGenerator", ExperimentalDetectronPriorGridGenerator},
         { "ExperimentalDetectronGenerateProposalsSingleImage", ExperimentalDetectronGenerateProposalsSingleImage},
         { "ExtractImagePatches", ExtractImagePatches},
-        { "NonMaxSuppressionIEInternal", NonMaxSuppression}
+        { "NonMaxSuppressionIEInternal", NonMaxSuppression},
+        { "MatrixNms", MatrixNms},
+        { "MulticlassNms", MulticlassNms}
 };
 
 Type TypeFromName(const std::string type) {
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
index 436ecc3ac3998e..35993f96c79c99 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@@ -194,6 +194,10 @@ static std::string NameFromType(Type type) {
             return "ExtractImagePatches";
         case NonMaxSuppression:
             return "NonMaxSuppression";
+        case MatrixNms:
+            return "MatrixNms";
+        case MulticlassNms:
+            return "MulticlassNms";
         default:
             return "Unknown";
     }
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
index 2d7299aed9201a..c7907aa55692b2 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
@@ -57,6 +57,8 @@
 #include <transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp>
 #include <transformations/op_conversions/convert_previous_nms_to_nms_5.hpp>
 #include <transformations/op_conversions/convert_nms_to_nms_ie_internal.hpp>
+#include <transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp>
+#include <transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp>
 #include <transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp>
 #include <transformations/smart_reshape/matmul_sr.hpp>
 #include <transformations/convert_precision.hpp>
@@ -168,6 +170,8 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
     manager.register_pass<ngraph::pass::ConvertNMS3ToNMS5>();
     manager.register_pass<ngraph::pass::ConvertNMS4ToNMS5>();
     manager.register_pass<ngraph::pass::ConvertNMSToNMSIEInternal>();
+    manager.register_pass<ngraph::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
+    manager.register_pass<ngraph::pass::ConvertMatrixNmsToMatrixNmsIE>();
     manager.register_pass<ngraph::pass::TransposeMatMul>();
     manager.register_pass<ngraph::pass::ConstantFolding>();
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matrix_nms_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matrix_nms_node.cpp
new file mode 100644
index 00000000000000..5bd27d079cef20
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matrix_nms_node.cpp
@@ -0,0 +1,382 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_matrix_nms_node.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "base.hpp"
+#include "ie_parallel.hpp"
+#include "ngraph/opsets/opset8.hpp"
+#include "ngraph_ops/nms_static_shape_ie.hpp"
+#include "utils/general_utils.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+using MatrixNmsIEInternal = ngraph::op::internal::NmsStaticShapeIE<ngraph::op::v8::MatrixNms>;
+
+using ngNmsSortResultType = ngraph::op::util::NmsBase::SortResultType;
+using ngNmseDcayFunction = ngraph::op::v8::MatrixNms::DecayFunction;
+
+bool MKLDNNMatrixNmsNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto nms = std::dynamic_pointer_cast<const MatrixNmsIEInternal>(op);
+        if (!nms) {
+            errorMessage = "Only internal MatrixNms operation is supported";
+            return false;
+        }
+        const auto& attrs = nms->get_attrs();
+        const auto& sortType = attrs.sort_result_type;
+        if (!one_of(sortType, ngNmsSortResultType::NONE, ngNmsSortResultType::SCORE, ngNmsSortResultType::CLASSID)) {
+            errorMessage = "Does not support SortResultType mode: " + ngraph::as_string(sortType);
+            return false;
+        }
+        const auto& decayType = attrs.decay_function;
+        if (!one_of(decayType, ngNmseDcayFunction::LINEAR, ngNmseDcayFunction::GAUSSIAN)) {
+            errorMessage = "Does not support DcayFunction " + ngraph::as_string(decayType);
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNMatrixNmsNode::MKLDNNMatrixNmsNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr& cache)
+    : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+
+    errorPrefix = "MatrixNMS layer with name '" + getName() + "' ";
+    const auto matrix_nms = std::dynamic_pointer_cast<const MatrixNmsIEInternal>(op);
+
+    if (getOriginalInputsNumber() != 2)
+        IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getOriginalInputsNumber();
+
+    if (getOriginalOutputsNumber() != 3)
+        IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getOriginalOutputsNumber();
+
+    if (!(inDims[NMS_BOXES][0] == inDims[NMS_SCORES][0] && inDims[NMS_BOXES][1] == inDims[NMS_SCORES][2])) {
+        IE_THROW() << errorPrefix << "has incompatible 'boxes' and 'scores' input dmensions";
+    }
+    const SizeVector& boxes_dims = op->get_input_shape(NMS_BOXES);
+    m_numBatches = boxes_dims[0];
+    m_numBoxes = boxes_dims[1];
+    if (boxes_dims.size() != 3)
+        IE_THROW() << errorPrefix << "has unsupported 'boxes' input rank: " << boxes_dims.size();
+    if (boxes_dims[2] != 4)
+        IE_THROW() << errorPrefix << "has unsupported 'boxes' input 3rd dimension size: " << boxes_dims[2];
+    const SizeVector& scores_dims = op->get_input_shape(NMS_SCORES);
+    m_numClasses = scores_dims[1];
+    if (scores_dims.size() != 3)
+        IE_THROW() << errorPrefix << "has unsupported 'scores' input rank: " << scores_dims.size();
+
+    if (m_numBatches != scores_dims[0])
+        IE_THROW() << errorPrefix << " num_batches is different in 'boxes' and 'scores' inputs";
+    if (m_numBoxes != scores_dims[2])
+        IE_THROW() << errorPrefix << " num_boxes is different in 'boxes' and 'scores' inputs";
+    auto& attrs = matrix_nms->get_attrs();
+    if (attrs.sort_result_type == ngraph::op::util::NmsBase::SortResultType::CLASSID)
+        m_sortResultType = MatrixNmsSortResultType::CLASSID;
+    else if (attrs.sort_result_type == ngraph::op::util::NmsBase::SortResultType::SCORE)
+        m_sortResultType = MatrixNmsSortResultType::SCORE;
+    else if (attrs.sort_result_type == ngraph::op::util::NmsBase::SortResultType::NONE)
+        m_sortResultType = MatrixNmsSortResultType::NONE;
+
+    if (attrs.decay_function == ngraph::op::v8::MatrixNms::DecayFunction::GAUSSIAN)
+        m_decayFunction = GAUSSIAN;
+    else if (attrs.decay_function == ngraph::op::v8::MatrixNms::DecayFunction::LINEAR)
+        m_decayFunction = LINEAR;
+
+    m_sortResultAcrossBatch = attrs.sort_result_across_batch;
+    m_scoreThreshold = attrs.score_threshold;
+    m_nmsTopk = attrs.nms_top_k;
+    m_keepTopk = attrs.keep_top_k;
+    m_backgroundClass = attrs.background_class;
+
+    m_gaussianSigma = attrs.gaussian_sigma;
+    m_postThreshold = attrs.post_threshold;
+    m_normalized = attrs.normalized;
+    int64_t max_output_boxes_per_class = 0;
+    size_t real_num_classes = m_backgroundClass == -1 ? m_numClasses : m_numClasses - 1;
+    if (m_nmsTopk >= 0)
+        max_output_boxes_per_class = std::min(m_numBoxes, static_cast<size_t>(m_nmsTopk));
+    else
+        max_output_boxes_per_class = m_numBoxes;
+
+    m_maxBoxesPerBatch = max_output_boxes_per_class * real_num_classes;
+    if (m_keepTopk >= 0)
+        m_maxBoxesPerBatch = std::min(m_maxBoxesPerBatch, static_cast<size_t>(m_keepTopk));
+}
+
+void MKLDNNMatrixNmsNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    m_realNumClasses = m_backgroundClass == -1 ? m_numClasses : m_numClasses - 1;
+    m_realNumBoxes = m_nmsTopk == -1 ? m_numBoxes : std::min(m_nmsTopk, static_cast<int>(m_numBoxes));
+    m_numPerBatch.resize(m_numBatches);
+    m_filteredBoxes.resize(m_numBatches * m_realNumClasses * m_realNumBoxes);
+    m_numPerBatchClass.resize(m_numBatches, std::vector<int64_t>(m_numClasses, 0));
+    m_classOffset.resize(m_numClasses, 0);
+
+    for (size_t i = 0, count = 0; i < m_numClasses; i++) {
+        if (i == m_backgroundClass)
+            continue;
+        m_classOffset[i] = (count++) * m_realNumBoxes;
+    }
+
+    if (m_decayFunction == MatrixNmsDecayFunction::LINEAR) {
+        m_decay_fn = [](float iou, float max_iou, float sigma) -> float {
+            return (1. - iou) / (1. - max_iou + 1e-10f);
+        };
+    } else {
+        m_decay_fn = [](float iou, float max_iou, float sigma) -> float {
+            return std::exp((max_iou * max_iou - iou * iou) * sigma);
+        };
+    }
+
+    const std::vector<Precision> supportedFloatPrecision = {Precision::FP32};
+    const std::vector<Precision> supportedIntOutputPrecision = {Precision::I32, Precision::I64};
+
+    checkPrecision(getOriginalInputPrecisionAtPort(NMS_BOXES), supportedFloatPrecision, "boxes", inType);
+
+    checkPrecision(getOriginalInputPrecisionAtPort(NMS_SCORES), supportedFloatPrecision, "scores", inType);
+
+    checkPrecision(getOriginalOutputPrecisionAtPort(NMS_SELECTED_INDICES), supportedIntOutputPrecision, "selected_indices", outType);
+    checkPrecision(getOriginalOutputPrecisionAtPort(NMS_SELECTED_OUTPUTS), supportedFloatPrecision, "selected_outputs", outType);
+    checkPrecision(getOriginalOutputPrecisionAtPort(NMS_VALID_OUTPUTS), supportedIntOutputPrecision, "valid_outputs", outType);
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::I32},
+                          {TensorDescCreatorTypes::ncsp, Precision::I32}},
+                         impl_desc_type::ref_any);
+}
+
+bool MKLDNNMatrixNmsNode::created() const {
+    return getType() == MatrixNms;
+}
+
+namespace {
+
+static inline float boxArea(const float* bbox, const bool normalized) {
+    if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
+        return static_cast<float>(0.);
+    } else {
+        const float width = bbox[2] - bbox[0];
+        const float height = bbox[3] - bbox[1];
+        if (normalized) {
+            return width * height;
+        } else {
+            return (width + 1) * (height + 1);
+        }
+    }
+}
+
+static inline float intersectionOverUnion(const float* bbox1, const float* bbox2, const bool normalized) {
+    if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || bbox2[1] > bbox1[3] || bbox2[3] < bbox1[1]) {
+        return static_cast<float>(0.);
+    } else {
+        const float xMin = std::max(bbox1[0], bbox2[0]);
+        const float yMin = std::max(bbox1[1], bbox2[1]);
+        const float xMax = std::min(bbox1[2], bbox2[2]);
+        const float yMax = std::min(bbox1[3], bbox2[3]);
+        float norm = normalized ? static_cast<float>(0.) : static_cast<float>(1.);
+        float width = xMax - xMin + norm;
+        float height = yMax - yMin + norm;
+        const float interArea = width * height;
+        const float bbox1Area = boxArea(bbox1, normalized);
+        const float bbox2Area = boxArea(bbox2, normalized);
+        return interArea / (bbox1Area + bbox2Area - interArea);
+    }
+}
+}  // namespace
+
+size_t MKLDNNMatrixNmsNode::nmsMatrix(const float* boxesData, const float* scoresData, BoxInfo* filterBoxes, const int64_t batchIdx, const int64_t classIdx) {
+    std::vector<int32_t> candidateIndex(m_numBoxes);
+    std::iota(candidateIndex.begin(), candidateIndex.end(), 0);
+    auto end = std::remove_if(candidateIndex.begin(), candidateIndex.end(), [&scoresData, this](int32_t idx) {
+        return scoresData[idx] <= m_scoreThreshold;
+    });
+    int64_t numDet = 0;
+    int64_t originalSize = std::distance(candidateIndex.begin(), end);
+    if (originalSize <= 0) {
+        return 0;
+    }
+    if (m_nmsTopk > -1 && originalSize > m_nmsTopk) {
+        originalSize = m_nmsTopk;
+    }
+
+    std::partial_sort(candidateIndex.begin(), candidateIndex.begin() + originalSize, end, [&scoresData](int32_t a, int32_t b) {
+        return scoresData[a] > scoresData[b];
+    });
+
+    std::vector<float> iouMatrix((originalSize * (originalSize - 1)) >> 1);
+    std::vector<float> iouMax(originalSize);
+
+    iouMax[0] = 0.;
+    InferenceEngine::parallel_for(originalSize - 1, [&](size_t i) {
+        float max_iou = 0.;
+        size_t actual_index = i + 1;
+        auto idx_a = candidateIndex[actual_index];
+        for (int64_t j = 0; j < actual_index; j++) {
+            auto idx_b = candidateIndex[j];
+            auto iou = intersectionOverUnion(boxesData + idx_a * 4, boxesData + idx_b * 4, m_normalized);
+            max_iou = std::max(max_iou, iou);
+            iouMatrix[actual_index * (actual_index - 1) / 2 + j] = iou;
+        }
+        iouMax[actual_index] = max_iou;
+    });
+
+    if (scoresData[candidateIndex[0]] > m_postThreshold) {
+        auto box_index = candidateIndex[0];
+        auto box = boxesData + box_index * 4;
+        filterBoxes[0].box.x1 = box[0];
+        filterBoxes[0].box.y1 = box[1];
+        filterBoxes[0].box.x2 = box[2];
+        filterBoxes[0].box.y2 = box[3];
+        filterBoxes[0].index = batchIdx * m_numBoxes + box_index;
+        filterBoxes[0].score = scoresData[candidateIndex[0]];
+        filterBoxes[0].batchIndex = batchIdx;
+        filterBoxes[0].classIndex = classIdx;
+        numDet++;
+    }
+
+    for (int64_t i = 1; i < originalSize; i++) {
+        float minDecay = 1.;
+        for (int64_t j = 0; j < i; j++) {
+            auto maxIou = iouMax[j];
+            auto iou = iouMatrix[i * (i - 1) / 2 + j];
+            auto decay = m_decay_fn(iou, maxIou, m_gaussianSigma);
+            minDecay = std::min(minDecay, decay);
+        }
+        auto ds = minDecay * scoresData[candidateIndex[i]];
+        if (ds <= m_postThreshold)
+            continue;
+        auto boxIndex = candidateIndex[i];
+        auto box = boxesData + boxIndex * 4;
+        filterBoxes[numDet].box.x1 = box[0];
+        filterBoxes[numDet].box.y1 = box[1];
+        filterBoxes[numDet].box.x2 = box[2];
+        filterBoxes[numDet].box.y2 = box[3];
+        filterBoxes[numDet].index = batchIdx * m_numBoxes + boxIndex;
+        filterBoxes[numDet].score = ds;
+        filterBoxes[numDet].batchIndex = batchIdx;
+        filterBoxes[numDet].classIndex = classIdx;
+        numDet++;
+    }
+    return numDet;
+}
+
+void MKLDNNMatrixNmsNode::execute(mkldnn::stream strm) {
+    const float* boxes = reinterpret_cast<const float*>(getParentEdgeAt(NMS_BOXES)->getMemoryPtr()->GetPtr());
+    const float* scores = reinterpret_cast<const float*>(getParentEdgeAt(NMS_SCORES)->getMemoryPtr()->GetPtr());
+
+    InferenceEngine::parallel_for2d(m_numBatches, m_numClasses, [&](size_t batchIdx, size_t classIdx) {
+        if (classIdx == m_backgroundClass) {
+            m_numPerBatchClass[batchIdx][classIdx] = 0;
+            return;
+        }
+        const float* boxesPtr = boxes + batchIdx * m_numBoxes * 4;
+        const float* scoresPtr = scores + batchIdx * (m_numClasses * m_numBoxes) + classIdx * m_numBoxes;
+        size_t classNumDet = 0;
+        size_t batchOffset = batchIdx * m_realNumClasses * m_realNumBoxes;
+        classNumDet = nmsMatrix(boxesPtr, scoresPtr, m_filteredBoxes.data() + batchOffset + m_classOffset[classIdx], batchIdx, classIdx);
+        m_numPerBatchClass[batchIdx][classIdx] = classNumDet;
+    });
+
+    InferenceEngine::parallel_for(m_numBatches, [&](size_t batchIdx) {
+        size_t batchOffset = batchIdx * m_realNumClasses * m_realNumBoxes;
+        BoxInfo* batchFilteredBox = m_filteredBoxes.data() + batchOffset;
+        auto& numPerClass = m_numPerBatchClass[batchIdx];
+        auto numDet = std::accumulate(numPerClass.begin(), numPerClass.end(), 0);
+        auto start_offset = numPerClass[0];
+
+        for (size_t i = 1; i < numPerClass.size(); i++) {
+            auto offset_class = m_classOffset[i];
+            for (size_t j = 0; j < numPerClass[i]; j++) {
+                batchFilteredBox[start_offset + j] = batchFilteredBox[offset_class + j];
+            }
+            start_offset += numPerClass[i];
+        }
+        auto keepNum = numDet;
+        if (m_keepTopk > -1) {
+            auto k = static_cast<size_t>(m_keepTopk);
+            if (keepNum > k)
+                keepNum = k;
+        }
+
+        std::partial_sort(batchFilteredBox, batchFilteredBox + keepNum, batchFilteredBox + numDet, [](const BoxInfo& lhs, const BoxInfo rhs) {
+            return lhs.score > rhs.score || (lhs.score == rhs.score && lhs.classIndex < rhs.classIndex) ||
+                   (lhs.score == rhs.score && lhs.classIndex == rhs.classIndex && lhs.index < rhs.index);
+        });
+        m_numPerBatch[batchIdx] = keepNum;
+    });
+
+    auto startOffset = m_numPerBatch[0];
+    for (size_t i = 1; i < m_numPerBatch.size(); i++) {
+        auto offset_batch = i * m_realNumClasses * m_realNumBoxes;
+        for (size_t j = 0; j < m_numPerBatch[i]; j++) {
+            m_filteredBoxes[startOffset + j] = m_filteredBoxes[offset_batch + j];
+        }
+        startOffset += m_numPerBatch[i];
+    }
+
+    if (m_sortResultAcrossBatch) { /* sort across batch */
+        if (m_sortResultType == MatrixNmsSortResultType::SCORE) {
+            parallel_sort(m_filteredBoxes.begin(), m_filteredBoxes.begin() + startOffset, [](const BoxInfo& l, const BoxInfo& r) {
+                return (l.score > r.score) || (l.score == r.score && l.batchIndex < r.batchIndex) ||
+                       (l.score == r.score && l.batchIndex == r.batchIndex && l.classIndex < r.classIndex) ||
+                       (l.score == r.score && l.batchIndex == r.batchIndex && l.classIndex == r.classIndex && l.index < r.index);
+            });
+        } else if (m_sortResultType == MatrixNmsSortResultType::CLASSID) {
+            parallel_sort(m_filteredBoxes.begin(), m_filteredBoxes.begin() + startOffset, [](const BoxInfo& l, const BoxInfo& r) {
+                return (l.classIndex < r.classIndex) || (l.classIndex == r.classIndex && l.batchIndex < r.batchIndex) ||
+                       (l.classIndex == r.classIndex && l.batchIndex == r.batchIndex && l.score > r.score) ||
+                       (l.classIndex == r.classIndex && l.batchIndex == r.batchIndex && l.score == r.score && l.index < r.index);
+            });
+        }
+    }
+
+    float* selectedOutputs = reinterpret_cast<float*>(getChildEdgesAtPort(NMS_SELECTED_OUTPUTS)[0]->getMemoryPtr()->GetPtr());
+    int* selectedIndices = reinterpret_cast<int*>(getChildEdgesAtPort(NMS_SELECTED_INDICES)[0]->getMemoryPtr()->GetPtr());
+    int* validOutputs = reinterpret_cast<int*>(getChildEdgesAtPort(NMS_VALID_OUTPUTS)[0]->getMemoryPtr()->GetPtr());
+    std::copy(m_numPerBatch.begin(), m_numPerBatch.end(), validOutputs);
+
+    int64_t outputOffset = 0;
+    int64_t originalOffset = 0;
+    for (size_t i = 0; i < m_numBatches; i++) {
+        auto real_boxes = m_numPerBatch[i];
+        for (size_t j = 0; j < real_boxes; j++) {
+            auto originalIndex = originalOffset + j;
+            selectedIndices[j + outputOffset] = static_cast<int>(m_filteredBoxes[originalIndex].index);
+            auto selectedBase = selectedOutputs + (outputOffset + j) * 6;
+            selectedBase[0] = m_filteredBoxes[originalIndex].classIndex;
+            selectedBase[1] = m_filteredBoxes[originalIndex].score;
+            selectedBase[2] = m_filteredBoxes[originalIndex].box.x1;
+            selectedBase[3] = m_filteredBoxes[originalIndex].box.y1;
+            selectedBase[4] = m_filteredBoxes[originalIndex].box.x2;
+            selectedBase[5] = m_filteredBoxes[originalIndex].box.y2;
+        }
+        std::fill_n(selectedOutputs + (outputOffset + real_boxes) * 6, (m_maxBoxesPerBatch - real_boxes) * 6, -1);
+        std::fill_n(selectedIndices + (outputOffset + real_boxes), m_maxBoxesPerBatch - real_boxes, -1);
+        outputOffset += m_maxBoxesPerBatch;
+        originalOffset += real_boxes;
+    }
+}
+
+void MKLDNNMatrixNmsNode::checkPrecision(const Precision prec, const std::vector<Precision> precList, const std::string name, const std::string type) {
+    if (std::find(precList.begin(), precList.end(), prec) == precList.end())
+        IE_THROW() << errorPrefix << "has unsupported '" << name << "' " << type << " precision: " << prec;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNMatrixNmsNode, MatrixNms);
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matrix_nms_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matrix_nms_node.h
new file mode 100644
index 00000000000000..5d85a3669529d3
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_matrix_nms_node.h
@@ -0,0 +1,100 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace MKLDNNPlugin {
+
+enum MatrixNmsSortResultType {
+    CLASSID,  // sort selected boxes by class id (ascending) in each batch element
+    SCORE,    // sort selected boxes by score (descending) in each batch element
+    NONE      // do not guarantee the order in each batch element
+};
+
+enum MatrixNmsDecayFunction { GAUSSIAN, LINEAR };
+
+class MKLDNNMatrixNmsNode : public MKLDNNNode {
+public:
+    MKLDNNMatrixNmsNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr& cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    // input
+    static const size_t NMS_BOXES = 0;
+    static const size_t NMS_SCORES = 1;
+
+    // output
+    static const size_t NMS_SELECTED_OUTPUTS = 0;
+    static const size_t NMS_SELECTED_INDICES = 1;
+    static const size_t NMS_VALID_OUTPUTS = 2;
+
+    size_t m_numBatches;
+    size_t m_numBoxes;
+    size_t m_numClasses;
+    size_t m_maxBoxesPerBatch;
+
+    MatrixNmsSortResultType m_sortResultType;
+    bool m_sortResultAcrossBatch;
+    float m_scoreThreshold;
+    int m_nmsTopk;
+    int m_keepTopk;
+    int m_backgroundClass;
+    MatrixNmsDecayFunction m_decayFunction;
+    float m_gaussianSigma;
+    float m_postThreshold;
+    bool m_normalized;
+
+    struct Rectangle {
+        Rectangle(float x_left, float y_left, float x_right, float y_right) : x1 {x_left}, y1 {y_left}, x2 {x_right}, y2 {y_right} {}
+
+        Rectangle() = default;
+
+        float x1 = 0.0f;
+        float y1 = 0.0f;
+        float x2 = 0.0f;
+        float y2 = 0.0f;
+    };
+
+    struct BoxInfo {
+        BoxInfo(const Rectangle& r, int64_t idx, float sc, int64_t batch_idx, int64_t class_idx)
+            : box {r}, index {idx}, batchIndex {batch_idx}, classIndex {class_idx}, score {sc} {}
+
+        BoxInfo() = default;
+
+        Rectangle box;
+        int64_t index = -1;
+        int64_t batchIndex = -1;
+        int64_t classIndex = -1;
+        float score = 0.0f;
+    };
+    std::string errorPrefix;
+    const std::string inType = "input", outType = "output";
+    std::vector<int64_t> m_numPerBatch;
+    std::vector<std::vector<int64_t>> m_numPerBatchClass;
+    std::vector<BoxInfo> m_filteredBoxes;
+    std::vector<int> m_classOffset;
+    size_t m_realNumClasses;
+    size_t m_realNumBoxes;
+    float (*m_decay_fn)(float, float, float);
+    void checkPrecision(const InferenceEngine::Precision prec, const std::vector<InferenceEngine::Precision> precList, const std::string name,
+                        const std::string type);
+
+    size_t nmsMatrix(const float* boxesData, const float* scoresData, BoxInfo* filterBoxes, const int64_t batchIdx, const int64_t classIdx);
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_multiclass_nms.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_multiclass_nms.cpp
new file mode 100644
index 00000000000000..1ea109f5fdb1e5
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_multiclass_nms.cpp
@@ -0,0 +1,414 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_multiclass_nms.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <ie_ngraph_utils.hpp>
+#include <ngraph_ops/nms_static_shape_ie.hpp>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "base.hpp"
+#include "ie_parallel.hpp"
+#include "utils/general_utils.h"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+using ngNmsSortResultType = ngraph::op::util::NmsBase::SortResultType;
+using MulticlassNmsIEInternal = ngraph::op::internal::NmsStaticShapeIE<ngraph::op::v8::MulticlassNms>;
+
+bool MKLDNNMultiClassNmsNode::isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept {
+    try {
+        const auto nms = std::dynamic_pointer_cast<const MulticlassNmsIEInternal>(op);
+        if (!nms) {
+            errorMessage = "Only internal MulitClassNonMaxSuppression operation is supported";
+            return false;
+        }
+        const auto& atrri = nms->get_attrs();
+        const auto& sortType = atrri.sort_result_type;
+        if (!one_of(sortType, ngNmsSortResultType::NONE, ngNmsSortResultType::SCORE, ngNmsSortResultType::CLASSID)) {
+            errorMessage = "Does not support SortResultType mode: " + ngraph::as_string(sortType);
+            return false;
+        }
+    } catch (...) {
+        return false;
+    }
+    return true;
+}
+
+MKLDNNMultiClassNmsNode::MKLDNNMultiClassNmsNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr& cache)
+    : MKLDNNNode(op, eng, cache) {
+    std::string errorMessage;
+    if (!isSupportedOperation(op, errorMessage)) {
+        IE_THROW(NotImplemented) << errorMessage;
+    }
+    errorPrefix = "MultiClassNms layer with name '" + getName() + "' ";
+    const auto nms = std::dynamic_pointer_cast<const MulticlassNmsIEInternal>(op);
+
+    if (getOriginalInputsNumber() != 2)
+        IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getOriginalInputsNumber();
+
+    if (getOriginalOutputsNumber() != 3)
+        IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getOriginalOutputsNumber();
+
+    auto& atrri = nms->get_attrs();
+    sort_result_across_batch = atrri.sort_result_across_batch;
+    max_output_boxes_per_class = atrri.nms_top_k;
+    iou_threshold = atrri.iou_threshold;
+    score_threshold = atrri.score_threshold;
+    background_class = atrri.background_class;
+    keep_top_k = atrri.keep_top_k;
+    if (atrri.sort_result_type == ngNmsSortResultType::CLASSID)
+        sort_result_type = MulticlassNmsSortResultType::CLASSID;
+    else if (atrri.sort_result_type == ngNmsSortResultType::SCORE)
+        sort_result_type = MulticlassNmsSortResultType::SCORE;
+    else if (atrri.sort_result_type == ngNmsSortResultType::NONE)
+        sort_result_type = MulticlassNmsSortResultType::NONE;
+    nms_eta = atrri.nms_eta;
+    normalized = atrri.normalized;
+
+    const SizeVector& boxes_dims = inDims[NMS_BOXES].ToSizeVector();
+    if (boxes_dims.size() != 3)
+        IE_THROW() << errorPrefix << "has unsupported 'boxes' input rank: " << boxes_dims.size();
+    if (boxes_dims[2] != 4)
+        IE_THROW() << errorPrefix << "has unsupported 'boxes' input 3rd dimension size: " << boxes_dims[2];
+
+    const SizeVector& scores_dims = inDims[NMS_SCORES].ToSizeVector();
+    if (scores_dims.size() != 3)
+        IE_THROW() << errorPrefix << "has unsupported 'scores' input rank: " << scores_dims.size();
+
+    if (boxes_dims[0] != scores_dims[0])
+        IE_THROW() << errorPrefix << " num_batches is different in 'boxes' and 'scores' inputs";
+    if (boxes_dims[1] != scores_dims[2])
+        IE_THROW() << errorPrefix << " num_boxes is different in 'boxes' and 'scores' inputs";
+
+    const SizeVector& valid_outputs_dims = outDims[NMS_SELECTEDNUM].ToSizeVector();
+    if (valid_outputs_dims.size() != 1)
+        IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output rank: " << valid_outputs_dims.size();
+    if (valid_outputs_dims[0] != boxes_dims[0])  // valid_outputs_dims[0] != num_batches
+        IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output 1st dimension size: " << valid_outputs_dims[0];
+}
+
+void MKLDNNMultiClassNmsNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+    const SizeVector& boxes_dims = inDims[NMS_BOXES].ToSizeVector();
+    num_batches = boxes_dims[0];
+    num_boxes = boxes_dims[1];
+    const SizeVector& scores_dims = inDims[NMS_SCORES].ToSizeVector();
+    num_classes = scores_dims[1];
+    numFiltBox.resize(num_batches, std::vector<size_t>(num_classes));  // batches
+    numBoxOffset.resize(num_batches);
+
+    if (max_output_boxes_per_class) {
+        max_output_boxes_per_class = (max_output_boxes_per_class == -1) ? num_boxes : max_output_boxes_per_class;
+        filtBoxes.resize(max_output_boxes_per_class * num_batches * num_classes);
+    }
+
+    const std::vector<Precision> supportedFloatPrecision = {Precision::FP32, Precision::BF16};
+    const std::vector<Precision> supportedIntOutputPrecision = {Precision::I32, Precision::I64};
+
+    checkPrecision(getOriginalInputPrecisionAtPort(NMS_BOXES), supportedFloatPrecision, "boxes", inType);
+
+    checkPrecision(getOriginalInputPrecisionAtPort(NMS_SCORES), supportedFloatPrecision, "scores", inType);
+
+    checkPrecision(getOriginalOutputPrecisionAtPort(NMS_SELECTEDINDICES), supportedIntOutputPrecision, "selected_indices", outType);
+    checkPrecision(getOriginalOutputPrecisionAtPort(NMS_SELECTEDOUTPUTS), supportedFloatPrecision, "selected_outputs", outType);
+    checkPrecision(getOriginalOutputPrecisionAtPort(NMS_SELECTEDNUM), supportedIntOutputPrecision, "selected_num", outType);
+
+    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::FP32}},
+                         {{TensorDescCreatorTypes::ncsp, Precision::FP32},
+                          {TensorDescCreatorTypes::ncsp, Precision::I32},
+                          {TensorDescCreatorTypes::ncsp, Precision::I32}},
+                         impl_desc_type::ref_any);
+}
+
+void MKLDNNMultiClassNmsNode::execute(mkldnn::stream strm) {
+    const float* boxes = reinterpret_cast<const float*>(getParentEdgeAt(NMS_BOXES)->getMemoryPtr()->GetPtr());
+    const float* scores = reinterpret_cast<const float*>(getParentEdgeAt(NMS_SCORES)->getMemoryPtr()->GetPtr());
+
+    auto dims_boxes = getParentEdgeAt(NMS_BOXES)->getDesc().getDims();
+
+    if (max_output_boxes_per_class == 0)
+        return;
+
+    int* selected_indices = reinterpret_cast<int*>(getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getMemoryPtr()->GetPtr());
+
+    float* selected_outputs = selected_outputs = reinterpret_cast<float*>(getChildEdgesAtPort(NMS_SELECTEDOUTPUTS)[0]->getMemoryPtr()->GetPtr());
+
+    int* selected_num = reinterpret_cast<int*>(getChildEdgesAtPort(NMS_SELECTEDNUM)[0]->getMemoryPtr()->GetPtr());
+
+    auto boxesStrides = getParentEdgeAt(NMS_BOXES)->getDesc().getBlockingDesc().getStrides();
+    auto scoresStrides = getParentEdgeAt(NMS_SCORES)->getDesc().getBlockingDesc().getStrides();
+
+    if ((nms_eta >= 0) && (nms_eta < 1)) {
+        nmsWithEta(boxes, scores, boxesStrides, scoresStrides);
+    } else {
+        nmsWithoutEta(boxes, scores, boxesStrides, scoresStrides);
+    }
+
+    size_t startOffset = numFiltBox[0][0];
+    numBoxOffset[0] = 0;
+    for (size_t b = 0; b < numFiltBox.size(); b++) {
+        size_t batchOffsetNew = 0;
+        size_t batchOffset = b * num_classes * max_output_boxes_per_class;
+        for (size_t c = (b == 0 ? 1 : 0); c < numFiltBox[b].size(); c++) {
+            size_t offset = batchOffset + c * max_output_boxes_per_class;
+            for (size_t i = 0; i < numFiltBox[b][c]; i++) {
+                filtBoxes[startOffset + i] = filtBoxes[offset + i];
+            }
+            startOffset += numFiltBox[b][c];
+            batchOffsetNew += numFiltBox[b][c];
+        }
+        numBoxOffset[b] = batchOffsetNew;
+        if (b == 0)
+            numBoxOffset[b] += numFiltBox[0][0];
+    }
+    // sort element before go through keep_top_k
+    parallel_sort(filtBoxes.begin(), filtBoxes.begin() + startOffset, [](const filteredBoxes& l, const filteredBoxes& r) {
+        return ((l.batch_index < r.batch_index) ||
+                ((l.batch_index == r.batch_index) && ((l.score > r.score) || ((std::fabs(l.score - r.score) < 1e-6) && l.class_index < r.class_index) ||
+                                                      ((std::fabs(l.score - r.score) < 1e-6) && l.class_index == r.class_index && l.box_index < r.box_index))));
+    });
+
+    if (keep_top_k > -1) {
+        startOffset = 0;
+        size_t offset = 0;
+        for (size_t b = 0; b < numFiltBox.size(); b++) {
+            if (numBoxOffset[b] > keep_top_k) {
+                if (startOffset == offset) {
+                    startOffset += keep_top_k;
+                    offset += numBoxOffset[b];
+                } else {
+                    for (size_t i = 0; i < keep_top_k; i++) {
+                        filtBoxes[startOffset + i] = filtBoxes[offset + i];
+                    }
+                    startOffset += keep_top_k;
+                    offset += numBoxOffset[b];
+                }
+            } else {
+                if (startOffset == offset) {
+                    startOffset += numBoxOffset[b];
+                    offset += numBoxOffset[b];
+                } else {
+                    for (size_t i = 0; i < numBoxOffset[b]; i++) {
+                        filtBoxes[startOffset + i] = filtBoxes[offset + i];
+                    }
+                    startOffset += numBoxOffset[b];
+                    offset += numBoxOffset[b];
+                }
+            }
+        }
+    }
+
+    if (sort_result_across_batch) {
+        if (sort_result_type == SCORE) {
+            parallel_sort(filtBoxes.begin(), filtBoxes.begin() + startOffset, [](const filteredBoxes& l, const filteredBoxes& r) {
+                return (l.score > r.score) || (l.score == r.score && l.batch_index < r.batch_index) ||
+                       (l.score == r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) ||
+                       (l.score == r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index);
+            });
+        } else if (sort_result_type == CLASSID) {
+            parallel_sort(filtBoxes.begin(), filtBoxes.begin() + startOffset, [](const filteredBoxes& l, const filteredBoxes& r) {
+                return (l.class_index < r.class_index) || (l.class_index == r.class_index && l.batch_index < r.batch_index) ||
+                       (l.class_index == r.class_index && l.batch_index == r.batch_index && l.score > r.score) ||
+                       (l.class_index == r.class_index && l.batch_index == r.batch_index && l.score == r.score && l.box_index < r.box_index);
+            });
+        }
+    } else if (sort_result_type == CLASSID) {
+        parallel_sort(filtBoxes.begin(), filtBoxes.begin() + startOffset, [](const filteredBoxes& l, const filteredBoxes& r) {
+            return ((l.batch_index < r.batch_index) ||
+                    ((l.batch_index == r.batch_index) &&
+                     ((l.class_index < r.class_index) || ((l.class_index == r.class_index) && l.score > r.score) ||
+                      ((std::fabs(l.score - r.score) <= 1e-6) && l.class_index == r.class_index && l.box_index < r.box_index))));
+        });
+    }
+
+    const size_t selectedBoxesNum = getChildEdgeAt(NMS_SELECTEDINDICES)->getDesc().getDims()[0];
+    const size_t validOutputs = std::min(startOffset, selectedBoxesNum);
+
+    std::vector<size_t> m_selected_num;
+    m_selected_num.resize(dims_boxes[0]);
+
+    const size_t selectedBoxesNum_perBatch = selectedBoxesNum / dims_boxes[0];
+
+    for (size_t idx = 0lu; idx < validOutputs; idx++) {
+        m_selected_num[filtBoxes[idx].batch_index]++;
+    }
+
+    int64_t output_offset = 0;
+    int64_t original_offset = 0;
+    for (size_t i = 0; i < dims_boxes[0]; i++) {
+        auto real_boxes = m_selected_num[i];
+        selected_num[i] = static_cast<int>(real_boxes);
+
+        for (size_t j = 0; j < real_boxes; j++) {
+            auto original_index = original_offset + j;
+            selected_indices[j + output_offset] = filtBoxes[original_index].batch_index * dims_boxes[1] + filtBoxes[original_index].box_index;
+            auto selected_base = selected_outputs + (output_offset + j) * 6;
+            selected_base[0] = filtBoxes[original_index].class_index;
+            selected_base[1] = filtBoxes[original_index].score;
+            selected_base[2] = boxes[selected_indices[j + output_offset] * 4];
+            selected_base[3] = boxes[selected_indices[j + output_offset] * 4 + 1];
+            selected_base[4] = boxes[selected_indices[j + output_offset] * 4 + 2];
+            selected_base[5] = boxes[selected_indices[j + output_offset] * 4 + 3];
+        }
+        std::fill_n(selected_outputs + (output_offset + real_boxes) * 6, (selectedBoxesNum_perBatch - real_boxes) * 6, -1);
+        std::fill_n(selected_indices + (output_offset + real_boxes), selectedBoxesNum_perBatch - real_boxes, -1);
+        output_offset += selectedBoxesNum_perBatch;
+        original_offset += real_boxes;
+    }
+}
+
+bool MKLDNNMultiClassNmsNode::created() const {
+    return getType() == MulticlassNms;
+}
+
+float MKLDNNMultiClassNmsNode::intersectionOverUnion(const float* boxesI, const float* boxesJ, const bool normalized) {
+    float yminI, xminI, ymaxI, xmaxI, yminJ, xminJ, ymaxJ, xmaxJ;
+    const float norm = static_cast<float>(normalized == false);
+
+    // to align with reference
+    yminI = boxesI[0];
+    xminI = boxesI[1];
+    ymaxI = boxesI[2];
+    xmaxI = boxesI[3];
+    yminJ = boxesJ[0];
+    xminJ = boxesJ[1];
+    ymaxJ = boxesJ[2];
+    xmaxJ = boxesJ[3];
+
+    float areaI = (ymaxI - yminI + norm) * (xmaxI - xminI + norm);
+    float areaJ = (ymaxJ - yminJ + norm) * (xmaxJ - xminJ + norm);
+    if (areaI <= 0.f || areaJ <= 0.f)
+        return 0.f;
+
+    float intersection_area = (std::max)((std::min)(ymaxI, ymaxJ) - (std::max)(yminI, yminJ) + norm, 0.f) *
+                              (std::max)((std::min)(xmaxI, xmaxJ) - (std::max)(xminI, xminJ) + norm, 0.f);
+    return intersection_area / (areaI + areaJ - intersection_area);
+}
+
+void MKLDNNMultiClassNmsNode::nmsWithEta(const float* boxes, const float* scores, const SizeVector& boxesStrides, const SizeVector& scoresStrides) {
+    auto less = [](const boxInfo& l, const boxInfo& r) {
+        return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx));
+    };
+
+    auto func = [](float iou, float adaptive_threshold) {
+        return iou <= adaptive_threshold ? 1.0f : 0.0f;
+    };
+
+    parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) {
+        if (class_idx != background_class) {
+            std::vector<filteredBoxes> fb;
+            const float* boxesPtr = boxes + batch_idx * boxesStrides[0];
+            const float* scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
+
+            std::priority_queue<boxInfo, std::vector<boxInfo>, decltype(less)> sorted_boxes(less);
+            for (int box_idx = 0; box_idx < num_boxes; box_idx++) {
+                if (scoresPtr[box_idx] >= score_threshold)  // algin with ref
+                    sorted_boxes.emplace(boxInfo({scoresPtr[box_idx], box_idx, 0}));
+            }
+            fb.reserve(sorted_boxes.size());
+            if (sorted_boxes.size() > 0) {
+                auto adaptive_threshold = iou_threshold;
+                int max_out_box = (max_output_boxes_per_class > sorted_boxes.size()) ? sorted_boxes.size() : max_output_boxes_per_class;
+                while (max_out_box && !sorted_boxes.empty()) {
+                    boxInfo currBox = sorted_boxes.top();
+                    float origScore = currBox.score;
+                    sorted_boxes.pop();
+                    max_out_box--;
+
+                    bool box_is_selected = true;
+                    for (int idx = static_cast<int>(fb.size()) - 1; idx >= currBox.suppress_begin_index; idx--) {
+                        float iou = intersectionOverUnion(&boxesPtr[currBox.idx * 4], &boxesPtr[fb[idx].box_index * 4], normalized);
+                        currBox.score *= func(iou, adaptive_threshold);
+                        if (iou >= adaptive_threshold) {
+                            box_is_selected = false;
+                            break;
+                        }
+                        if (currBox.score <= score_threshold)
+                            break;
+                    }
+
+                    currBox.suppress_begin_index = fb.size();
+                    if (box_is_selected) {
+                        if (nms_eta < 1 && adaptive_threshold > 0.5) {
+                            adaptive_threshold *= nms_eta;
+                        }
+                        if (currBox.score == origScore) {
+                            fb.push_back({currBox.score, batch_idx, class_idx, currBox.idx});
+                            continue;
+                        }
+                        if (currBox.score > score_threshold) {
+                            sorted_boxes.push(currBox);
+                        }
+                    }
+                }
+            }
+            numFiltBox[batch_idx][class_idx] = fb.size();
+            size_t offset = batch_idx * num_classes * max_output_boxes_per_class + class_idx * max_output_boxes_per_class;
+            for (size_t i = 0; i < fb.size(); i++) {
+                filtBoxes[offset + i] = fb[i];
+            }
+        }
+    });
+}
+
+void MKLDNNMultiClassNmsNode::nmsWithoutEta(const float* boxes, const float* scores, const SizeVector& boxesStrides, const SizeVector& scoresStrides) {
+    parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) {
+        if (class_idx != background_class) {
+            const float* boxesPtr = boxes + batch_idx * boxesStrides[0];
+            const float* scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1];
+
+            std::vector<std::pair<float, int>> sorted_boxes;
+            for (int box_idx = 0; box_idx < num_boxes; box_idx++) {
+                if (scoresPtr[box_idx] >= score_threshold)  // algin with ref
+                    sorted_boxes.emplace_back(std::make_pair(scoresPtr[box_idx], box_idx));
+            }
+
+            int io_selection_size = 0;
+            if (sorted_boxes.size() > 0) {
+                parallel_sort(sorted_boxes.begin(), sorted_boxes.end(), [](const std::pair<float, int>& l, const std::pair<float, int>& r) {
+                    return (l.first > r.first || ((l.first == r.first) && (l.second < r.second)));
+                });
+                int offset = batch_idx * num_classes * max_output_boxes_per_class + class_idx * max_output_boxes_per_class;
+                filtBoxes[offset + 0] = filteredBoxes(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second);
+                io_selection_size++;
+                int max_out_box = (max_output_boxes_per_class > sorted_boxes.size()) ? sorted_boxes.size() : max_output_boxes_per_class;
+                for (size_t box_idx = 1; box_idx < max_out_box; box_idx++) {
+                    bool box_is_selected = true;
+                    for (int idx = io_selection_size - 1; idx >= 0; idx--) {
+                        float iou =
+                            intersectionOverUnion(&boxesPtr[sorted_boxes[box_idx].second * 4], &boxesPtr[filtBoxes[offset + idx].box_index * 4], normalized);
+                        if (iou >= iou_threshold) {
+                            box_is_selected = false;
+                            break;
+                        }
+                    }
+
+                    if (box_is_selected) {
+                        filtBoxes[offset + io_selection_size] = filteredBoxes(sorted_boxes[box_idx].first, batch_idx, class_idx, sorted_boxes[box_idx].second);
+                        io_selection_size++;
+                    }
+                }
+            }
+            numFiltBox[batch_idx][class_idx] = io_selection_size;
+        }
+    });
+}
+
+void MKLDNNMultiClassNmsNode::checkPrecision(const Precision prec, const std::vector<Precision> precList, const std::string name, const std::string type) {
+    if (std::find(precList.begin(), precList.end(), prec) == precList.end())
+        IE_THROW() << errorPrefix << "has unsupported '" << name << "' " << type << " precision: " << prec;
+}
+
+REG_MKLDNN_PRIM_FOR(MKLDNNMultiClassNmsNode, MulticlassNms)
\ No newline at end of file
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_multiclass_nms.hpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_multiclass_nms.hpp
new file mode 100644
index 00000000000000..0627f72cea0df8
--- /dev/null
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_multiclass_nms.hpp
@@ -0,0 +1,93 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+
+#include <string>
+
+namespace MKLDNNPlugin {
+
+enum MulticlassNmsSortResultType {
+    CLASSID,  // sort selected boxes by class id (ascending) in each batch element
+    SCORE,    // sort selected boxes by score (descending) in each batch element
+    NONE      // do not guarantee the order in each batch element
+};
+
+class MKLDNNMultiClassNmsNode : public MKLDNNNode {
+public:
+    MKLDNNMultiClassNmsNode(const std::shared_ptr<ngraph::Node>& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr& cache);
+
+    void getSupportedDescriptors() override {};
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override {};
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+
+    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+
+private:
+    // input (port Num)
+    const size_t NMS_BOXES = 0;
+    const size_t NMS_SCORES = 1;
+
+    // output (port Num)
+    const size_t NMS_SELECTEDOUTPUTS = 0;
+    const size_t NMS_SELECTEDINDICES = 1;
+    const size_t NMS_SELECTEDNUM = 2;
+
+    bool sort_result_across_batch = false;
+    MulticlassNmsSortResultType sort_result_type = NONE;
+
+    size_t num_batches;
+    size_t num_boxes;
+    size_t num_classes;
+
+    int max_output_boxes_per_class = 0;
+    float iou_threshold = 0.0f;
+    float score_threshold = 0.0f;
+
+    int32_t background_class = 0;
+    int32_t keep_top_k = 0;
+    float nms_eta = 0.0f;
+    bool normalized = true;
+
+    std::string errorPrefix;
+
+    std::vector<std::vector<size_t>> numFiltBox;
+    std::vector<size_t> numBoxOffset;
+    const std::string inType = "input", outType = "output";
+
+    struct filteredBoxes {
+        float score;
+        int batch_index;
+        int class_index;
+        int box_index;
+        filteredBoxes() = default;
+        filteredBoxes(float _score, int _batch_index, int _class_index, int _box_index)
+            : score(_score), batch_index(_batch_index), class_index(_class_index), box_index(_box_index) {}
+    };
+
+    struct boxInfo {
+        float score;
+        int idx;
+        int suppress_begin_index;
+    };
+
+    std::vector<filteredBoxes> filtBoxes;
+
+    void checkPrecision(const InferenceEngine::Precision prec, const std::vector<InferenceEngine::Precision> precList, const std::string name,
+                        const std::string type);
+
+    float intersectionOverUnion(const float* boxesI, const float* boxesJ, const bool normalized);
+
+    void nmsWithEta(const float* boxes, const float* scores, const InferenceEngine::SizeVector& boxesStrides, const InferenceEngine::SizeVector& scoresStrides);
+
+    void nmsWithoutEta(const float* boxes, const float* scores, const InferenceEngine::SizeVector& boxesStrides,
+                       const InferenceEngine::SizeVector& scoresStrides);
+};
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/transformations/include/ngraph_ops/nms_static_shape_ie.hpp b/inference-engine/src/transformations/include/ngraph_ops/nms_static_shape_ie.hpp
new file mode 100644
index 00000000000000..3bed4a37e6adb7
--- /dev/null
+++ b/inference-engine/src/transformations/include/ngraph_ops/nms_static_shape_ie.hpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include <string>
+
+#include <transformations_visibility.hpp>
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph {
+namespace op {
+namespace internal {
+
+template <typename BaseNmsOp>
+class NmsStaticShapeIE : public BaseNmsOp {
+public:
+    NGRAPH_RTTI_DECLARATION;
+
+    using Attributes = typename BaseNmsOp::Attributes;
+
+    /// \brief Constructs a NmsStaticShapeIE operation
+    ///
+    /// \param boxes Node producing the box coordinates
+    /// \param scores Node producing the box scores
+    /// \param attrs Attributes of the operation
+    NmsStaticShapeIE(const Output<Node>& boxes,
+                     const Output<Node>& scores,
+                     const Attributes& attrs) : BaseNmsOp(boxes, scores, attrs) {
+        this->constructor_validate_and_infer_types();
+    }
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override {
+        return std::make_shared<NmsStaticShapeIE>(new_args.at(0), new_args.at(1), this->m_attrs);
+    }
+};
+
+template <typename BaseNmsOp>
+void NmsStaticShapeIE<BaseNmsOp>::validate_and_infer_types() {
+    const auto boxes_ps = this->get_input_partial_shape(0);
+    const auto scores_ps = this->get_input_partial_shape(1);
+
+    auto first_dim_shape = Dimension::dynamic();
+
+    if (boxes_ps.rank().is_static() && scores_ps.rank().is_static()) {
+        const auto num_boxes_boxes = boxes_ps[1];
+        if (num_boxes_boxes.is_static() && scores_ps[0].is_static() && scores_ps[1].is_static()) {
+            const auto num_boxes = num_boxes_boxes.get_length();
+            auto num_classes = scores_ps[1].get_length();
+            if (this->m_attrs.background_class >=0 && this->m_attrs.background_class <= num_classes) {
+                num_classes = num_classes - 1;
+            }
+            int64_t max_output_boxes_per_class = 0;
+            if (this->m_attrs.nms_top_k >= 0)
+                max_output_boxes_per_class = std::min(num_boxes, static_cast<int64_t>(this->m_attrs.nms_top_k));
+            else
+                max_output_boxes_per_class = num_boxes;
+
+            auto max_output_boxes_per_batch = max_output_boxes_per_class * num_classes;
+            if (this->m_keep_top_k >= 0)
+                max_output_boxes_per_batch =
+                    std::min(max_output_boxes_per_batch, static_cast<int64_t>(this->m_attrs.keep_top_k));
+
+            first_dim_shape = max_output_boxes_per_batch * scores_ps[0].get_length();
+        }
+    }
+
+    // 'selected_outputs' have the following format:
+    //      [number of selected boxes, [class_id, box_score, xmin, ymin, xmax, ymax]]
+    this->set_output_type(0, element::f32, {first_dim_shape, 6});
+    // 'selected_indices' have the following format:
+    //      [number of selected boxes, 1]
+    this->set_output_type(1, this->m_attrs.output_type, {first_dim_shape, 1});
+    // 'selected_num' have the following format:
+    //      [num_batches, ]
+    if (boxes_ps.rank().is_static() && boxes_ps.rank().get_length() > 0) {
+        this->set_output_type(2, this->m_attrs.output_type, {boxes_ps[0]});
+    } else {
+        this->set_output_type(2, this->m_attrs.output_type, {Dimension::dynamic()});
+    }
+}
+
+template <typename BaseNmsOp>
+const ::ngraph::Node::type_info_t& NmsStaticShapeIE<BaseNmsOp>::get_type_info() const { return get_type_info_static(); }
+
+template <typename BaseNmsOp>
+const ::ngraph::Node::type_info_t& NmsStaticShapeIE<BaseNmsOp>::get_type_info_static() {
+    auto BaseNmsOpTypeInfoPtr = &BaseNmsOp::get_type_info_static();
+
+    // TODO: it should be static const std::string name = std::string("NmsStaticShapeIE_") + BaseNmsOpTypeInfoPtr->name;
+    //       but currently it will not pass conversion ot Legacy Opset correctly
+    static const std::string name = BaseNmsOpTypeInfoPtr->name;
+
+    static const ::ngraph::Node::type_info_t type_info_static{
+        name.c_str(), BaseNmsOpTypeInfoPtr->version, BaseNmsOpTypeInfoPtr};
+    return type_info_static;
+}
+
+template <typename BaseNmsOp>
+const ::ngraph::Node::type_info_t NmsStaticShapeIE<BaseNmsOp>::type_info = NmsStaticShapeIE<BaseNmsOp>::get_type_info_static();
+
+#ifdef __clang__
+extern template class TRANSFORMATIONS_API op::internal::NmsStaticShapeIE<op::v8::MulticlassNms>;
+extern template class TRANSFORMATIONS_API op::internal::NmsStaticShapeIE<op::v8::MatrixNms>;
+#endif  // __clang__
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ngraph
diff --git a/inference-engine/src/transformations/include/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp b/inference-engine/src/transformations/include/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp
new file mode 100644
index 00000000000000..080a08683222d9
--- /dev/null
+++ b/inference-engine/src/transformations/include/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <utility>
+#include <memory>
+
+#include <transformations_visibility.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace ngraph {
+namespace pass {
+
+class TRANSFORMATIONS_API ConvertMatrixNmsToMatrixNmsIE;
+
+}  // namespace pass
+}  // namespace ngraph
+
+class ngraph::pass::ConvertMatrixNmsToMatrixNmsIE: public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    ConvertMatrixNmsToMatrixNmsIE();
+};
diff --git a/inference-engine/src/transformations/include/transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp b/inference-engine/src/transformations/include/transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp
new file mode 100644
index 00000000000000..b639364b24e978
--- /dev/null
+++ b/inference-engine/src/transformations/include/transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <utility>
+#include <memory>
+
+#include <transformations_visibility.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace ngraph {
+namespace pass {
+
+class TRANSFORMATIONS_API ConvertMulticlassNmsToMulticlassNmsIE;
+
+}  // namespace pass
+}  // namespace ngraph
+
+class ngraph::pass::ConvertMulticlassNmsToMulticlassNmsIE: public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    ConvertMulticlassNmsToMulticlassNmsIE();
+};
diff --git a/inference-engine/src/transformations/src/ngraph_ops/nms_static_shape_ie.cpp b/inference-engine/src/transformations/src/ngraph_ops/nms_static_shape_ie.cpp
new file mode 100644
index 00000000000000..8f173eafcae271
--- /dev/null
+++ b/inference-engine/src/transformations/src/ngraph_ops/nms_static_shape_ie.cpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <memory>
+
+#include "ngraph/ops.hpp"
+#include "ngraph_ops/nms_static_shape_ie.hpp"
+
+namespace ngraph {
+namespace op {
+namespace internal {
+
+template class TRANSFORMATIONS_API op::internal::NmsStaticShapeIE<op::v8::MulticlassNms>;
+template class TRANSFORMATIONS_API op::internal::NmsStaticShapeIE<op::v8::MatrixNms>;
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ngraph
diff --git a/inference-engine/src/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp b/inference-engine/src/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp
new file mode 100644
index 00000000000000..34163fc48601d7
--- /dev/null
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "itt.hpp"
+#include <memory>
+#include <vector>
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset5.hpp>
+#include <ngraph/opsets/opset8.hpp>
+
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "ngraph_ops/nms_static_shape_ie.hpp"
+#include "transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp"
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::ConvertMatrixNmsToMatrixNmsIE, "ConvertMatrixNmsToMatrixNmsIE", 0);
+
+ngraph::pass::ConvertMatrixNmsToMatrixNmsIE::ConvertMatrixNmsToMatrixNmsIE() {
+    MATCHER_SCOPE(ConvertMatrixNmsToMatrixNmsIE);
+    auto nms = ngraph::pattern::wrap_type<ngraph::opset8::MatrixNms>();
+
+    ngraph::matcher_pass_callback callback = [](pattern::Matcher &m) {
+        auto nms = std::dynamic_pointer_cast<ngraph::opset8::MatrixNms>(m.get_match_root());
+        if (!nms) {
+            return false;
+        }
+
+        const auto new_args = nms->input_values();
+        // vector of new nGraph operations
+        NodeVector new_ops;
+        auto attrs = nms->get_attrs();
+        attrs.output_type = element::i32;
+        auto nms_new = std::make_shared<op::internal::NmsStaticShapeIE<ngraph::opset8::MatrixNms>>(
+                new_args.at(0),
+                new_args.at(1),
+                attrs);
+        new_ops.emplace_back(nms_new);
+
+        Output<Node> output_0 = nms_new->output(0);
+        Output<Node> output_1 = nms_new->output(1);
+        Output<Node> output_2 = nms_new->output(2);
+
+        if (nms->output(1).get_element_type() != output_1.get_element_type()) {
+            output_1 = std::make_shared<opset1::Convert>(output_1, nms->output(1).get_element_type());
+            output_1.get_node_shared_ptr()->set_friendly_name(nms->get_friendly_name() + "/convert.1");
+            new_ops.emplace_back(output_1.get_node_shared_ptr());
+        }
+
+        if (nms->output(2).get_element_type() != output_2.get_element_type()) {
+            output_2 = std::make_shared<opset1::Convert>(output_2, nms->output(2).get_element_type());
+            output_2.get_node_shared_ptr()->set_friendly_name(nms->get_friendly_name() + "/convert.2");
+            new_ops.emplace_back(output_2.get_node_shared_ptr());
+        }
+
+        nms_new->set_friendly_name(nms->get_friendly_name());
+        ngraph::copy_runtime_info(nms, new_ops);
+        ngraph::replace_node(nms, {output_0, output_1, output_2});
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(nms, matcher_name);
+    this->register_matcher(m, callback);
+}
diff --git a/inference-engine/src/transformations/src/transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.cpp b/inference-engine/src/transformations/src/transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.cpp
new file mode 100644
index 00000000000000..1f236610e53ed7
--- /dev/null
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.cpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "itt.hpp"
+#include <memory>
+#include <vector>
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset5.hpp>
+#include <ngraph/opsets/opset8.hpp>
+
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "ngraph_ops/nms_static_shape_ie.hpp"
+#include "transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp"
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::ConvertMulticlassNmsToMulticlassNmsIE, "ConvertMulticlassNmsToMulticlassNmsIE", 0);
+
+ngraph::pass::ConvertMulticlassNmsToMulticlassNmsIE::ConvertMulticlassNmsToMulticlassNmsIE() {
+    MATCHER_SCOPE(ConvertMulticlassNmsToMulticlassNmsIE);
+    auto nms = ngraph::pattern::wrap_type<ngraph::opset8::MulticlassNms>();
+
+    ngraph::matcher_pass_callback callback = [](pattern::Matcher &m) {
+        auto nms = std::dynamic_pointer_cast<ngraph::opset8::MulticlassNms>(m.get_match_root());
+        if (!nms) {
+            return false;
+        }
+
+        const auto new_args = nms->input_values();
+        // vector of new nGraph operations
+        NodeVector new_ops;
+        auto attrs = nms->get_attrs();
+        attrs.output_type = element::i32;
+
+        auto nms_new = std::make_shared<op::internal::NmsStaticShapeIE<ngraph::opset8::MulticlassNms>>(
+                new_args.at(0),
+                new_args.at(1),
+                attrs);
+        new_ops.emplace_back(nms_new);
+
+        Output<Node> output_0 = nms_new->output(0);
+        Output<Node> output_1 = nms_new->output(1);
+        Output<Node> output_2 = nms_new->output(2);
+
+        if (nms->output(1).get_element_type() != output_1.get_element_type()) {
+            output_1 = std::make_shared<opset1::Convert>(output_1, nms->output(1).get_element_type());
+            output_1.get_node_shared_ptr()->set_friendly_name(nms->get_friendly_name() + "/convert.1");
+            new_ops.emplace_back(output_1.get_node_shared_ptr());
+        }
+
+        if (nms->output(2).get_element_type() != output_2.get_element_type()) {
+            output_2 = std::make_shared<opset1::Convert>(output_2, nms->output(2).get_element_type());
+            output_2.get_node_shared_ptr()->set_friendly_name(nms->get_friendly_name() + "/convert.2");
+            new_ops.emplace_back(output_2.get_node_shared_ptr());
+        }
+
+        nms_new->set_friendly_name(nms->get_friendly_name());
+        ngraph::copy_runtime_info(nms, new_ops);
+        ngraph::replace_node(nms, {output_0, output_1, output_2});
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(nms, matcher_name);
+    this->register_matcher(m, callback);
+}
diff --git a/inference-engine/tests/functional/inference_engine/serialization/single_layer/matrix_nms.cpp b/inference-engine/tests/functional/inference_engine/serialization/single_layer/matrix_nms.cpp
new file mode 100644
index 00000000000000..750b483bd29414
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/serialization/single_layer/matrix_nms.cpp
@@ -0,0 +1,60 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "shared_test_classes/single_layer/matrix_nms.hpp"
+
+using namespace ngraph;
+using namespace LayerTestsDefinitions;
+
+namespace {
+    TEST_P(MatrixNmsLayerTest, Serialize) {
+        Serialize();
+    }
+
+    const std::vector<InferenceEngine::Precision> netPrecisions = {
+            InferenceEngine::Precision::FP32,
+            InferenceEngine::Precision::FP16
+    };
+
+    const std::vector<InputShapeParams> inShapeParams = {
+        InputShapeParams{3, 100, 5},
+        InputShapeParams{1, 10, 50},
+        InputShapeParams{2, 50, 50}
+    };
+
+    const std::vector<op::v8::MatrixNms::SortResultType> sortResultType = {op::v8::MatrixNms::SortResultType::CLASSID,
+                                                                       op::v8::MatrixNms::SortResultType::SCORE,
+                                                                       op::v8::MatrixNms::SortResultType::NONE};
+    const std::vector<element::Type> outType = {element::i32, element::i64};
+    const std::vector<TopKParams> topKParams = {
+        TopKParams{-1, 5},
+        TopKParams{100, -1}
+    };
+    const std::vector<ThresholdParams> thresholdParams = {
+        ThresholdParams{0.0f, 2.0f, 0.0f},
+        ThresholdParams{0.1f, 1.5f, 0.2f}
+    };
+    const std::vector<int> nmsTopK = {-1, 100};
+    const std::vector<int> keepTopK = {-1, 5};
+    const std::vector<int> backgroudClass = {-1, 0};
+    const std::vector<bool> normalized = {true, false};
+    const std::vector<op::v8::MatrixNms::DecayFunction> decayFunction = {op::v8::MatrixNms::DecayFunction::GAUSSIAN,
+                                                    op::v8::MatrixNms::DecayFunction::LINEAR};
+    const auto nmsParams = ::testing::Combine(::testing::ValuesIn(inShapeParams),
+                                          ::testing::Combine(::testing::Values(InferenceEngine::Precision::FP32),
+                                                             ::testing::Values(InferenceEngine::Precision::I32),
+                                                             ::testing::Values(InferenceEngine::Precision::FP32)),
+                                          ::testing::ValuesIn(sortResultType),
+                                          ::testing::ValuesIn(outType),
+                                          ::testing::ValuesIn(topKParams),
+                                          ::testing::ValuesIn(thresholdParams),
+                                          ::testing::ValuesIn(backgroudClass),
+                                          ::testing::ValuesIn(normalized),
+                                          ::testing::ValuesIn(decayFunction),
+                                          ::testing::Values(CommonTestUtils::DEVICE_CPU));
+
+    INSTANTIATE_TEST_CASE_P(smoke_MatrixNmsLayerTest, MatrixNmsLayerTest, nmsParams, MatrixNmsLayerTest::getTestCaseName);
+}  // namespace
diff --git a/inference-engine/tests/functional/inference_engine/serialization/single_layer/multiclass_nms.cpp b/inference-engine/tests/functional/inference_engine/serialization/single_layer/multiclass_nms.cpp
new file mode 100644
index 00000000000000..203b20c4ab4cf9
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/serialization/single_layer/multiclass_nms.cpp
@@ -0,0 +1,60 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "shared_test_classes/single_layer/multiclass_nms.hpp"
+
+using namespace ngraph;
+using namespace LayerTestsDefinitions;
+
+namespace {
+TEST_P(MulticlassNmsLayerTest, Serialize) {
+    Serialize();
+}
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16};
+
+const std::vector<InputShapeParams> inShapeParams = {
+    InputShapeParams{3, 100, 5}, InputShapeParams{1, 10, 50},
+    InputShapeParams{2, 50, 50}};
+
+const std::vector<int32_t> nmsTopK = {-1, 20};
+const std::vector<float> iouThreshold = {0.7f};
+const std::vector<float> scoreThreshold = {0.7f};
+const std::vector<int32_t> backgroundClass = {-1, 0};
+const std::vector<int32_t> keepTopK = {-1, 30};
+const std::vector<element::Type> outType = {element::i32, element::i64};
+
+const std::vector<op::v8::MulticlassNms::SortResultType> sortResultType = {
+    op::v8::MulticlassNms::SortResultType::SCORE,
+    op::v8::MulticlassNms::SortResultType::CLASSID,
+    op::v8::MulticlassNms::SortResultType::NONE};
+const std::vector<bool> sortResDesc = {true, false};
+const std::vector<float> nmsEta = {0.6f, 1.0f};
+const std::vector<bool> normalized = {true, false};
+
+const auto nmsParams = ::testing::Combine(
+    ::testing::ValuesIn(inShapeParams),
+    ::testing::Combine(::testing::Values(InferenceEngine::Precision::FP32),
+                       ::testing::Values(InferenceEngine::Precision::I32),
+                       ::testing::Values(InferenceEngine::Precision::FP32)),
+    ::testing::ValuesIn(nmsTopK),
+    ::testing::Combine(::testing::ValuesIn(iouThreshold),
+                       ::testing::ValuesIn(scoreThreshold),
+                       ::testing::ValuesIn(nmsEta)),
+    ::testing::ValuesIn(backgroundClass),
+    ::testing::ValuesIn(keepTopK),
+    ::testing::ValuesIn(outType),
+    ::testing::ValuesIn(sortResultType),
+    ::testing::Combine(::testing::ValuesIn(sortResDesc),
+                       ::testing::ValuesIn(normalized)),
+    ::testing::Values(CommonTestUtils::DEVICE_CPU));
+
+INSTANTIATE_TEST_CASE_P(smoke_MulticlassNmsLayerTest,
+                        MulticlassNmsLayerTest,
+                        nmsParams,
+                        MulticlassNmsLayerTest::getTestCaseName);
+}  // namespace
diff --git a/inference-engine/tests/functional/inference_engine/skip_tests_config.cpp b/inference-engine/tests/functional/inference_engine/skip_tests_config.cpp
index 75fb7d791899ed..aff04cee6e5eab 100644
--- a/inference-engine/tests/functional/inference_engine/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/inference_engine/skip_tests_config.cpp
@@ -15,6 +15,6 @@ std::vector<std::string> disabledTestPatterns() {
         // TODO: task 32568, enable after supporting constants outputs in plugins
         ".*TransformationTests\\.ConstFoldingPriorBox.*",
         // azure is failing after #6199
-        ".*NmsLayerTest.*",
+        ".*/NmsLayerTest.*",
     };
 }
diff --git a/inference-engine/tests/functional/inference_engine/transformations/convert_matrix_nms_to_matrix_nms_ie_internal.cpp b/inference-engine/tests/functional/inference_engine/transformations/convert_matrix_nms_to_matrix_nms_ie_internal.cpp
new file mode 100644
index 00000000000000..afd4cd26a5b348
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/transformations/convert_matrix_nms_to_matrix_nms_ie_internal.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+#include <queue>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <ngraph/opsets/opset5.hpp>
+#include <ngraph/opsets/opset8.hpp>
+#include <transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp>
+#include <transformations/init_node_info.hpp>
+#include <transformations/utils/utils.hpp>
+#include <ngraph_ops/nms_static_shape_ie.hpp>
+#include <ngraph/pass/constant_folding.hpp>
+#include <ngraph/pass/manager.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+using namespace ngraph;
+
+TEST(TransformationTests, ConvertMatrixNmsToMatrixNmsIE) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto boxes = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1000, 4});
+        auto scores = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1, 1000});
+
+        auto nms = std::make_shared<opset8::MatrixNms>(boxes, scores, opset8::MatrixNms::Attributes());
+
+        f = std::make_shared<Function>(NodeVector{nms}, ParameterVector{boxes, scores});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::ConvertMatrixNmsToMatrixNmsIE>();
+        manager.register_pass<ngraph::pass::ConstantFolding>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+        ASSERT_TRUE(f->get_output_partial_shape(0).is_static()) << "Shape " << f->get_output_partial_shape(0) << " should be static";
+    }
+
+    {
+        auto boxes = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1000, 4});
+        auto scores = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1, 1000});
+        auto nms = std::make_shared<op::internal::NmsStaticShapeIE<ngraph::opset8::MatrixNms>>(boxes, scores, opset8::MatrixNms::Attributes());
+
+        f_ref = std::make_shared<Function>(NodeVector{nms}, ParameterVector{boxes, scores});
+        ASSERT_TRUE(f_ref->get_output_partial_shape(0).is_static()) << "Shape " << f_ref->get_output_partial_shape(0) << " should be static";
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
diff --git a/inference-engine/tests/functional/inference_engine/transformations/convert_multiclass_nms_to_multiclass_nms_ie_internal.cpp b/inference-engine/tests/functional/inference_engine/transformations/convert_multiclass_nms_to_multiclass_nms_ie_internal.cpp
new file mode 100644
index 00000000000000..1f0f6f856f76ba
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/transformations/convert_multiclass_nms_to_multiclass_nms_ie_internal.cpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+#include <queue>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <ngraph/opsets/opset5.hpp>
+#include <ngraph/opsets/opset8.hpp>
+#include <transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp>
+#include <transformations/init_node_info.hpp>
+#include <transformations/utils/utils.hpp>
+#include <ngraph_ops/nms_static_shape_ie.hpp>
+#include <ngraph/pass/constant_folding.hpp>
+#include <ngraph/pass/manager.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+using namespace ngraph;
+
+TEST(TransformationTests, ConvertMulticlassNmsToMulticlassNmsIE) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto boxes = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1000, 4});
+        auto scores = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1, 1000});
+
+        auto nms = std::make_shared<opset8::MulticlassNms>(boxes, scores, opset8::MulticlassNms::Attributes());
+
+        f = std::make_shared<Function>(NodeVector{nms}, ParameterVector{boxes, scores});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
+        manager.register_pass<ngraph::pass::ConstantFolding>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+        ASSERT_TRUE(f->get_output_partial_shape(0).is_static()) << "Shape " << f->get_output_partial_shape(0) << " should be static";
+    }
+
+    {
+        auto boxes = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1000, 4});
+        auto scores = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1, 1000});
+        auto nms = std::make_shared<op::internal::NmsStaticShapeIE<ngraph::opset8::MulticlassNms>>(boxes, scores, opset8::MulticlassNms::Attributes());
+
+        f_ref = std::make_shared<Function>(NodeVector{nms}, ParameterVector{boxes, scores});
+        ASSERT_TRUE(f_ref->get_output_partial_shape(0).is_static()) << "Shape " << f_ref->get_output_partial_shape(0) << " should be static";
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp
new file mode 100644
index 00000000000000..25766a89fc8fbc
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <tuple>
+
+#include "single_layer_tests/matrix_nms.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+using namespace InferenceEngine;
+using namespace ngraph;
+
+const std::vector<InputShapeParams> inShapeParams = {
+    InputShapeParams{3, 100, 5},
+    InputShapeParams{1, 10, 50},
+    InputShapeParams{2, 50, 50}
+};
+
+const std::vector<op::v8::MatrixNms::SortResultType> sortResultType = {op::v8::MatrixNms::SortResultType::CLASSID,
+                                                                       op::v8::MatrixNms::SortResultType::SCORE,
+                                                                       op::v8::MatrixNms::SortResultType::NONE};
+const std::vector<element::Type> outType = {element::i32, element::i64};
+const std::vector<TopKParams> topKParams = {
+    TopKParams{-1, 5},
+    TopKParams{100, -1}
+};
+const std::vector<ThresholdParams> thresholdParams = {
+    ThresholdParams{0.0f, 2.0f, 0.0f},
+    ThresholdParams{0.1f, 1.5f, 0.2f}
+};
+const std::vector<int> nmsTopK = {-1, 100};
+const std::vector<int> keepTopK = {-1, 5};
+const std::vector<int> backgroudClass = {-1, 0};
+const std::vector<bool> normalized = {true, false};
+const std::vector<op::v8::MatrixNms::DecayFunction> decayFunction = {op::v8::MatrixNms::DecayFunction::GAUSSIAN,
+                                                op::v8::MatrixNms::DecayFunction::LINEAR};
+
+const auto nmsParams = ::testing::Combine(::testing::ValuesIn(inShapeParams),
+                                          ::testing::Combine(::testing::Values(Precision::FP32),
+                                                             ::testing::Values(Precision::I32),
+                                                             ::testing::Values(Precision::FP32)),
+                                          ::testing::ValuesIn(sortResultType),
+                                          ::testing::ValuesIn(outType),
+                                          ::testing::ValuesIn(topKParams),
+                                          ::testing::ValuesIn(thresholdParams),
+                                          ::testing::ValuesIn(backgroudClass),
+                                          ::testing::ValuesIn(normalized),
+                                          ::testing::ValuesIn(decayFunction),
+                                          ::testing::Values(CommonTestUtils::DEVICE_CPU)
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_MatrixNmsLayerTest, MatrixNmsLayerTest, nmsParams, MatrixNmsLayerTest::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/multiclass_nms.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/multiclass_nms.cpp
new file mode 100644
index 00000000000000..6622a24ce3d2a9
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/multiclass_nms.cpp
@@ -0,0 +1,37 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/multiclass_nms.hpp"
+
+#include <vector>
+
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+using namespace InferenceEngine;
+using namespace ngraph;
+
+const std::vector<InputShapeParams> inShapeParams = {InputShapeParams {3, 100, 5}, InputShapeParams {1, 10, 50}, InputShapeParams {2, 50, 50}};
+
+const std::vector<int32_t> nmsTopK = {-1, 20};
+const std::vector<float> iouThreshold = {0.7f};
+const std::vector<float> scoreThreshold = {0.7f};
+const std::vector<int32_t> backgroundClass = {-1, 0};
+const std::vector<int32_t> keepTopK = {-1, 30};
+const std::vector<element::Type> outType = {element::i32, element::i64};
+
+const std::vector<op::v8::MulticlassNms::SortResultType> sortResultType = {
+    op::v8::MulticlassNms::SortResultType::SCORE, op::v8::MulticlassNms::SortResultType::CLASSID, op::v8::MulticlassNms::SortResultType::NONE};
+const std::vector<bool> sortResDesc = {true, false};
+const std::vector<float> nmsEta = {0.6f, 1.0f};
+const std::vector<bool> normalized = {true, false};
+
+const auto nmsParams = ::testing::Combine(
+    ::testing::ValuesIn(inShapeParams),
+    ::testing::Combine(::testing::Values(Precision::FP32), ::testing::Values(Precision::I32), ::testing::Values(Precision::FP32)), ::testing::ValuesIn(nmsTopK),
+    ::testing::Combine(::testing::ValuesIn(iouThreshold), ::testing::ValuesIn(scoreThreshold), ::testing::ValuesIn(nmsEta)),
+    ::testing::ValuesIn(backgroundClass), ::testing::ValuesIn(keepTopK), ::testing::ValuesIn(outType), ::testing::ValuesIn(sortResultType),
+    ::testing::Combine(::testing::ValuesIn(sortResDesc), ::testing::ValuesIn(normalized)), ::testing::Values(CommonTestUtils::DEVICE_CPU));
+
+INSTANTIATE_TEST_CASE_P(smoke_MulticlassNmsLayerTest, MulticlassNmsLayerTest, nmsParams, MulticlassNmsLayerTest::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
index 14d25be9a17f6a..8019fc072a8fdb 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
@@ -77,7 +77,7 @@ std::vector<std::string> disabledTestPatterns() {
         // need to implement Export / Import
         R"(.*IEClassImportExportTestP.*)",
         // azure is failing after #6199
-        R"(.*NmsLayerTest.*)"
+        R"(.*/NmsLayerTest.*)"
     };
 #ifdef __APPLE__
         // TODO: Issue 55717
diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/matrix_nms.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/matrix_nms.hpp
new file mode 100644
index 00000000000000..21e89bf0474455
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/matrix_nms.hpp
@@ -0,0 +1,15 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/single_layer/matrix_nms.hpp"
+
+namespace LayerTestsDefinitions {
+
+TEST_P(MatrixNmsLayerTest, CompareWithRefs) {
+    Run();
+};
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/multiclass_nms.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/multiclass_nms.hpp
new file mode 100644
index 00000000000000..e89ba2d126c3cb
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/multiclass_nms.hpp
@@ -0,0 +1,15 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/single_layer/multiclass_nms.hpp"
+
+namespace LayerTestsDefinitions {
+
+TEST_P(MulticlassNmsLayerTest, CompareWithRefs) {
+    Run();
+};
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/matrix_nms.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/matrix_nms.hpp
new file mode 100644
index 00000000000000..9be3b082c3b808
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/matrix_nms.hpp
@@ -0,0 +1,58 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <tuple>
+#include <string>
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+
+namespace LayerTestsDefinitions {
+
+using InputShapeParams = std::tuple<size_t,  // Number of batches
+                                    size_t,  // Number of boxes
+                                    size_t>; // Number of classes
+
+using InputPrecisions = std::tuple<InferenceEngine::Precision,  // boxes and scores precisions
+                                   InferenceEngine::Precision,  // max_output_boxes_per_class precision
+                                   InferenceEngine::Precision>; // iou_threshold, score_threshold, soft_nms_sigma precisions
+
+using TopKParams = std::tuple<int,      // Maximum number of boxes to be selected per class
+                              int>;     // Maximum number of boxes to be selected per batch element
+
+using ThresholdParams = std::tuple<float,   // minimum score to consider box for the processing
+                                   float,   // gaussian_sigma parameter for gaussian decay_function
+                                   float>;  // filter out boxes with low confidence score after decaying
+
+using NmsParams = std::tuple<InputShapeParams,                                   // Params using to create 1st and 2nd inputs
+                             InputPrecisions,                                    // Input precisions
+                             ngraph::op::v8::MatrixNms::SortResultType,          // Order of output elements
+                             ngraph::element::Type,                              // Output type
+                             TopKParams,                                         // Maximum number of boxes topk params
+                             ThresholdParams,                                    // Thresholds: score_threshold, gaussian_sigma, post_threshold
+                             int,                                                // Background class id
+                             bool,                                               // If boxes are normalized
+                             ngraph::op::v8::MatrixNms::DecayFunction,           // Decay function
+                             std::string>;                                       // Device name
+
+class MatrixNmsLayerTest : public testing::WithParamInterface<NmsParams>, virtual public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<NmsParams> obj);
+    void GenerateInputs() override;
+    void Compare(const std::vector<std::pair<ngraph::element::Type, std::vector<std::uint8_t>>> &expectedOutputs,
+                 const std::vector<InferenceEngine::Blob::Ptr> &actualOutputs)
+    override;
+
+protected:
+    void SetUp() override;
+
+private:
+    size_t numBatches, numBoxes, numClasses;
+    size_t maxOutputBoxesPerClass;
+    size_t maxOutputBoxesPerBatch;
+};
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/multiclass_nms.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/multiclass_nms.hpp
new file mode 100644
index 00000000000000..4add46d8ce13f2
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/multiclass_nms.hpp
@@ -0,0 +1,59 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <tuple>
+
+#include "ngraph_functions/builders.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+
+namespace LayerTestsDefinitions {
+
+using InputShapeParams = std::tuple<size_t,   // Number of batches
+                                    size_t,   // Number of boxes
+                                    size_t>;  // Number of classes
+
+using InputPrecisions = std::tuple<InferenceEngine::Precision,   // boxes and scores precisions
+                                   InferenceEngine::Precision,   // max_output_boxes_per_class
+                                                                 // precision
+                                   InferenceEngine::Precision>;  // iou_threshold, score_threshold,
+                                                                 // soft_nms_sigma precisions
+
+using InputfloatVar = std::tuple<float,   // iouThreshold
+                                 float,   // scoreThreshold
+                                 float>;  // nmsEta
+
+using InputboolVar = std::tuple<bool,   // nmsEta
+                                bool>;  // normalized
+
+using MulticlassNmsParams = std::tuple<InputShapeParams,                           // Params using to create 1st and 2nd inputs
+                                       InputPrecisions,                            // Input precisions
+                                       int32_t,                                    // Max output boxes per class
+                                       InputfloatVar,                              // iouThreshold, scoreThreshold, nmsEta
+                                       int32_t,                                    // background_class
+                                       int32_t,                                    // keep_top_k
+                                       ngraph::element::Type,                      // Output type
+                                       ngraph::op::util::NmsBase::SortResultType,  // SortResultType
+                                       InputboolVar,                               // Sort result across batch, normalized
+                                       std::string>;
+
+class MulticlassNmsLayerTest : public testing::WithParamInterface<MulticlassNmsParams>, virtual public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<MulticlassNmsParams> obj);
+    void GenerateInputs() override;
+    void Compare(const std::vector<std::pair<ngraph::element::Type, std::vector<std::uint8_t>>>& expectedOutputs,
+                 const std::vector<InferenceEngine::Blob::Ptr>& actualOutputs) override;
+
+protected:
+    void SetUp() override;
+
+private:
+    size_t numBatches, numBoxes, numClasses;
+    size_t maxOutputBoxesPerClass;
+    size_t maxOutputBoxesPerBatch;
+};
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp b/inference-engine/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp
new file mode 100644
index 00000000000000..2b33a25ae1e764
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp
@@ -0,0 +1,250 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/single_layer/matrix_nms.hpp"
+
+namespace LayerTestsDefinitions {
+
+using namespace ngraph;
+using namespace InferenceEngine;
+using namespace FuncTestUtils::PrecisionUtils;
+
+std::string MatrixNmsLayerTest::getTestCaseName(testing::TestParamInfo<NmsParams> obj) {
+    InputShapeParams inShapeParams;
+    InputPrecisions inPrecisions;
+    op::v8::MatrixNms::SortResultType sortResultType;
+    element::Type outType;
+    int backgroudClass;
+    op::v8::MatrixNms::DecayFunction decayFunction;
+    TopKParams topKParams;
+    ThresholdParams thresholdParams;
+    bool normalized;
+    std::string targetDevice;
+    std::tie(inShapeParams, inPrecisions, sortResultType, outType, topKParams, thresholdParams,
+        backgroudClass, normalized, decayFunction, targetDevice) = obj.param;
+
+    size_t numBatches, numBoxes, numClasses;
+    std::tie(numBatches, numBoxes, numClasses) = inShapeParams;
+
+    Precision paramsPrec, maxBoxPrec, thrPrec;
+    std::tie(paramsPrec, maxBoxPrec, thrPrec) = inPrecisions;
+
+    int nmsTopK, keepTopK;
+    std::tie(nmsTopK, keepTopK) = topKParams;
+
+    float score_threshold, gaussian_sigma, post_threshold;
+    std::tie(score_threshold, gaussian_sigma, post_threshold) = thresholdParams;
+
+    std::ostringstream result;
+    result << "numBatches=" << numBatches << "_numBoxes=" << numBoxes << "_numClasses=" << numClasses << "_";
+    result << "paramsPrec=" << paramsPrec << "_maxBoxPrec=" << maxBoxPrec << "_thrPrec=" << thrPrec << "_";
+    result << "sortResultType=" << sortResultType << "_normalized=" << normalized << "_";
+    result << "outType=" << outType << "_nmsTopK=" << nmsTopK << "_keepTopK=" << keepTopK << "_";
+    result << "backgroudClass=" << backgroudClass << "_decayFunction=" << decayFunction << "_";
+    result << "score_threshold=" << score_threshold << "_gaussian_sigma=" << gaussian_sigma << "_";
+    result << "post_threshold=" << post_threshold << "_TargetDevice=" << targetDevice;
+    return result.str();
+}
+
+void MatrixNmsLayerTest::GenerateInputs() {
+    size_t it = 0;
+    for (const auto &input : cnnNetwork.getInputsInfo()) {
+        const auto &info = input.second;
+        Blob::Ptr blob;
+
+        if (it == 1) {
+            blob = make_blob_with_precision(info->getTensorDesc());
+            blob->allocate();
+            CommonTestUtils::fill_data_random_float<Precision::FP32>(blob, 1, 0, 100000);
+        } else {
+            blob = GenerateInput(*info);
+        }
+        inputs.push_back(blob);
+        it++;
+    }
+}
+
+void MatrixNmsLayerTest::Compare(const std::vector<std::pair<ngraph::element::Type, std::vector<std::uint8_t>>> &expectedOutputs,
+                                     const std::vector<Blob::Ptr> &actualOutputs) {
+    auto batchIndex = -1;
+    std::vector<int32_t> numPerBatch(numBatches);
+    for (int outputIndex = static_cast<int>(expectedOutputs.size()) - 1; outputIndex >= 0 ; outputIndex--) {
+        const auto& actual = actualOutputs[outputIndex];
+        const auto _dims = actual->getTensorDesc().getDims();
+        if (_dims.size() == 1 && _dims[0] == numBatches) {
+            batchIndex = outputIndex;
+            auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(actual);
+            IE_ASSERT(memory);
+            const auto lockedMemory = memory->wmap();
+            const auto actualBuffer = lockedMemory.as<const uint8_t *>();
+            auto buffer = reinterpret_cast<const int32_t *>(actualBuffer);
+            std::copy_n(buffer, numBatches, numPerBatch.begin());
+        }
+    }
+
+    for (int outputIndex = static_cast<int>(expectedOutputs.size()) - 1; outputIndex >= 0 ; outputIndex--) {
+        const auto& expected = expectedOutputs[outputIndex];
+        const auto& actual = actualOutputs[outputIndex];
+
+        //Compare Selected Outputs & Selected Indices
+        if (outputIndex != batchIndex) {
+            const auto &expectedBuffer = expected.second.data();
+            auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(actual);
+            IE_ASSERT(memory);
+            const auto lockedMemory = memory->wmap();
+            const auto actualBuffer = lockedMemory.as<const uint8_t *>();
+
+            auto k =  static_cast<float>(expected.first.size()) / actual->getTensorDesc().getPrecision().size();
+            // W/A for int4, uint4
+            if (expected.first == ngraph::element::Type_t::u4 || expected.first == ngraph::element::Type_t::i4) {
+                k /= 2;
+            }
+            if (outputIndex == 2) {
+                if (expected.second.size() != k * actual->byteSize())
+                    throw std::runtime_error("Expected and actual size 3rd output have different size");
+            }
+
+            const auto &precision = actual->getTensorDesc().getPrecision();
+            auto expected_offset = 0;
+            auto actual_offset = 0;
+            for (size_t i = 0; i < numPerBatch.size(); i++) {
+                auto validNums = numPerBatch[i];
+                switch (precision) {
+                    case InferenceEngine::Precision::FP32: {
+                        switch (expected.first) {
+                            case ngraph::element::Type_t::f32:
+                                LayerTestsUtils::LayerTestsCommon::Compare(
+                                        reinterpret_cast<const float *>(expectedBuffer) + expected_offset * 6,
+                                        reinterpret_cast<const float *>(actualBuffer) + actual_offset * 6, validNums * 6, 1e-5f);
+                                break;
+                            case ngraph::element::Type_t::f64:
+                                LayerTestsUtils::LayerTestsCommon::Compare(
+                                        reinterpret_cast<const double *>(expectedBuffer) + expected_offset * 6,
+                                        reinterpret_cast<const float *>(actualBuffer) + actual_offset * 6, validNums *6, 1e-5f);
+                                break;
+                            default:
+                                break;
+                        }
+
+                        const auto fBuffer = lockedMemory.as<const float *>();
+                        for (size_t tailing = validNums * 6; tailing < maxOutputBoxesPerBatch * 6; tailing++) {
+                            ASSERT_TRUE(std::abs(fBuffer[(actual_offset * 6 + tailing)] - -1.f) < 1e-5)
+                                << "Invalid default value: " << fBuffer[i] << " at index: " << i;
+                        }
+                        break;
+                    }
+                    case InferenceEngine::Precision::I32: {
+                        switch (expected.first) {
+                            case ngraph::element::Type_t::i32:
+                                LayerTestsUtils::LayerTestsCommon::Compare(
+                                        reinterpret_cast<const int32_t *>(expectedBuffer) + expected_offset,
+                                        reinterpret_cast<const int32_t *>(actualBuffer) + actual_offset, validNums, 0);
+                                break;
+                            case ngraph::element::Type_t::i64:
+                                LayerTestsUtils::LayerTestsCommon::Compare(
+                                        reinterpret_cast<const int64_t *>(expectedBuffer) + expected_offset,
+                                        reinterpret_cast<const int32_t *>(actualBuffer) + actual_offset, validNums, 0);
+                                break;
+                            default:
+                                break;
+                        }
+                        const auto iBuffer = lockedMemory.as<const int *>();
+                        for (size_t tailing = validNums; tailing < maxOutputBoxesPerBatch; tailing++) {
+                            ASSERT_TRUE(iBuffer[actual_offset + tailing] == -1) << "Invalid default value: " << iBuffer[i] << " at index: " << i;
+                        }
+                        break;
+                    }
+                    default:
+                        FAIL() << "Comparator for " << precision << " precision isn't supported";
+                }
+                expected_offset += validNums;
+                actual_offset += maxOutputBoxesPerBatch;
+            }
+        } else {
+            const auto &expectedBuffer = expected.second.data();
+            auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(actual);
+            IE_ASSERT(memory);
+            const auto lockedMemory = memory->wmap();
+            const auto actualBuffer = lockedMemory.as<const uint8_t *>();
+
+            auto k =  static_cast<float>(expected.first.size()) / actual->getTensorDesc().getPrecision().size();
+            // W/A for int4, uint4
+            if (expected.first == ngraph::element::Type_t::u4 || expected.first == ngraph::element::Type_t::i4) {
+                k /= 2;
+            }
+            if (outputIndex == 2) {
+                if (expected.second.size() != k * actual->byteSize())
+                    throw std::runtime_error("Expected and actual size 3rd output have different size");
+            }
+
+            const auto &precision = actual->getTensorDesc().getPrecision();
+            size_t size = expected.second.size() / (k * actual->getTensorDesc().getPrecision().size());
+            switch (precision) {
+                case InferenceEngine::Precision::I32: {
+                    switch (expected.first) {
+                        case ngraph::element::Type_t::i32:
+                            LayerTestsUtils::LayerTestsCommon::Compare(
+                                    reinterpret_cast<const int32_t *>(expectedBuffer),
+                                    reinterpret_cast<const int32_t *>(actualBuffer), size, 0);
+                            break;
+                        case ngraph::element::Type_t::i64:
+                            LayerTestsUtils::LayerTestsCommon::Compare(
+                                    reinterpret_cast<const int64_t *>(expectedBuffer),
+                                    reinterpret_cast<const int32_t *>(actualBuffer), size, 0);
+                            break;
+                        default:
+                            break;
+                    }
+                    break;
+                }
+                default:
+                    FAIL() << "Comparator for " << precision << " precision isn't supported";
+            }
+        }
+    }
+}
+
+void MatrixNmsLayerTest::SetUp() {
+    InputShapeParams inShapeParams;
+    InputPrecisions inPrecisions;
+    op::v8::MatrixNms::Attributes attrs;
+    TopKParams topKParams;
+    ThresholdParams thresholdParams;
+
+    std::tie(inShapeParams, inPrecisions, attrs.sort_result_type, attrs.output_type, topKParams, thresholdParams,
+        attrs.background_class, attrs.normalized, attrs.decay_function, targetDevice) = this->GetParam();
+
+    std::tie(attrs.nms_top_k, attrs.keep_top_k) = topKParams;
+    std::tie(attrs.score_threshold, attrs.gaussian_sigma, attrs.post_threshold) = thresholdParams;
+    std::tie(numBatches, numBoxes, numClasses) = inShapeParams;
+    auto realClasses = numClasses;
+    if (attrs.background_class >=0 && attrs.background_class <= numClasses) {
+        realClasses = realClasses - 1;
+    }
+
+    maxOutputBoxesPerClass = 0;
+    if (attrs.nms_top_k >= 0)
+        maxOutputBoxesPerClass = std::min(numBoxes, static_cast<size_t>(attrs.nms_top_k));
+    else
+        maxOutputBoxesPerClass = numBoxes;
+
+    maxOutputBoxesPerBatch  = maxOutputBoxesPerClass * realClasses;
+    if (attrs.keep_top_k >= 0)
+        maxOutputBoxesPerBatch =
+                std::min(maxOutputBoxesPerBatch, static_cast<size_t>(attrs.keep_top_k));
+    Precision paramsPrec, maxBoxPrec, thrPrec;
+    std::tie(paramsPrec, maxBoxPrec, thrPrec) = inPrecisions;
+
+    const std::vector<size_t> boxesShape{numBatches, numBoxes, 4}, scoresShape{numBatches, numClasses, numBoxes};
+    auto ngPrc = convertIE2nGraphPrc(paramsPrec);
+    auto params = builder::makeParams(ngPrc, {boxesShape, scoresShape});
+    auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes<op::Parameter>(params));
+    auto nms = std::make_shared<opset8::MatrixNms>(paramOuts[0], paramOuts[1], attrs);
+    auto nms_0_identity = std::make_shared<opset5::Multiply>(nms->output(0), opset5::Constant::create(element::f32, Shape{1}, {1}));
+    auto nms_1_identity = std::make_shared<opset5::Multiply>(nms->output(1), opset5::Constant::create(attrs.output_type, Shape{1}, {1}));
+    auto nms_2_identity = std::make_shared<opset5::Multiply>(nms->output(2), opset5::Constant::create(attrs.output_type, Shape{1}, {1}));
+    function = std::make_shared<Function>(OutputVector{nms_0_identity, nms_1_identity, nms_2_identity}, params, "NMS");
+}
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/src/single_layer/multiclass_nms.cpp b/inference-engine/tests/functional/shared_test_classes/src/single_layer/multiclass_nms.cpp
new file mode 100644
index 00000000000000..e8532bad22706f
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/multiclass_nms.cpp
@@ -0,0 +1,270 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/single_layer/multiclass_nms.hpp"
+
+namespace LayerTestsDefinitions {
+
+using namespace ngraph;
+using namespace InferenceEngine;
+using namespace FuncTestUtils::PrecisionUtils;
+
+std::string MulticlassNmsLayerTest::getTestCaseName(testing::TestParamInfo<MulticlassNmsParams> obj) {
+    InputShapeParams inShapeParams;
+    InputPrecisions inPrecisions;
+    int32_t nmsTopK, backgroundClass, keepTopK;
+    element::Type outType;
+
+    op::util::NmsBase::SortResultType sortResultType;
+
+    InputfloatVar inFloatVar;
+    InputboolVar inboolVar;
+
+    std::string targetDevice;
+
+    std::tie(inShapeParams, inPrecisions, nmsTopK, inFloatVar, backgroundClass, keepTopK, outType, sortResultType, inboolVar, targetDevice) = obj.param;
+
+    size_t numBatches, numBoxes, numClasses;
+    std::tie(numBatches, numBoxes, numClasses) = inShapeParams;
+
+    Precision paramsPrec, maxBoxPrec, thrPrec;
+    std::tie(paramsPrec, maxBoxPrec, thrPrec) = inPrecisions;
+
+    float iouThr, scoreThr, nmsEta;
+    std::tie(iouThr, scoreThr, nmsEta) = inFloatVar;
+
+    bool sortResCB, normalized;
+    std::tie(sortResCB, normalized) = inboolVar;
+
+    std::ostringstream result;
+    result << "numBatches=" << numBatches << "_numBoxes=" << numBoxes << "_numClasses=" << numClasses << "_";
+    result << "paramsPrec=" << paramsPrec << "_maxBoxPrec=" << maxBoxPrec << "_thrPrec=" << thrPrec << "_";
+    result << "nmsTopK=" << nmsTopK << "_";
+    result << "iouThr=" << iouThr << "_scoreThr=" << scoreThr << "_backgroundClass=" << backgroundClass << "_";
+    result << "keepTopK=" << keepTopK << "_outType=" << outType << "_";
+    result << "sortResultType=" << sortResultType << "_sortResCrossBatch=" << sortResCB << "_nmsEta=" << nmsEta << "_normalized=" << normalized << "_";
+    result << "TargetDevice=" << targetDevice;
+    return result.str();
+}
+
+void MulticlassNmsLayerTest::GenerateInputs() {
+    size_t it = 0;
+    for (const auto& input : cnnNetwork.getInputsInfo()) {
+        const auto& info = input.second;
+        Blob::Ptr blob;
+
+        if (it == 1) {
+            blob = make_blob_with_precision(info->getTensorDesc());
+            blob->allocate();
+            CommonTestUtils::fill_data_random_float<Precision::FP32>(blob, 1, 0, 1000);
+        } else {
+            blob = GenerateInput(*info);
+        }
+        inputs.push_back(blob);
+        it++;
+    }
+}
+
+void MulticlassNmsLayerTest::Compare(const std::vector<std::pair<ngraph::element::Type, std::vector<std::uint8_t>>>& expectedOutputs,
+                                     const std::vector<Blob::Ptr>& actualOutputs) {
+    auto batchIndex = -1;
+    std::vector<int32_t> numPerBatch(numBatches);
+    for (int outputIndex = static_cast<int>(expectedOutputs.size()) - 1; outputIndex >= 0; outputIndex--) {
+        const auto& actual = actualOutputs[outputIndex];
+        const auto _dims = actual->getTensorDesc().getDims();
+        if (_dims.size() == 1 && _dims[0] == numBatches) {
+            batchIndex = outputIndex;
+            auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(actual);
+            IE_ASSERT(memory);
+            const auto lockedMemory = memory->wmap();
+            const auto actualBuffer = lockedMemory.as<const uint8_t*>();
+            auto buffer = reinterpret_cast<const int32_t*>(actualBuffer);
+            std::copy_n(buffer, numBatches, numPerBatch.begin());
+        }
+    }
+
+    for (int outputIndex = static_cast<int>(expectedOutputs.size()) - 1; outputIndex >= 0; outputIndex--) {
+        const auto& expected = expectedOutputs[outputIndex];
+        const auto& actual = actualOutputs[outputIndex];
+
+        // Compare Selected Outputs & Selected Indices
+        if (outputIndex != batchIndex) {
+            const auto& expectedBuffer = expected.second.data();
+            auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(actual);
+            IE_ASSERT(memory);
+            const auto lockedMemory = memory->wmap();
+            const auto actualBuffer = lockedMemory.as<const uint8_t*>();
+
+            auto k = static_cast<float>(expected.first.size()) / actual->getTensorDesc().getPrecision().size();
+            // W/A for int4, uint4
+            if (expected.first == ngraph::element::Type_t::u4 || expected.first == ngraph::element::Type_t::i4) {
+                k /= 2;
+            }
+            if (outputIndex == 2) {
+                if (expected.second.size() != k * actual->byteSize())
+                    throw std::runtime_error("Expected and actual size 3rd output have different "
+                                             "size");
+            }
+
+            const auto& precision = actual->getTensorDesc().getPrecision();
+            auto expected_offset = 0;
+            auto actual_offset = 0;
+            for (size_t i = 0; i < numPerBatch.size(); i++) {
+                auto validNums = numPerBatch[i];
+                switch (precision) {
+                case InferenceEngine::Precision::FP32: {
+                    switch (expected.first) {
+                    case ngraph::element::Type_t::f32:
+                        LayerTestsUtils::LayerTestsCommon::Compare(reinterpret_cast<const float*>(expectedBuffer) + expected_offset * 6,
+                                                                   reinterpret_cast<const float*>(actualBuffer) + actual_offset * 6, validNums * 6, 1e-5f);
+                        break;
+                    case ngraph::element::Type_t::f64:
+                        LayerTestsUtils::LayerTestsCommon::Compare(reinterpret_cast<const double*>(expectedBuffer) + expected_offset * 6,
+                                                                   reinterpret_cast<const float*>(actualBuffer) + actual_offset * 6, validNums * 6, 1e-5f);
+                        break;
+                    default:
+                        break;
+                    }
+
+                    const auto fBuffer = lockedMemory.as<const float*>();
+                    for (size_t tailing = validNums * 6; tailing < maxOutputBoxesPerBatch * 6; tailing++) {
+                        ASSERT_TRUE(std::abs(fBuffer[(actual_offset * 6 + tailing)] - -1.f) < 1e-5)
+                            << "Invalid default value: " << fBuffer[i] << " at index: " << i;
+                    }
+                    break;
+                }
+                case InferenceEngine::Precision::I32: {
+                    switch (expected.first) {
+                    case ngraph::element::Type_t::i32:
+                        LayerTestsUtils::LayerTestsCommon::Compare(reinterpret_cast<const int32_t*>(expectedBuffer) + expected_offset,
+                                                                   reinterpret_cast<const int32_t*>(actualBuffer) + actual_offset, validNums, 0);
+                        break;
+                    case ngraph::element::Type_t::i64:
+                        LayerTestsUtils::LayerTestsCommon::Compare(reinterpret_cast<const int64_t*>(expectedBuffer) + expected_offset,
+                                                                   reinterpret_cast<const int32_t*>(actualBuffer) + actual_offset, validNums, 0);
+                        break;
+                    default:
+                        break;
+                    }
+                    const auto iBuffer = lockedMemory.as<const int*>();
+                    for (size_t tailing = validNums; tailing < maxOutputBoxesPerBatch; tailing++) {
+                        ASSERT_TRUE(iBuffer[actual_offset + tailing] == -1) << "Invalid default value: " << iBuffer[i] << " at index: " << i;
+                    }
+                    break;
+                }
+                default:
+                    FAIL() << "Comparator for " << precision << " precision isn't supported";
+                }
+                expected_offset += validNums;
+                actual_offset += maxOutputBoxesPerBatch;
+            }
+        } else {
+            const auto& expectedBuffer = expected.second.data();
+            auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(actual);
+            IE_ASSERT(memory);
+            const auto lockedMemory = memory->wmap();
+            const auto actualBuffer = lockedMemory.as<const uint8_t*>();
+
+            auto k = static_cast<float>(expected.first.size()) / actual->getTensorDesc().getPrecision().size();
+            // W/A for int4, uint4
+            if (expected.first == ngraph::element::Type_t::u4 || expected.first == ngraph::element::Type_t::i4) {
+                k /= 2;
+            }
+            if (outputIndex == 2) {
+                if (expected.second.size() != k * actual->byteSize())
+                    throw std::runtime_error("Expected and actual size 3rd output have different "
+                                             "size");
+            }
+
+            const auto& precision = actual->getTensorDesc().getPrecision();
+            size_t size = expected.second.size() / (k * actual->getTensorDesc().getPrecision().size());
+            switch (precision) {
+            case InferenceEngine::Precision::I32: {
+                switch (expected.first) {
+                case ngraph::element::Type_t::i32:
+                    LayerTestsUtils::LayerTestsCommon::Compare(reinterpret_cast<const int32_t*>(expectedBuffer), reinterpret_cast<const int32_t*>(actualBuffer),
+                                                               size, 0);
+                    break;
+                case ngraph::element::Type_t::i64:
+                    LayerTestsUtils::LayerTestsCommon::Compare(reinterpret_cast<const int64_t*>(expectedBuffer), reinterpret_cast<const int32_t*>(actualBuffer),
+                                                               size, 0);
+                    break;
+                default:
+                    break;
+                }
+                break;
+            }
+            default:
+                FAIL() << "Comparator for " << precision << " precision isn't supported";
+            }
+        }
+    }
+}
+
+void MulticlassNmsLayerTest::SetUp() {
+    InputShapeParams inShapeParams;
+    InputPrecisions inPrecisions;
+    op::v8::MulticlassNms::Attributes attrs;
+    size_t maxOutBoxesPerClass, backgroundClass, keepTopK;
+    element::Type outType;
+
+    op::util::NmsBase::SortResultType sortResultType;
+
+    InputfloatVar inFloatVar;
+    InputboolVar inboolVar;
+
+    std::tie(inShapeParams, inPrecisions, maxOutBoxesPerClass, inFloatVar, backgroundClass, keepTopK, outType, sortResultType, inboolVar, targetDevice) =
+        this->GetParam();
+
+    // size_t numBatches, numBoxes, numClasses;
+    std::tie(numBatches, numBoxes, numClasses) = inShapeParams;
+    auto realClasses = numClasses;
+    if (backgroundClass >= 0 && backgroundClass <= numClasses) {
+        realClasses = realClasses - 1;
+    }
+
+    maxOutputBoxesPerClass = 0;
+    if (maxOutBoxesPerClass >= 0)
+        maxOutputBoxesPerClass = std::min(numBoxes, static_cast<size_t>(maxOutBoxesPerClass));
+    else
+        maxOutputBoxesPerClass = numBoxes;
+
+    maxOutputBoxesPerBatch = maxOutputBoxesPerClass * realClasses;
+    if (keepTopK >= 0)
+        maxOutputBoxesPerBatch = std::min(maxOutputBoxesPerBatch, static_cast<size_t>(keepTopK));
+
+    Precision paramsPrec, maxBoxPrec, thrPrec;
+    std::tie(paramsPrec, maxBoxPrec, thrPrec) = inPrecisions;
+
+    float iouThr, scoreThr, nmsEta;
+    std::tie(iouThr, scoreThr, nmsEta) = inFloatVar;
+
+    bool sortResCB, normalized;
+    std::tie(sortResCB, normalized) = inboolVar;
+
+    const std::vector<size_t> boxesShape {numBatches, numBoxes, 4}, scoresShape {numBatches, numClasses, numBoxes};
+    auto ngPrc = convertIE2nGraphPrc(paramsPrec);
+    auto params = builder::makeParams(ngPrc, {boxesShape, scoresShape});
+    auto paramOuts = helpers::convert2OutputVector(helpers::castOps2Nodes<op::Parameter>(params));
+
+    attrs.iou_threshold = iouThr;
+    attrs.score_threshold = scoreThr;
+    attrs.nms_eta = nmsEta;
+    attrs.sort_result_type = sortResultType;
+    attrs.sort_result_across_batch = sortResCB;
+    attrs.output_type = outType;
+    attrs.nms_top_k = maxOutBoxesPerClass;
+    attrs.keep_top_k = keepTopK;
+    attrs.background_class = backgroundClass;
+    attrs.normalized = normalized;
+
+    auto nms = std::make_shared<opset8::MulticlassNms>(paramOuts[0], paramOuts[1], attrs);
+
+    auto nms_0_identity = std::make_shared<opset5::Multiply>(nms->output(0), opset5::Constant::create(ngPrc, Shape {1}, {1}));
+    auto nms_1_identity = std::make_shared<opset5::Multiply>(nms->output(1), opset5::Constant::create(outType, Shape {1}, {1}));
+    auto nms_2_identity = std::make_shared<opset5::Multiply>(nms->output(2), opset5::Constant::create(outType, Shape {1}, {1}));
+    function = std::make_shared<Function>(OutputVector {nms_0_identity, nms_1_identity, nms_2_identity}, params, "MulticlassNMS");
+}
+
+}  // namespace LayerTestsDefinitions
diff --git a/ngraph/core/src/op/matrix_nms.cpp b/ngraph/core/src/op/matrix_nms.cpp
index 7d3731f3b114de..3cac8707883edd 100644
--- a/ngraph/core/src/op/matrix_nms.cpp
+++ b/ngraph/core/src/op/matrix_nms.cpp
@@ -74,7 +74,8 @@ bool ngraph::op::v8::MatrixNms::visit_attributes(AttributeVisitor& visitor)
 namespace ngraph
 {
     template <>
-    EnumNames<op::v8::MatrixNms::DecayFunction>& EnumNames<op::v8::MatrixNms::DecayFunction>::get()
+    NGRAPH_API EnumNames<op::v8::MatrixNms::DecayFunction>&
+        EnumNames<op::v8::MatrixNms::DecayFunction>::get()
     {
         static auto enum_names = EnumNames<op::v8::MatrixNms::DecayFunction>(
             "op::v8::MatrixNms::DecayFunction",
diff --git a/ngraph/core/src/op/util/nms_base.cpp b/ngraph/core/src/op/util/nms_base.cpp
index 4fce4c46fc49f9..7a9b4f3d35cf10 100644
--- a/ngraph/core/src/op/util/nms_base.cpp
+++ b/ngraph/core/src/op/util/nms_base.cpp
@@ -163,7 +163,7 @@ void op::util::NmsBase::validate_and_infer_types()
 namespace ngraph
 {
     template <>
-    EnumNames<op::util::NmsBase::SortResultType>&
+    NGRAPH_API EnumNames<op::util::NmsBase::SortResultType>&
         EnumNames<op::util::NmsBase::SortResultType>::get()
     {
         static auto enum_names = EnumNames<op::util::NmsBase::SortResultType>(