diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/movement/scatter-nd-update-15.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/movement/scatter-nd-update-15.rst
index 283779d6368caa..794fe707695ff8 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/movement/scatter-nd-update-15.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/movement/scatter-nd-update-15.rst
@@ -16,7 +16,7 @@ ScatterNDUpdate
 
 **Detailed description**: The operation produces a copy of ``data`` tensor and updates its value using logic from ``reduction`` attribute, using values specified
 by ``updates`` at specific index positions specified by ``indices``. The output shape is the same as the shape of ``data``.
-Input ``indices`` can contain duplicated index values, however, in case when *reduction* is set to ``none``, only last update for given duplicated index is used.
+If multiple indices point to the same output location then the order of updating the values is undefined.
 
 The last dimension of ``indices`` corresponds to indices into elements if ``indices.shape[-1]`` = ``data.shape.rank`` or slices
 if ``indices.shape[-1]`` < ``data.shape.rank``.
@@ -41,8 +41,9 @@ Operator ScatterNDUpdate-15 is an equivalent to following NumPy snippet:
         elif reduction == "min":
             func = min
         out = np.copy(data)
+        # Order of loop iteration is undefined.
         for ndidx in np.ndindex(indices.shape[:-1]):
-            out[indices[ndidx]] = func(out[indices[ndidx]], updates[ndidx])
+            out[tuple(indices[ndidx])] = func(tuple(out[indices[ndidx]]), updates[ndidx])
         return out
 
 Example 1 that shows simple case of update with *reduction* set to ``none``.:
@@ -52,7 +53,7 @@ Example 1 that shows simple case of update with *reduction* set to ``none``.:
     data    = [1, 2, 3, 4, 5, 6, 7, 8]
     indices = [[4], [3], [1], [7], [-2], [-4]]
     updates = [9, 10, 11, 12, 13, 14]
-    output  = [1, 11, 3, 10, 14, 6, 13, 12]
+    output  = [1, 11, 3, 10, 4, 6, 13, 12]
 
 
 Example that shows update of two slices of ``4x4`` shape in ``data``, with *reduction* set to ``none``:
@@ -87,7 +88,7 @@ Example that shows update of two slices of ``4x4`` shape in ``data``, with *redu
 
 *   **1**: ``data`` tensor of arbitrary rank ``r`` >= 1 and of type *T*. **Required.**
 
-*   **2**: ``indices`` tensor with indices of arbitrary rank ``q`` >= 1 and of type *T_IND*. All index values ``i_j`` in index entry ``(i_0, i_1, ...,i_k)`` (where ``k = indices.shape[-1]``) must be within bounds ``[-s_j, s_j - 1]`` where ``s_j = data.shape[j]``. ``k`` must be at most ``r``. If multiple indices point to the same output location then values will be updated in order of their occurrence. Negative value of index means reverse indexing and will be normalized to value ``len(data.shape[j] + index)``. If an index points to non-existing element then exception is raised. **Required.**
+*   **2**: ``indices`` tensor with indices of arbitrary rank ``q`` >= 1 and of type *T_IND*. All index values ``i_j`` in index entry ``(i_0, i_1, ...,i_k)`` (where ``k = indices.shape[-1]``) must be within bounds ``[-s_j, s_j - 1]`` where ``s_j = data.shape[j]``. ``k`` must be at most ``r``. If multiple indices point to the same output location then the order of updating the values is undefined. Negative value of index means reverse indexing and will be normalized to value ``len(data.shape[j] + index)``. If an index points to non-existing element then exception is raised. **Required.**
 
 *   **3**: ``updates`` tensor of rank ``r - indices.shape[-1] + q - 1`` of type *T*. If expected ``updates`` rank is 0D it can be a tensor with single element. **Required.**
 
@@ -121,7 +122,7 @@ Example that shows update of two slices of ``4x4`` shape in ``data``, with *redu
         </input>
         <output>
             <port id="3" precision="FP32">
-                <dim>4</dim>  <!-- values: [50, 40, 20, 4] -->
+                <dim>4</dim>  <!-- values: [50, 20, 20, 4] -->
             </port>
         </output>
     </layer>
diff --git a/src/bindings/js/node/include/core_wrap.hpp b/src/bindings/js/node/include/core_wrap.hpp
index 169812234f901a..29149aaf157ea8 100644
--- a/src/bindings/js/node/include/core_wrap.hpp
+++ b/src/bindings/js/node/include/core_wrap.hpp
@@ -91,6 +91,9 @@ class CoreWrap : public Napi::ObjectWrap<CoreWrap> {
     /** @brief Imports a compiled model from the previously exported one. */
     Napi::Value import_model(const Napi::CallbackInfo& info);
 
+    /** @brief Implements Core.importModel() defined in ../lib/addon.ts. */
+    Napi::Value import_model_async(const Napi::CallbackInfo& info);
+
     /** @brief Returns devices available for inference. */
     Napi::Value get_available_devices(const Napi::CallbackInfo& info);
 
@@ -99,6 +102,7 @@ class CoreWrap : public Napi::ObjectWrap<CoreWrap> {
 
 private:
     ov::Core _core;
+    std::mutex _mutex;
 };
 
 struct TsfnContextModel {
@@ -127,6 +131,20 @@ struct TsfnContextPath {
     std::map<std::string, ov::Any> _config = {};
 };
 
+struct ImportModelContext {
+    ImportModelContext(Napi::Env env, ov::Core& core) : deferred(Napi::Promise::Deferred::New(env)), _core{core} {};
+    std::thread nativeThread;
+
+    Napi::Promise::Deferred deferred;
+    Napi::ThreadSafeFunction tsfn;
+
+    std::stringstream _stream;
+    std::string _device;
+    std::map<std::string, ov::Any> _config = {};
+    ov::Core& _core;
+    ov::CompiledModel _compiled_model;
+};
+
 void FinalizerCallbackModel(Napi::Env env, void* finalizeData, TsfnContextModel* context);
 void FinalizerCallbackPath(Napi::Env env, void* finalizeData, TsfnContextPath* context);
 void compileModelThreadModel(TsfnContextModel* context);
diff --git a/src/bindings/js/node/include/helper.hpp b/src/bindings/js/node/include/helper.hpp
index 15132cbe5455f4..201dd3703ad3f0 100644
--- a/src/bindings/js/node/include/helper.hpp
+++ b/src/bindings/js/node/include/helper.hpp
@@ -105,9 +105,17 @@ Napi::Array cpp_to_js<ov::PartialShape, Napi::Array>(const Napi::CallbackInfo& i
 template <>
 Napi::Array cpp_to_js<ov::Dimension, Napi::Array>(const Napi::CallbackInfo& info, const ov::Dimension dim);
 
+/**
+ * @brief Creates JavaScript Model and wraps ov::Model inside of it.
+ * @return Javascript Model as Napi::Object. (Not ModelWrap object)
+ */
+Napi::Object cpp_to_js(const Napi::Env& env, std::shared_ptr<ov::Model> model);
+
 template <>
 Napi::Boolean cpp_to_js<bool, Napi::Boolean>(const Napi::CallbackInfo& info, const bool value);
 
+Napi::Object cpp_to_js(const Napi::Env& env, const ov::CompiledModel& compiled_model);
+
 /** @brief Takes Napi::Value and parse Napi::Array or Napi::Object to ov::TensorVector. */
 ov::TensorVector parse_input_data(const Napi::Value& input);
 
diff --git a/src/bindings/js/node/include/model_wrap.hpp b/src/bindings/js/node/include/model_wrap.hpp
index 38fe9835d9378d..cda9ff8b6ee65a 100644
--- a/src/bindings/js/node/include/model_wrap.hpp
+++ b/src/bindings/js/node/include/model_wrap.hpp
@@ -23,13 +23,6 @@ class ModelWrap : public Napi::ObjectWrap<ModelWrap> {
     static Napi::Function get_class(Napi::Env env);
 
     void set_model(const std::shared_ptr<ov::Model>& model);
-    /**
-     * @brief Creates JavaScript Model object and wraps inside of it ov::Model object.
-     * @param env The environment in which to construct a JavaScript object.
-     * @param model a pointer to ov::Model to wrap.
-     * @return Javascript Model as Napi::Object. (Not ModelWrap object)
-     */
-    static Napi::Object wrap(Napi::Env env, std::shared_ptr<ov::Model> model);
 
     /** @return Napi::String containing a model name. */
     Napi::Value get_name(const Napi::CallbackInfo& info);
@@ -91,7 +84,7 @@ class ModelWrap : public Napi::ObjectWrap<ModelWrap> {
      * @return number indicating the quantity of outputs for the model
      */
     Napi::Value get_output_size(const Napi::CallbackInfo& info);
-    
+
     /**
      * @brief Sets a friendly name for a model.
      * @param info Contains information about the environment and passed arguments
diff --git a/src/bindings/js/node/include/read_model_args.hpp b/src/bindings/js/node/include/read_model_args.hpp
index fd6dc35d7334a4..4eb7e39885904b 100644
--- a/src/bindings/js/node/include/read_model_args.hpp
+++ b/src/bindings/js/node/include/read_model_args.hpp
@@ -2,6 +2,8 @@
 
 #include <napi.h>
 
+#include "node/include/helper.hpp"
+#include "node/include/type_validation.hpp"
 #include "openvino/runtime/core.hpp"
 
 /**
@@ -15,41 +17,29 @@ struct ReadModelArgs {
 
     ReadModelArgs() {}
     ReadModelArgs(const Napi::CallbackInfo& info) {
-        if (!is_valid_read_model_input(info))
-            throw std::runtime_error("Invalid arguments of read model function");
-
-        const size_t argsLength = info.Length();
-        std::shared_ptr<ov::Model> model;
-
-        if (info[0].IsBuffer()) {
-            Napi::Buffer<uint8_t> model_data = info[0].As<Napi::Buffer<uint8_t>>();
-            model_str = std::string(reinterpret_cast<char*>(model_data.Data()), model_data.Length());
-
-            if (argsLength == 2) {
-                Napi::Buffer<uint8_t> weights = info[1].As<Napi::Buffer<uint8_t>>();
-                const uint8_t* bin = reinterpret_cast<const uint8_t*>(weights.Data());
-
-                size_t bin_size = weights.Length();
-                weight_tensor = ov::Tensor(ov::element::Type_t::u8, {bin_size});
-                std::memcpy(weight_tensor.data(), bin, bin_size);
-            }
-            else {
-                weight_tensor = ov::Tensor(ov::element::Type_t::u8, {0});
-            }
+        std::vector<std::string> allowed_signatures;
+
+        if (ov::js::validate<Napi::String>(info, allowed_signatures)) {
+            model_path = info[0].ToString();
+        } else if (ov::js::validate<Napi::String, Napi::String>(info, allowed_signatures)) {
+            model_path = info[0].ToString();
+            bin_path = info[1].ToString();
+        } else if (ov::js::validate<Napi::Buffer<uint8_t>>(info, allowed_signatures)) {
+            model_str = buffer_to_string(info[0]);
+            weight_tensor = ov::Tensor(ov::element::Type_t::u8, {0});
+        } else if (ov::js::validate<Napi::Buffer<uint8_t>, Napi::Buffer<uint8_t>>(info, allowed_signatures)) {
+            model_str = buffer_to_string(info[0]);
+            Napi::Buffer<uint8_t> weights = info[1].As<Napi::Buffer<uint8_t>>();
+            const uint8_t* bin = reinterpret_cast<const uint8_t*>(weights.Data());
+
+            size_t bin_size = weights.Length();
+            weight_tensor = ov::Tensor(ov::element::Type_t::u8, {bin_size});
+            std::memcpy(weight_tensor.data(), bin, bin_size);
+        } else if (ov::js::validate<Napi::String, TensorWrap>(info, allowed_signatures)) {
+            model_str = info[0].ToString();
+            weight_tensor = cast_to_tensor(info, 1);
         } else {
-            model_path = std::string(info[0].ToString());
-
-            if (argsLength == 2) bin_path = info[1].ToString();
+            OPENVINO_THROW("'readModel'", ov::js::get_parameters_error_msg(info, allowed_signatures));
         }
     }
-
-    bool is_valid_read_model_input(const Napi::CallbackInfo& info) {
-        const size_t argsLength = info.Length();
-        const size_t is_buffers_input = info[0].IsBuffer()
-            && (argsLength == 1 || info[1].IsBuffer());
-
-        if (is_buffers_input) return true;
-
-        return info[0].IsString() && (argsLength == 1 || info[1].IsString());
-    }
 };
diff --git a/src/bindings/js/node/lib/addon.ts b/src/bindings/js/node/lib/addon.ts
index 5a48ce963b0e45..b5909ea9f3ae03 100644
--- a/src/bindings/js/node/lib/addon.ts
+++ b/src/bindings/js/node/lib/addon.ts
@@ -123,7 +123,7 @@ interface Core {
     },
   };
   /**
-   * It imports a previously exported compiled model.
+   * Asynchronously imports a previously exported compiled model.
    * @param modelStream The input stream that contains a model,
    * previously exported with the {@link CompiledModel.exportModelSync} method.
    * @param device The name of a device, for which you import a compiled model.
@@ -132,6 +132,15 @@ interface Core {
    * @param config An object with the key-value pairs
    * (property name, property value): relevant only for this load operation.
    */
+  importModel(
+    modelStream: Buffer,
+    device: string,
+    config?: { [key: string]: string | number | boolean }
+  ): Promise<CompiledModel>;
+  /**
+   * A synchronous version of {@link Core.importModel}.
+   * It imports a previously exported compiled model.
+   */
   importModelSync(
     modelStream: Buffer,
     device: string,
@@ -151,7 +160,14 @@ interface Core {
    * For the TFLite format (*.tflite), the weights parameter is not used.
    */
   readModel(modelPath: string, weightsPath?: string): Promise<Model>;
-
+  /**
+   * It reads models from IR / ONNX / PDPD / TF and TFLite formats.
+   * @param model A string with model in IR / ONNX / PDPD / TF
+   * and TFLite format.
+   * @param weights Tensor with weights. Reading ONNX / PDPD / TF
+   * and TFLite models doesn’t support loading weights from weights tensors.
+   */
+  readModel(model: string, weights: Tensor): Promise<Model>;
   /**
    * It reads models from the IR / ONNX / PDPD / TF and TFLite formats.
    * @param modelBuffer Binary data with a model
@@ -165,6 +181,11 @@ interface Core {
    * It reads models from the IR / ONNX / PDPD / TF and TFLite formats.
    */
   readModelSync(modelPath: string, weightsPath?: string): Model;
+  /**
+   * A synchronous version of {@link Core.readModel}.
+   * It reads models from the IR / ONNX / PDPD / TF and TFLite formats.
+   */
+  readModelSync(model: string, weights: Tensor): Model;
   /**
    * A synchronous version of {@link Core.readModel}.
    * It reads models from the IR / ONNX / PDPD / TF and TFLite formats.
diff --git a/src/bindings/js/node/src/async_reader.cpp b/src/bindings/js/node/src/async_reader.cpp
index 8faaf47ab0a105..bf8ca8105b4c0f 100644
--- a/src/bindings/js/node/src/async_reader.cpp
+++ b/src/bindings/js/node/src/async_reader.cpp
@@ -13,14 +13,11 @@ void ReaderWorker::Execute() {
 }
 
 void ReaderWorker::OnOK() {
-    Napi::HandleScope scope(Env());
-    Napi::Object mw = ModelWrap::get_class(Env()).New({});
-    ModelWrap* m = Napi::ObjectWrap<ModelWrap>::Unwrap(mw);
-    m->set_model(_model);
+    auto model = cpp_to_js(Env(), _model);
 
     delete _args;
 
-    _deferred.Resolve(mw);
+    _deferred.Resolve(model);
 }
 
 void ReaderWorker::OnError(Napi::Error const& error) {
diff --git a/src/bindings/js/node/src/core_wrap.cpp b/src/bindings/js/node/src/core_wrap.cpp
index 33350056fc443c..20422b7d683d3d 100644
--- a/src/bindings/js/node/src/core_wrap.cpp
+++ b/src/bindings/js/node/src/core_wrap.cpp
@@ -51,6 +51,7 @@ Napi::Function CoreWrap::get_class(Napi::Env env) {
                         InstanceMethod("compileModelSync", &CoreWrap::compile_model_sync_dispatch),
                         InstanceMethod("compileModel", &CoreWrap::compile_model_async),
                         InstanceMethod("getAvailableDevices", &CoreWrap::get_available_devices),
+                        InstanceMethod("importModel", &CoreWrap::import_model_async),
                         InstanceMethod("importModelSync", &CoreWrap::import_model),
                         InstanceMethod("getAvailableDevices", &CoreWrap::get_available_devices),
                         InstanceMethod("getVersions", &CoreWrap::get_versions),
@@ -85,11 +86,13 @@ Napi::Value CoreWrap::read_model_sync(const Napi::CallbackInfo& info) {
             model = _core.read_model(model_str, weight_tensor);
         } else if (ov::js::validate<Napi::String>(info, allowed_signatures)) {
             model = _core.read_model(info[0].ToString());
+        } else if (ov::js::validate<Napi::String, TensorWrap>(info, allowed_signatures)) {
+            model = _core.read_model(info[0].ToString(), cast_to_tensor(info, 1));
         } else {
             OPENVINO_THROW("'readModelSync'", ov::js::get_parameters_error_msg(info, allowed_signatures));
         }
 
-        return ModelWrap::wrap(info.Env(), model);
+        return cpp_to_js(info.Env(), model);
     } catch (std::runtime_error& err) {
         reportError(info.Env(), err.what());
 
@@ -350,6 +353,66 @@ Napi::Value CoreWrap::import_model(const Napi::CallbackInfo& info) {
     }
 }
 
+void ImportModelFinalizer(Napi::Env env, void* finalizeData, ImportModelContext* context) {
+    context->nativeThread.join();
+    delete context;
+};
+
+void importModelThread(ImportModelContext* context, std::mutex& mutex) {
+    // Imports model without blocking the main thread.
+    {
+        const std::lock_guard<std::mutex> lock(mutex);
+        context->_compiled_model = context->_core.import_model(context->_stream, context->_device, context->_config);
+    }
+
+    // Callback to return to JS the results of core.import_model()
+    auto callback = [](Napi::Env env, Napi::Function, ImportModelContext* context) {
+        context->deferred.Resolve(cpp_to_js(env, context->_compiled_model));
+    };
+
+    // Addon's main thread will safely invoke the JS callback function on the behalf of the additional thread.
+    context->tsfn.BlockingCall(context, callback);
+    context->tsfn.Release();
+}
+
+Napi::Value CoreWrap::import_model_async(const Napi::CallbackInfo& info) {
+    const auto& env = info.Env();
+    std::vector<std::string> allowed_signatures;
+
+    try {
+        if (ov::js::validate<Napi::Buffer<uint8_t>, Napi::String>(info, allowed_signatures) ||
+            ov::js::validate<Napi::Buffer<uint8_t>, Napi::String, Napi::Object>(info, allowed_signatures)) {
+            // Prepare validated data that will be transferred to the new thread.
+            auto context_data = new ImportModelContext(env, _core);
+
+            const auto& model_data = info[0].As<Napi::Buffer<uint8_t>>();
+            const auto model_stream = std::string(reinterpret_cast<char*>(model_data.Data()), model_data.Length());
+            context_data->_stream << model_stream;
+            context_data->_device = info[1].ToString();
+            context_data->_config = info.Length() == 3 ? to_anyMap(env, info[2]) : ov::AnyMap();
+
+            context_data->tsfn = Napi::ThreadSafeFunction::New(env,
+                                                               Napi::Function(),
+                                                               "TSFN",
+                                                               0,
+                                                               1,
+                                                               context_data,
+                                                               ImportModelFinalizer,
+                                                               (void*)nullptr);
+
+            context_data->nativeThread = std::thread(importModelThread, context_data, std::ref(_mutex));
+            // Returns a Promise to JS. Method import_model() is performed on additional thread.
+            return context_data->deferred.Promise();
+        } else {
+            OPENVINO_THROW("'importModel'", ov::js::get_parameters_error_msg(info, allowed_signatures));
+        }
+
+    } catch (std::exception& e) {
+        reportError(info.Env(), e.what());
+        return info.Env().Undefined();
+    }
+}
+
 Napi::Value CoreWrap::set_property(const Napi::CallbackInfo& info) {
     try {
         auto args = try_get_set_property_parameters(info);
diff --git a/src/bindings/js/node/src/helper.cpp b/src/bindings/js/node/src/helper.cpp
index 01474a49c4ae3c..09161deb2bc30e 100644
--- a/src/bindings/js/node/src/helper.cpp
+++ b/src/bindings/js/node/src/helper.cpp
@@ -3,6 +3,7 @@
 
 #include "node/include/helper.hpp"
 
+#include "node/include/compiled_model.hpp"
 #include "node/include/tensor.hpp"
 #include "node/include/type_validation.hpp"
 
@@ -251,11 +252,33 @@ Napi::Array cpp_to_js<ov::Dimension, Napi::Array>(const Napi::CallbackInfo& info
     return interval;
 }
 
+Napi::Object cpp_to_js(const Napi::Env& env, std::shared_ptr<ov::Model> model) {
+    const auto& prototype = env.GetInstanceData<AddonData>()->model;
+    if (!prototype) {
+        OPENVINO_THROW("Invalid pointer to Model prototype.");
+    }
+    const auto& model_js = prototype.New({});
+    const auto mw = Napi::ObjectWrap<ModelWrap>::Unwrap(model_js);
+    mw->set_model(model);
+    return model_js;
+}
+
 template <>
 Napi::Boolean cpp_to_js<bool, Napi::Boolean>(const Napi::CallbackInfo& info, const bool value) {
     return Napi::Boolean::New(info.Env(), value);
 }
 
+Napi::Object cpp_to_js(const Napi::Env& env, const ov::CompiledModel& compiled_model) {
+    const auto& prototype = env.GetInstanceData<AddonData>()->compiled_model;
+    if (!prototype) {
+        OPENVINO_THROW("Invalid pointer to CompiledModel prototype.");
+    }
+    auto obj = prototype.New({});
+    const auto cm = Napi::ObjectWrap<CompiledModelWrap>::Unwrap(obj);
+    cm->set_compiled_model(compiled_model);
+    return obj;
+}
+
 ov::TensorVector parse_input_data(const Napi::Value& input) {
     ov::TensorVector parsed_input;
     if (input.IsArray()) {
diff --git a/src/bindings/js/node/src/model_wrap.cpp b/src/bindings/js/node/src/model_wrap.cpp
index a10bc3dd6861a6..e8359b83ff6da3 100644
--- a/src/bindings/js/node/src/model_wrap.cpp
+++ b/src/bindings/js/node/src/model_wrap.cpp
@@ -33,18 +33,6 @@ void ModelWrap::set_model(const std::shared_ptr<ov::Model>& model) {
     _model = model;
 }
 
-Napi::Object ModelWrap::wrap(Napi::Env env, std::shared_ptr<ov::Model> model) {
-    Napi::HandleScope scope(env);
-    const auto& prototype = env.GetInstanceData<AddonData>()->model;
-    if (!prototype) {
-        OPENVINO_THROW("Invalid pointer to model prototype.");
-    }
-    const auto& model_js = prototype.New({});
-    const auto mw = Napi::ObjectWrap<ModelWrap>::Unwrap(model_js);
-    mw->set_model(model);
-    return model_js;
-}
-
 Napi::Value ModelWrap::get_name(const Napi::CallbackInfo& info) {
     if (_model->get_name() != "")
         return Napi::String::New(info.Env(), _model->get_name());
diff --git a/src/bindings/js/node/tests/basic.test.js b/src/bindings/js/node/tests/basic.test.js
index ba3e585e40ce7d..8c8e0900a127f7 100644
--- a/src/bindings/js/node/tests/basic.test.js
+++ b/src/bindings/js/node/tests/basic.test.js
@@ -236,7 +236,7 @@ describe('Test exportModel()/importModel()', () => {
   const inferRequest = compiledModel.createInferRequest();
   const res1 = inferRequest.infer([tensor]);
 
-  it('Test importModel(stream, device)', () => {
+  it('Test importModelSync(stream, device)', () => {
     const newCompiled = core.importModelSync(userStream, 'CPU');
     const newInferRequest = newCompiled.createInferRequest();
     const res2 = newInferRequest.infer([tensor]);
@@ -244,7 +244,7 @@ describe('Test exportModel()/importModel()', () => {
     assert.deepStrictEqual(res1['fc_out'].data[0], res2['fc_out'].data[0]);
   });
 
-  it('Test importModel(stream, device, config)', () => {
+  it('Test importModelSync(stream, device, config)', () => {
     const newCompiled = core.importModelSync(userStream, 'CPU', { 'NUM_STREAMS': 1 });
     const newInferRequest = newCompiled.createInferRequest();
     const res2 = newInferRequest.infer([tensor]);
@@ -252,27 +252,27 @@ describe('Test exportModel()/importModel()', () => {
     assert.deepStrictEqual(res1['fc_out'].data[0], res2['fc_out'].data[0]);
   });
 
-  it('Test importModel(stream, device) throws', () => {
+  it('Test importModelSync(stream, device) throws', () => {
     assert.throws(
       () => core.importModelSync(epsilon, 'CPU'),
       /The first argument must be of type Buffer./
     );
   });
 
-  it('Test importModel(stream, device) throws', () => {
+  it('Test importModelSync(stream, device) throws', () => {
     assert.throws(
       () => core.importModelSync(userStream, tensor),
       /The second argument must be of type String./
     );
   });
-  it('Test importModel(stream, device, config: tensor) throws', () => {
+  it('Test importModelSync(stream, device, config: tensor) throws', () => {
     assert.throws(
       () => core.importModelSync(userStream, 'CPU', tensor),
       /NotFound: Unsupported property 0 by CPU plugin./
     );
   });
 
-  it('Test importModel(stream, device, config: string) throws', () => {
+  it('Test importModelSync(stream, device, config: string) throws', () => {
     const testString = 'test';
     assert.throws(
       () => core.importModelSync(userStream, 'CPU', testString),
@@ -280,11 +280,53 @@ describe('Test exportModel()/importModel()', () => {
     );
   });
 
-  it('Test importModel(stream, device, config: unsupported property) throws', () => {
+  it('Test importModelSync(stream, device, config: unsupported property) \
+    throws', () => {
     const tmpDir = '/tmp';
     assert.throws(
       () => core.importModelSync(userStream, 'CPU', { 'CACHE_DIR': tmpDir }),
       /Unsupported property CACHE_DIR by CPU plugin./
     );
   });
+
+  it('Test importModel(stream, device)', () => {
+    core.importModel(userStream, 'CPU').then(newCompiled => {
+      const newInferRequest = newCompiled.createInferRequest();
+      const res2 = newInferRequest.infer([tensor]);
+      assert.deepStrictEqual(res1['fc_out'].data[0], res2['fc_out'].data[0]);
+    });
+  });
+
+  it('Test importModel(stream, device, config)', () => {
+    core.importModel(userStream, 'CPU', { 'NUM_STREAMS': 1 }).then(
+      newCompiled => {
+        const newInferRequest = newCompiled.createInferRequest();
+        const res2 = newInferRequest.infer([tensor]);
+
+        assert.deepStrictEqual(res1['fc_out'].data[0], res2['fc_out'].data[0]);
+      });
+  });
+
+  it('Test importModel(stream, device) throws', () => {
+    assert.throws(
+      () => core.importModel(epsilon, 'CPU').then(),
+      /'importModel' method called with incorrect parameters./
+    );
+  });
+
+  it('Test importModel(stream, device) throws', () => {
+    assert.throws(
+      () => core.importModel(userStream, tensor).then(),
+      /'importModel' method called with incorrect parameters./
+    );
+  });
+
+  it('Test importModel(stream, device, config: string) throws', () => {
+    const testString = 'test';
+    assert.throws(
+      () => core.importModel(userStream, 'CPU', testString).then(),
+      /'importModel' method called with incorrect parameters./
+    );
+  });
+
 });
diff --git a/src/bindings/js/node/tests/read_model.test.js b/src/bindings/js/node/tests/read_model.test.js
index 731001fbb861b7..83c5c216f3c759 100644
--- a/src/bindings/js/node/tests/read_model.test.js
+++ b/src/bindings/js/node/tests/read_model.test.js
@@ -10,7 +10,9 @@ const { getModelPath } = require('./utils.js');
 
 const { xml: modelPath, bin: weightsPath } = getModelPath();
 const modelFile = fs.readFileSync(modelPath);
+const modelStr = fs.readFileSync(modelPath, 'utf8');
 const weightsFile = fs.readFileSync(weightsPath);
+const weightsTensor = new ov.Tensor(ov.element.u8, [weightsFile.buffer.byteLength], new Uint8Array(weightsFile.buffer));
 
 const core = new ov.Core();
 
@@ -34,7 +36,16 @@ describe('Core.readModeSync', () => {
     )
   });
 
-  it('readModeSync(modelUint8ArrayBuffer, weightsUint8ArrayBuffer) ', () => {
+  it('readModelSync(modelString, weightsTensor) ', () => {
+    const model = core.readModelSync(
+      modelStr,
+      weightsTensor,
+    );
+    assert.ok(model instanceof ov.Model);
+    assert.equal(model.inputs.length, 1);
+  });
+
+  it('readModelSync(modelUint8ArrayBuffer, weightsUint8ArrayBuffer) ', () => {
     const model = core.readModelSync(
       new Uint8Array(modelFile.buffer),
       new Uint8Array(weightsFile.buffer),
@@ -55,7 +66,16 @@ describe('Core.readModel', () => {
     assert.equal(model.inputs.length, 1);
   });
 
-  it('readMode(modelUint8ArrayBuffer, weightsUint8ArrayBuffer) ', async () => {
+  it('readModel(modelString, weightsTensor) ', async () => {
+    const model = await core.readModel(
+      modelStr,
+      weightsTensor,
+    );
+    assert.ok(model instanceof ov.Model);
+    assert.equal(model.inputs.length, 1);
+  });
+
+  it('readModel(modelUint8ArrayBuffer, weightsUint8ArrayBuffer) ', async () => {
     const model = await core.readModel(
       new Uint8Array(modelFile.buffer),
       new Uint8Array(weightsFile.buffer),
diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp
index b2e80048e66cf7..f6cd1ab20012f7 100644
--- a/src/common/transformations/src/transformations/convert_precision.cpp
+++ b/src/common/transformations/src/transformations/convert_precision.cpp
@@ -441,6 +441,7 @@ bool ov::pass::ConvertPrecision::run_on_model(const std::shared_ptr<ov::Model>&
         {ov::op::v3::TopK::get_type_info_static(), fuse_type_to_topk},
         {ov::op::v11::TopK::get_type_info_static(), fuse_type_to_topk},
         {ov::op::v8::MaxPool::get_type_info_static(), fuse_type_to_maxpool},
+        {ov::op::v14::MaxPool::get_type_info_static(), fuse_type_to_maxpool},
         {ov::op::v3::NonZero::get_type_info_static(), fuse_type_to_nonzero},
         {ov::op::v3::Bucketize::get_type_info_static(), fuse_type_to_bucketize},
         {ov::op::v1::Equal::get_type_info_static(), fuse_type_to_binary_comparision<ov::op::v1::Equal>},
@@ -924,9 +925,15 @@ bool fuse_type_to_topk(const std::shared_ptr<ov::Node>& node, const precisions_m
 }
 
 bool fuse_type_to_maxpool(const std::shared_ptr<ov::Node>& node, const precisions_map& precisions) {
-    if (auto maxpool = ov::as_type_ptr<ov::op::v8::MaxPool>(node)) {
+    auto maxpool_v8 = ov::as_type_ptr<ov::op::v8::MaxPool>(node);
+    auto maxpool_v14 = ov::as_type_ptr<ov::op::v14::MaxPool>(node);
+    if (maxpool_v14) {
         return update_type(1, node, precisions, [&](const element::Type& to) {
-            maxpool->set_index_element_type(to);
+            maxpool_v14->set_index_element_type(to);
+        });
+    } else if (maxpool_v8) {
+        return update_type(1, node, precisions, [&](const element::Type& to) {
+            maxpool_v8->set_index_element_type(to);
         });
     }
     return false;
diff --git a/src/common/transformations/src/transformations/op_conversions/convert_avgpool_downgrade.cpp b/src/common/transformations/src/transformations/op_conversions/convert_avgpool_downgrade.cpp
index 3333c1d6885f08..24d3ecca334c73 100644
--- a/src/common/transformations/src/transformations/op_conversions/convert_avgpool_downgrade.cpp
+++ b/src/common/transformations/src/transformations/op_conversions/convert_avgpool_downgrade.cpp
@@ -24,8 +24,11 @@ ov::pass::ConvertAvgPool14ToAvgPool1::ConvertAvgPool14ToAvgPool1() {
 
     const auto avg_pool_v14_pattern = pattern::wrap_type<ov::op::v14::AvgPool>();
 
-    const matcher_pass_callback callback = [](pattern::Matcher& m) {
+    const matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](pattern::Matcher& m) {
         const auto avg_pool_v14 = std::dynamic_pointer_cast<ov::op::v14::AvgPool>(m.get_match_root());
+        if (!avg_pool_v14 || transformation_callback(avg_pool_v14)) {
+            return false;
+        }
         const auto rounding_type_v14 = avg_pool_v14->get_rounding_type();
         const auto rounding_type_v1 =
             rounding_type_v14 == ov::op::RoundingType::CEIL_TORCH ? ov::op::RoundingType::CEIL : rounding_type_v14;
diff --git a/src/common/transformations/src/transformations/op_conversions/convert_maxpool_downgrade.cpp b/src/common/transformations/src/transformations/op_conversions/convert_maxpool_downgrade.cpp
index 0edff11b8429ed..9c3f1178125851 100644
--- a/src/common/transformations/src/transformations/op_conversions/convert_maxpool_downgrade.cpp
+++ b/src/common/transformations/src/transformations/op_conversions/convert_maxpool_downgrade.cpp
@@ -140,7 +140,7 @@ ov::pass::ConvertMaxPool14ToMaxPool8::ConvertMaxPool14ToMaxPool8() {
             const auto selected_pads = node_registry.make<Select>(in_gt_out, padding_end_node, zero);
 
             // apply padding on input clear pads attribute
-            const auto pb = node_registry.make<Concat>(OutputVector{pads_remaining->output(0), padding_end_node}, 0);
+            const auto pb = node_registry.make<Concat>(OutputVector{pads_remaining->output(0), padding_begin_node}, 0);
             const auto pe = node_registry.make<Concat>(OutputVector{pads_remaining, selected_pads}, 0);
             auto minus_inf =
                 node_registry.make<Constant>(element::f32, Shape{}, -std::numeric_limits<float>::infinity());
diff --git a/src/common/transformations/tests/op_conversions/convert_maxpool_downgrade_test.cpp b/src/common/transformations/tests/op_conversions/convert_maxpool_downgrade_test.cpp
index 6a437beda64ea7..4bef88fe4a8cb6 100644
--- a/src/common/transformations/tests/op_conversions/convert_maxpool_downgrade_test.cpp
+++ b/src/common/transformations/tests/op_conversions/convert_maxpool_downgrade_test.cpp
@@ -122,7 +122,7 @@ std::shared_ptr<ov::Model> create_ceil_torch_workaround_model(const ov::op::Roun
     const auto selected_pads = std::make_shared<Select>(in_gt_out, padding_end_node, zero);
 
     // apply padding on input clear pads attribute
-    const auto pb = std::make_shared<Concat>(ov::OutputVector{pads_remaining, padding_end_node}, 0);
+    const auto pb = std::make_shared<Concat>(ov::OutputVector{pads_remaining, padding_begin_node}, 0);
     const auto pe = std::make_shared<Concat>(ov::OutputVector{pads_remaining, selected_pads}, 0);
     auto minus_inf = Constant::create(ov::element::f32, ov::Shape{}, {-std::numeric_limits<float>::infinity()});
     std::shared_ptr<ov::Node> convert_like_node = std::make_shared<ConvertLike>(minus_inf, input);
diff --git a/src/core/reference/include/openvino/reference/utils/phillox_converter.hpp b/src/core/reference/include/openvino/reference/utils/phillox_converter.hpp
index 521a23b240a9f3..eb178d8dc894b4 100644
--- a/src/core/reference/include/openvino/reference/utils/phillox_converter.hpp
+++ b/src/core/reference/include/openvino/reference/utils/phillox_converter.hpp
@@ -27,7 +27,7 @@ class PhilloxConverter {
 
     /// \brief Converts the given array (PhilloxOutput) to the target dtype and assigns them at the k-th index of the
     /// output array.
-    virtual void convert(PhilloxOutput result, size_t k) = 0;
+    virtual void convert(const PhilloxOutput result, size_t k) = 0;
 
 protected:
     PhilloxConverter(char* out,
@@ -62,7 +62,7 @@ class MockPhilloxConverter : public PhilloxConverter {
 
     /// \brief Converts the given array (PhilloxOutput) to the target dtype and assigns them at the k-th index of the
     /// output array.
-    void convert(PhilloxOutput result, size_t idx) override;
+    void convert(const PhilloxOutput result, size_t idx) override;
 };
 
 class TensorflowPhilloxConverter : public PhilloxConverter {
@@ -79,7 +79,7 @@ class TensorflowPhilloxConverter : public PhilloxConverter {
 
     /// \brief Converts the given array (PhilloxOutput) to the target dtype and assigns them at the k-th index of the
     /// output array.
-    void convert(PhilloxOutput result, size_t idx) override;
+    void convert(const PhilloxOutput result, size_t idx) override;
 };
 
 class PyTorchPhilloxConverter : public PhilloxConverter {
@@ -98,7 +98,7 @@ class PyTorchPhilloxConverter : public PhilloxConverter {
 
     /// \brief Converts the given array (PhilloxOutput) to the target dtype and assigns them at the k-th index of the
     /// output array.
-    void convert(PhilloxOutput result, size_t idx) override;
+    void convert(const PhilloxOutput result, size_t idx) override;
 
 private:
     // Determines whether the ghenerator uses optimized execution
diff --git a/src/core/reference/src/op/random_uniform.cpp b/src/core/reference/src/op/random_uniform.cpp
index 3d28c798e7306d..d7134fb9bf32e1 100644
--- a/src/core/reference/src/op/random_uniform.cpp
+++ b/src/core/reference/src/op/random_uniform.cpp
@@ -6,6 +6,7 @@
 
 #include <ctime>
 #include <memory>
+#include <random>
 
 #include "openvino/core/except.hpp"
 #include "openvino/core/shape.hpp"
@@ -30,7 +31,8 @@ std::pair<uint64_t, uint64_t> random_uniform(const uint64_t* out_shape,
     // Implementation in plugins may differ for this case.
     if (seed == 0 && seed2 == 0) {
         std::srand(static_cast<unsigned int>(std::time(nullptr)));
-        seed = std::rand();
+        std::mt19937_64 gen(static_cast<uint64_t>(std::time(nullptr)));
+        seed = gen();
     }
 
     // Calculate total element count for generation
@@ -49,8 +51,8 @@ std::pair<uint64_t, uint64_t> random_uniform(const uint64_t* out_shape,
     // Generate randon numbers and convert them until the output array is full
     const size_t step = converter->get_converted_elements_count();
     for (size_t i = 0; i < elem_count; i += step) {
-        phillox::PhilloxOutput result = generator->random();
-        converter->convert(result, i);
+        const auto& result = generator->random();
+        converter->convert(std::move(result), i);
     }
 
     // Return the next state to feed into the generator
diff --git a/src/core/reference/src/utils/phillox_converter.cpp b/src/core/reference/src/utils/phillox_converter.cpp
index 2e84073140bce8..48020350835b0e 100644
--- a/src/core/reference/src/utils/phillox_converter.cpp
+++ b/src/core/reference/src/utils/phillox_converter.cpp
@@ -236,7 +236,7 @@ size_t TensorflowPhilloxConverter::get_converted_elements_count() const {
     return m_elem_type.size() > 4 ? ELEMENTS_PER_EXECUTION / 2 : ELEMENTS_PER_EXECUTION;
 }
 
-void TensorflowPhilloxConverter::convert(PhilloxOutput result, size_t idx) {
+void TensorflowPhilloxConverter::convert(const PhilloxOutput result, size_t idx) {
     // convert values to corresponding output_type
     switch (m_elem_type) {
     case element::Type_t::f32: {
@@ -352,7 +352,7 @@ size_t PyTorchPhilloxConverter::get_converted_elements_count() const {
     return m_elem_type.size() > 4 && !m_optimization_enabled ? ELEMENTS_PER_EXECUTION / 2 : ELEMENTS_PER_EXECUTION;
 }
 
-void PyTorchPhilloxConverter::convert(PhilloxOutput result, size_t idx) {
+void PyTorchPhilloxConverter::convert(const PhilloxOutput result, size_t idx) {
     // convert values to corresponding output_type
     switch (m_elem_type) {
     case element::Type_t::f32: {
diff --git a/src/core/src/pass/serialize.cpp b/src/core/src/pass/serialize.cpp
index d3e1d501a7ee00..f4f3b7d0184697 100644
--- a/src/core/src/pass/serialize.cpp
+++ b/src/core/src/pass/serialize.cpp
@@ -873,9 +873,9 @@ class PaddingsFixer {
             if (pad_agnostic_types.count(op->get_auto_pad())) {
                 clone_op_and_fix_paddings<ov::opset1::BinaryConvolution, ov::CoordinateDiff>(op);
             }
-        } else if (auto op = ov::as_type<ov::opset1::AvgPool>(node)) {
+        } else if (auto op = ov::as_type<ov::op::util::AvgPoolBase>(node)) {
             if (pad_agnostic_types.count(op->get_auto_pad())) {
-                clone_op_and_fix_paddings<ov::opset1::AvgPool, ov::Shape>(op);
+                clone_op_and_fix_paddings<ov::op::util::AvgPoolBase, ov::Shape>(op);
             }
         } else if (auto op = ov::as_type<ov::op::util::MaxPoolBase>(node)) {
             if (pad_agnostic_types.count(op->get_auto_pad())) {
diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp
index dc0ed0b983900c..61a6255d9c8a01 100644
--- a/src/plugins/intel_cpu/src/extension.cpp
+++ b/src/plugins/intel_cpu/src/extension.cpp
@@ -97,6 +97,7 @@ class TypeRelaxedExtension : public ov::OpExtension<ov::op::TypeRelaxed<Op>> {
 #define TYPE_RELAXED_EXTENSIONS                                         \
     TYPE_RELAXED_OP_EXTENSION(ov::op::v1::Add)                          \
     TYPE_RELAXED_OP_EXTENSION(ov::op::v1::AvgPool)                      \
+    TYPE_RELAXED_OP_EXTENSION(ov::op::v14::AvgPool)                     \
     TYPE_RELAXED_OP_EXTENSION(ov::op::v0::Clamp)                        \
     TYPE_RELAXED_OP_EXTENSION(ov::op::v0::Concat)                       \
     TYPE_RELAXED_OP_EXTENSION(ov::op::v1::Convolution)                  \
diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp
index 92b953665fe2cb..0d6f02dd36fe6e 100644
--- a/src/plugins/intel_cpu/src/graph_optimizer.cpp
+++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp
@@ -2471,6 +2471,15 @@ void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) {
     }
 }
 
+bool GraphOptimizer::canBeInplaced(const NodePtr& parentNode, const NodePtr& childNode) {
+    const auto parentInPlace = parentNode->getParentEdgeAt(0)->inPlace(Edge::LOOK_UP);
+    const auto& childEdges = childNode->getChildEdgesAtPort(0);
+    const auto childInPlace = std::any_of(childEdges.begin(), childEdges.end(), [](const EdgePtr& edge) {
+        return edge->inPlace(Edge::LOOK_DOWN);
+    });
+    return !(parentInPlace && childInPlace);
+}
+
 bool GraphOptimizer::checkAscendingFinalOrder(const VectorDims& transposeOrder,
                                               const VectorDims& layoutOrder,
                                               const VectorDims& reorderInOrder,
@@ -2534,21 +2543,14 @@ void GraphOptimizer::mergeTransposeReshapeReorder(Graph& graph,
     if (reshapeNode)
         graph.RemoveEdge(reshapeNode->getParentEdgeAt(1));
 
-    // to prevent inPlace conflict we must check that the memory reference is unidirectional or
-    // inPlace memory is not used
-    const auto parentInPlace = parentNode->getParentEdgeAt(0)->inPlace(Edge::LOOK_UP);
-    const auto& childEdges = childNode->getChildEdgesAtPort(0);
-
-    const auto childInPlace = std::any_of(childEdges.begin(), childEdges.end(), [](const EdgePtr& edge) {
-        return edge->inPlace(Edge::LOOK_DOWN);
-    });
-
+    // To prevent inPlace conflict, we must check that the memory reference is unidirectional
+    // or inPlace memory is not used
     // Note: this value must be computed before detaching nodes
-    bool isOptimized = !(parentInPlace && childInPlace);
+    bool isOptimized = canBeInplaced(parentNode, childNode);
 
     // hold references to all children before dropping reorder_node
     std::vector<std::pair<NodePtr, int>> reorderChildren;
-    for (auto ccEdge : childEdges)
+    for (auto ccEdge : childNode->getChildEdgesAtPort(0))
         reorderChildren.emplace_back(ccEdge->getChild(), ccEdge->getOutputNum());
 
     // detach nodes from graph by remove all of their edges
@@ -2900,6 +2902,12 @@ void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) {
         auto& outOrder = outBlockedDesc->getOrder();
 
         if (checkAscendingFinalOrder(transposeOrder, layoutOrder, inOrder, outOrder)) {
+            // Reorder node doesn't support (with rare exceptions) reordering in case of different ranks on input and output.
+            // So the merge can be performed only in the case when the fused reorder will be optimized.
+            if (parentNode->getInputShapeAtPort(0).getRank() != childNode->getOutputShapeAtPort(0).getRank() &&
+                !canBeInplaced(parentNode, childNode)) {
+                continue;
+            }
             mergeTransposeReshapeReorder(graph, transposeNode, reshapeNode, reorderNode, true);
         }
     }
diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h
index 7164854ac9c8c5..0a85a253ba8d66 100644
--- a/src/plugins/intel_cpu/src/graph_optimizer.h
+++ b/src/plugins/intel_cpu/src/graph_optimizer.h
@@ -53,6 +53,7 @@ class GraphOptimizer {
     void RemoveConvertMemoryOutput(Graph &graph);
     void MatchSdpaKvCache(Graph &graph);
 
+    bool canBeInplaced(const NodePtr& parentNode, const NodePtr& childNode);
     // Method checks that after the sequential execution of Transpose and Reorder nodes,
     // the order of the elements in the memory (physical layout) will not change.
     bool checkAscendingFinalOrder(const VectorDims& transposeOrder,
diff --git a/src/plugins/intel_cpu/src/nodes/pooling.cpp b/src/plugins/intel_cpu/src/nodes/pooling.cpp
index 24e791dcb4b39b..d412cdaecda192 100644
--- a/src/plugins/intel_cpu/src/nodes/pooling.cpp
+++ b/src/plugins/intel_cpu/src/nodes/pooling.cpp
@@ -146,13 +146,15 @@ dnnl::pooling_forward::primitive_desc createDescriptorHelper(const dnnl::engine&
 
 bool Pooling::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
     try {
-        if (ov::is_type<const ov::op::v8::MaxPool>(op)) {
+        if (ov::is_type<const ov::op::v8::MaxPool>(op) || ov::is_type<const ov::op::v14::MaxPool>(op)) {
             if (!op->get_output_target_inputs(1).empty()) {
-                errorMessage = "MaxPool from opset8 is supported only with one output";
+                errorMessage = "MaxPool from opset8 and opset14 is supported only with one output";
                 return false;
             }
-        } else if (!ov::is_type<const ov::op::v1::MaxPool>(op) && !ov::is_type<const ov::op::v1::AvgPool>(op)) {
-            errorMessage = "MaxPool and AvgPool from opset1 and MaxPool from opset8 are supported";
+        } else if (!ov::is_type<const ov::op::v1::MaxPool>(op) && !ov::is_type<const ov::op::v8::MaxPool>(op) &&
+        !ov::is_type<const ov::op::v14::MaxPool>(op) && !ov::is_type<const ov::op::v1::AvgPool>(op) &&
+        !ov::is_type<const ov::op::v14::AvgPool>(op)) {
+            errorMessage = "Supported ops are MaxPool-1, MaxPool-8, MaxPool-14, AvgPool-1 and AvgPool-14";
             return false;
         }
     } catch (...) {
@@ -174,47 +176,37 @@ Pooling::Pooling(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr c
         }
     };
 
-    if (auto maxPoolOp_v8 = ov::as_type_ptr<const ov::op::v8::MaxPool>(op)) {
-        isMaxPool8 = true;
+    if (auto maxPoolOpBase = ov::as_type_ptr<const ov::op::util::MaxPoolBase>(op)) {
         algorithm = Algorithm::PoolingMax;
         poolingAttrs.exclude_pad = false;
-        poolingAttrs.rounding = maxPoolOp_v8->get_rounding_type();
-        poolingAttrs.pad_type = maxPoolOp_v8->get_auto_pad();
+        poolingAttrs.rounding = maxPoolOpBase->get_rounding_type();
+        poolingAttrs.pad_type = maxPoolOpBase->get_auto_pad();
+        get_attributes(poolingAttrs.stride, maxPoolOpBase->get_strides());
+        get_attributes(poolingAttrs.kernel, maxPoolOpBase->get_kernel());
+        get_attributes(poolingAttrs.data_pad_begin, maxPoolOpBase->get_pads_begin());
+        get_attributes(poolingAttrs.data_pad_end, maxPoolOpBase->get_pads_end());
+        poolingAttrs.auto_pad = (poolingAttrs.pad_type == ov::op::PadType::SAME_LOWER || poolingAttrs.pad_type == ov::op::PadType::SAME_UPPER);
+    }
 
+    if (auto maxPoolOp_v14 = ov::as_type_ptr<const ov::op::v14::MaxPool>(op)) {
+        isNotMaxPool1 = true;
+        get_attributes(poolingAttrs.dilation, maxPoolOp_v14->get_dilations());
+    } else if (auto maxPoolOp_v8 = ov::as_type_ptr<const ov::op::v8::MaxPool>(op)) {
+        isNotMaxPool1 = true;
         get_attributes(poolingAttrs.dilation, maxPoolOp_v8->get_dilations());
-        get_attributes(poolingAttrs.stride, maxPoolOp_v8->get_strides());
-        get_attributes(poolingAttrs.kernel, maxPoolOp_v8->get_kernel());
-        get_attributes(poolingAttrs.data_pad_begin, maxPoolOp_v8->get_pads_begin());
-        get_attributes(poolingAttrs.data_pad_end, maxPoolOp_v8->get_pads_end());
-
-        poolingAttrs.auto_pad = (maxPoolOp_v8->get_auto_pad() == ov::op::PadType::SAME_LOWER || maxPoolOp_v8->get_auto_pad() == ov::op::PadType::SAME_UPPER);
     } else if (auto maxPoolOp_v1 = ov::as_type_ptr<const ov::op::v1::MaxPool>(op)) {
-        algorithm = Algorithm::PoolingMax;
-        poolingAttrs.exclude_pad = false;
-        poolingAttrs.pad_type = maxPoolOp_v1->get_auto_pad();
-        poolingAttrs.rounding = maxPoolOp_v1->get_rounding_type();
-
-        get_attributes(poolingAttrs.stride, maxPoolOp_v1->get_strides());
-        get_attributes(poolingAttrs.kernel, maxPoolOp_v1->get_kernel());
-        get_attributes(poolingAttrs.data_pad_begin, maxPoolOp_v1->get_pads_begin());
-        get_attributes(poolingAttrs.data_pad_end, maxPoolOp_v1->get_pads_end());
         poolingAttrs.dilation.resize(poolingAttrs.kernel.size(), 1);
-
-        poolingAttrs.auto_pad = (maxPoolOp_v1->get_auto_pad() == ov::op::PadType::SAME_LOWER || maxPoolOp_v1->get_auto_pad() == ov::op::PadType::SAME_UPPER);
-    } else if (auto avgPoolOp = ov::as_type_ptr<const ov::op::v1::AvgPool>(op)) {
+    } else if (auto avgPoolOpBase = ov::as_type_ptr<const ov::op::util::AvgPoolBase>(op)) {
         algorithm = Algorithm::PoolingAvg;
-        poolingAttrs.exclude_pad = avgPoolOp->get_exclude_pad();
-        poolingAttrs.rounding = avgPoolOp->get_rounding_type();
-
-        get_attributes(poolingAttrs.stride, avgPoolOp->get_strides());
-        get_attributes(poolingAttrs.kernel, avgPoolOp->get_kernel());
-        get_attributes(poolingAttrs.data_pad_begin, avgPoolOp->get_pads_begin());
-        get_attributes(poolingAttrs.data_pad_end, avgPoolOp->get_pads_end());
+        poolingAttrs.exclude_pad = avgPoolOpBase->get_exclude_pad();
+        poolingAttrs.rounding = avgPoolOpBase->get_rounding_type();
+        get_attributes(poolingAttrs.stride, avgPoolOpBase->get_strides());
+        get_attributes(poolingAttrs.kernel, avgPoolOpBase->get_kernel());
+        get_attributes(poolingAttrs.data_pad_begin, avgPoolOpBase->get_pads_begin());
+        get_attributes(poolingAttrs.data_pad_end, avgPoolOpBase->get_pads_end());
         poolingAttrs.dilation.resize(poolingAttrs.kernel.size(), 1);
-
-        poolingAttrs.auto_pad = (avgPoolOp->get_auto_pad() == ov::op::PadType::SAME_LOWER || avgPoolOp->get_auto_pad() == ov::op::PadType::SAME_UPPER);
+        poolingAttrs.auto_pad = (avgPoolOpBase->get_auto_pad() == ov::op::PadType::SAME_LOWER || avgPoolOpBase->get_auto_pad() == ov::op::PadType::SAME_UPPER);
     }
-
     poolingAttrs.algorithm = algorithm;
 }
 
@@ -642,7 +634,7 @@ void Pooling::initSupportedPrimitiveDescriptors() {
         }
 
         // CPU plugin doesn't support second output of MaxPool-8, but anyway we should have out config for second port as stub
-        if (isMaxPool8) {
+        if (isNotMaxPool1) {
             const auto& creatorsMap = BlockedDescCreator::getCommonCreators();
             const auto outputPrecision = outConfs.front().getMemDesc()->getPrecision();
             auto desc = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(outputPrecision, getOutputShapeAtPort(1));
diff --git a/src/plugins/intel_cpu/src/nodes/pooling.h b/src/plugins/intel_cpu/src/nodes/pooling.h
index e526f144a5cbb1..ffd96fc320036c 100644
--- a/src/plugins/intel_cpu/src/nodes/pooling.h
+++ b/src/plugins/intel_cpu/src/nodes/pooling.h
@@ -57,7 +57,7 @@ class Pooling : public Node {
 
     Shape inShape;
 
-    bool isMaxPool8 = false;
+    bool isNotMaxPool1 = false;
     bool useACL = false;
 };
 
diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp
index 9079bc24862c38..c868792574cc81 100644
--- a/src/plugins/intel_cpu/src/nodes/reorder.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp
@@ -265,11 +265,15 @@ void Reorder::createReorderPrimitive(const dnnl::memory::desc& srcDesc,
 
     const auto engine = getEngine();
     src_blocked = std::make_shared<Memory>(engine, DnnlExtensionUtils::makeDescriptor(srcDesc), srcPtr, false);
-
     dst_blocked = std::make_shared<Memory>(engine, DnnlExtensionUtils::makeDescriptor(dstDesc), dstPtr, false);
-
     auto src_desc = src_blocked->getPrimitive().get_desc();
     if (!src_permutation.empty()) {
+        CPU_NODE_ASSERT(src_permutation.size() == static_cast<size_t>(src_desc.get_ndims()),
+                        "src_permutation size (",
+                        src_permutation.size(),
+                        ") doesn't match with src_desc ndims(",
+                        src_desc.get_ndims(),
+                        ")");
         // reorder requires exact matching of logical dimensions between src & dst
         // sometime we have to permute source's logical dimensions to satisfy
         // this requirement, this dosn't affect plugin's node input memory desc.
@@ -304,11 +308,10 @@ void Reorder::createReorderPrimitive(const dnnl::memory::desc& srcDesc,
         src_desc = src_blocked->getPrimitive().get_desc();
     }
 
+    DEBUG_LOG("CreateReorderPrimitive is called for node", getName(), " src desc: ", src_desc, " dst_desc: ", dst_desc);
+    CPU_NODE_ASSERT(src_desc.get_ndims() == dst_desc.get_ndims(), "OneDNN doesn't support reorder with different ranks.");
     auto result = getReorderPrim(context->getParamsCache(), getEngine(), src_desc, dst_desc);
-    if (!result) {
-        DEBUG_LOG("src desc: ", src_desc, " dst_desc: ", dst_desc);
-        THROW_CPU_NODE_ERR("could not create reorder primitive: unsupported reorder case.");
-    }
+    CPU_NODE_ASSERT(result, "could not create reorder primitive: unsupported reorder case.");
     prim = result;
 
     selectedPD->setImplementationType(
diff --git a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp
index 2f0fec7437570a..94589c668a198e 100644
--- a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp
+++ b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp
@@ -407,6 +407,8 @@ const IStaticShapeInferFactory::TRegistry IStaticShapeInferFactory::registry{
     _OV_OP_SHAPE_INFER_MASK_REG(op::v15::Col2Im, ShapeInferTA, util::bit::mask(1, 2)),
     // opset14
     _OV_OP_SHAPE_INFER_MASK_REG(opset14::Inverse, ShapeInferTA, util::bit::mask()),
+    _OV_OP_SHAPE_INFER_MASK_REG(opset14::MaxPool, ShapeInferPaddingTA, util::bit::mask()),
+    _OV_OP_SHAPE_INFER_MASK_REG(opset14::AvgPool, ShapeInferPaddingTA, util::bit::mask()),
     // opset13
     _OV_OP_SHAPE_INFER_MASK_REG(opset13::Multinomial, ShapeInferTA, util::bit::mask(1)),
     _OV_OP_SHAPE_INFER_MASK_REG(opset13::ScaledDotProductAttention, ShapeInferTA, util::bit::mask(3, 5)),
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/snippets_mark_skipped.cpp
index 85a758ffcbcbef..b3fbf93ce07c3a 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/snippets_mark_skipped.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/snippets_mark_skipped.cpp
@@ -184,7 +184,8 @@ bool isSuitableMiscParent(const std::shared_ptr<const Node> &node) {
                                   ov::is_type<ov::opset1::ConvolutionBackpropData>(node) ||
                                   ov::is_type<ov::op::util::ArithmeticReductionKeepDims>(node) ||
                                   ov::is_type<ov::opset1::GroupConvolutionBackpropData>(node) ||
-                                  ov::is_type<ov::opset1::AvgPool>(node);
+                                  ov::is_type<ov::opset1::AvgPool>(node) ||
+                                  ov::is_type<ov::op::v14::AvgPool>(node);
     // has a single output, connected to a single child
     const auto out = node->outputs();
     const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 01a1ac3e7b47fa..b755062c7361f1 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -37,6 +37,7 @@
 #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp"
 #include "transformations/control_flow/unroll_tensor_iterator.hpp"
 #include "transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp"
+#include "transformations/op_conversions/convert_avgpool_downgrade.hpp"
 #include "transformations/op_conversions/convert_batch_to_space.hpp"
 #include "transformations/op_conversions/convert_bitwise_to_logical_bool.hpp"
 #include "transformations/op_conversions/convert_broadcast_to_tiles.hpp"
@@ -468,10 +469,18 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
         manager,
         [](const_node_ptr& node) -> bool {
             const auto maxpool = std::dynamic_pointer_cast<const ov::op::v14::MaxPool>(node);
-            return !maxpool ||  maxpool->get_rounding_type() == ov::op::RoundingType::CEIL_TORCH;
+            return !maxpool || maxpool->get_rounding_type() == ov::op::RoundingType::CEIL_TORCH;
         },
         ov::pass::ConvertMaxPool14ToMaxPool8);
 
+    CPU_SET_CALLBACK_COMMON(
+        manager,
+        [](const_node_ptr& node) -> bool {
+            const auto avgpool = std::dynamic_pointer_cast<const ov::op::v14::AvgPool>(node);
+            return !avgpool || avgpool->get_rounding_type() == ov::op::RoundingType::CEIL_TORCH;
+        },
+        ov::pass::ConvertAvgPool14ToAvgPool1);
+
     CPU_SET_CALLBACK_COMMON(manager,
         [](const_node_ptr &node) -> bool {
             std::string msg;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/pooling.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/pooling.cpp
index 65f48713185f59..7c4854dd334bcf 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/pooling.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/pooling.cpp
@@ -11,7 +11,6 @@ using namespace CPUTestUtils;
 
 namespace ov {
 namespace test {
-
 std::string PoolingLayerCPUTest::getTestCaseName(const testing::TestParamInfo<poolLayerCpuTestParamsSet>& obj) {
     ov::test::poolSpecificParams basicParamsSet;
     InputShape inputShapes;
@@ -122,6 +121,103 @@ void PoolingLayerCPUTest::SetUp() {
     function = makeNgraphFunction(inPrc, params, pooling, "PoolingCPU");
 }
 
+std::string AvgPoolingV14LayerCPUTest::getTestCaseName(const testing::TestParamInfo<poolLayerCpuTestParamsSet>& obj) {
+    ov::test::poolSpecificParams basicParamsSet;
+    InputShape inputShapes;
+    ElementType inPrc;
+    bool isInt8;
+    CPUSpecificParams cpuParams;
+    fusingSpecificParams fusingParams;
+    ov::AnyMap additionalConfig;
+    std::tie(basicParamsSet, inputShapes, inPrc, isInt8, cpuParams, fusingParams, additionalConfig) = obj.param;
+
+    utils::PoolingTypes poolType;
+    std::vector<size_t> kernel, stride;
+    std::vector<size_t> padBegin, padEnd;
+    ov::op::PadType padType;
+    ov::op::RoundingType roundingType;
+    bool excludePad;
+    std::tie(poolType, kernel, stride, padBegin, padEnd, roundingType, padType, excludePad) = basicParamsSet;
+
+    std::ostringstream results;
+    results << "IS=(";
+    results << ov::test::utils::partialShape2str({inputShapes.first}) << ")_";
+    results << "TS=";
+    for (const auto& shape : inputShapes.second) {
+        results << ov::test::utils::vec2str(shape) << "_";
+    }
+    results << "Prc=" << inPrc << "_";
+    results << "ExcludePad=" << excludePad << "_";
+    results << "K" << ov::test::utils::vec2str(kernel) << "_";
+    results << "S" << ov::test::utils::vec2str(stride) << "_";
+    results << "PB" << ov::test::utils::vec2str(padBegin) << "_";
+    results << "PE" << ov::test::utils::vec2str(padEnd) << "_";
+    results << "Rounding=" << roundingType << "_";
+    results << "AutoPad=" << padType << "_";
+    results << "INT8=" << isInt8 << "_";
+    if (!additionalConfig.empty()) {
+        results << "_PluginConf";
+            for (auto& item : additionalConfig) {
+                results << "_" << item.first << "=" << item.second.as<std::string>();
+            }
+        }
+
+    results << CPUTestsBase::getTestCaseName(cpuParams);
+    results << CpuTestWithFusing::getTestCaseName(fusingParams);
+    return results.str();
+}
+
+void AvgPoolingV14LayerCPUTest::SetUp() {
+    targetDevice = ov::test::utils::DEVICE_CPU;
+
+    poolSpecificParams basicParamsSet;
+    InputShape inputShapes;
+    ElementType inPrc;
+    bool isInt8;
+    CPUSpecificParams cpuParams;
+    fusingSpecificParams fusingParams;
+    ov::AnyMap additionalConfig;
+    std::tie(basicParamsSet, inputShapes, inPrc, isInt8, cpuParams, fusingParams, additionalConfig) = this->GetParam();
+    configuration.insert(additionalConfig.begin(), additionalConfig.end());
+
+    utils::PoolingTypes poolType;
+    std::vector<size_t> kernel, stride;
+    std::vector<size_t> padBegin, padEnd;
+    ov::op::PadType padType;
+    ov::op::RoundingType roundingType;
+    bool excludePad;
+    std::tie(poolType, kernel, stride, padBegin, padEnd, roundingType, padType, excludePad) = basicParamsSet;
+
+    std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+    std::tie(postOpMgrPtr, fusedOps) = fusingParams;
+
+    if (selectedType.empty()) {
+        selectedType = getPrimitiveType();
+    }
+    if (isInt8)
+        selectedType = selectedType + "_I8";
+    else
+        selectedType = makeSelectedTypeStr(selectedType, deduce_expected_precision(inPrc, configuration));
+
+    init_input_shapes({inputShapes});
+
+    ov::ParameterVector params;
+    for (auto&& shape : inputDynamicShapes) {
+        params.push_back(std::make_shared<ov::op::v0::Parameter>(inPrc, shape));
+    }
+
+    std::shared_ptr<ov::Node> poolInput = params[0];
+    if (isInt8) {
+        abs_threshold = 2e-2;
+        ov::Shape newShape(poolInput->get_output_partial_shape(0).size(), 1);
+        poolInput = ov::test::utils::make_fake_quantize(poolInput, inPrc, 256, newShape);
+    }
+
+    auto pooling = std::make_shared<ov::op::v14::AvgPool>(poolInput, stride, padBegin, padEnd, kernel, excludePad, roundingType, padType);
+
+    function = makeNgraphFunction(inPrc, params, pooling, "PoolingCPU");
+}
+
 std::string MaxPoolingV8LayerCPUTest::getTestCaseName(
     const testing::TestParamInfo<maxPoolV8LayerCpuTestParamsSet>& obj) {
     maxPoolV8SpecificParams basicParamsSet;
@@ -213,24 +309,125 @@ void MaxPoolingV8LayerCPUTest::SetUp() {
     function = std::make_shared<ov::Model>(results, params, "MaxPooling");
 }
 
+std::string MaxPoolingV14LayerCPUTest::getTestCaseName(
+const testing::TestParamInfo<maxPoolV8LayerCpuTestParamsSet>& obj) {
+    maxPoolV8SpecificParams basicParamsSet;
+    InputShape inputShapes;
+    ElementType inPrc;
+    CPUSpecificParams cpuParams;
+    ov::AnyMap additionalConfig;
+    std::tie(basicParamsSet, inputShapes, inPrc, cpuParams, additionalConfig) = obj.param;
+
+    std::vector<size_t> kernel, stride, dilation;
+    std::vector<size_t> padBegin, padEnd;
+    ov::op::PadType padType;
+    ov::op::RoundingType roundingType;
+    ov::element::Type indexElementType;
+    int64_t axis;
+    std::tie(kernel, stride, dilation, padBegin, padEnd, indexElementType, axis, roundingType, padType) =
+        basicParamsSet;
+
+    std::ostringstream results;
+    results << "IS=(";
+    results << ov::test::utils::partialShape2str({inputShapes.first}) << ")_";
+    results << "TS=";
+    for (const auto& shape : inputShapes.second) {
+        results << ov::test::utils::vec2str(shape) << "_";
+    }
+    results << "Prc=" << inPrc << "_";
+    results << "MaxPool_";
+    results << "K" << ov::test::utils::vec2str(kernel) << "_";
+    results << "S" << ov::test::utils::vec2str(stride) << "_";
+    results << "D" << ov::test::utils::vec2str(dilation) << "_";
+    results << "PB" << ov::test::utils::vec2str(padBegin) << "_";
+    results << "PE" << ov::test::utils::vec2str(padEnd) << "_";
+    results << "Rounding=" << roundingType << "_";
+    results << "AutoPad=" << padType << "_";
+    if (!additionalConfig.empty()) {
+        results << "_PluginConf";
+        for (auto& item : additionalConfig) {
+            results << "_" << item.first << "=" << item.second.as<std::string>();
+        }
+    }
+
+    results << CPUTestsBase::getTestCaseName(cpuParams);
+    return results.str();
+}
+
+void MaxPoolingV14LayerCPUTest::SetUp() {
+    targetDevice = ov::test::utils::DEVICE_CPU;
+
+    maxPoolV8SpecificParams basicParamsSet;
+    InputShape inputShapes;
+    ElementType inPrc;
+    CPUSpecificParams cpuParams;
+    ov::AnyMap additionalConfig;
+    std::tie(basicParamsSet, inputShapes, inPrc, cpuParams, additionalConfig) = this->GetParam();
+    configuration.insert(additionalConfig.begin(), additionalConfig.end());
+
+    std::vector<size_t> kernel, stride, dilation;
+    std::vector<size_t> padBegin, padEnd;
+    ov::op::PadType padType;
+    ov::op::RoundingType roundingType;
+    ov::element::Type indexElementType;
+    int64_t axis;
+    std::tie(kernel, stride, dilation, padBegin, padEnd, indexElementType, axis, roundingType, padType) =
+        basicParamsSet;
+    std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+    if (selectedType.empty()) {
+        selectedType = getPrimitiveType();
+    }
+    selectedType = makeSelectedTypeStr(selectedType, deduce_expected_precision(inPrc, configuration));
+
+    init_input_shapes({inputShapes});
+
+    ov::ParameterVector params;
+    for (auto&& shape : inputDynamicShapes) {
+        params.push_back(std::make_shared<ov::op::v0::Parameter>(inPrc, shape));
+    }
+    auto pooling = std::make_shared<ov::op::v14::MaxPool>(params[0],
+                                                         stride,
+                                                         dilation,
+                                                         padBegin,
+                                                         padEnd,
+                                                         kernel,
+                                                         roundingType,
+                                                         padType,
+                                                         indexElementType,
+                                                         axis);
+    pooling->get_rt_info() = getCPUInfo();
+    ov::ResultVector results{std::make_shared<ov::op::v0::Result>(pooling->output(0))};
+    function = std::make_shared<ov::Model>(results, params, "MaxPooling");
+}
+
 TEST_P(PoolingLayerCPUTest, CompareWithRefs) {
     run();
     CheckPluginRelatedResults(compiledModel, "Pooling");
 }
 
+TEST_P(AvgPoolingV14LayerCPUTest, CompareWithRefs) {
+    run();
+    CheckPluginRelatedResults(compiledModel, "Pooling");
+}
+
 TEST_P(MaxPoolingV8LayerCPUTest, CompareWithRefs) {
     run();
     CheckPluginRelatedResults(compiledModel, "Pooling");
 }
 
+TEST_P(MaxPoolingV14LayerCPUTest, CompareWithRefs) {
+    run();
+    CheckPluginRelatedResults(compiledModel, "Pooling");
+}
+
 namespace Pooling {
 
 // The combination of parameters: NCHW + CEIL gives an accuracy problem in ACL AvgPool
-const ov::op::RoundingType expectedAvgRoundingType() {
+const ov::op::RoundingType expectedAvgRoundingType(const ov::op::RoundingType ceil_type) {
 #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     return ov::op::RoundingType::FLOOR;
 #else
-    return ov::op::RoundingType::CEIL;
+    return ceil_type;
 #endif
 }
 
@@ -260,6 +457,21 @@ const std::vector<maxPoolV8SpecificParams>& paramsMaxV83D() {
     return paramsMaxV83D;
 }
 
+const std::vector<maxPoolV8SpecificParams>& paramsMaxV143D() {
+    static const std::vector<maxPoolV8SpecificParams> paramsMaxV143D = {
+            maxPoolV8SpecificParams{ {2}, {2}, {1}, {0}, {0},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::SAME_UPPER },
+            maxPoolV8SpecificParams{ {2}, {2}, {1}, {0}, {0},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::SAME_LOWER },
+            maxPoolV8SpecificParams{ {7}, {2}, {1}, {2}, {2},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::EXPLICIT},
+    };
+    return paramsMaxV143D;
+}
+
 const std::vector<poolSpecificParams>& paramsAvg3D() {
     static const std::vector<poolSpecificParams> paramsAvg3D = {
             poolSpecificParams{ utils::PoolingTypes::AVG, {3}, {1}, {1}, {0},
@@ -272,6 +484,20 @@ const std::vector<poolSpecificParams>& paramsAvg3D() {
     return paramsAvg3D;
 }
 
+const std::vector<poolSpecificParams>& paramsAvgV143D() {
+    static const std::vector<poolSpecificParams> paramsAvgV143D = {
+            poolSpecificParams{ utils::PoolingTypes::AVG, {3}, {2}, {0}, {0},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::EXPLICIT, true },
+            poolSpecificParams{ utils::PoolingTypes::AVG, {4}, {4}, {2}, {2},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::EXPLICIT, true },
+            poolSpecificParams{ utils::PoolingTypes::AVG, {3}, {2}, {0}, {0},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::SAME_UPPER, true },
+            poolSpecificParams{ utils::PoolingTypes::AVG, {4}, {4}, {2}, {2},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::SAME_LOWER, true },
+    };
+    return paramsAvgV143D;
+}
+
 const std::vector<ElementType>& inpOutPrecision() {
     static const std::vector<ElementType> inpOutPrecision = {ElementType::f32/*, ElementType::bf16*/};
     return inpOutPrecision;
@@ -305,6 +531,21 @@ const std::vector<maxPoolV8SpecificParams>& paramsMaxV84D() {
     return paramsMaxV84D;
 }
 
+const std::vector<maxPoolV8SpecificParams>& paramsMaxV144D() {
+    static const std::vector<maxPoolV8SpecificParams> paramsMaxV144D = {
+            maxPoolV8SpecificParams{ {2, 2}, {2, 2}, {1, 1}, {0, 0}, {0, 0},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::SAME_UPPER },
+            maxPoolV8SpecificParams{ {2, 2}, {2, 2}, {1, 1}, {0, 0}, {0, 0},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::SAME_LOWER },
+            maxPoolV8SpecificParams{ {11, 7}, {2, 2}, {1, 1}, {2, 2}, {2, 2},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::EXPLICIT},
+    };
+    return paramsMaxV144D;
+}
+
 const std::vector<InputShape>& inputShapes3D() {
     static const std::vector<InputShape> inputShapes3D = {
             { {}, {{3, 4, 64}} },
@@ -420,6 +661,21 @@ const std::vector<maxPoolV8SpecificParams>& paramsMaxV85D() {
     return paramsMaxV85D;
 }
 
+const std::vector<maxPoolV8SpecificParams>& paramsMaxV145D() {
+    static const std::vector<maxPoolV8SpecificParams> paramsMaxV145DCeilTorch = {
+            maxPoolV8SpecificParams{ {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::SAME_UPPER },
+            maxPoolV8SpecificParams{ {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::SAME_LOWER },
+            maxPoolV8SpecificParams{ {7, 11, 6}, {2, 2, 2}, {1, 1, 1}, {2, 2, 2}, {2, 2, 2},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::EXPLICIT },
+    };
+    return paramsMaxV145DCeilTorch;
+}
+
 const std::vector<poolSpecificParams>& paramsAvg4D() {
     static const std::vector<poolSpecificParams> paramsAvg4D = {
             poolSpecificParams{ utils::PoolingTypes::AVG, {2, 2}, {2, 2}, {1, 0}, {0, 0},
@@ -438,6 +694,18 @@ const std::vector<poolSpecificParams>& paramsAvg4D() {
     return paramsAvg4D;
 }
 
+const std::vector<poolSpecificParams>& paramsAvgV144D() {
+    static const std::vector<poolSpecificParams> paramsAvgV144D = {
+            poolSpecificParams{ utils::PoolingTypes::AVG, {2, 2}, {2, 2}, {1, 0}, {0, 0},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::SAME_UPPER, false },
+            poolSpecificParams{ utils::PoolingTypes::AVG, {2, 2}, {2, 2}, {0, 0}, {0, 0},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::SAME_LOWER, true },
+            poolSpecificParams{ utils::PoolingTypes::AVG, {4, 4}, {4, 4}, {2, 2}, {2, 2},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::EXPLICIT, true },
+    };
+    return paramsAvgV144D;
+}
+
 const std::vector<poolSpecificParams>& paramsAvg5D() {
     static const std::vector<poolSpecificParams> paramsAvg5D = {
             poolSpecificParams{ utils::PoolingTypes::AVG, {2, 2, 2}, {2, 2, 2}, {1, 0, 0}, {0, 0, 0},
@@ -458,6 +726,18 @@ const std::vector<poolSpecificParams>& paramsAvg5D() {
     return paramsAvg5D;
 }
 
+const std::vector<poolSpecificParams>& paramsAvgV145D() {
+    static const std::vector<poolSpecificParams> paramsAvgV145D = {
+            poolSpecificParams{ utils::PoolingTypes::AVG, {2, 2, 2}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::SAME_UPPER, true },
+            poolSpecificParams{ utils::PoolingTypes::AVG, {3, 3, 3}, {3, 3, 3}, {1, 1, 1}, {0, 0, 0},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::SAME_LOWER, true },
+            poolSpecificParams{ utils::PoolingTypes::AVG, {4, 4, 4}, {2, 2, 2}, {2, 2, 2}, {2, 2, 2},
+                                expectedAvgRoundingType(ov::op::RoundingType::CEIL_TORCH), ov::op::PadType::EXPLICIT, true },
+    };
+    return paramsAvgV145D;
+}
+
 const std::vector<poolSpecificParams>& paramsMax5D() {
     static const std::vector<poolSpecificParams> paramsMax5D = {
             poolSpecificParams{ utils::PoolingTypes::MAX, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0},
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/pooling.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/pooling.hpp
index a1d04c633e94f1..6b77a65cd152bb 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/pooling.hpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/pooling.hpp
@@ -38,38 +38,63 @@ class PoolingLayerCPUTest : public testing::WithParamInterface<poolLayerCpuTestP
     void SetUp() override;
 };
 
+class AvgPoolingV14LayerCPUTest : public testing::WithParamInterface<poolLayerCpuTestParamsSet>,
+                            virtual public SubgraphBaseTest, public CpuTestWithFusing {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<poolLayerCpuTestParamsSet>& obj);
+
+protected:
+    void SetUp() override;
+};
+
 class MaxPoolingV8LayerCPUTest : public testing::WithParamInterface<maxPoolV8LayerCpuTestParamsSet>,
                                  virtual public SubgraphBaseTest, public CPUTestsBase {
 public:
     static std::string getTestCaseName(const testing::TestParamInfo<maxPoolV8LayerCpuTestParamsSet>& obj);
+protected:
+    void SetUp() override;
+};
 
+class MaxPoolingV14LayerCPUTest : public testing::WithParamInterface<maxPoolV8LayerCpuTestParamsSet>,
+                                 virtual public SubgraphBaseTest, public CPUTestsBase {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<maxPoolV8LayerCpuTestParamsSet>& obj);
 protected:
     void SetUp() override;
 };
 
 namespace Pooling {
 const std::vector<ElementType>& inpOutPrecision();
-const ov::op::RoundingType expectedAvgRoundingType();
+const ov::op::RoundingType expectedAvgRoundingType(const ov::op::RoundingType ceil_type = ov::op::RoundingType::CEIL);
 
 const CPUSpecificParams& expectedCpuConfigAnyLayout();
 const std::vector<CPUSpecificParams>& vecCpuConfigsFusing_4D();
 
 const std::vector<poolSpecificParams>& paramsMax3D();
 const std::vector<poolSpecificParams>& paramsAvg3D();
+const std::vector<poolSpecificParams>& paramsAvgV143D();
+
+const std::vector<poolSpecificParams>& paramsMax3D();
 const std::vector<poolSpecificParams>& paramsMax4D();
 
 const std::vector<maxPoolV8SpecificParams>& paramsMaxV83D();
 const std::vector<maxPoolV8SpecificParams>& paramsMaxV84D();
 const std::vector<maxPoolV8SpecificParams>& paramsMaxV85D();
 
+const std::vector<maxPoolV8SpecificParams>& paramsMaxV143D();
+const std::vector<maxPoolV8SpecificParams>& paramsMaxV144D();
+const std::vector<maxPoolV8SpecificParams>& paramsMaxV145D();
+
 const std::vector<InputShape>& inputShapes3D();
 const std::vector<InputShape>& inputShapes4D();
 const std::vector<InputShape>& inputShapes4D_Large();
 const std::vector<InputShape>& inputShapes5D();
 
 const std::vector<poolSpecificParams>& paramsAvg4D();
+const std::vector<poolSpecificParams>& paramsAvgV144D();
 const std::vector<poolSpecificParams>& paramsAvg4D_Large();
 const std::vector<poolSpecificParams>& paramsAvg5D();
+const std::vector<poolSpecificParams>& paramsAvgV145D();
 const std::vector<poolSpecificParams>& paramsMax5D();
 }  // namespace Pooling
 }  // namespace test
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/pooling.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/pooling.cpp
index 8e8deeef20dbee..87f86d78197871 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/pooling.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/pooling.cpp
@@ -53,6 +53,39 @@ INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_3D_NotOptimized, PoolingLayerCPUTest,
                                  ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          PoolingLayerCPUTest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_3D, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsAvg3D()),
+                                 ::testing::ValuesIn(inputShapes3D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::Values(false),
+                                 ::testing::ValuesIn(vecCpuConfigs),
+                                 ::testing::Values(emptyFusingSpec),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14CeilTorch_CPU_3D, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsAvgV143D()),
+                                 ::testing::ValuesIn(inputShapes3D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::Values(false),
+                                 ::testing::ValuesIn(vecCpuConfigs),
+                                 ::testing::Values(emptyFusingSpec),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_3D_NotOptimized, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsAvg3D_RefOnly),
+                                 ::testing::ValuesIn(inputShapes3D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::Values(false),
+                                 ::testing::Values(expectedCpuConfigAnyLayout()),
+                                 ::testing::Values(emptyFusingSpec),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         AvgPoolingV14LayerCPUTest::getTestCaseName);
+
 const std::vector<poolSpecificParams> paramsAvg4D_RefOnly = {
         poolSpecificParams{ ov::test::utils::PoolingTypes::AVG, {2, 2}, {2, 2}, {2, 2}, {2, 2},
                             expectedAvgRoundingType(), ov::op::PadType::EXPLICIT, false },
@@ -101,6 +134,50 @@ INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_4D_NotOptimized, PoolingLayerCPUTest,
                         PoolingLayerCPUTest::getTestCaseName);
 
 INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_Large, PoolingLayerCPUTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(paramsAvg4D_Large()),
+                            ::testing::ValuesIn(inputShapes4D_Large()),
+                            ::testing::ValuesIn((inpOutPrecision())),
+                            ::testing::Values(false),
+                            ::testing::ValuesIn(vecCpuConfigs),
+                            ::testing::Values(emptyFusingSpec),
+                            ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                        AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_4D, AvgPoolingV14LayerCPUTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(paramsAvg4D()),
+                            ::testing::ValuesIn(inputShapes4D()),
+                            ::testing::ValuesIn((inpOutPrecision())),
+                            ::testing::Values(false),
+                            ::testing::ValuesIn(vecCpuConfigs),
+                            ::testing::Values(emptyFusingSpec),
+                            ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                        AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14CeilTorch_CPU_4D, AvgPoolingV14LayerCPUTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(paramsAvgV144D()),
+                            ::testing::ValuesIn(inputShapes4D()),
+                            ::testing::ValuesIn((inpOutPrecision())),
+                            ::testing::Values(false),
+                            ::testing::ValuesIn(vecCpuConfigs),
+                            ::testing::Values(emptyFusingSpec),
+                            ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                        AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_4D_NotOptimized, AvgPoolingV14LayerCPUTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(paramsAvg4D_RefOnly),
+                            ::testing::ValuesIn(inputShapes4D()),
+                            ::testing::ValuesIn((inpOutPrecision())),
+                            ::testing::Values(false),
+                            ::testing::Values(expectedCpuConfigAnyLayout()),
+                            ::testing::Values(emptyFusingSpec),
+                            ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                        AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_Large, AvgPoolingV14LayerCPUTest,
                         ::testing::Combine(
                             ::testing::ValuesIn(paramsAvg4D_Large()),
                             ::testing::ValuesIn(inputShapes4D_Large()),
@@ -159,6 +236,33 @@ INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV8_CPU_5D_ref, MaxPoolingV8LayerCPUTest,
                                  ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          MaxPoolingV8LayerCPUTest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_5D, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV85D()),
+                                 ::testing::ValuesIn(inputShapes5D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::ValuesIn(vecCpuConfigs),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_5D_ceil_torch, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV145D()),
+                                 ::testing::ValuesIn(inputShapes5D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::ValuesIn(vecCpuConfigs),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_5D_ref, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV85D_ref),
+                                 ::testing::ValuesIn(inputShapes5D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::Values(expectedCpuConfigAnyLayout()),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_5D, PoolingLayerCPUTest,
                          ::testing::Combine(
                               ::testing::ValuesIn(paramsAvg5D()),
@@ -170,6 +274,28 @@ INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_5D, PoolingLayerCPUTest,
                               ::testing::Values(CPUTestUtils::empty_plugin_config)),
                           PoolingLayerCPUTest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_5D, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                              ::testing::ValuesIn(paramsAvg5D()),
+                              ::testing::ValuesIn(inputShapes5D()),
+                              ::testing::ValuesIn((inpOutPrecision())),
+                              ::testing::Values(false),
+                              ::testing::ValuesIn(vecCpuConfigs),
+                              ::testing::Values(emptyFusingSpec),
+                              ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                          AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14CeilTorch_CPU_5D, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                              ::testing::ValuesIn(paramsAvgV145D()),
+                              ::testing::ValuesIn(inputShapes5D()),
+                              ::testing::ValuesIn((inpOutPrecision())),
+                              ::testing::Values(false),
+                              ::testing::ValuesIn(vecCpuConfigs),
+                              ::testing::Values(emptyFusingSpec),
+                              ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                          PoolingLayerCPUTest::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_5D_NotOptimized, PoolingLayerCPUTest,
                          ::testing::Combine(
                               ::testing::ValuesIn(paramsAvg5D_RefOnly),
@@ -180,6 +306,19 @@ INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_5D_NotOptimized, PoolingLayerCPUTest,
                               ::testing::Values(emptyFusingSpec),
                               ::testing::Values(CPUTestUtils::empty_plugin_config)),
                           PoolingLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_5D_NotOptimized, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                              ::testing::ValuesIn(paramsAvg5D_RefOnly),
+                              ::testing::ValuesIn(inputShapes5D()),
+                              ::testing::ValuesIn((inpOutPrecision())),
+                              ::testing::Values(false),
+                              ::testing::Values(expectedCpuConfigAnyLayout()),
+                              ::testing::Values(emptyFusingSpec),
+                              ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                          AvgPoolingV14LayerCPUTest::getTestCaseName);
+// 333
+
 #endif
 }  // namespace Pooling
 }  // namespace test
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/pooling.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/pooling.cpp
index 4127d0c9deddd8..cfe29692f8414c 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/pooling.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/pooling.cpp
@@ -168,6 +168,39 @@ INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_5D_I8_FP16, PoolingLayerCPUTest,
                               ::testing::Values(cpu_f16_plugin_config)),
                           PoolingLayerCPUTest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_5D_I8, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                              ::testing::ValuesIn(paramsAvg5D()),
+                              ::testing::ValuesIn(inputShapes5D_int8),
+                              ::testing::Values(ElementType::f32),
+                              ::testing::Values(true),
+                              ::testing::ValuesIn(filterCPUInfoForDevice(vecCpuConfigsFusing_5D)),
+                              ::testing::ValuesIn(fusingParamsSet),
+                              ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                          AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_4D_I8_FP16, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                              ::testing::ValuesIn(paramsAvg4D()),
+                              ::testing::ValuesIn(inputShapes4D_int8),
+                              ::testing::Values(ElementType::f32),
+                              ::testing::Values(true),
+                              ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigsFusing_4D)),
+                              ::testing::ValuesIn(fusingParamsSet),
+                              ::testing::Values(cpu_f16_plugin_config)),
+                          AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_5D_I8_FP16, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                              ::testing::ValuesIn(paramsAvg5D()),
+                              ::testing::ValuesIn(inputShapes5D_int8),
+                              ::testing::Values(ElementType::f32),
+                              ::testing::Values(true),
+                              ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigsFusing_5D)),
+                              ::testing::ValuesIn(fusingParamsSet),
+                              ::testing::Values(cpu_f16_plugin_config)),
+                          AvgPoolingV14LayerCPUTest::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_CPU_3D_FP16, PoolingLayerCPUTest,
                          ::testing::Combine(
                                  ::testing::ValuesIn(paramsMax3D()),
@@ -190,6 +223,28 @@ INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_3D_FP16, PoolingLayerCPUTest,
                                  ::testing::Values(cpu_f16_plugin_config)),
                          PoolingLayerCPUTest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_3D_FP16, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsAvg3D()),
+                                 ::testing::ValuesIn(inputShapes3D()),
+                                 ::testing::ValuesIn(inpOutPrecision()),
+                                 ::testing::Values(false),
+                                 ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigs)),
+                                 ::testing::Values(emptyFusingSpec),
+                                 ::testing::Values(cpu_f16_plugin_config)),
+                         AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_3D_FP16_Ceil_Torch, AvgPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsAvgV143D()),
+                                 ::testing::ValuesIn(inputShapes3D()),
+                                 ::testing::ValuesIn(inpOutPrecision()),
+                                 ::testing::Values(false),
+                                 ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigs)),
+                                 ::testing::Values(emptyFusingSpec),
+                                 ::testing::Values(cpu_f16_plugin_config)),
+                         AvgPoolingV14LayerCPUTest::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_CPU_4D_FP16, PoolingLayerCPUTest,
                             ::testing::Combine(
                             ::testing::ValuesIn(paramsMax4D()),
@@ -210,6 +265,46 @@ INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV8_CPU_4D_FP16, MaxPoolingV8LayerCPUTest,
                                  ::testing::Values(cpu_f16_plugin_config)),
                          MaxPoolingV8LayerCPUTest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_4D_FP16, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV84D()),
+                                 ::testing::ValuesIn(inputShapes4D()),
+                                 ::testing::ValuesIn(inpOutPrecision()),
+                                 ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigs)),
+                                 ::testing::Values(cpu_f16_plugin_config)),
+                         MaxPoolingV8LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_4D_FP16_Ceil_Torch, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV144D()),
+                                 ::testing::ValuesIn(inputShapes4D()),
+                                 ::testing::ValuesIn(inpOutPrecision()),
+                                 ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigs)),
+                                 ::testing::Values(cpu_f16_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_4D_FP16, AvgPoolingV14LayerCPUTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(paramsAvg4D()),
+                            ::testing::ValuesIn(inputShapes4D()),
+                            ::testing::ValuesIn(inpOutPrecision()),
+                            ::testing::Values(false),
+                            ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigs)),
+                            ::testing::Values(emptyFusingSpec),
+                            ::testing::Values(cpu_f16_plugin_config)),
+                        AvgPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_AvgPoolV14_CPU_Large_FP16, AvgPoolingV14LayerCPUTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(paramsAvg4D_Large()),
+                            ::testing::ValuesIn(inputShapes4D_Large()),
+                            ::testing::ValuesIn(inpOutPrecision()),
+                            ::testing::Values(false),
+                            ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigs)),
+                            ::testing::Values(emptyFusingSpec),
+                            ::testing::Values(cpu_f16_plugin_config)),
+                        AvgPoolingV14LayerCPUTest::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_CPU_4D_FP16, PoolingLayerCPUTest,
                         ::testing::Combine(
                             ::testing::ValuesIn(paramsAvg4D()),
@@ -252,6 +347,24 @@ INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV8_CPU_5D_FP16, MaxPoolingV8LayerCPUTest,
                                  ::testing::Values(cpu_f16_plugin_config)),
                          MaxPoolingV8LayerCPUTest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_5D_FP16, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV85D()),
+                                 ::testing::ValuesIn(inputShapes5D()),
+                                 ::testing::ValuesIn(inpOutPrecision()),
+                                 ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigs)),
+                                 ::testing::Values(cpu_f16_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_5D_FP16_Ceil_Torch, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV145D()),
+                                 ::testing::ValuesIn(inputShapes5D()),
+                                 ::testing::ValuesIn(inpOutPrecision()),
+                                 ::testing::ValuesIn(filterCPUInfoForDeviceWithFP16(vecCpuConfigs)),
+                                 ::testing::Values(cpu_f16_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_CPU_3D, PoolingLayerCPUTest,
                          ::testing::Combine(
                                  ::testing::ValuesIn(paramsMax3D()),
@@ -272,6 +385,24 @@ INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV8_CPU_3D, MaxPoolingV8LayerCPUTest,
                                  ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          MaxPoolingV8LayerCPUTest::getTestCaseName);
 
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_3D, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV83D()),
+                                 ::testing::ValuesIn(inputShapes3D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::ValuesIn(filterCPUInfoForDevice(vecCpuConfigsFusing_3D)),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_3D_Ceil_Torch, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV143D()),
+                                 ::testing::ValuesIn(inputShapes3D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::ValuesIn(filterCPUInfoForDevice(vecCpuConfigsFusing_3D)),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
 INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_CPU_5D, PoolingLayerCPUTest,
                          ::testing::Combine(
                              ::testing::ValuesIn(paramsMax5D()),
@@ -291,6 +422,24 @@ INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV8_CPU_5D, MaxPoolingV8LayerCPUTest,
                                  ::testing::ValuesIn(filterCPUInfoForDevice(vecCpuConfigsFusing_5D)),
                                  ::testing::Values(CPUTestUtils::empty_plugin_config)),
                          MaxPoolingV8LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_5D, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV85D()),
+                                 ::testing::ValuesIn(inputShapes5D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::ValuesIn(filterCPUInfoForDevice(vecCpuConfigsFusing_5D)),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14CeilTorch_CPU_5D, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV145D()),
+                                 ::testing::ValuesIn(inputShapes5D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::ValuesIn(filterCPUInfoForDevice(vecCpuConfigsFusing_5D)),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
 }  // namespace
 }  // namespace Pooling
 }  // namespace test
diff --git a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp
index ddb2ba1105fe93..09fb028a4fa5bf 100644
--- a/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/graph/merge_transpose_reorder_test.cpp
@@ -5,17 +5,17 @@
 
 #include <common_test_utils/test_common.hpp>
 
+#include "common_test_utils/node_builders/constant.hpp"
 #include "dummy_node.hpp"
 #include "graph.h"
 #include "nodes/input.h"
 #include "nodes/reorder.h"
+#include "nodes/reshape.h"
 #include "nodes/transpose.h"
-
-#include "openvino/op/transpose.hpp"
-#include "openvino/op/result.hpp"
 #include "openvino/op/parameter.hpp"
-
-#include "common_test_utils/node_builders/constant.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/result.hpp"
+#include "openvino/op/transpose.hpp"
 
 using namespace ov::intel_cpu;
 using LOOK = Edge::LOOK;
@@ -37,6 +37,27 @@ struct MergeTransposeReorderTestParam {
 
 using MergeTransposeReorderTestParams = std::tuple<ov::Shape, MergeTransposeReorderTestParam>;
 
+/* graph topology
+    ┌───────┐
+    │ Input │
+    └───┬───┘
+        │
+    ┌───┴───┐
+    │ Dummy │      <*NOTE: fake node with firstNodeLayout, and firstNodeInplaceDirection*>
+    └───┬───┘
+        │
+   ┌────┴────┐
+   │Transpose│     <*NOTE: Reorder is inserted before/after Transpose depending on first/second node layouts.*>
+   └────┬────┘
+        │
+    ┌───┴───┐
+    │ Dummy │      <*NOTE: fake node with lastNodeLayout, and lastNodeInplaceDirection*>
+    └───┬───┘
+        │
+   ┌────┴───┐
+   │ Output │
+   └────────┘
+*/
 class MergeTransposeReorderCPUTest : public testing::WithParamInterface<MergeTransposeReorderTestParams>,
                                      public ov::test::TestsCommon {
 public:
@@ -50,49 +71,27 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface<MergeTra
     void SetUp() override {
         const auto& shape = std::get<0>(GetParam());
         const auto& params = std::get<1>(GetParam());
-        CreateGraph(shape,
-                    params.firstNodeLayout,
-                    params.firstNodeInplaceDirection,
-                    params.lastNodeLayout,
-                    params.lastNodeInplaceDirection,
-                    params.num_consumers);
-    }
-
-    /* graph topology
-        ┌───────┐
-        │ Input │
-        └───┬───┘
-            │
-        ┌───┴───┐
-        │ Dummy │      <*NOTE: fake node with firstNodeLayout, and firstNodeInplaceDirection*>
-        └───┬───┘
-            │
-       ┌────┴────┐
-       │Transpose│     <*NOTE: Reorder is inserted before/after Transpose depending on first/second node layouts.*>
-       └────┬────┘
-            │
-        ┌───┴───┐
-        │ Dummy │      <*NOTE: fake node with lastNodeLayout, and lastNodeInplaceDirection*>
-        └───┬───┘
-            │
-       ┌────┴───┐
-       │ Output │
-       └────────┘
-    */
-    void CreateGraph(const ov::Shape& testShape,
-                     LayoutType firstNodeLayout,
-                     LOOK firstNodeInplaceDirection,
-                     LayoutType lastNodeLayout,
-                     LOOK lastNodeInplaceDirection,
-                     size_t num_consumers) {
-        Config conf;
-        conf.rtCacheCapacity = 100;
-        auto context = std::make_shared<GraphContext>(conf, nullptr, false);
-        const dnnl::engine cpuEngine = context->getEngine();
+        OPENVINO_ASSERT(shape.size() == 4 || shape.size() == 3,
+                        "MergeTransposeReorderCPUTest doesn't support shape", shape,
+                        ". Only 4D and 3D shapes are supported");
+        m_context = std::make_shared<GraphContext>(Config(), nullptr, false);
+        const auto replication_result = CreateModelAndReplicate(shape,
+                                                                params.firstNodeLayout,
+                                                                params.firstNodeInplaceDirection,
+                                                                params.lastNodeLayout,
+                                                                params.lastNodeInplaceDirection,
+                                                                params.num_consumers);
         m_graph = std::unique_ptr<Graph>(new Graph());
+        m_graph->CreateGraph(replication_result.first, replication_result.second, m_context, "fused_graph");
+    }
 
+    virtual std::pair<std::vector<NodePtr>, std::vector<EdgePtr>> CreateModelAndReplicate(const ov::Shape& testShape,
+                                                                                          LayoutType firstNodeLayout,
+                                                                                          LOOK firstNodeInplaceDirection,
+                                                                                          LayoutType lastNodeLayout,
+                                                                                          LOOK lastNodeInplaceDirection,
+                                                                                          size_t num_consumers) {
         const auto precision = ov::element::f32;
-        OPENVINO_ASSERT(testShape.size() == 4 || testShape.size() == 3, "Only 4D and 3D shapes are supported");
         // ov::Model with only a transpose node
         ov::ParameterVector params{std::make_shared<ov::op::v0::Parameter>(precision, testShape)};
         auto order = testShape.size() == 4 ? std::vector<int32_t>{0, 3, 1, 2} : std::vector<int32_t>{0, 2, 1};
@@ -103,52 +102,46 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface<MergeTra
             results.push_back(std::make_shared<ov::op::v0::Result>(transpose));
 
         // Replicate
-        auto replicate = [&](std::vector<NodePtr> &nodes, std::vector<EdgePtr> &edges) -> void {
-            std::unordered_set<NodePtr> nodesSet;
-
-            auto addEdge = [&](const NodePtr& parent, const NodePtr& child, size_t parentPort, size_t childPort) -> void {
-                auto edge = std::make_shared<Edge>(parent, child, parentPort, childPort);
-                Node::addEdge(edge);
-                edges.push_back(edge);
-                nodesSet.insert(parent);
-                nodesSet.insert(child);
-            };
-
-            auto inputNode = std::make_shared<node::Input>(params[0], context);
-
-            auto dummyNode1 = std::make_shared<cpu_unit_test::DummyNode>(
-                testShape, precision, "reshape", "DummyNode", context, firstNodeLayout, firstNodeInplaceDirection);
-
-            auto orderNode = std::make_shared<node::Input>(constOrder, context);
-            auto transposeNode = std::make_shared<node::Transpose>(transpose, context);
-            transposeNode->filterSupportedPrimitiveDescriptors();
-
-            addEdge(inputNode, dummyNode1, 0, 0);
-            addEdge(dummyNode1, transposeNode, 0, 0);
-            addEdge(orderNode, transposeNode, 0, 1);
-
-            const auto& transpose_shape = transpose->get_output_shape(0);
-            for (size_t i = 0; i < num_consumers; i++) {
-                auto dummyConsumer = std::make_shared<cpu_unit_test::DummyNode>(transpose_shape,
-                                                                                precision,
-                                                                                "multiply",
-                                                                                "DummyNode",
-                                                                                context,
-                                                                                lastNodeLayout,
-                                                                                lastNodeInplaceDirection);
-                auto outputNode = std::make_shared<node::Input>(results[i], context);
-                addEdge(transposeNode, dummyConsumer, 0, 0);
-                addEdge(dummyConsumer, outputNode, 0, 0);
-            }
+        std::vector<NodePtr> nodes;
+        std::vector<EdgePtr> edges;
+        std::unordered_set<NodePtr> nodesSet;
 
-            for (auto &node : nodesSet) nodes.emplace_back(node);
+        auto addEdge = [&](const NodePtr& parent, const NodePtr& child, size_t parentPort, size_t childPort) -> void {
+            auto edge = std::make_shared<Edge>(parent, child, parentPort, childPort);
+            Node::addEdge(edge);
+            edges.push_back(edge);
+            nodesSet.insert(parent);
+            nodesSet.insert(child);
         };
 
-        std::vector<NodePtr> graphNodes;
-        std::vector<EdgePtr> graphEdges;
-        replicate(graphNodes, graphEdges);
+        auto inputNode = std::make_shared<node::Input>(params[0], m_context);
+
+        auto dummyNode1 = std::make_shared<cpu_unit_test::DummyNode>(
+            testShape, precision, "reshape", "DummyNode", m_context, firstNodeLayout, firstNodeInplaceDirection);
+
+        auto orderNode = std::make_shared<node::Input>(constOrder, m_context);
+        auto transposeNode = std::make_shared<node::Transpose>(transpose, m_context);
+        transposeNode->filterSupportedPrimitiveDescriptors();
 
-        m_graph->CreateGraph(graphNodes, graphEdges, context, "fused_graph");
+        addEdge(inputNode, dummyNode1, 0, 0);
+        addEdge(dummyNode1, transposeNode, 0, 0);
+        addEdge(orderNode, transposeNode, 0, 1);
+
+        const auto& transpose_shape = transpose->get_output_shape(0);
+        for (size_t i = 0; i < num_consumers; i++) {
+            auto dummyConsumer = std::make_shared<cpu_unit_test::DummyNode>(transpose_shape,
+                                                                            precision,
+                                                                            "multiply",
+                                                                            "DummyNode",
+                                                                            m_context,
+                                                                            lastNodeLayout,
+                                                                            lastNodeInplaceDirection);
+            auto outputNode = std::make_shared<node::Input>(results[i], m_context);
+            addEdge(transposeNode, dummyConsumer, 0, 0);
+            addEdge(dummyConsumer, outputNode, 0, 0);
+        }
+        for (auto &node : nodesSet) nodes.emplace_back(node);
+        return {nodes, edges};
     }
 
     void CheckTransposeCount(size_t ref_transpose_count) const {
@@ -176,14 +169,110 @@ class MergeTransposeReorderCPUTest : public testing::WithParamInterface<MergeTra
         ASSERT_EQ(ref_not_optimized_reorder_count, not_optimized_count);
     }
 
-private:
+    std::shared_ptr<GraphContext> m_context;
     std::unique_ptr<Graph> m_graph;
 };  // class MergeTransposeReorderCPUTest
 
+/*
+ ┌───────┐  
+ │ Input │  
+ └───┬───┘  
+     │      
+ ┌───┴───┐  
+ │ Dummy │  
+ └───┬───┘  
+     │      
+ ┌───┴───┐  
+ │Reshape│  
+ └───┬───┘  
+     │      
+┌────┴────┐ 
+│Transpose│ 
+└────┬────┘ 
+     │      
+ ┌───┴───┐  
+ │ Dummy │  
+ └───┬───┘  
+     │      
+┌────┴───┐  
+│ Output │  
+└────────┘  
+ */
+class MergeTransposeReorderWithReshapeCPUTest : public MergeTransposeReorderCPUTest {
+    std::pair<std::vector<NodePtr>, std::vector<EdgePtr>> CreateModelAndReplicate(const ov::Shape& testShape,
+                                                                                  LayoutType firstNodeLayout,
+                                                                                  LOOK firstNodeInplaceDirection,
+                                                                                  LayoutType lastNodeLayout,
+                                                                                  LOOK lastNodeInplaceDirection,
+                                                                                  size_t num_consumers) override {
+        const auto precision = ov::element::f32;
+        const auto param = std::make_shared<ov::op::v0::Parameter>(precision, testShape);
+        auto reshape_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int>{0, 0, -1});
+        auto reshape = std::make_shared<ov::op::v1::Reshape>(param, reshape_const, true);
+        auto order = std::vector<int32_t>{0, 2, 1};
+        auto transpose_order = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{order.size()}, order);
+        auto transpose = std::make_shared<ov::op::v1::Transpose>(reshape, transpose_order);
+        ov::ResultVector results;
+        for (size_t i = 0; i < num_consumers; i++)
+            results.push_back(std::make_shared<ov::op::v0::Result>(transpose));
+
+        // Replicate
+        std::vector<NodePtr> nodes;
+        std::vector<EdgePtr> edges;
+        std::unordered_set<NodePtr> nodesSet;
+
+        auto addEdge = [&](const NodePtr& parent, const NodePtr& child, size_t parentPort, size_t childPort) -> void {
+            auto edge = std::make_shared<Edge>(parent, child, parentPort, childPort);
+            Node::addEdge(edge);
+            edges.push_back(edge);
+            nodesSet.insert(parent);
+            nodesSet.insert(child);
+        };
+
+        auto inputNode = std::make_shared<node::Input>(param, m_context);
+        auto dummyNode1 = std::make_shared<cpu_unit_test::DummyNode>(
+            testShape, precision, "before_reshape", "DummyNode", m_context, LayoutType::nspc, LOOK::LOOK_UP);
+
+        auto reshapeConstNode = std::make_shared<node::Input>(reshape_const, m_context);
+        auto reshapeNode = std::make_shared<node::Reshape>(reshape, m_context);
+
+        auto orderNode = std::make_shared<node::Input>(transpose_order, m_context);
+        auto transposeNode = std::make_shared<node::Transpose>(transpose, m_context);
+        transposeNode->filterSupportedPrimitiveDescriptors();
+
+        addEdge(inputNode, dummyNode1, 0, 0);
+        addEdge(dummyNode1, reshapeNode, 0, 0);
+        addEdge(reshapeNode, transposeNode, 0, 0);
+        addEdge(reshapeConstNode, reshapeNode, 0, 1);
+        addEdge(orderNode, transposeNode, 0, 1);
+
+        const auto& transpose_shape = transpose->get_output_shape(0);
+        for (size_t i = 0; i < num_consumers; i++) {
+            auto dummyConsumer = std::make_shared<cpu_unit_test::DummyNode>(transpose_shape,
+                                                                            precision,
+                                                                            "multiply",
+                                                                            "DummyNode",
+                                                                            m_context,
+                                                                            LayoutType::ncsp,
+                                                                            LOOK::LOOK_DOWN);
+            auto outputNode = std::make_shared<node::Input>(results[i], m_context);
+            addEdge(transposeNode, dummyConsumer, 0, 0);
+            addEdge(dummyConsumer, outputNode, 0, 0);
+        }
+        for (auto &node : nodesSet) nodes.emplace_back(node);
+        return {nodes, edges};
+    }
+};
+
 TEST_P(MergeTransposeReorderCPUTest, smoke_Run_MergeTransposeReorder) {
     Validate();
 }
 
+TEST_P(MergeTransposeReorderWithReshapeCPUTest, smoke_Run_MergeTransposeReorderWithReshape) {
+    Validate();
+}
+
+namespace {
 const std::vector<ov::Shape> input_shapes{{1, 3, 8, 16}, {3, 8, 16}};
 
 const std::vector<MergeTransposeReorderTestParam> test_params = {
@@ -201,3 +290,19 @@ const std::vector<MergeTransposeReorderTestParam> test_params = {
 INSTANTIATE_TEST_SUITE_P(smoke_Run_MergeTransposeReorder,
                          MergeTransposeReorderCPUTest,
                          ::testing::Combine(::testing::ValuesIn(input_shapes), ::testing::ValuesIn(test_params)));
+
+const std::vector<ov::Shape> input_shapes_with_reshape{{1, 64, 128, 128}};
+
+const std::vector<MergeTransposeReorderTestParam> test_params_with_reshape = {
+    // In case of non optimized reorder OneDNN primitive is used,
+    // which doesn't support reordering in case of different ranks on input and output.
+    // So the fusion is skipped for such case.
+    {LayoutType::nspc, LOOK::LOOK_UP, LayoutType::ncsp, LOOK::LOOK_DOWN, 1, Result{1, 0, 2}},
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_Run_MergeTransposeReorderWithReshape,
+                         MergeTransposeReorderWithReshapeCPUTest,
+                         ::testing::Combine(::testing::ValuesIn(input_shapes_with_reshape),
+                                            ::testing::ValuesIn(test_params_with_reshape)));
+
+}  // namespace
\ No newline at end of file
diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/avg_pool_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/avg_pool_shape_inference_test.cpp
index 6b145faf3b93a6..25e65d7bc680c6 100644
--- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/avg_pool_shape_inference_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/avg_pool_shape_inference_test.cpp
@@ -12,34 +12,32 @@ using namespace ov;
 using namespace ov::intel_cpu;
 using namespace testing;
 
-class AvgPoolV1StaticShapeInferenceTest : public OpStaticShapeInferenceTest<op::v1::AvgPool> {
-protected:
-    void SetUp() override {
-        output_shapes.resize(1);
-    }
-};
-
-TEST_F(AvgPoolV1StaticShapeInferenceTest, default_ctor) {
-    op = make_op();
-    op->set_strides({1, 1});
-    op->set_pads_begin({2, 2});
-    op->set_pads_end({2, 1});
-    op->set_kernel({3, 2});
-    op->set_rounding_type(op::RoundingType::FLOOR);
-    op->set_auto_pad(op::PadType::VALID);
-
-    input_shapes = ShapeVector{{1, 3, 10, 12}};
-    auto shape_infer = make_shape_inference(op);
-    const auto input_shape_refs = make_static_shape_refs(input_shapes);
-    output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
-
-    EXPECT_EQ(output_shapes.size(), 1);
-    EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 8, 11}));
+template <class TOp>
+class AvgPoolCommonStaticShapeInferenceTest : public OpStaticShapeInferenceTest<TOp> {};
+
+TYPED_TEST_SUITE_P(AvgPoolCommonStaticShapeInferenceTest);
+
+TYPED_TEST_P(AvgPoolCommonStaticShapeInferenceTest, default_ctor) {
+    this->op = this->make_op();
+    this->op->set_strides({1, 1});
+    this->op->set_pads_begin({2, 2});
+    this->op->set_pads_end({2, 1});
+    this->op->set_kernel({3, 2});
+    this->op->set_rounding_type(op::RoundingType::FLOOR);
+    this->op->set_auto_pad(op::PadType::VALID);
+
+    this->input_shapes = ShapeVector{{1, 3, 10, 12}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+
+    EXPECT_EQ(this->output_shapes.size(), 1);
+    EXPECT_EQ(this->output_shapes.front(), StaticShape({1, 3, 8, 11}));
     EXPECT_EQ(shape_infer->get_pads_begin(), CoordinateDiff({0, 0}));
     EXPECT_EQ(shape_infer->get_pads_end(), CoordinateDiff({0, 0}));
 }
 
-TEST_F(AvgPoolV1StaticShapeInferenceTest, no_auto_pad_round_floor) {
+TYPED_TEST_P(AvgPoolCommonStaticShapeInferenceTest, no_auto_pad_round_floor) {
     const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape{-1, -1, -1, -1});
 
     const Strides strides{1, 1};
@@ -49,20 +47,20 @@ TEST_F(AvgPoolV1StaticShapeInferenceTest, no_auto_pad_round_floor) {
     const auto rounding_mode = op::RoundingType::FLOOR;
     const auto pad_type = op::PadType::EXPLICIT;
 
-    op = make_op(data, strides, pads_begin, pads_end, kernel_shape, false, rounding_mode, pad_type);
+    this->op = this->make_op(data, strides, pads_begin, pads_end, kernel_shape, false, rounding_mode, pad_type);
 
-    input_shapes = ShapeVector{{1, 3, 10, 12}};
-    auto shape_infer = make_shape_inference(op);
-    const auto input_shape_refs = make_static_shape_refs(input_shapes);
-    output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+    this->input_shapes = ShapeVector{{1, 3, 10, 12}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
 
-    EXPECT_EQ(output_shapes.size(), 1);
-    EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 12, 14}));
+    EXPECT_EQ(this->output_shapes.size(), 1);
+    EXPECT_EQ(this->output_shapes.front(), StaticShape({1, 3, 12, 14}));
     EXPECT_EQ(shape_infer->get_pads_begin(), CoordinateDiff({2, 2}));
     EXPECT_EQ(shape_infer->get_pads_end(), CoordinateDiff({2, 1}));
 }
 
-TEST_F(AvgPoolV1StaticShapeInferenceTest, auto_padding_same_lower_round_ceil) {
+TYPED_TEST_P(AvgPoolCommonStaticShapeInferenceTest, auto_padding_same_lower_round_ceil) {
     const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape::dynamic());
 
     const Strides strides{1, 3, 2};
@@ -72,20 +70,20 @@ TEST_F(AvgPoolV1StaticShapeInferenceTest, auto_padding_same_lower_round_ceil) {
     const auto rounding_mode = op::RoundingType::CEIL;
     const auto pad_type = op::PadType::SAME_LOWER;
 
-    op = make_op(data, strides, pads_begin, pads_end, kernel_shape, false, rounding_mode, pad_type);
+    this->op = this->make_op(data, strides, pads_begin, pads_end, kernel_shape, false, rounding_mode, pad_type);
 
-    input_shapes = ShapeVector{{1, 3, 10, 12, 20}};
-    auto shape_infer = make_shape_inference(op);
-    const auto input_shape_refs = make_static_shape_refs(input_shapes);
-    output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+    this->input_shapes = ShapeVector{{1, 3, 10, 12, 20}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
 
-    EXPECT_EQ(output_shapes.size(), 1);
-    EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 10, 4, 10}));
+    EXPECT_EQ(this->output_shapes.size(), 1);
+    EXPECT_EQ(this->output_shapes.front(), StaticShape({1, 3, 10, 4, 10}));
     EXPECT_EQ(shape_infer->get_pads_begin(), CoordinateDiff({2, 1, 2}));
     EXPECT_EQ(shape_infer->get_pads_end(), CoordinateDiff({2, 1, 1}));
 }
 
-TEST_F(AvgPoolV1StaticShapeInferenceTest, auto_padding_same_upper_round_floor_exclude_pad) {
+TYPED_TEST_P(AvgPoolCommonStaticShapeInferenceTest, auto_padding_same_upper_round_floor_exclude_pad) {
     const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape::dynamic());
 
     const Strides strides{1, 3, 2};
@@ -95,20 +93,20 @@ TEST_F(AvgPoolV1StaticShapeInferenceTest, auto_padding_same_upper_round_floor_ex
     const auto rounding_mode = op::RoundingType::FLOOR;
     const auto pad_type = op::PadType::SAME_UPPER;
 
-    op = make_op(data, strides, pads_begin, pads_end, kernel_shape, true, rounding_mode, pad_type);
+    this->op = this->make_op(data, strides, pads_begin, pads_end, kernel_shape, true, rounding_mode, pad_type);
 
-    input_shapes = ShapeVector{{1, 3, 10, 12, 20}};
-    auto shape_infer = make_shape_inference(op);
-    const auto input_shape_refs = make_static_shape_refs(input_shapes);
-    output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+    this->input_shapes = ShapeVector{{1, 3, 10, 12, 20}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
 
-    EXPECT_EQ(output_shapes.size(), 1);
-    EXPECT_EQ(output_shapes.front(), StaticShape({1, 3, 10, 4, 10}));
+    EXPECT_EQ(this->output_shapes.size(), 1);
+    EXPECT_EQ(this->output_shapes.front(), StaticShape({1, 3, 10, 4, 10}));
     EXPECT_EQ(shape_infer->get_pads_begin(), CoordinateDiff({2, 1, 1}));
     EXPECT_EQ(shape_infer->get_pads_end(), CoordinateDiff({2, 1, 2}));
 }
 
-TEST_F(AvgPoolV1StaticShapeInferenceTest, 5d_auto_padding_same_upper_round_floor) {
+TYPED_TEST_P(AvgPoolCommonStaticShapeInferenceTest, auto_padding_same_upper_round_floor) {
     const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape::dynamic());
 
     const Strides strides{1, 1, 1};
@@ -118,15 +116,90 @@ TEST_F(AvgPoolV1StaticShapeInferenceTest, 5d_auto_padding_same_upper_round_floor
     const auto rounding_mode = op::RoundingType::FLOOR;
     const auto pad_type = op::PadType::SAME_UPPER;
 
-    op = make_op(data, strides, pads_begin, pads_end, kernel_shape, true, rounding_mode, pad_type);
+    this->op = this->make_op(data, strides, pads_begin, pads_end, kernel_shape, true, rounding_mode, pad_type);
 
-    input_shapes = ShapeVector{{32, 32, 2, 2, 4}};
-    auto shape_infer = make_shape_inference(op);
-    const auto input_shape_refs = make_static_shape_refs(input_shapes);
-    output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+    this->input_shapes = ShapeVector{{32, 32, 2, 2, 4}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
 
-    EXPECT_EQ(output_shapes.size(), 1);
-    EXPECT_EQ(output_shapes.front(), StaticShape({32, 32, 2, 2, 4}));
+    EXPECT_EQ(this->output_shapes.size(), 1);
+    EXPECT_EQ(this->output_shapes.front(), StaticShape({32, 32, 2, 2, 4}));
     EXPECT_EQ(shape_infer->get_pads_begin(), CoordinateDiff({0, 0, 0}));
     EXPECT_EQ(shape_infer->get_pads_end(), CoordinateDiff({1, 1, 1}));
 }
+
+REGISTER_TYPED_TEST_SUITE_P(AvgPoolCommonStaticShapeInferenceTest,
+                            default_ctor,
+                            no_auto_pad_round_floor,
+                            auto_padding_same_lower_round_ceil,
+                            auto_padding_same_upper_round_floor_exclude_pad,
+                            auto_padding_same_upper_round_floor);
+
+using AvgPoolOpTypes = Types<ov::op::v1::AvgPool, ov::op::v14::AvgPool>;
+INSTANTIATE_TYPED_TEST_SUITE_P(StaticShapeInferenceTest, AvgPoolCommonStaticShapeInferenceTest, AvgPoolOpTypes);
+
+class AvgPoolV14StaticShapeInferenceTest : public OpStaticShapeInferenceTest<ov::op::v14::AvgPool> {};
+
+TEST_F(AvgPoolV14StaticShapeInferenceTest, explicit_padding_ceil_torch) {
+    const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape::dynamic());
+
+    const Strides strides{2, 2};
+    const ov::Shape pads_begin{1, 1};
+    const ov::Shape pads_end{1, 1};
+    const ov::Shape kernel_shape{2, 2};
+    const auto rounding_mode = op::RoundingType::CEIL_TORCH;
+    const auto pad_type = op::PadType::EXPLICIT;
+
+    this->op = this->make_op(data, strides, pads_begin, pads_end, kernel_shape, true, rounding_mode, pad_type);
+
+    this->input_shapes = ShapeVector{{1, 3, 9, 9}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+
+    EXPECT_EQ(this->output_shapes.size(), 1);
+    EXPECT_EQ(this->output_shapes.front(), StaticShape({1, 3, 5, 5}));
+}
+
+TEST_F(AvgPoolV14StaticShapeInferenceTest, explicit_padding_ceil_torch_no_strides) {
+    const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape::dynamic());
+
+    const Strides strides{1, 1};
+    const ov::Shape pads_begin{1, 1};
+    const ov::Shape pads_end{1, 1};
+    const ov::Shape kernel_shape{2, 2};
+    const auto rounding_mode = op::RoundingType::CEIL_TORCH;
+    const auto pad_type = op::PadType::EXPLICIT;
+
+    this->op = this->make_op(data, strides, pads_begin, pads_end, kernel_shape, false, rounding_mode, pad_type);
+
+    this->input_shapes = ShapeVector{{1, 3, 9, 9}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+
+    EXPECT_EQ(this->output_shapes.size(), 1);
+    EXPECT_EQ(this->output_shapes.front(), StaticShape({1, 3, 10, 10}));
+}
+
+TEST_F(AvgPoolV14StaticShapeInferenceTest, auto_padding_ceil_torch) {
+    const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape::dynamic());
+
+    const Strides strides{1, 1};
+    const ov::Shape pads_begin{1, 1};
+    const ov::Shape pads_end{1, 1};
+    const ov::Shape kernel_shape{2, 2};
+    const auto rounding_mode = op::RoundingType::CEIL_TORCH;
+    const auto pad_type = op::PadType::SAME_LOWER;
+
+    this->op = this->make_op(data, strides, pads_begin, pads_end, kernel_shape, false, rounding_mode, pad_type);
+
+    this->input_shapes = ShapeVector{{1, 3, 9, 9}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+
+    EXPECT_EQ(this->output_shapes.size(), 1);
+    EXPECT_EQ(this->output_shapes.front(), StaticShape({1, 3, 9, 9}));
+}
diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/max_pool_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/max_pool_shape_inference_test.cpp
index e1395b529e7558..97beda20917414 100644
--- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/max_pool_shape_inference_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/max_pool_shape_inference_test.cpp
@@ -85,35 +85,76 @@ TEST_F(MaxPoolV1StaticShapeInferenceTest, auto_padding_same_lower_round_ceil) {
     EXPECT_EQ(shape_infer->get_pads_end(), CoordinateDiff({2, 1, 1}));
 }
 
-class MaxPoolV8StaticShapeInferenceTest : public OpStaticShapeInferenceTest<op::v8::MaxPool> {
+class MaxPoolV14StaticShapeInferenceTest : public OpStaticShapeInferenceTest<op::v14::MaxPool> {
 protected:
     void SetUp() override {
         output_shapes.resize(2);
     }
 };
 
-TEST_F(MaxPoolV8StaticShapeInferenceTest, default_ctor) {
-    op = make_op();
-    op->set_strides({1, 1});
-    op->set_pads_begin({2, 2});
-    op->set_pads_end({2, 1});
-    op->set_kernel({3, 2});
-    op->set_dilations({2, 1});
-    op->set_rounding_type(op::RoundingType::FLOOR);
-    op->set_auto_pad(op::PadType::VALID);
+TEST_F(MaxPoolV14StaticShapeInferenceTest, ceil_torch_mode_1) {
+    const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape::dynamic());
+    const Strides strides{2, 2};
+    const Strides dilations{1, 1};
+    const Shape pads_begin{1, 1};
+    const Shape pads_end{1, 1};
+    const Shape kernel_shape{2, 2};
+    const auto rounding_mode = op::RoundingType::CEIL_TORCH;
+
+    op = make_op(data, strides, dilations, pads_begin, pads_end, kernel_shape, rounding_mode);
+    this->input_shapes = ShapeVector{{1, 3, 5, 5}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+
+    EXPECT_THAT(this->output_shapes, Each(StaticShape({1, 3, 3, 3})));
+}
 
-    input_shapes = ShapeVector{{1, 3, 10, 12}};
-    auto shape_infer = make_shape_inference(op);
-    const auto input_shape_refs = make_static_shape_refs(input_shapes);
-    output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+TEST_F(MaxPoolV14StaticShapeInferenceTest, ceil_torch_mode_2) {
+    const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape::dynamic());
+    const Strides strides{2, 2};
+    const Strides dilations{1, 1};
+    const Shape pads_begin{1, 1};
+    const Shape pads_end{1, 1};
+    const Shape kernel_shape{2, 2};
+    const auto rounding_mode = op::RoundingType::CEIL_TORCH;
+
+    op = make_op(data, strides, dilations, pads_begin, pads_end, kernel_shape, rounding_mode);
+    this->input_shapes = ShapeVector{{1, 3, 9, 9}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+
+    EXPECT_THAT(this->output_shapes, Each(StaticShape({1, 3, 5, 5})));
+}
+
+template <class TOp>
+class MaxPoolCommonStaticShapeInferenceTest : public OpStaticShapeInferenceTest<TOp> {};
+
+TYPED_TEST_SUITE_P(MaxPoolCommonStaticShapeInferenceTest);
 
-    EXPECT_EQ(output_shapes.size(), 2);
-    EXPECT_THAT(output_shapes, Each(StaticShape({1, 3, 6, 11})));
+TYPED_TEST_P(MaxPoolCommonStaticShapeInferenceTest, default_ctor) {
+    this->op = this->make_op();
+    this->op->set_strides({1, 1});
+    this->op->set_pads_begin({2, 2});
+    this->op->set_pads_end({2, 1});
+    this->op->set_kernel({3, 2});
+    this->op->set_dilations({2, 1});
+    this->op->set_rounding_type(op::RoundingType::FLOOR);
+    this->op->set_auto_pad(op::PadType::VALID);
+
+    this->input_shapes = ShapeVector{{1, 3, 10, 12}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+
+    EXPECT_EQ(this->output_shapes.size(), 2);
+    EXPECT_THAT(this->output_shapes, Each(StaticShape({1, 3, 6, 11})));
     EXPECT_EQ(shape_infer->get_pads_begin(), CoordinateDiff({0, 0}));
     EXPECT_EQ(shape_infer->get_pads_end(), CoordinateDiff({0, 0}));
 }
 
-TEST_F(MaxPoolV8StaticShapeInferenceTest, no_dilation) {
+TYPED_TEST_P(MaxPoolCommonStaticShapeInferenceTest, no_dilation) {
     const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape{-1, -1, -1, -1});
 
     const Strides strides{1, 1};
@@ -122,20 +163,20 @@ TEST_F(MaxPoolV8StaticShapeInferenceTest, no_dilation) {
     const ov::Shape pads_end{0, 0};
     const ov::Shape kernel_shape{2, 2};
 
-    op = make_op(data, strides, dilations, pads_begin, pads_end, kernel_shape);
+    this->op = this->make_op(data, strides, dilations, pads_begin, pads_end, kernel_shape);
 
-    input_shapes = ShapeVector{{2, 3, 13, 13}};
-    auto shape_infer = make_shape_inference(op);
-    const auto input_shape_refs = make_static_shape_refs(input_shapes);
-    output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+    this->input_shapes = ShapeVector{{2, 3, 13, 13}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
 
-    EXPECT_EQ(output_shapes.size(), 2);
-    EXPECT_THAT(output_shapes, Each(StaticShape({2, 3, 13, 13})));
+    EXPECT_EQ(this->output_shapes.size(), 2);
+    EXPECT_THAT(this->output_shapes, Each(StaticShape({2, 3, 13, 13})));
     EXPECT_EQ(shape_infer->get_pads_begin(), CoordinateDiff({1, 1}));
     EXPECT_EQ(shape_infer->get_pads_end(), CoordinateDiff({0, 0}));
 }
 
-TEST_F(MaxPoolV8StaticShapeInferenceTest, with_dilations) {
+TYPED_TEST_P(MaxPoolCommonStaticShapeInferenceTest, with_dilations) {
     const auto data = std::make_shared<op::v0::Parameter>(element::f64, PartialShape::dynamic());
 
     const Strides strides{1, 1};
@@ -144,15 +185,23 @@ TEST_F(MaxPoolV8StaticShapeInferenceTest, with_dilations) {
     const ov::Shape pads_end{1, 1};
     const ov::Shape kernel_shape{2, 2};
 
-    op = make_op(data, strides, dilations, pads_begin, pads_end, kernel_shape);
+    this->op = this->make_op(data, strides, dilations, pads_begin, pads_end, kernel_shape);
 
-    input_shapes = ShapeVector{{2, 4, 13, 13}};
-    auto shape_infer = make_shape_inference(op);
-    const auto input_shape_refs = make_static_shape_refs(input_shapes);
-    output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
+    this->input_shapes = ShapeVector{{2, 4, 13, 13}};
+    auto shape_infer = make_shape_inference(this->op);
+    const auto input_shape_refs = make_static_shape_refs(this->input_shapes);
+    this->output_shapes = *shape_infer->infer(input_shape_refs, make_tensor_accessor());
 
-    EXPECT_EQ(output_shapes.size(), 2);
-    EXPECT_THAT(output_shapes, Each(StaticShape({2, 4, 12, 11})));
+    EXPECT_EQ(this->output_shapes.size(), 2);
+    EXPECT_THAT(this->output_shapes, Each(StaticShape({2, 4, 12, 11})));
     EXPECT_EQ(shape_infer->get_pads_begin(), CoordinateDiff({0, 0}));
     EXPECT_EQ(shape_infer->get_pads_end(), CoordinateDiff({1, 1}));
 }
+
+REGISTER_TYPED_TEST_SUITE_P(MaxPoolCommonStaticShapeInferenceTest,
+                            default_ctor,
+                            no_dilation,
+                            with_dilations);
+
+using MaxPoolOpTypes = Types<ov::op::v8::MaxPool, ov::op::v14::MaxPool>;
+INSTANTIATE_TYPED_TEST_SUITE_P(StaticShapeInferenceTest, MaxPoolCommonStaticShapeInferenceTest, MaxPoolOpTypes);
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp
index e7a7a7287d5e05..d5464d6f1d244d 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp
@@ -156,4 +156,29 @@ struct non_max_suppression : public primitive_base<non_max_suppression> {
         ib >> make_data(&rotation, sizeof(rotation));
     }
 };
+
+struct non_max_suppression_gather : primitive_base<non_max_suppression_gather> {
+    CLDNN_DECLARE_PRIMITIVE(non_max_suppression_gather)
+
+    /// @brief Constructs non_max_suppression_gather primitive.
+    /// @param id This primitive id.
+    /// @param inputs Input primitives ids.
+    non_max_suppression_gather(const primitive_id& id,
+                  const std::vector<input_info>& inputs,
+                  const size_t num_outputs = 1)
+        : primitive_base(id, inputs, {padding()}, {optional_data_type()}, num_outputs) {}
+
+    size_t hash() const override {
+        size_t seed = primitive::hash();
+        return seed;
+    }
+
+    bool operator==(const primitive& rhs) const override {
+        if (!compare_common_params(rhs)) {
+            return false;
+        }
+
+        return true;
+    }
+};
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
index f71857791e0a1c..3a2cb04341ee15 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
@@ -17,6 +17,7 @@
 #include "gemm_inst.h"
 #include "condition_inst.h"
 #include "loop_inst.h"
+#include "group_normalization_inst.h"
 #include "program_node.h"
 
 #include <iostream>
@@ -111,9 +112,11 @@ void compile_graph::run(program& p) {
 
         bool is_planar = format::is_default_format(node->get_output_layout().format);
 
-        if ((node->is_dynamic() && !is_planar &&
-            (!node->is_type<convolution>() || (node->is_type<convolution>() && node->get_output_layout().format != cldnn::format::b_fs_yx_fsv16)))) {
-            can_select_impl = false;
+        if (node->is_dynamic() && !is_planar) {
+            if (!(node->is_type<convolution>() && node->get_output_layout().format == cldnn::format::b_fs_yx_fsv16) &&
+                !(node->is_type<group_normalization>() && node->get_output_layout().format == cldnn::format::b_fs_yx_fsv16)) {
+                can_select_impl = false;
+            }
         }
 
         if (node->is_type<condition>() || node->is_type<loop>() || node->is_type<proposal>())
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/fuse_primitives_with_layout.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/fuse_primitives_with_layout.cpp
index bd0edaa1b58861..61d89ac398169c 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/fuse_primitives_with_layout.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/fuse_primitives_with_layout.cpp
@@ -12,17 +12,18 @@
 
 using namespace cldnn;
 
-static bool eltwise_supports_fusings(eltwise_node& node) {
-    auto out_layout = node.get_output_layout();
-    // This condition refers to optimizied kernel EltwiseKernel_fs_b_yx_fsv32
-    if (out_layout.data_type == data_types::f16 && out_layout.batch() > 1 && out_layout.format == format::fs_b_yx_fsv32) {
-        return false;
-    }
+void fuse_primitives_with_layout::run(program& p) {
+    auto eltwise_supports_fusings = [&](eltwise_node& node) -> bool {
+        auto out_layout = node.get_output_layout();
+        // This condition refers to optimizied kernel EltwiseKernel_fs_b_yx_fsv32
+        if (out_layout.data_type == data_types::f16 && out_layout.batch() > 1 &&
+            (_lo.get_optimization_attributes().fs_b_yx_fsv32_network || out_layout.format == format::fs_b_yx_fsv32)) {
+            return false;
+        }
 
-    return true;
-}
+        return true;
+    };
 
-void fuse_primitives_with_layout::run(program& p) {
     bool need_recalc_processing_order = false;
     std::map<primitive_id, std::vector<std::pair<primitive_id, size_t>>> fusing_history;
 
@@ -35,7 +36,7 @@ void fuse_primitives_with_layout::run(program& p) {
             continue;
 
         // No optimized Eltwise kernel supports fused-operation for fs_b_yx_fsv32
-        // Check fusing quantize to eltwsise for this case
+        // Check fusing quantize to eltwise for this case
         auto func_fuse_quantize = [&](quantize_node& node) {
             bool should_fuse = false;
             auto out_layout = node.get_output_layout();
@@ -49,7 +50,6 @@ void fuse_primitives_with_layout::run(program& p) {
                 return;
 
             should_fuse |= input_node.is_type<eltwise>() && eltwise_supports_fusings(input_node.as<eltwise>());
-
             if (!should_fuse)
                 return;
 
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp
index e432248ac46669..1a22b47212b2b1 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp
@@ -4,6 +4,7 @@
 
 #include "pass_manager.h"
 #include "gather_inst.h"
+#include "non_max_suppression_inst.h"
 #include "permute_inst.h"
 #include "strided_slice_inst.h"
 #include "kv_cache_inst.h"
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
index 6c6a41a948f212..46ed489725f4a4 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -39,6 +39,7 @@
 #include "embedding_bag_inst.h"
 #include "extract_image_patches_inst.h"
 #include "reduce_inst.h"
+#include "group_normalization_inst.h"
 #include <vector>
 #include <map>
 #include <list>
@@ -694,6 +695,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
 
             should_fuse |= input.is_type<mvn>();
 
+            should_fuse |= input.is_type<group_normalization>();
+
             should_fuse |= input.is_type<normalize>() && data_type_traits::is_i8_u8(input.get_input_layout(0).data_type);
 
             should_fuse |= input.is_type<deconvolution>();
@@ -891,6 +894,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                                        conv_supports_fusings(parents[i].first->as<convolution>())) ||
                                       (parents[i].first->is_type<mvn>() &&
                                        mvn_supports_fusings(parents[i].first->as<mvn>(), true)) ||
+                                      (parents[i].first->is_type<group_normalization>()) ||
                                       (parents[i].first->is_type<deconvolution>()) ||
                                       (parents[i].first->is_type<permute>()) ||
                                       (parents[i].first->is_type<resample>()) ||
diff --git a/src/plugins/intel_gpu/src/graph/group_normalization.cpp b/src/plugins/intel_gpu/src/graph/group_normalization.cpp
index 56ce5de52520e7..d9d359f339bcac 100644
--- a/src/plugins/intel_gpu/src/graph/group_normalization.cpp
+++ b/src/plugins/intel_gpu/src/graph/group_normalization.cpp
@@ -12,12 +12,13 @@ GPU_DEFINE_PRIMITIVE_TYPE_ID(group_normalization)
 layout group_normalization_inst::calc_output_layout(group_normalization_node const& node, kernel_impl_params const& impl_param) {
     assert(static_cast<bool>(impl_param.desc->output_data_types[0]) == false &&
         "Output data type forcing is not supported for group_normalization_node!");
-    auto output_layout = impl_param.get_input_layout();
+    auto input_node_layout = impl_param.get_non_padded_input_layout();
+    auto output_type = impl_param.desc->output_data_types[0].value_or(input_node_layout.data_type);
 
     if (impl_param.has_fused_primitives())
-        output_layout.data_type = impl_param.get_output_element_type();
+        output_type = impl_param.get_output_element_type();
 
-    return output_layout;
+    return layout(output_type, input_node_layout.format, input_node_layout.get_tensor());
 }
 
 std::string group_normalization_inst::to_string(group_normalization_node const& node) {
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp
index f4793699de4120..f38efcd5c0d30c 100644
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp
@@ -440,6 +440,58 @@ attach_non_max_suppression_impl::attach_non_max_suppression_impl() {
 }
 
 }  // namespace detail
+
+struct non_max_suppression_gather_impl : typed_primitive_impl<non_max_suppression_gather> {
+    using parent = typed_primitive_impl<non_max_suppression_gather>;
+
+    DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cpu::non_max_suppression_gather_impl)
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<non_max_suppression_gather_impl>(*this);
+    }
+
+    non_max_suppression_gather_impl() : parent("non_max_suppression_gather_impl") {}
+
+    event::ptr execute_impl(const std::vector<event::ptr>& events, typed_primitive_inst<non_max_suppression_gather>& instance) override {
+        auto& stream = instance.get_network().get_stream();
+
+        const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph();
+
+        if (!pass_through_events) {
+            for (auto e : events) {
+                e->wait();
+            }
+        }
+
+        if (pass_through_events) {
+            if (events.size() > 1) {
+                return stream.group_events(events);
+            } else if (events.size() == 1) {
+                return events[0];
+            }
+        }
+
+        return stream.create_user_event(true);
+    }
+
+    static std::unique_ptr<primitive_impl> create(const non_max_suppression_gather_node&, const kernel_impl_params&) {
+        return make_unique<non_max_suppression_gather_impl>();
+    }
+    void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
+};
+
+namespace detail {
+
+attach_non_max_suppression_gather_impl::attach_non_max_suppression_gather_impl() {
+    implementation_map<non_max_suppression_gather>::add(impl_types::cpu, non_max_suppression_gather_impl::create, {
+        std::make_tuple(data_types::i32, format::bfyx),
+        std::make_tuple(data_types::f16, format::bfyx),
+        std::make_tuple(data_types::f32, format::bfyx),
+    });
+}
+
+}  // namespace detail
+
 }  // namespace cpu
 }  // namespace cldnn
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp
index c70b39cc9de7f1..2b0dc5b212158c 100644
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp
@@ -16,6 +16,7 @@ void register_implementations() {
     REGISTER_CPU(proposal);
     REGISTER_CPU(read_value);
     REGISTER_CPU(non_max_suppression);
+    REGISTER_CPU(non_max_suppression_gather);
     REGISTER_CPU(shape_of);
     REGISTER_CPU(concatenation);
     REGISTER_CPU(gather);
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp
index aaa56678d08ca1..cb89eae29d8c56 100644
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp
@@ -40,6 +40,7 @@ REGISTER_CPU(assign);
 REGISTER_CPU(proposal);
 REGISTER_CPU(read_value);
 REGISTER_CPU(non_max_suppression);
+REGISTER_CPU(non_max_suppression_gather);
 REGISTER_CPU(detection_output);
 REGISTER_CPU(shape_of);
 REGISTER_CPU(concatenation);
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp
index d79e47e8a114e0..10b33897cb51e8 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/group_normalization.cpp
@@ -65,6 +65,15 @@ attach_group_normalization_impl::attach_group_normalization_impl() {
                                      typed_primitive_impl_ocl<group_normalization>::create<group_normalization_impl>,
                                      types,
                                      formats);
+
+    const std::vector<format::type> dyn_formats {
+        format::bfyx,
+        format::b_fs_yx_fsv16,
+    };
+
+    implementation_map<group_normalization>::add(impl_types::ocl, shape_types::dynamic_shape,
+                                                 typed_primitive_impl_ocl<group_normalization>::create<group_normalization_impl>,
+                                                 types, dyn_formats);
 }
 
 }  // namespace detail
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
index e8ce9ddef9a254..619797c1f78432 100644
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp
@@ -23,7 +23,6 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
     DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::onednn::fully_connected_onednn)
 
 private:
-    memory::ptr _zp_mem; // OneDNN needs broadcasted zp. This is to hold the memory pointer.
     int _ds_group_size;
     dnnl::memory::data_type _ds_data_type;
     dnnl::memory::data_type _dzp_data_type;
@@ -69,11 +68,9 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
                 args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, scale_mem->get_onednn_memory(desc)});
             }
 
-            if (!prim->decompression_zero_point.empty() || prim->decompression_zero_point_scalar.has_value()) {
-                // If _zp_mem is not set in primitive, use the one from primitive_inst.
-                // It happens when broadcasting is not necessary.
+            if (!prim->decompression_zero_point.empty()) {
                 auto decompression_zp_idx = prim->bias.empty() ? 3 : 4;
-                auto zp_mem = _zp_mem != nullptr ? _zp_mem : instance.dep_memory_ptr(decompression_zp_idx);
+                auto zp_mem = instance.dep_memory_ptr(decompression_zp_idx);
                 dnnl::memory::desc desc = onednn::layout_to_memory_desc(zp_mem->get_layout(), dnnl::memory::format_tag::a, true);
                 args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, zp_mem->get_onednn_memory(desc)});
             }
@@ -282,12 +279,20 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
 
         if (has_decompression_zp) {
             ib >> make_data(&_dzp_data_type, sizeof(dnnl::memory::data_type));
-            _zp_mem = prepare_zp_mem(impl_params->get_program().get_node(impl_params->desc->id),
-                                     *impl_params,
-                                     is_four_bit_weight,
-                                     _ds_group_size,
-                                     _dzp_data_type,
-                                     _attrs);
+            auto& arg = impl_params->get_program().get_node(impl_params->desc->id).as<fully_connected>();
+            auto decompression_zp_idx = !arg.bias_term() ? 3 : 4;
+            auto dzp_layout = arg.get_dependency(decompression_zp_idx).get_output_layout();
+
+            if (dzp_layout.count() == 1) {
+                _attrs->set_zero_points(DNNL_ARG_WEIGHTS, 0, dnnl::memory::dims{}, _dzp_data_type);
+            } else {
+                auto ngroups = dzp_layout.get_dim(1);
+                if (ngroups == 1) {
+                    _attrs->set_zero_points(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, _dzp_data_type);
+                } else {
+                    _attrs->set_zero_points(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {_ds_group_size, 1}, _dzp_data_type);
+                }
+            }
         }
 
         if (is_compressed) {
@@ -307,75 +312,11 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
 #endif
     }
 
-    static memory::ptr prepare_zp_mem(const fully_connected_node& arg, const kernel_impl_params& impl_params,
-                                      bool is_four_bit_weight, int group_size, dnnl::memory::data_type &dzp_data_type,
-                                      std::shared_ptr<dnnl::primitive_attr> attr) {
-        auto& engine = impl_params.prog->get_engine();
-        auto prim = impl_params.typed_desc<fully_connected>();
-        memory::ptr zp_mem(nullptr);
-
-        auto mem_fill = [](stream &stream, memory::ptr mem, uint8_t val) {
-            mem_lock<uint8_t, mem_lock_type::write> data(mem, stream);
-            memset(data.data(), val, data.size());
-        };
-
-        auto get_broadcasted_layout_zp = [](const fully_connected_node& arg) {
-            auto decompression_scale_idx = !arg.bias_term() ? 2 : 3;  // it assumes we have decompress_scale
-            auto &scale_node = arg.get_dependency(decompression_scale_idx);
-            auto broadcasted_layout = scale_node.get_output_layout();
-            broadcasted_layout.data_type = data_types::u8;
-            return broadcasted_layout;
-        };
-
-        if (prim->decompression_zero_point_scalar.has_value()) {
-            // TODO: we may improve this logic by using common weight instead of broadcasted one
-            auto& stream = engine.get_service_stream();
-            auto broadcasted_layout_zp = get_broadcasted_layout_zp(arg);
-            dzp_data_type = convert_data_type(broadcasted_layout_zp.data_type);
-            zp_mem = engine.allocate_memory(broadcasted_layout_zp, false);
-            mem_fill(stream, zp_mem, static_cast<uint8_t>(std::round(prim->decompression_zero_point_scalar.value())));
-
-            if (!is_four_bit_weight) {
-                attr->set_zero_points(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, dzp_data_type);
-            } else {
-                attr->set_zero_points(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {group_size, 1}, dzp_data_type);
-            }
-        } else if (!prim->decompression_zero_point.empty()) {
-            auto decompression_zp_idx = !arg.bias_term() ? 3 : 4;
-            auto &zp_node = arg.get_dependency(decompression_zp_idx).as<data>();
-            memory::ptr zp_old_mem = zp_node.get_attached_memory_ptr();
-
-            if (!is_four_bit_weight) {
-                // 8-bit quantized weight
-                dzp_data_type = convert_data_type(arg.get_dependency(decompression_zp_idx).get_output_layout().data_type);
-                attr->set_zero_points(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, dzp_data_type);
-            } else {
-                // OneDNN does not support scalar zero-point for s4 and u8 type. Need to broadcast it.
-                auto broadcasted_layout_zp = get_broadcasted_layout_zp(arg);
-                dzp_data_type = convert_data_type(broadcasted_layout_zp.data_type);
-
-                if (zp_node.get_output_layout().get_linear_size() == 1) {
-                    zp_mem = engine.allocate_memory(broadcasted_layout_zp, false);
-                    auto& stream = engine.get_service_stream();
-                    mem_lock<uint8_t, mem_lock_type::read> zp_old_data(zp_old_mem, stream);
-                    mem_fill(stream, zp_mem, static_cast<uint8_t>(zp_old_data.data()[0] & 0xf));
-                }
-
-                OPENVINO_ASSERT(broadcasted_layout_zp.get_linear_size() == zp_mem->get_layout().get_linear_size(),
-                                "[GPU] Size mismatch between zp and scale for compressed FC\n");
-
-                attr->set_zero_points(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {group_size, 1}, dzp_data_type);
-            }
-        }
-        return zp_mem;
-    }
-
     static std::unique_ptr<primitive_impl> create(const fully_connected_node& arg, const kernel_impl_params& impl_params) {
         auto& engine = impl_params.prog->get_engine();
         auto& config = impl_params.prog->get_config();
         auto attr = impl_params.attrs_onednn;
         auto prim = impl_params.typed_desc<fully_connected>();
-        memory::ptr zp_mem(nullptr);
         int group_size = 0;
         dnnl::memory::data_type ds_data_type = dnnl::memory::data_type::undef;
         dnnl::memory::data_type dzp_data_type = dnnl::memory::data_type::undef;
@@ -402,14 +343,27 @@ struct fully_connected_onednn : typed_primitive_onednn_impl<fully_connected> {
                 }
             }
 
-            if (prim->decompression_zero_point_scalar.has_value() || !prim->decompression_zero_point.empty())
-                zp_mem = prepare_zp_mem(arg, impl_params, is_four_bit_weight, group_size, dzp_data_type, attr);
+            if (!prim->decompression_zero_point.empty()) {
+                auto decompression_zp_idx = !arg.bias_term() ? 3 : 4;
+                auto dzp_layout = arg.get_dependency(decompression_zp_idx).get_output_layout();
+                dzp_data_type = convert_data_type(dzp_layout.data_type);
+
+                if (dzp_layout.count() == 1) {
+                    attr->set_zero_points(DNNL_ARG_WEIGHTS, 0, dnnl::memory::dims{}, dzp_data_type);
+                } else {
+                    auto ngroups = dzp_layout.get_dim(1);
+                    if (ngroups == 1) {
+                        attr->set_zero_points(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, dzp_data_type);
+                    } else {
+                        attr->set_zero_points(DNNL_ARG_WEIGHTS, (1 << 1) + (1 << 0), {group_size, 1}, dzp_data_type);
+                    }
+                }
+            }
 
             auto prim_desc = get_matmul_primitive_descriptor(impl_params, impl_params.prog->get_engine(),
                                                              prim->input_size, !prim->bias.empty(), *attr);
 
             auto prim_onednn = cldnn::make_unique<fully_connected_onednn>(engine, config, attr, *prim_desc);
-            prim_onednn->_zp_mem = zp_mem;
             prim_onednn->_ds_group_size = group_size;
             prim_onednn->_ds_data_type = ds_data_type;
             prim_onednn->_dzp_data_type = dzp_data_type;
diff --git a/src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h b/src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h
index 250708f97cf858..5af957d94a3273 100644
--- a/src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h
@@ -186,4 +186,44 @@ class typed_primitive_inst<non_max_suppression> : public typed_primitive_inst_ba
 
 using non_max_suppression_inst = typed_primitive_inst<non_max_suppression>;
 
+template <>
+struct typed_program_node<non_max_suppression_gather> : typed_program_node_base<non_max_suppression_gather> {
+    using parent = typed_program_node_base<non_max_suppression_gather>;
+    using parent::parent;
+
+public:
+    typed_program_node(const std::shared_ptr<non_max_suppression_gather> prim, program& prog) : parent(prim, prog) {
+        can_be_optimized(true);
+        set_runtime_skippable(true);
+    }
+
+    bool generates_dynamic_output() const override {
+        return true;
+    }
+
+    std::vector<size_t> get_shape_infer_dependencies() const override { return {0, 1, 2}; }
+};
+
+using non_max_suppression_gather_node = typed_program_node<non_max_suppression_gather>;
+
+template <>
+class typed_primitive_inst<non_max_suppression_gather> : public typed_primitive_inst_base<non_max_suppression_gather> {
+public:
+    using parent = typed_primitive_inst_base<non_max_suppression_gather>;
+    using parent::parent;
+
+    static layout calc_output_layout(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param);
+    template <typename ShapeType>
+    static std::vector<layout> calc_output_layouts(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param);
+    static std::string to_string(const non_max_suppression_gather_node& node);
+
+    typed_primitive_inst(network& network, non_max_suppression_gather_node const& node);
+    void update_output_memory() override;
+
+private:
+    void on_execute() override;
+};
+
+using non_max_suppression_gather_inst = typed_primitive_inst<non_max_suppression_gather>;
+
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
index 764f91893b855c..d128e7f4bbae43 100644
--- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h
+++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@@ -397,10 +397,12 @@ class mark_runtime_skippable_nodes : public base_pass {
 
 class fuse_primitives_with_layout : public base_pass {
 public:
-    fuse_primitives_with_layout() : base_pass("fuse_primitives_with_layout") {}
+    explicit fuse_primitives_with_layout(layout_optimizer& lo_ref) :
+        base_pass("fuse_primitives_with_layout"), _lo(lo_ref) {}
 
 private:
     void run(program& p) override;
+    layout_optimizer& _lo;
 };
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index 07c66b3b983c54..6759eda9f5cb46 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -932,8 +932,10 @@ static bool is_node_for_onednn(fully_connected_node const& node) {
         if (!fc_prim->decompression_zero_point.empty()) {
             auto decompression_zp_idx = fc_prim->bias.empty() ? 3 : 4;
             auto decompression_zp_dt = node.get_input_layout(decompression_zp_idx).data_type;
-            if (weights_dt != decompression_zp_dt)
+            if ((weights_dt != ov::element::Type_t::u4 && weights_dt != ov::element::Type_t::u8) ||
+                (decompression_zp_dt != ov::element::Type_t::u8 && decompression_zp_dt != ov::element::Type_t::i8)) {
                 return false;
+            }
 
             auto input_dt = node.get_input_layout(0).data_type;
             if (input_dt == data_types::f32)
@@ -1568,6 +1570,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
                 }
             }
         }
+    } else if (node.is_type<non_max_suppression_gather>()) {
+        return impl_types::cpu;
     } else if (node.is_type<reorder>()) {
         if (!_optimization_attributes.use_onednn_impls)
             return impl_types::ocl;
diff --git a/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp b/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp
index 29a707ea53d3d9..00fdc5dc6db31e 100644
--- a/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp
+++ b/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp
@@ -11,6 +11,10 @@
 #include "nms_shape_inference.hpp"
 
 namespace cldnn {
+
+// -----------------------------------------------
+// non_max_suppression
+// -----------------------------------------------
 GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression)
 
 layout non_max_suppression_inst::calc_output_layout(non_max_suppression_node const& node, kernel_impl_params const& impl_param) {
@@ -81,4 +85,79 @@ std::string non_max_suppression_inst::to_string(non_max_suppression_node const&
     return description.str();
 }
 
+// -----------------------------------------------
+// non_max_suppression_gather
+// -----------------------------------------------
+GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression_gather)
+
+layout non_max_suppression_gather_inst::calc_output_layout(non_max_suppression_gather_node const& node, kernel_impl_params const& impl_param) {
+    OPENVINO_THROW("Only calc_output_layouts should be used!");
+}
+
+template<typename ShapeType>
+std::vector<layout> non_max_suppression_gather_inst::calc_output_layouts(non_max_suppression_gather_node const& /*node*/,
+                                                                         const kernel_impl_params& impl_param) {
+    std::vector<layout> layouts;
+
+    auto desc = impl_param.typed_desc<non_max_suppression_gather>();
+    std::vector<ShapeType> output_shapes = { ShapeType{}, ShapeType{}, ShapeType{} };
+
+    auto& memory_deps = impl_param.memory_deps;
+    if (memory_deps.count(2)) {
+        auto third_output = memory_deps.at(2);
+        cldnn::mem_lock<int32_t, mem_lock_type::read> third_output_lock(third_output, impl_param.get_stream());
+        auto third_output_data = third_output_lock.data();
+
+        output_shapes[0] = ShapeType{third_output_data[0], 3};
+    } else {
+        output_shapes[0] = ShapeType{ov::Dimension::dynamic(), 3};
+    }
+    output_shapes[1] = output_shapes[0];
+    output_shapes[2] = ShapeType{1};
+
+    for (size_t i = 0; i < desc->num_outputs; ++i) {
+        layouts.push_back({output_shapes[i],
+                        impl_param.get_input_layout(i).data_type,
+                        format::get_default_format(output_shapes[i].size())});
+    }
+    return layouts;
+}
+
+template std::vector<layout> non_max_suppression_gather_inst::calc_output_layouts<ov::PartialShape>(non_max_suppression_gather_node const& node,
+                                                                                             const kernel_impl_params& impl_param);
+
+std::string non_max_suppression_gather_inst::to_string(non_max_suppression_gather_node const& node) {
+    auto desc = node.get_primitive();
+    auto node_info = node.desc_to_json();
+
+    json_composite info;
+
+    node_info->add("non max suppression gather info", info);
+
+    std::stringstream description;
+    node_info->dump(description);
+    return description.str();
+}
+
+void non_max_suppression_gather_inst::on_execute() {
+    update_output_memory();
+}
+
+void non_max_suppression_gather_inst::update_output_memory() {
+    if (!can_be_optimized())
+        return;
+
+    for (size_t i = 0; i < inputs_memory_count(); i++) {
+        if (node->get_program().is_new_shape_infer() && input_memory_ptr(i) == nullptr)
+            return;
+
+        if (output_memory_ptr(i) != nullptr && _network.get_engine().is_the_same_buffer(output_memory(i), input_memory(i)))
+            return;
+
+        _outputs[i] = {_network.get_engine().reinterpret_buffer(input_memory(i), _impl_params->get_output_layout(i))};
+    }
+}
+
+non_max_suppression_gather_inst::typed_primitive_inst(network& network, non_max_suppression_gather_node const& node) : parent(network, node) {}
+
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index 460428477281ea..efe2a5c56c844e 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -586,19 +586,44 @@ event::ptr primitive_inst::realloc_if_needed() {
                         user_insts.size(), " and ", user_insts_origin.size());
     }
     for (auto user : user_insts) {
+        auto is_fused_prim_of_user = [&](primitive_id id) -> bool {
+            for (auto& p : user->get_node().get_fused_primitives()) {
+                if (p.has_outer_dep()) {
+                    const auto start_idx = p.outer_dep_start_idx;
+                    // exclude fused_node from total_num_deps
+                    const auto end_idx = p.outer_dep_start_idx + p.total_num_deps -1;
+                    for (size_t idx = start_idx; idx < end_idx; idx++) {
+                        if (user->get_node().get_dependency(idx).id() == id) {
+                            return true;
+                        }
+                    }
+                }
+            }
+            return false;
+        };
         // Since fake alignment is applicable for input tensor as well, make sure we allocate enough memory
         // to prevent reading beyond the allocated memory bounds
-        if (user->get_node().is_type<fully_connected>() && user->is_dynamic() && user->_deps[0].first == this) {
-            GPU_DEBUG_TRACE_DETAIL << "Check fc user " << user->id() << "'s fake alignment-ed input size" << std::endl;
-            user->update_shape();
-            user->update_shape_done_by_other = true;
-
-            auto fc_impl_params = *user->_impl_params;
-            auto fc_input_layout = user->get_node().type()->get_fake_aligned_params(fc_impl_params).input_layouts[0];
-            if (fc_input_layout.bytes_count() > updated_layout.bytes_count()) {
-                GPU_DEBUG_TRACE_DETAIL << id() << ": increase output layout allocation size from " << actual_layout.to_short_string() << " -> "
-                                       << fc_input_layout.to_short_string() << " to meet the input buffer alignment requirements for FC\n";
-                updated_layout = fc_input_layout;
+        if (user->get_node().is_type<fully_connected>() && user->is_dynamic()) {
+            if (user->_deps[0].first == this
+                || (is_fused_prim_of_user(id()) && user->update_shape_done_by_other)) {
+                GPU_DEBUG_TRACE_DETAIL << "Check fc user " << user->id() << "'s fake alignment-ed input size" << std::endl;
+                // Setting update_shape_done_by_other to false before running update_shape,
+                // since update_Shape is already called in realloc_if_needed of current node's dep node
+                // but current node's output layout is not updated to the this user node yet.
+                user->update_shape_done_by_other = false;
+                bool prev_shape_changed = user->shape_changed();
+                user->update_shape();
+                // Set again shape_change status if shape is changed in the prev udpate_shape() for this user node.
+                if (prev_shape_changed)
+                    user->set_shape_change();
+                user->update_shape_done_by_other = true;
+                auto fc_impl_params = *user->_impl_params;
+                auto fc_input_layout = user->get_node().type()->get_fake_aligned_params(fc_impl_params).input_layouts[0];
+                if (fc_input_layout.bytes_count() > updated_layout.bytes_count()) {
+                    GPU_DEBUG_TRACE_DETAIL << id() << ": increase output layout allocation size from " << actual_layout.to_short_string() << " -> "
+                                        << fc_input_layout.to_short_string() << " to meet the input buffer alignment requirements for FC\n";
+                    updated_layout = fc_input_layout;
+                }
             }
         }
     }
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 1c526ea38188d2..cdceed4ae39687 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -593,7 +593,7 @@ void program::pre_optimize_graph(bool is_internal) {
 
     // Check fusing primitives based on preferred format or layout optimization
     if (optimize_data) {
-        apply_opt_pass<fuse_primitives_with_layout>();
+        apply_opt_pass<fuse_primitives_with_layout>(lo);
     }
 
     // add optimization attributes for onednn primitives
@@ -1496,6 +1496,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::broadcast::type_id() &&
             prim.type() != cldnn::ctc_loss::type_id() &&
             prim.type() != cldnn::non_max_suppression::type_id() &&
+            prim.type() != cldnn::non_max_suppression_gather::type_id() &&
             prim.type() != cldnn::roi_align::type_id() &&
             prim.type() != cldnn::matrix_nms::type_id() &&
             prim.type() != cldnn::adaptive_pooling::type_id() &&
@@ -1548,6 +1549,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::quantize::type_id() &&
             prim.type() != cldnn::ctc_loss::type_id() &&
             prim.type() != cldnn::non_max_suppression::type_id() &&
+            prim.type() != cldnn::non_max_suppression_gather::type_id() &&
             prim.type() != cldnn::roi_align::type_id() &&
             prim.type() != cldnn::matrix_nms::type_id() &&
             prim.type() != cldnn::adaptive_pooling::type_id() &&
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_b_fs_yx_fsv16.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_b_fs_yx_fsv16.cl
new file mode 100644
index 00000000000000..1c71b9c7c7093d
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_b_fs_yx_fsv16.cl
@@ -0,0 +1,182 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+
+#ifdef GROUP_NORM_KERNEL_FEATURE_MEAN
+REQD_SUB_GROUP_SIZE(SIMD)
+KERNEL(calc_mean_per_feature)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    __global ACCUMULATOR_TYPE* internal_mean
+) {
+    const uint data_set_idx = get_global_id(1);     // batch * feature split
+    const uint in_data_set_idx = get_global_id(0);
+    #if IS_DYNAMIC
+        const uint workers_per_dataset = get_local_size(0) / FSV;    // 16 datasets are handled by one local workgroup
+    #else
+        const uint workers_per_dataset = WORKERS_PER_DATASET;
+    #endif
+    const uint data_set_size = INPUT0_SIZE_X * INPUT0_SIZE_Y;
+    const uint items_num = data_set_size / workers_per_dataset;
+    const uint leftovers = data_set_size - (items_num * workers_per_dataset);
+
+    const uint INPUT0_ALIGNED_FEATURE_NUM = ALIGN(INPUT0_FEATURE_NUM, FSV);
+    const uint b = (data_set_idx * FSV) / INPUT0_ALIGNED_FEATURE_NUM;
+    const uint f_base = (data_set_idx * FSV) % INPUT0_ALIGNED_FEATURE_NUM;
+    const uint data_set_offset = INPUT0_GET_INDEX(b, f_base, 0, 0);
+    const uint my_data_offset = data_set_offset + in_data_set_idx;
+
+    __local ACCUMULATOR_TYPE mean_per_feature[SLM_SIZE];
+
+    ACCUMULATOR_TYPE mean = ACCUMULATOR_VAL_ZERO;
+
+    for (uint i = 0; i < items_num; ++i) {
+        mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + i * workers_per_dataset * FSV]);
+    }
+
+    if (in_data_set_idx < leftovers) {
+        mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + items_num * workers_per_dataset * FSV + in_data_set_idx]);
+    }
+
+    mean_per_feature[in_data_set_idx] = mean;
+    const uint num_local_workers = get_local_size(0);
+    const uint worker_block_idx = in_data_set_idx / 16;
+    uint reduce_add_level = 1;
+    while ((SLM_SIZE / SIMD) > reduce_add_level) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (worker_block_idx % (reduce_add_level * 2) == 0 && (in_data_set_idx + SIMD * reduce_add_level) < num_local_workers) {
+            mean_per_feature[in_data_set_idx] += mean_per_feature[in_data_set_idx + SIMD * reduce_add_level];
+        }
+        reduce_add_level *= 2;
+    }
+
+    if (worker_block_idx == 0 && (f_base + in_data_set_idx) < INPUT0_FEATURE_NUM) {
+        mean = mean_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size);
+        uint bf = b * INPUT0_FEATURE_NUM + f_base + in_data_set_idx;
+        internal_mean[bf] = mean;
+    }
+}
+#elif GROUP_NORM_KERNEL_GROUP_MEAN
+KERNEL(calc_mean_per_group)(
+    __global ACCUMULATOR_TYPE* internal_mean
+) {
+    const uint data_idx = get_global_id(0) + get_global_id(1) * get_global_size(0);
+    const uint group_size = get_local_size(0);
+
+    ACCUMULATOR_TYPE mean = work_group_reduce_add(internal_mean[data_idx]);
+    mean /= TO_ACCUMULATOR_TYPE(group_size);
+    internal_mean[data_idx] = mean;
+}
+#elif GROUP_NORM_KERNEL_FEATURE_VAR
+REQD_SUB_GROUP_SIZE(SIMD)
+KERNEL(calc_var_per_feature)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    __global ACCUMULATOR_TYPE* internal_mean,
+    __global ACCUMULATOR_TYPE* internal_variance
+) {
+    const uint data_set_idx = get_global_id(1);     // batch * feature split
+    const uint in_data_set_idx = get_global_id(0);
+    #if IS_DYNAMIC
+        const uint workers_per_dataset = get_local_size(0) / FSV;    // 16 datasets are handled by one local workgroup
+    #else
+        const uint workers_per_dataset = WORKERS_PER_DATASET;
+    #endif
+    const uint data_set_size = INPUT0_SIZE_X * INPUT0_SIZE_Y;
+    const uint items_num = data_set_size / workers_per_dataset;
+    const uint leftovers = data_set_size - (items_num * workers_per_dataset);
+
+    const uint INPUT0_ALIGNED_FEATURE_NUM = ALIGN(INPUT0_FEATURE_NUM, FSV);
+    const uint b = (data_set_idx * FSV) / INPUT0_ALIGNED_FEATURE_NUM;
+    const uint f_base = (data_set_idx * FSV) % INPUT0_ALIGNED_FEATURE_NUM;
+    const uint data_set_offset = INPUT0_GET_INDEX(b, f_base, 0, 0);
+    const uint my_data_offset = data_set_offset + in_data_set_idx;
+
+    __local ACCUMULATOR_TYPE var_per_feature[SLM_SIZE];
+
+    uint bf = b * INPUT0_FEATURE_NUM + f_base + get_sub_group_local_id();
+
+    ACCUMULATOR_TYPE mean = internal_mean[bf];
+    ACCUMULATOR_TYPE variance = ACCUMULATOR_VAL_ZERO;
+
+    for (uint i = 0; i < items_num; ++i) {
+        ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * workers_per_dataset * FSV]);
+        tmp -= mean;
+        variance = fma(tmp, tmp, variance);
+    }
+
+    if (in_data_set_idx < leftovers) {
+        ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + items_num * workers_per_dataset * FSV + in_data_set_idx]);
+        tmp -= mean;
+        variance = fma(tmp, tmp, variance);
+    }
+
+    var_per_feature[in_data_set_idx] = variance;
+    const uint worker_block_idx = in_data_set_idx / 16;
+    uint reduce_add_level = 1;
+    while ((SLM_SIZE / SIMD) > reduce_add_level) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (worker_block_idx % (reduce_add_level * 2) == 0) {
+            var_per_feature[in_data_set_idx] += var_per_feature[in_data_set_idx + SIMD * reduce_add_level];
+        }
+        reduce_add_level *= 2;
+    }
+
+    if (worker_block_idx == 0 && (f_base + get_sub_group_local_id()) < INPUT0_FEATURE_NUM) {
+        variance = var_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size);
+        internal_variance[bf] = variance;
+    }
+}
+#elif GROUP_NORM_KERNEL_GROUP_VAR
+KERNEL(calc_var_per_group)(
+    __global ACCUMULATOR_TYPE* internal_variance
+) {
+    const uint data_idx = get_global_id(0) + get_global_id(1) * get_global_size(0);
+    const uint group_size = get_local_size(0);
+
+    ACCUMULATOR_TYPE variance = work_group_reduce_add(internal_variance[data_idx]);
+    variance /= TO_ACCUMULATOR_TYPE(group_size);
+    variance = native_powr(variance + TO_ACCUMULATOR_TYPE(EPSILON), -0.5f);
+    internal_variance[data_idx] = variance;
+}
+#elif GROUP_NORM_KERNEL_FINAL
+REQD_SUB_GROUP_SIZE(SIMD)
+KERNEL(group_normalization_b_fs_yx_fsv16)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    const __global INPUT1_TYPE* scale,
+    const __global INPUT2_TYPE* bias,
+    __global OUTPUT_TYPE* restrict output,
+#if HAS_FUSED_OPS_DECLS
+    FUSED_OPS_DECLS,
+#endif
+    __global ACCUMULATOR_TYPE* internal_mean,
+    __global ACCUMULATOR_TYPE* internal_variance
+) {
+    const uint bf = get_global_id(1) * FSV + get_sub_group_local_id();
+    const uint b = bf / OUTPUT_FEATURE_NUM;
+    const uint f = bf % OUTPUT_FEATURE_NUM;
+    const uint yx = get_global_id(0) / FSV;
+    const uint y = yx / OUTPUT_SIZE_X;
+    const uint x = yx % OUTPUT_SIZE_X;
+    const uint data_index = OUTPUT_GET_INDEX(b, f, y, x);
+
+    if (f < OUTPUT_FEATURE_NUM) {
+        ACTIVATION_TYPE mean = TO_ACTIVATION_TYPE(internal_mean[bf]);
+        ACTIVATION_TYPE variance = TO_ACTIVATION_TYPE(internal_variance[bf]);
+        ACTIVATION_TYPE normalized = (TO_ACTIVATION_TYPE(input[data_index]) - mean) * variance;
+        normalized = normalized * TO_ACTIVATION_TYPE(scale[f]) + TO_ACTIVATION_TYPE(bias[f]);
+        #if HAS_FUSED_OPS
+            FUSED_OPS;
+            output[data_index] = FUSED_OPS_RESULT;
+        #else
+            output[data_index] = TO_OUTPUT_TYPE(ACTIVATION(normalized, ACTIVATION_PARAMS));
+        #endif
+    } else {
+        output[data_index] = OUTPUT_VAL_ZERO;
+    }
+}
+#endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_bfyx_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_bfyx_opt.cl
new file mode 100644
index 00000000000000..6763b78aebe36b
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_bfyx_opt.cl
@@ -0,0 +1,224 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+
+#ifdef GROUP_NORM_KERNEL_FEATURE_MEAN
+KERNEL(calc_mean_per_feature)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    __global ACCUMULATOR_TYPE* internal_mean
+) {
+    const uint bf = get_global_id(2);     // batch * feature
+    const uint b = bf / INPUT0_FEATURE_NUM;
+    const uint f = bf % INPUT0_FEATURE_NUM;
+
+    #if IS_DYNAMIC
+        const uint y_num_workers = get_local_size(1);
+        const uint x_num_workers = get_local_size(0);
+    #else
+        const uint y_num_workers = Y_NUM_WORKERS;
+        const uint x_num_workers = X_NUM_WORKERS;
+    #endif
+    const uint y_block_size = INPUT0_SIZE_Y / y_num_workers;
+    const uint y_base = get_local_id(1) * y_block_size;
+    const uint y_leftover = INPUT0_SIZE_Y - y_num_workers * y_block_size;
+
+    const uint x_block_size = INPUT0_SIZE_X / x_num_workers;
+    const uint x_base = get_local_id(0);
+    const uint x_leftover = INPUT0_SIZE_X - x_num_workers * x_block_size;
+
+    const uint num_local_workers = y_num_workers * x_num_workers;
+    const uint worker_idx = get_local_linear_id();
+
+    __local ACCUMULATOR_TYPE mean_per_feature[SLM_SIZE];
+
+    ACCUMULATOR_TYPE mean = ACCUMULATOR_VAL_ZERO;
+
+    for (uint y = y_base; y < (y_base + y_block_size); ++y) {
+        uint my_data_offset = INPUT0_GET_INDEX(b, f, y, x_base);
+        for (uint i = 0; i < x_block_size; ++i) {
+            mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]);
+        }
+    }
+
+    if (get_local_id(1) < y_leftover) {
+        uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(1) + y_num_workers * y_block_size), x_base);
+        for (uint i = 0; i < x_block_size; ++i) {
+            mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]);
+        }
+    }
+
+    if (get_local_id(0) < x_leftover) {
+        for (uint y = y_base; y < (y_base + y_block_size); ++y) {
+            uint my_data_offset = INPUT0_GET_INDEX(b, f, y, (get_local_id(0) + x_num_workers * x_block_size));
+            mean += TO_ACCUMULATOR_TYPE(input[my_data_offset]);
+        }
+    }
+
+    if (get_local_id(1) < y_leftover && get_local_id(0) < x_leftover) {
+        uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(1) + y_num_workers * y_block_size),
+                                                     (get_local_id(0) + x_num_workers * x_block_size));
+        mean += TO_ACCUMULATOR_TYPE(input[my_data_offset]);
+    }
+
+    mean_per_feature[worker_idx] = mean;
+    uint reduce_add_level = 1;
+    while (num_local_workers > reduce_add_level) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (worker_idx % (reduce_add_level * 2) == 0 && (worker_idx + reduce_add_level) < num_local_workers) {
+            mean_per_feature[worker_idx] += mean_per_feature[worker_idx + reduce_add_level];
+        }
+        reduce_add_level *= 2;
+    }
+
+    if (worker_idx == 0) {
+        mean = mean_per_feature[0] / TO_ACCUMULATOR_TYPE(INPUT0_SIZE_Y * INPUT0_SIZE_X);
+        internal_mean[bf] = mean;
+    }
+}
+#elif GROUP_NORM_KERNEL_GROUP_MEAN
+KERNEL(calc_mean_per_group)(
+    __global ACCUMULATOR_TYPE* internal_mean
+) {
+    const uint data_idx = get_global_id(0) + get_global_id(1) * get_global_size(0);
+    const uint group_size = get_local_size(0);
+
+    ACCUMULATOR_TYPE mean = work_group_reduce_add(internal_mean[data_idx]);
+    mean /= TO_ACCUMULATOR_TYPE(group_size);
+    internal_mean[data_idx] = mean;
+}
+#elif GROUP_NORM_KERNEL_FEATURE_VAR
+KERNEL(calc_var_per_feature)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    __global ACCUMULATOR_TYPE* internal_mean,
+    __global ACCUMULATOR_TYPE* internal_variance
+) {
+    const uint bf = get_global_id(2);     // batch * feature
+    const uint b = bf / INPUT0_FEATURE_NUM;
+    const uint f = bf % INPUT0_FEATURE_NUM;
+
+    #if IS_DYNAMIC
+        const uint y_num_workers = get_local_size(1);
+        const uint x_num_workers = get_local_size(0);
+    #else
+        const uint y_num_workers = Y_NUM_WORKERS;
+        const uint x_num_workers = X_NUM_WORKERS;
+    #endif
+    const uint y_block_size = INPUT0_SIZE_Y / y_num_workers;
+    const uint y_base = get_local_id(1) * y_block_size;
+    const uint y_leftover = INPUT0_SIZE_Y - y_num_workers * y_block_size;
+
+    const uint x_block_size = INPUT0_SIZE_X / x_num_workers;
+    const uint x_base = get_local_id(0);
+    const uint x_leftover = INPUT0_SIZE_X - x_num_workers * x_block_size;
+
+    __local ACCUMULATOR_TYPE var_per_feature[SLM_SIZE];
+
+    const ACCUMULATOR_TYPE mean = internal_mean[bf];
+    ACCUMULATOR_TYPE variance = ACCUMULATOR_VAL_ZERO;
+
+    for (uint y = y_base; y < (y_base + y_block_size); ++y) {
+        uint my_data_offset = INPUT0_GET_INDEX(b, f, y, x_base);
+        for (uint i = 0; i < x_block_size; ++i) {
+            ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]);
+            tmp -= mean;
+            variance = fma(tmp, tmp, variance);
+        }
+    }
+
+    if (get_local_id(1) < y_leftover) {
+        uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(1) + y_num_workers * y_block_size), x_base);
+        for (uint i = 0; i < x_block_size; ++i) {
+            ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]);
+            tmp -= mean;
+            variance = fma(tmp, tmp, variance);
+        }
+    }
+
+    if (get_local_id(0) < x_leftover) {
+        for (uint y = y_base; y < (y_base + y_block_size); ++y) {
+            uint my_data_offset = INPUT0_GET_INDEX(b, f, y, (get_local_id(0) + x_num_workers * x_block_size));
+            ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset]);
+            tmp -= mean;
+            variance = fma(tmp, tmp, variance);
+        }
+    }
+
+    if (get_local_id(1) < y_leftover && get_local_id(0) < x_leftover) {
+        uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(1) + y_num_workers * y_block_size),
+                                                     (get_local_id(0) + x_num_workers * x_block_size));
+        ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset]);
+        tmp -= mean;
+        variance = fma(tmp, tmp, variance);
+    }
+
+    const uint num_local_workers = y_num_workers * x_num_workers;
+    const uint worker_idx = get_local_linear_id();
+
+    var_per_feature[worker_idx] = variance;
+    uint reduce_add_level = 1;
+    while (num_local_workers > reduce_add_level) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (worker_idx % (reduce_add_level * 2) == 0 && (worker_idx + reduce_add_level) < num_local_workers) {
+            var_per_feature[worker_idx] += var_per_feature[worker_idx + reduce_add_level];
+        }
+        reduce_add_level *= 2;
+    }
+
+    if (worker_idx == 0) {
+        variance = var_per_feature[0] / TO_ACCUMULATOR_TYPE(INPUT0_SIZE_Y * INPUT0_SIZE_X);
+        internal_variance[bf] = variance;
+    }
+}
+#elif GROUP_NORM_KERNEL_GROUP_VAR
+KERNEL(calc_var_per_group)(
+    __global ACCUMULATOR_TYPE* internal_variance
+) {
+    const uint data_idx = get_global_id(0) + get_global_id(1) * get_global_size(0);
+    const uint group_size = get_local_size(0);
+
+    ACCUMULATOR_TYPE variance = work_group_reduce_add(internal_variance[data_idx]);
+    variance /= TO_ACCUMULATOR_TYPE(group_size);
+    variance = native_powr(variance + TO_ACCUMULATOR_TYPE(EPSILON), -0.5f);
+    internal_variance[data_idx] = variance;
+}
+#elif GROUP_NORM_KERNEL_FINAL
+KERNEL(group_normalization_b_fs_yx_fsv16)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    const __global INPUT1_TYPE* scale,
+    const __global INPUT2_TYPE* bias,
+    __global OUTPUT_TYPE* restrict output,
+#if HAS_FUSED_OPS_DECLS
+    FUSED_OPS_DECLS,
+#endif
+    __global ACCUMULATOR_TYPE* internal_mean,
+    __global ACCUMULATOR_TYPE* internal_variance
+) {
+    const uint bf = get_global_id(1);
+    const uint b = bf / OUTPUT_FEATURE_NUM;
+    const uint f = bf % OUTPUT_FEATURE_NUM;
+    const uint yx = get_global_id(0);
+    const uint y = yx / OUTPUT_SIZE_X;
+    const uint x = yx % OUTPUT_SIZE_X;
+
+    const uint input_data_index = INPUT0_GET_INDEX(b, f, y, x);
+
+    ACTIVATION_TYPE mean = TO_ACTIVATION_TYPE(internal_mean[bf]);
+    ACTIVATION_TYPE variance = TO_ACTIVATION_TYPE(internal_variance[bf]);
+    ACTIVATION_TYPE normalized = (TO_ACTIVATION_TYPE(input[input_data_index]) - mean) * variance;
+    normalized = normalized * TO_ACTIVATION_TYPE(scale[f]) + TO_ACTIVATION_TYPE(bias[f]);
+
+    const uint output_data_index = OUTPUT_GET_INDEX(b, f, y, x);
+    #if HAS_FUSED_OPS
+        FUSED_OPS;
+        output[output_data_index] = FUSED_OPS_RESULT;
+    #else
+        output[output_data_index] = TO_OUTPUT_TYPE(ACTIVATION(normalized, ACTIVATION_PARAMS));
+    #endif
+}
+#endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
index 408cfc2b5e7719..d0aba8554eccc7 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
@@ -77,6 +77,7 @@ enum class KernelType {
     EXTRACT_IMAGE_PATCHES,
     LOOP,
     NON_MAX_SUPPRESSION,
+    NON_MAX_SUPPRESSION_GATHER,
     DETECTION_OUTPUT,
     EXPERIMENTAL_DETECTRON_DETECTION_OUTPUT,
     EXPERIMENTAL_DETECTRON_GENERATE_PROPOSALS_SINGLE_IMAGE,
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.cpp
new file mode 100644
index 00000000000000..78ba86cbf64d80
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.cpp
@@ -0,0 +1,292 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "group_normalization_kernel_b_fs_yx_fsv16.h"
+#include "kernel_selector_utils.h"
+
+namespace kernel_selector {
+
+static constexpr size_t fsv = 16;
+static constexpr size_t simd = fsv;
+
+ParamsKey GroupNormalizationKernel_b_fs_yx_fsv16::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableBatching();
+    k.EnableDifferentTypes();
+    k.EnableDynamicShapesSupport();
+    return k;
+}
+
+GroupNormalizationKernelBase::MultiDispatchData GroupNormalizationKernel_b_fs_yx_fsv16::SetDefault(const group_normalization_params &params) const {
+    MultiDispatchData dispatchData;
+
+    if (!params.has_dynamic_tensors()) {
+        const auto& input = params.inputs[0];
+
+        dispatchData.stage_1.gws[0] = input.X().v * input.Y().v * fsv;
+        dispatchData.stage_1.gws[1] = CeilDiv(input.Feature().v, fsv) * input.Batch().v;
+        dispatchData.stage_1.gws[2] = 1;
+
+        dispatchData.stage_1.lws[0] = simd;
+        dispatchData.stage_1.lws[1] = 1;
+        dispatchData.stage_1.lws[2] = 1;
+
+        while ((dispatchData.stage_1.lws[0] * 2) <= params.engineInfo.maxWorkGroupSize &&
+              (dispatchData.stage_1.lws[0] * 2) <= dispatchData.stage_1.gws[0]) {
+            if (dispatchData.stage_1.gws[0] % (dispatchData.stage_1.lws[0] * 2) == 0) {
+                dispatchData.stage_1.lws[0] *= 2;
+            } else {
+                break;
+            }
+        }
+        dispatchData.stage_1.gws[0] = dispatchData.stage_1.lws[0];
+
+        dispatchData.stage_2.gws[0] = input.Feature().v;
+        dispatchData.stage_2.gws[1] = input.Batch().v;
+        dispatchData.stage_2.gws[2] = 1;
+
+        dispatchData.stage_2.lws[0] = input.Feature().v / params.num_groups;
+        dispatchData.stage_2.lws[1] = 1;
+        dispatchData.stage_2.lws[2] = 1;
+
+        dispatchData.stage_final.gws[0] = input.X().v * input.Y().v * fsv;
+        dispatchData.stage_final.gws[1] = CeilDiv(input.Feature().v, fsv) * input.Batch().v;
+        dispatchData.stage_final.gws[2] = 1;
+
+        dispatchData.stage_final.lws[0] = simd;
+        dispatchData.stage_final.lws[1] = 1;
+        dispatchData.stage_final.lws[2] = 1;
+
+        while ((dispatchData.stage_final.lws[0] * 2) <= params.engineInfo.maxWorkGroupSize &&
+              (dispatchData.stage_final.lws[0] * 2) <= dispatchData.stage_final.gws[0]) {
+            if (dispatchData.stage_final.gws[0] % (dispatchData.stage_final.lws[0] * 2) == 0) {
+                dispatchData.stage_final.lws[0] *= 2;
+            } else {
+                break;
+            }
+        }
+    }
+
+    return dispatchData;
+}
+
+JitConstants GroupNormalizationKernel_b_fs_yx_fsv16::GetJitConstants(const group_normalization_params &params,
+                                                              GroupNormalizationKernelBase::DispatchData dispatchData) const {
+    auto jit = GroupNormalizationKernelBase::GetJitConstants(params);
+
+    jit.AddConstants({
+        MakeJitConstant("SIMD", 16),
+        MakeJitConstant("FSV", 16),
+    });
+
+    if (params.has_dynamic_tensors()) {
+        jit.AddConstants({
+            MakeJitConstant("SLM_SIZE", params.engineInfo.maxWorkGroupSize),
+        });
+    } else {
+        jit.AddConstants({
+            MakeJitConstant("WORKERS_PER_DATASET", dispatchData.lws[0] / fsv),
+            MakeJitConstant("SLM_SIZE", dispatchData.lws[0]),
+        });
+    }
+    auto activation_dt = GetActivationType(params);
+    jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+
+    if (!params.fused_ops.empty()) {
+        std::vector<std::string> idx_order;
+        if (params.inputs[0].GetDims().size() <= 4) {
+            idx_order = { "(b)", "(f)", "(y)", "(x)" };
+        } else {
+            OPENVINO_THROW("group_normalization_b_fs_yx_fsv16 doesn't support 5D or higher dims.");
+        }
+        auto conf = FusedOpsConfiguration("", idx_order, "normalized", activation_dt, 1);
+        jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
+    }
+
+    return jit;
+}
+
+void GroupNormalizationKernel_b_fs_yx_fsv16::GetUpdateDispatchDataFunc(KernelData& kd) const {
+    kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
+        const auto& prim_params = static_cast<const group_normalization_params&>(params);
+        auto dispatchData = SetDefault(prim_params);
+
+        kd.kernels[0].params.workGroups.global = dispatchData.stage_1.gws;
+        kd.kernels[0].params.workGroups.local = dispatchData.stage_1.lws;
+        kd.kernels[0].skip_execution = KernelData::SkipKernelExecution(prim_params, 0);
+
+        kd.kernels[1].params.workGroups.global = dispatchData.stage_2.gws;
+        kd.kernels[1].params.workGroups.local = dispatchData.stage_2.lws;
+        kd.kernels[1].skip_execution = KernelData::SkipKernelExecution(prim_params, 1);
+
+        kd.kernels[2].params.workGroups.global = dispatchData.stage_1.gws;
+        kd.kernels[2].params.workGroups.local = dispatchData.stage_1.lws;
+        kd.kernels[2].skip_execution = KernelData::SkipKernelExecution(prim_params, 2);
+
+        kd.kernels[3].params.workGroups.global = dispatchData.stage_2.gws;
+        kd.kernels[3].params.workGroups.local = dispatchData.stage_2.lws;
+        kd.kernels[3].skip_execution = KernelData::SkipKernelExecution(prim_params, 3);
+
+        kd.kernels[4].params.workGroups.global = dispatchData.stage_final.gws;
+        kd.kernels[4].params.workGroups.local = dispatchData.stage_final.lws;
+        kd.kernels[4].skip_execution = KernelData::SkipKernelExecution(prim_params, 4);
+
+        kd.internalBufferSizes.clear();
+        kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * Align(prim_params.outputs[0].Feature().v, fsv) * 4);
+        kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * Align(prim_params.outputs[0].Feature().v, fsv) * 4);
+    };
+}
+
+KernelsData GroupNormalizationKernel_b_fs_yx_fsv16::GetKernelsData(const Params &params) const {
+    assert(params.GetType() == KernelType::GROUP_NORMALIZATION);
+
+    if (!Validate(params))
+        return {};
+
+    const group_normalization_params& prim_params = static_cast<const group_normalization_params&>(params);
+
+    MultiDispatchData dispatchData = SetDefault(prim_params);
+
+    KernelData kd = KernelData::Default<group_normalization_params>(params, 5);
+    kd.internalBufferDataType = GetAccumulatorType(prim_params);
+    GetUpdateDispatchDataFunc(kd);
+
+    auto finalKernelName = GetKernelName(prim_params);
+    size_t entry_part_id = 0;
+
+    {
+        // Mean first stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_1);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_MEAN", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[0];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_1,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        1,
+                        0,
+                        0,
+                        prim_params.is_shape_agnostic);
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        if (!prim_params.has_dynamic_tensors()) {
+            kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * Align(prim_params.outputs[0].Feature().v, fsv) * 4);
+        }
+    }
+    {
+        // Mean second stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_2);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_MEAN", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[1];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_2,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        0,
+                        0);
+        kernel.params.arguments.clear();
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+    }
+    {
+        // Variance first stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_1);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_VAR", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[2];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_1,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        1,
+                        0,
+                        0,
+                        prim_params.is_shape_agnostic);
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+        if (!prim_params.has_dynamic_tensors()) {
+            kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * Align(prim_params.outputs[0].Feature().v, fsv) * 4);
+        }
+    }
+    {
+        // Variance second stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_2);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_VAR", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[3];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_2,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        0,
+                        0);
+        kernel.params.arguments.clear();
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+    }
+    {
+        // final stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_final);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FINAL", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[4];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_final,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        3,
+                        GetFusedPrimitiveInputsCount(params),
+                        1,
+                        prim_params.is_shape_agnostic);
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+    }
+
+    return {kd};
+}
+
+KernelsPriority GroupNormalizationKernel_b_fs_yx_fsv16::GetKernelsPriority(const Params& /*params*/) const {
+    return FORCE_PRIORITY_4;
+}
+} // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.h
new file mode 100644
index 00000000000000..406d01c32ea6ae
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.h
@@ -0,0 +1,33 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma once
+
+#include "group_normalization_kernel_base.h"
+
+namespace kernel_selector {
+class GroupNormalizationKernel_b_fs_yx_fsv16 : public GroupNormalizationKernelBase {
+public:
+    using Parent = GroupNormalizationKernelBase;
+
+    GroupNormalizationKernel_b_fs_yx_fsv16() : GroupNormalizationKernelBase{"group_normalization_gpu_b_fs_yx_fsv16"} {}
+    virtual ~GroupNormalizationKernel_b_fs_yx_fsv16() {}
+
+    KernelsData GetKernelsData(const Params& params) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return {
+            FusedOpType::ACTIVATION,
+            FusedOpType::QUANTIZE,
+            FusedOpType::ELTWISE
+        };
+    }
+    MultiDispatchData SetDefault(const group_normalization_params& params) const;
+    JitConstants GetJitConstants(const group_normalization_params& params, GroupNormalizationKernelBase::DispatchData dispatchData) const;
+    void GetUpdateDispatchDataFunc(KernelData& kd) const override;
+};
+
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_base.cpp
new file mode 100644
index 00000000000000..e3bea4caecc6dc
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_base.cpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "group_normalization_kernel_base.h"
+#include <kernel_selector_utils.h>
+
+namespace kernel_selector {
+
+bool GroupNormalizationKernelBase::Validate(const Params& params) const {
+    const group_normalization_params& orgParams = static_cast<const group_normalization_params&>(params);
+
+    for (auto& fused_op : orgParams.fused_ops) {
+        if (!IsFusedPrimitiveSupported(fused_op))
+            return false;
+    }
+
+    return true;
+}
+
+JitConstants GroupNormalizationKernelBase::GetJitConstants(const group_normalization_params& params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+
+    jit.AddConstants({
+        MakeJitConstant("EPSILON", static_cast<float>(params.epsilon)),
+        MakeJitConstant("NUM_GROUPS", params.num_groups)
+    });
+
+    return jit;
+}
+
+Datatype GroupNormalizationKernelBase::GetActivationType(const group_normalization_params& params) const {
+    if (params.inputs[0].GetDType() == Datatype::F16)
+        return Datatype::F16;
+    return Datatype::F32;
+}
+
+Datatype GroupNormalizationKernelBase::GetAccumulatorType(const group_normalization_params& params) const {
+    const auto& input_dt = params.inputs[0].GetDType();
+
+    switch (input_dt) {
+        case Datatype::INT8:
+        case Datatype::UINT8:
+            return Datatype::INT32;
+        default:
+            return Datatype::F32;
+    }
+}
+
+} // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_base.h
new file mode 100644
index 00000000000000..a2787dfea98f22
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_base.h
@@ -0,0 +1,54 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma once
+#include "kernel_base_opencl.h"
+#include "kernel_selector_params.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// GroupNormalizationParams
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct group_normalization_params : public base_params {
+    group_normalization_params() : base_params(KernelType::GROUP_NORMALIZATION) {}
+
+    std::int64_t num_groups = 1;
+    double epsilon = 0.0f;
+
+    ParamsKey GetParamsKey() const override {
+        return base_params::GetParamsKey();
+    }
+};
+
+class GroupNormalizationKernelBase : public KernelBaseOpenCL {
+public:
+    using KernelBaseOpenCL::KernelBaseOpenCL;
+    virtual ~GroupNormalizationKernelBase() {}
+
+    struct DispatchData : public CommonDispatchData {
+        size_t itemsNum;
+        size_t leftovers;
+        size_t dataSetsCount;
+        size_t dataSetSize;
+        size_t maxSlmSize;
+
+        DispatchData() : itemsNum(0), leftovers(0), dataSetsCount(0), dataSetSize(0), maxSlmSize(0) {}
+    };
+
+    struct MultiDispatchData {
+        DispatchData stage_1;
+        DispatchData stage_2;
+        DispatchData stage_final;
+
+        size_t item_groups;
+    };
+
+protected:
+    bool Validate(const Params&) const override;
+    JitConstants GetJitConstants(const group_normalization_params& params) const;
+    std::string GetKernelName(const group_normalization_params&) const { return kernelName; }
+    Datatype GetActivationType(const group_normalization_params& params) const;
+    Datatype GetAccumulatorType(const group_normalization_params& params) const;
+};
+
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.cpp
new file mode 100644
index 00000000000000..fc855bab7b0360
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.cpp
@@ -0,0 +1,297 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "group_normalization_kernel_bfyx_opt.h"
+#include "kernel_selector_utils.h"
+
+namespace kernel_selector {
+ParamsKey GroupNormalizationKernelBfyx::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableBatching();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableDifferentTypes();
+    k.EnableDynamicShapesSupport();
+    return k;
+}
+
+GroupNormalizationKernelBase::MultiDispatchData GroupNormalizationKernelBfyx::SetDefault(const group_normalization_params &params) const {
+    MultiDispatchData dispatchData;
+
+    if (!params.has_dynamic_tensors()) {
+        const auto& input = params.inputs[0];
+
+        dispatchData.stage_1.gws[0] = input.X().v;
+        dispatchData.stage_1.gws[1] = input.Y().v;
+        dispatchData.stage_1.gws[2] = input.Feature().v * input.Batch().v;
+
+        dispatchData.stage_1.lws[0] = input.X().v;
+        dispatchData.stage_1.lws[1] = input.Y().v;
+        dispatchData.stage_1.lws[2] = 1;
+
+        if ((input.X().v * input.Y().v) > params.engineInfo.maxWorkGroupSize) {
+            if (input.Y().v > params.engineInfo.maxWorkGroupSize) {
+                dispatchData.stage_1.lws[0] = 1;
+                for (size_t lws = 2; lws <= input.Y().v; ++lws) {
+                    if (input.Y().v % lws == 0 && (input.Y().v / lws) <= params.engineInfo.maxWorkGroupSize) {
+                        dispatchData.stage_1.lws[1] = input.Y().v / lws;
+                        break;
+                    }
+                }
+            } else {
+                for (size_t lws = 2; lws <= input.X().v; ++lws) {
+                    if (input.X().v % lws == 0 && (input.X().v / lws * input.Y().v) <= params.engineInfo.maxWorkGroupSize) {
+                        dispatchData.stage_1.lws[0] = input.X().v / lws;
+                        break;
+                    }
+                }
+            }
+        }
+        dispatchData.stage_1.gws[0] = dispatchData.stage_1.lws[0];
+        dispatchData.stage_1.gws[1] = dispatchData.stage_1.lws[1];
+
+        dispatchData.stage_2.gws[0] = input.Feature().v;
+        dispatchData.stage_2.gws[1] = input.Batch().v;
+        dispatchData.stage_2.gws[2] = 1;
+
+        dispatchData.stage_2.lws[0] = input.Feature().v / params.num_groups;
+        dispatchData.stage_2.lws[1] = 1;
+        dispatchData.stage_2.lws[2] = 1;
+
+        dispatchData.stage_final.gws[0] = input.X().v * input.Y().v;
+        dispatchData.stage_final.gws[1] = input.Feature().v * input.Batch().v;
+        dispatchData.stage_final.gws[2] = 1;
+
+        dispatchData.stage_final.lws[0] = 1;
+        dispatchData.stage_final.lws[1] = 1;
+        dispatchData.stage_final.lws[2] = 1;
+
+        while ((dispatchData.stage_final.lws[0] * 2) <= params.engineInfo.maxWorkGroupSize &&
+              (dispatchData.stage_final.lws[0] * 2) <= dispatchData.stage_final.gws[0]) {
+            if (dispatchData.stage_final.gws[0] % (dispatchData.stage_final.lws[0] * 2) == 0) {
+                dispatchData.stage_final.lws[0] *= 2;
+            } else {
+                break;
+            }
+        }
+    }
+
+    return dispatchData;
+}
+
+JitConstants GroupNormalizationKernelBfyx::GetJitConstants(const group_normalization_params &params,
+                                                           GroupNormalizationKernelBase::DispatchData dispatchData) const {
+    auto jit = GroupNormalizationKernelBase::GetJitConstants(params);
+
+    if (params.has_dynamic_tensors()) {
+        jit.AddConstants({
+            MakeJitConstant("SLM_SIZE", params.engineInfo.maxWorkGroupSize),
+        });
+    } else {
+        jit.AddConstants({
+            MakeJitConstant("SLM_SIZE", (dispatchData.lws[0] * dispatchData.lws[1])),
+            MakeJitConstant("Y_NUM_WORKERS", dispatchData.lws[1]),
+            MakeJitConstant("X_NUM_WORKERS", dispatchData.lws[0]),
+        });
+    }
+    auto activation_dt = GetActivationType(params);
+    jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+
+    if (!params.fused_ops.empty()) {
+        std::vector<std::string> idx_order;
+        if (params.inputs[0].GetDims().size() <= 4) {
+            idx_order = { "(b)", "(f)", "(y)", "(x)" };
+        } else {
+            OPENVINO_THROW("group_normalization_bfyx doesn't support 5D or higher dims.");
+        }
+        auto conf = FusedOpsConfiguration("", idx_order, "normalized", activation_dt, 1);
+        jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
+    }
+
+    return jit;
+}
+
+void GroupNormalizationKernelBfyx::GetUpdateDispatchDataFunc(KernelData& kd) const {
+    kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
+        const auto& prim_params = static_cast<const group_normalization_params&>(params);
+        auto dispatchData = SetDefault(prim_params);
+
+        kd.kernels[0].params.workGroups.global = dispatchData.stage_1.gws;
+        kd.kernels[0].params.workGroups.local = dispatchData.stage_1.lws;
+        kd.kernels[0].skip_execution = KernelData::SkipKernelExecution(prim_params, 0);
+
+        kd.kernels[1].params.workGroups.global = dispatchData.stage_2.gws;
+        kd.kernels[1].params.workGroups.local = dispatchData.stage_2.lws;
+        kd.kernels[1].skip_execution = KernelData::SkipKernelExecution(prim_params, 1);
+
+        kd.kernels[2].params.workGroups.global = dispatchData.stage_1.gws;
+        kd.kernels[2].params.workGroups.local = dispatchData.stage_1.lws;
+        kd.kernels[2].skip_execution = KernelData::SkipKernelExecution(prim_params, 2);
+
+        kd.kernels[3].params.workGroups.global = dispatchData.stage_2.gws;
+        kd.kernels[3].params.workGroups.local = dispatchData.stage_2.lws;
+        kd.kernels[3].skip_execution = KernelData::SkipKernelExecution(prim_params, 3);
+
+        kd.kernels[4].params.workGroups.global = dispatchData.stage_final.gws;
+        kd.kernels[4].params.workGroups.local = dispatchData.stage_final.lws;
+        kd.kernels[4].skip_execution = KernelData::SkipKernelExecution(prim_params, 4);
+
+        kd.internalBufferSizes.clear();
+        kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * prim_params.outputs[0].Feature().v * 4);
+        kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * prim_params.outputs[0].Feature().v * 4);
+    };
+}
+
+KernelsData GroupNormalizationKernelBfyx::GetKernelsData(const Params &params) const {
+    assert(params.GetType() == KernelType::GROUP_NORMALIZATION);
+
+    if (!Validate(params))
+        return {};
+
+    const group_normalization_params& prim_params = static_cast<const group_normalization_params&>(params);
+
+    MultiDispatchData dispatchData = SetDefault(prim_params);
+
+    KernelData kd = KernelData::Default<group_normalization_params>(params, 5);
+    kd.internalBufferDataType = GetAccumulatorType(prim_params);
+    GetUpdateDispatchDataFunc(kd);
+
+    auto finalKernelName = GetKernelName(prim_params);
+    size_t entry_part_id = 0;
+
+    {
+        // Mean first stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_1);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_MEAN", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[0];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_1,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        1,
+                        0,
+                        0,
+                        prim_params.is_shape_agnostic);
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        if (!prim_params.has_dynamic_tensors()) {
+            kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * prim_params.outputs[0].Feature().v * 4);
+        }
+    }
+    {
+        // Mean second stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_2);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_MEAN", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[1];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_2,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        0,
+                        0);
+        kernel.params.arguments.clear();
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+    }
+    {
+        // Variance first stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_1);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_VAR", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[2];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_1,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        1,
+                        0,
+                        0,
+                        prim_params.is_shape_agnostic);
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+        if (!prim_params.has_dynamic_tensors()) {
+            kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * prim_params.outputs[0].Feature().v * 4);
+        }
+    }
+    {
+        // Variance second stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_2);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_VAR", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[3];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_2,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        0,
+                        0);
+        kernel.params.arguments.clear();
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+    }
+    {
+        // final stage
+        auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_final);
+        cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FINAL", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[4];
+        FillCLKernelData(kernel,
+                        dispatchData.stage_final,
+                        params.engineInfo,
+                        finalKernelName,
+                        jit,
+                        entry_point,
+                        "",
+                        false,
+                        false,
+                        3,
+                        GetFusedPrimitiveInputsCount(params),
+                        1,
+                        prim_params.is_shape_agnostic);
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+    }
+
+    return {kd};
+}
+
+KernelsPriority GroupNormalizationKernelBfyx::GetKernelsPriority(const Params& /*params*/) const {
+    return FORCE_PRIORITY_7;
+}
+} // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.h
new file mode 100644
index 00000000000000..a0f778c10d1a1d
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.h
@@ -0,0 +1,33 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#pragma once
+
+#include "group_normalization_kernel_base.h"
+
+namespace kernel_selector {
+class GroupNormalizationKernelBfyx : public GroupNormalizationKernelBase {
+public:
+    using Parent = GroupNormalizationKernelBase;
+
+    GroupNormalizationKernelBfyx() : GroupNormalizationKernelBase{"group_normalization_gpu_bfyx_opt"} {}
+    virtual ~GroupNormalizationKernelBfyx() {}
+
+    KernelsData GetKernelsData(const Params& params) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return {
+            FusedOpType::ACTIVATION,
+            FusedOpType::QUANTIZE,
+            FusedOpType::ELTWISE
+        };
+    }
+    MultiDispatchData SetDefault(const group_normalization_params& params) const;
+    JitConstants GetJitConstants(const group_normalization_params& params, GroupNormalizationKernelBase::DispatchData dispatchData) const;
+    void GetUpdateDispatchDataFunc(KernelData& kd) const override;
+};
+
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.cpp
index ad531c5c47b55e..88447ae9447356 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023 Intel Corporation
+// Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -75,9 +75,8 @@ GroupNormalizationKernelRef::DispatchData GroupNormalizationKernelRef::SetDefaul
 
 JitConstants GroupNormalizationKernelRef::GetJitConstants(KernelId kernelId,
                                                           const group_normalization_params &params) const {
-    auto jit = MakeBaseParamsJitConstants(params);
-    jit.AddConstant(MakeJitConstant("EPSILON", static_cast<float>(params.epsilon)));
-    jit.AddConstant(MakeJitConstant("NUM_GROUPS", params.num_groups));
+    auto jit = GroupNormalizationKernelBase::GetJitConstants(params);
+
     switch (kernelId) {
     case eCalcMeanKernel:
         jit.AddConstant(MakeJitConstant("MEAN_KERNEL_ENABLED", true));
@@ -167,4 +166,7 @@ KernelsData GroupNormalizationKernelRef::GetKernelsData(const Params &params) co
     return {kd};
 }
 
+KernelsPriority GroupNormalizationKernelRef::GetKernelsPriority(const Params& /*params*/) const {
+    return DONT_USE_IF_HAVE_SOMETHING_ELSE;
+}
 } // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.h
index 900496cc969187..3bbb1264553234 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_ref.h
@@ -1,28 +1,14 @@
-// Copyright (C) 2023 Intel Corporation
+// Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
-#include "kernel_base_opencl.h"
-#include "kernel_selector_params.h"
 
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// GroupNormalizationParams
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct group_normalization_params : public base_params {
-    group_normalization_params() : base_params(KernelType::GROUP_NORMALIZATION) {}
-
-    std::int64_t num_groups{};
-    double epsilon{};
+#include "group_normalization_kernel_base.h"
 
-    ParamsKey GetParamsKey() const override {
-        return base_params::GetParamsKey();
-    }
-};
-
-class GroupNormalizationKernelRef : public KernelBaseOpenCL {
+namespace kernel_selector {
+class GroupNormalizationKernelRef : public GroupNormalizationKernelBase {
 public:
-    using DispatchData = CommonDispatchData;
+    using Parent = GroupNormalizationKernelBase;
     enum KernelId {
         eCalcMeanKernel,
         eCalcStandardDeviationKernel,
@@ -30,8 +16,11 @@ class GroupNormalizationKernelRef : public KernelBaseOpenCL {
         eKernelsNum
     };
 
-    GroupNormalizationKernelRef() : KernelBaseOpenCL{"group_normalization_gpu_ref"} {}
+    GroupNormalizationKernelRef() : GroupNormalizationKernelBase{"group_normalization_gpu_ref"} {}
+    virtual ~GroupNormalizationKernelRef() {}
+
     KernelsData GetKernelsData(const Params& params) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
     ParamsKey GetSupportedKey() const override;
     std::vector<FusedOpType> GetSupportedFusedOps() const override {
         return {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.cpp
index a7b0ced35e0092..7879f8570f5575 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_selector.cpp
@@ -1,13 +1,17 @@
-// Copyright (C) 2023 Intel Corporation
+// Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "group_normalization_kernel_selector.h"
 #include "group_normalization_kernel_ref.h"
+#include "group_normalization_kernel_bfyx_opt.h"
+#include "group_normalization_kernel_b_fs_yx_fsv16.h"
 
 namespace kernel_selector {
 
 group_normalization_kernel_selector::group_normalization_kernel_selector() {
     Attach<GroupNormalizationKernelRef>();
+    Attach<GroupNormalizationKernelBfyx>();
+    Attach<GroupNormalizationKernel_b_fs_yx_fsv16>();
 }
 
 KernelsData group_normalization_kernel_selector::GetBestKernels(const Params &params) const {
diff --git a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp
index 036e46c9a955a6..30f1ea45d151c3 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp
@@ -29,6 +29,7 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share
     validate_inputs_count(op, {4, 5});
     auto inputs = p.GetInputInfo(op);
     std::string primitive_name = layer_type_name_ID(op);
+    auto supports_immad = p.get_engine().get_device_info().supports_immad;
 
     const int INPUT_CNT_WITH_ZP = 5;
     auto input_name = inputs[0].pid;
@@ -46,12 +47,14 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share
             zp_value = zp_const->cast_vector<float>()[0];
         }
     }
+
+    // The decompression zp node should be kept for onednn FC.
     auto fc = cldnn::fully_connected(primitive_name,
                                      cldnn::input_info(input_name),
                                      weights_name,
                                      bias_name,
                                      scale_name,
-                                     has_scalar_zp ? "" : zp_name,
+                                     has_scalar_zp && !supports_immad ? "" : zp_name,
                                      cldnn::element_type_to_data_type(op->get_output_element_type(0)),
                                      cldnn::padding(),
                                      op->get_input_partial_shape(0).size(),
diff --git a/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp b/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp
index a46f30c418f00a..38c59ba044d404 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp
@@ -54,9 +54,9 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh
     auto boxesShape = op->get_input_partial_shape(0);
     size_t num_outputs = op->get_output_size();
     if (p.use_new_shape_infer()) {
-        auto nonMaxSuppressionLayerName = layer_type_name_ID(op);
+        auto NMSLayerName = layer_type_name_ID(op);
         auto prim = cldnn::non_max_suppression(
-                nonMaxSuppressionLayerName,
+                NMSLayerName,
                 reordered_inputs[0],
                 reordered_inputs[1],
                 0,
@@ -78,6 +78,24 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh
         }
 
         p.add_primitive(*op, prim);
+
+        auto NMSGatherLayerName = layer_type_name_ID(op) + "_NMSGather";
+        std::vector<cldnn::input_info> nms_gather_inputs;
+        const std::vector<cldnn::input_info> nms_gather_input_list = {
+            cldnn::input_info(NMSLayerName, 0),
+            cldnn::input_info(NMSLayerName, 1),
+            cldnn::input_info(NMSLayerName, 2)
+        };
+        for (size_t i = 0; i < num_outputs; i++) {
+            nms_gather_inputs.push_back(nms_gather_input_list[i]);
+        }
+
+        auto nms_gather_prim = cldnn::non_max_suppression_gather(
+            NMSGatherLayerName,
+            nms_gather_inputs,
+            num_outputs);
+
+        p.add_primitive(*op, nms_gather_prim);
     } else {
         auto outputIndices = op->get_output_partial_shape(0)[0].get_length();
 
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/group_norm_composition.cpp b/src/plugins/intel_gpu/src/plugin/transformations/group_norm_composition.cpp
new file mode 100644
index 00000000000000..c298a2cb4bfb8e
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/group_norm_composition.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "group_norm_composition.hpp"
+
+#include "openvino/core/rt_info.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/mvn.hpp"
+#include "openvino/op/shape_of.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/group_normalization.hpp"
+#include "openvino/op/squeeze.hpp"
+#include "openvino/pass/pattern/op/or.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "transformations/utils/utils.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+GroupNormComposition::GroupNormComposition() {
+    using namespace ov::pass::pattern;
+    using ov::pass::pattern::op::Or;
+
+    // Detect Group-Normalization decomposition pattern
+    // y = scale * MVN(x) + bias
+    auto data_m = any_input();
+    auto pre_reshape_const_m = wrap_type<ov::op::v0::Constant>();
+    auto pre_reshape_m = wrap_type<ov::op::v1::Reshape>({data_m, pre_reshape_const_m});
+    auto axes_const_m = wrap_type<ov::op::v0::Constant>();
+    auto mvn_m = wrap_type<ov::op::v6::MVN>({pre_reshape_m, axes_const_m});
+    auto shapeof_m = wrap_type<ov::op::v3::ShapeOf>({data_m});
+    auto post_reshape_m = wrap_type<ov::op::v1::Reshape>({mvn_m, shapeof_m});
+    auto scale_const_m = wrap_type<ov::op::v0::Constant>();
+    auto convert_scale_const_m = wrap_type<ov::op::v0::Convert>({scale_const_m});
+    auto scale_m = std::make_shared<Or>(OutputVector{scale_const_m, convert_scale_const_m});
+    auto mul_m = wrap_type<ov::op::v1::Multiply>({post_reshape_m, scale_m});
+    auto bias_const_m = wrap_type<ov::op::v0::Constant>();
+    auto convert_bias_const_m = wrap_type<ov::op::v0::Convert>({bias_const_m});
+    auto bias_m = std::make_shared<Or>(OutputVector{bias_const_m, convert_bias_const_m});
+    auto add_m = wrap_type<ov::op::v1::Add>({mul_m, bias_m});
+
+    ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+
+        auto data = pattern_map.at(data_m);
+        auto data_pshape = data.get_partial_shape();
+        // Feature dim should be static.
+        if (data_pshape[1].is_dynamic()) {
+            return false;
+        }
+        auto feature_dim = data_pshape[1].get_max_length();
+
+        auto scale = pattern_map.at(scale_const_m);
+        {
+            // The total number of elements in scale must be equal to feature_dim.
+            auto const_scale = std::dynamic_pointer_cast<ov::op::v0::Constant>(scale.get_node_shared_ptr());
+            auto const_scale_shape = const_scale->get_output_shape(0);
+            int64_t const_scale_size = 1;
+            for (auto& dim : const_scale_shape) {
+                const_scale_size *= dim;
+            }
+            if (const_scale_size != feature_dim) {
+                return false;
+            }
+        }
+        if (pattern_map.count(convert_scale_const_m) != 0) {
+            scale = pattern_map.at(convert_scale_const_m);
+        }
+        auto scale_1d = std::make_shared<ov::op::v0::Squeeze>(scale);
+        auto bias = pattern_map.at(bias_const_m);
+        {
+            // The total number of elements in bias must be equal to feature_dim.
+            auto const_bias = std::dynamic_pointer_cast<ov::op::v0::Constant>(bias.get_node_shared_ptr());
+            auto const_bias_shape = const_bias->get_output_shape(0);
+            int64_t const_bias_size = 1;
+            for (auto& dim : const_bias_shape) {
+                const_bias_size *= dim;
+            }
+            if (const_bias_size != feature_dim) {
+                return false;
+            }
+        }
+        if (pattern_map.count(convert_bias_const_m) != 0) {
+            bias = pattern_map.at(convert_bias_const_m);
+        }
+        auto bias_1d = std::make_shared<ov::op::v0::Squeeze>(bias);
+
+        auto pre_reshape = std::dynamic_pointer_cast<ov::op::v1::Reshape>(pattern_map.at(pre_reshape_m).get_node_shared_ptr());
+        auto pre_reshape_pshape = pre_reshape->get_output_partial_shape(0);
+        auto num_groups = pre_reshape_pshape[1].get_max_length();
+
+        auto mvn = std::dynamic_pointer_cast<ov::op::v6::MVN>(pattern_map.at(mvn_m).get_node_shared_ptr());
+
+        auto group_norm = std::make_shared<ov::op::v12::GroupNormalization>(data, scale_1d, bias_1d, num_groups, mvn->get_eps());
+
+        group_norm->set_friendly_name(m.get_match_root()->get_friendly_name());
+        ov::copy_runtime_info(m.get_matched_nodes(), group_norm);
+        ov::replace_node(m.get_match_root(), group_norm);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(add_m, "GroupNormComposition");
+    this->register_matcher(m, callback);
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/group_norm_composition.hpp b/src/plugins/intel_gpu/src/plugin/transformations/group_norm_composition.hpp
new file mode 100644
index 00000000000000..889d0e9ec57e56
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/group_norm_composition.hpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+class GroupNormComposition : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("GroupNormComposition", "0");
+    GroupNormComposition();
+};
+
+}   // namespace intel_gpu
+}   // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index c02b14fc5b894c..dbe7e858c1e6fe 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -70,6 +70,7 @@
 #include "plugin/transformations/convert_convolution.hpp"
 #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp"
 #include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp"
+#include "plugin/transformations/group_norm_composition.hpp"
 #include "transformations/common_optimizations/rms_fusion.hpp"
 #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp"
 #include "transformations/common_optimizations/broadcast_transition.hpp"
@@ -141,6 +142,7 @@
 #include "transformations/op_conversions/softmax_decomposition.hpp"
 #include "transformations/op_conversions/softplus_decomposition.hpp"
 #include "transformations/op_conversions/scaled_dot_product_attention_decomposition.hpp"
+#include "transformations/op_conversions/group_normalization_decomposition.hpp"
 #include "transformations/opset_conversions/convert_opset2_to_opset1.hpp"
 #include "transformations/opset_conversions/convert_opset3_to_opset2.hpp"
 #include "transformations/resolve_names_collisions.hpp"
@@ -294,6 +296,8 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
                 return !is_type<ov::op::v0::MatMul>(next_node);
             });
 
+        manager.register_pass<ov::intel_gpu::GroupNormComposition>();
+
         // Disable subtract folding only for the dGPUs to meet the requirements of oneDNN:
         // it expects to have the same data type for weights and zero points (apply it only for u8 data type, since other compression
         // types are not supported by oneDNN)
@@ -624,6 +628,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         pass_config->disable<ov::pass::ConvertGather8ToGather7>();
         pass_config->disable<ov::pass::ConvertGather7ToGather1>();
         pass_config->disable<ov::pass::ConvertTopK11ToTopK3>();
+        pass_config->disable<ov::pass::GroupNormalizationDecomposition>();
 
         pass_config->enable<ov::pass::ConvertInterpolate1ToInterpolate4>();
 
diff --git a/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp
index 5745e446ca7bb0..04a5afa459f7c2 100644
--- a/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/fusions/fully_connected_fusion_test.cpp
@@ -306,8 +306,15 @@ TEST_P(fc_compressed_int8_bias_dynamic, basic) {
     auto p = GetParam();
     auto test_input_layout = get_input_layout(p);
     auto dynamic_input_layout = layout{ov::PartialShape::dynamic(test_input_layout.get_partial_shape().rank()), test_input_layout.data_type, test_input_layout.format};
+    auto supports_immad = this->engine.get_device_info().supports_immad;
+
+    auto dcomp_zp_mem = engine.allocate_memory({ {1, 1, 1, 1}, data_types::f32, format::bfyx });
+    set_values(dcomp_zp_mem, {8.0f});
+
+    auto dcomp_zp_name = supports_immad ? "dcomp_zp" : "";
+
+    auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, padding(), get_output_dim_size(p), get_input_weights_rank(p));
 
-    auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, padding(), get_output_dim_size(p), get_input_weights_rank(p));
     fc_prim.decompression_zero_point_scalar = 8.0f;
 
     create_topologies(
@@ -315,6 +322,7 @@ TEST_P(fc_compressed_int8_bias_dynamic, basic) {
         data("weights", get_mem(get_weights_layout(p))),
         data("scale", get_mem(get_scale_layout(p, 128))),
         data("bias", get_mem(get_bias_layout(p))),
+        data("dcomp_zp", dcomp_zp_mem),
         fc_prim,
         eltwise("bias_add", { input_info("fc_prim"), input_info("bias") }, eltwise_mode::sum),
         reorder("reorder_bfyx", input_info("bias_add"), p.default_format, data_types::f32)
@@ -599,14 +607,21 @@ TEST_P(fc_compressed_int8_bias_dynamic_onednn, basic) {
     auto test_input_layout = get_input_layout(p);
     auto dynamic_input_layout = layout{ov::PartialShape::dynamic(test_input_layout.get_partial_shape().rank()), test_input_layout.data_type, test_input_layout.format};
 
-    auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, padding(), get_output_dim_size(p), get_input_weights_rank(p));
+    auto supports_immad = engine.get_device_info().supports_immad;
+    auto dcomp_zp_name = supports_immad ? "dcomp_zp" : "";
+
+    auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, padding(), get_output_dim_size(p), get_input_weights_rank(p));
     fc_prim.decompression_zero_point_scalar = 8.0f;
 
+    // onednn FC supports scalar ZP for int4 compressed weight.
+    auto dcomp_zp_layout = layout{ {1, 1, 1, 1}, data_types::u8, format::bfyx };
+
     create_topologies(
         input_layout("input", dynamic_input_layout),
         data("weights", get_mem(get_weights_layout(p))),
         data("scale", get_mem(get_scale_layout(p, 128))),
         data("bias", get_mem(get_bias_layout(p))),
+        data("dcomp_zp", get_mem(dcomp_zp_layout, 8.0f)),
         fc_prim,
         eltwise("bias_add", { input_info("fc_prim"), input_info("bias") }, eltwise_mode::sum),
         reorder("reorder_bfyx", input_info("bias_add"), p.default_format, data_types::f32)
diff --git a/src/plugins/intel_gpu/tests/unit/passes/fuse_primitives_with_layout.cpp b/src/plugins/intel_gpu/tests/unit/passes/fuse_primitives_with_layout.cpp
index b02418a06a05b5..f833fc259c39eb 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/fuse_primitives_with_layout.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/fuse_primitives_with_layout.cpp
@@ -51,7 +51,9 @@ TEST(fuse_primitives_with_layout, fuse_when_layout_format_of_input_and_output_ar
             node->set_output_layout(qt_layout, false);
         }
     }
-    program_wrapper::apply_opt_pass<fuse_primitives_with_layout>(*program);
+
+    layout_optimizer lo(true);
+    program_wrapper::apply_opt_pass<fuse_primitives_with_layout>(*program, lo);
 
     ASSERT_TRUE(has_node(*program, "quantize"));
 }
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
index cbe4eb25fef53c..35d65554408252 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -1368,6 +1368,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
     void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128) {
         tests::random_generator rg(GET_SUITE_NAME);
         auto& engine = get_test_engine();
+        auto supports_immad = engine.get_device_info().supports_immad;
 
         long int ifm_num = 256;
         long int ofm_num = 256;
@@ -1375,6 +1376,9 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto input_mem = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx });
         auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx });
         auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx });
+        auto dcomp_zp_mem = engine.allocate_memory({ {1, 1, 1, 1}, data_types::u8, format::bfyx });
+
+        set_values(dcomp_zp_mem, {8});
 
         auto input_data = rg.generate_random_1d<ov::float16>(batch_num * ifm_num, -2.0f, 2.0f);
         set_values(input_mem, input_data);
@@ -1388,7 +1392,9 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx }
                                     : layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx };
 
-        auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, padding(), 2, 2);
+        auto dcomp_zp_name = supports_immad ? "dcomp_zp" : "";
+
+        auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, padding(), 2, 2);
 
         fc_prim.decompression_zero_point_scalar = 8;
 
@@ -1397,6 +1403,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
                 input_layout("input", in_layout),
                 data("weights", weights_mem),
                 data("scale", scale_mem),
+                data("dcomp_zp", dcomp_zp_mem),
                 fc_prim
             );
 
@@ -1420,6 +1427,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
             input_layout("input", in_layout),
             data("weights", weights_mem),
             data("scale", scale_mem),
+            data("dcomp_zp", dcomp_zp_mem),
             fc_prim
         );
 
@@ -1456,6 +1464,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
     void test_compressed_int4_scale_reuse(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128) {
         tests::random_generator rg(GET_SUITE_NAME);
         auto& engine = get_test_engine();
+        auto supports_immad = engine.get_device_info().supports_immad;
 
         long int ifm_num = 256;
         long int ofm_num = 256;
@@ -1464,6 +1473,9 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto weights_mem1 = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx });
         auto weights_mem2 = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx });
         auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx });
+        auto dcomp_zp_mem = engine.allocate_memory({ {1, 1, 1, 1}, data_types::u8, format::bfyx });
+
+        set_values(dcomp_zp_mem, {8});
 
         auto input_data = rg.generate_random_1d<ov::float16>(batch_num * ifm_num, -2.0f, 2.0f);
         set_values(input_mem, input_data);
@@ -1478,8 +1490,10 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx }
                                     : layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx };
 
-        auto fc_prim1 = fully_connected("fc_prim1", input_info("input"), "weights1", "", "scale", "", data_types::f16, padding(), 2, 2);
-        auto fc_prim2 = fully_connected("fc_prim2", input_info("input"), "weights2", "", "scale", "", data_types::f16, padding(), 2, 2);
+        auto dcomp_zp_name = supports_immad ? "dcomp_zp" : "";
+
+        auto fc_prim1 = fully_connected("fc_prim1", input_info("input"), "weights1", "", "scale", dcomp_zp_name, data_types::f16, padding(), 2, 2);
+        auto fc_prim2 = fully_connected("fc_prim2", input_info("input"), "weights2", "", "scale", dcomp_zp_name, data_types::f16, padding(), 2, 2);
 
         fc_prim1.decompression_zero_point_scalar = 8;
         fc_prim2.decompression_zero_point_scalar = 8;
@@ -1490,6 +1504,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
                 data("weights1", weights_mem1),
                 data("weights2", weights_mem2),
                 data("scale", scale_mem),
+                data("dcomp_zp", dcomp_zp_mem),
                 fc_prim1,
                 fc_prim2
             );
@@ -1516,6 +1531,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
             data("weights1", weights_mem1),
             data("weights2", weights_mem2),
             data("scale", scale_mem),
+            data("dcomp_zp", dcomp_zp_mem),
             fc_prim1,
             fc_prim2
         );
@@ -1835,6 +1851,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
 
     void test_compressed_int8_scale_zp_scalar(bool is_caching_test) {
         auto& engine = get_test_engine();
+        auto supports_immad = engine.get_device_info().supports_immad;
 
         long ifm_num = 6;
         long ofm_num = 8;
@@ -1842,6 +1859,9 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto input_mem = engine.allocate_memory({ { 1, ifm_num }, data_types::f16, format::bfyx });
         auto weights_mem = engine.allocate_memory({ { ofm_num, ifm_num }, data_types::u8, format::bfyx });
         auto scale_mem = engine.allocate_memory({ { ofm_num, 1 }, data_types::f16, format::bfyx });
+        auto dcomp_zp_mem = engine.allocate_memory({ {1, 1, 1, 1}, data_types::u8, format::bfyx });
+
+        set_values(dcomp_zp_mem, {8});
 
         set_values<ov::float16>(input_mem, { -0.5f, 2.0f, 0.5f, 1.0f, 0.5f, 2.0f });
         set_values<uint8_t>(weights_mem, { 0, 1, 2, 3, 4, 5,
@@ -1854,13 +1874,17 @@ class fully_connected_gpu_tests: public ::testing::Test {
                                            0, 1, 2, 3, 4, 5 });
         set_values<ov::float16>(scale_mem, { 2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 2.0f });
 
-        auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16);
+        auto dcomp_zp_name = supports_immad ? "dcomp_zp" : "";
+
+        auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16);
+
         fc_prim.decompression_zero_point_scalar = 8;
 
         topology topology(
             input_layout("input", input_mem->get_layout()),
             data("weights", weights_mem),
             data("scale", scale_mem),
+            data("dcomp_zp", dcomp_zp_mem),
             fc_prim
         );
 
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp
index ed52f276fa5960..424740f21721c4 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp
@@ -16,10 +16,11 @@ using namespace ::tests;
 namespace {
 
 typedef std::tuple<
-std::vector<std::int32_t>,  // Input shape
-std::size_t,                // Number of groups
-double,                     // Epsilon
-format                      // First input layout
+    std::vector<std::int32_t>,  // Input shape
+    std::size_t,                // Number of groups
+    double,                     // Epsilon
+    format,                     // First input layout
+    padding                     // Output padding
 >
 GroupNormalizationParams;
 
@@ -30,7 +31,7 @@ class GroupNormalizationGPUTest : public ::testing::TestWithParam<GroupNormaliza
     void SetUp() override {
         std::vector<std::int32_t> input_shape;
         const auto& params = GetParam();
-        std::tie(input_shape, num_groups_, epsilon_, format_) = params;
+        std::tie(input_shape, num_groups_, epsilon_, format_, output_pad_) = params;
         std::copy(std::begin(input_shape), std::end(input_shape), std::back_inserter(data_shape_));
         tests::random_generator rg{"GroupNormalizationGPUTest"};
         data_ = rg.generate_random_1d<float>(ov::shape_size(input_shape), -1, 1);
@@ -58,6 +59,7 @@ class GroupNormalizationGPUTest : public ::testing::TestWithParam<GroupNormaliza
             static_cast<std::int64_t>(num_groups_),
             epsilon_
         };
+        g.output_paddings = {output_pad_};
         tp.add(g);
         tp.add(reorder{"output", input_info("group_normalization_output"), planar_format, data_types::f32});
 
@@ -96,6 +98,7 @@ class GroupNormalizationGPUTest : public ::testing::TestWithParam<GroupNormaliza
     std::size_t num_groups_{};
     double epsilon_{};
     format format_{format::any};
+    padding output_pad_{padding()};
     network::ptr network_{};
     layout data_layout_{};
     layout scale_bias_layout_{};
@@ -109,10 +112,14 @@ const primitive_id GroupNormalizationGPUTest::data_primitive_{"data"};
 const primitive_id GroupNormalizationGPUTest::scale_primitive_{"scale"};
 const primitive_id GroupNormalizationGPUTest::bias_primitive_{"bias"};
 
-TEST_P(GroupNormalizationGPUTest, blocked_layouts_support) {
+TEST_P(GroupNormalizationGPUTest, random) {
     Test();
 }
 
+const std::vector<cldnn::format> f_planar_4d_formats {
+    format::bfyx,
+};
+
 const std::vector<cldnn::format> f_blocked_4d_formats {
     format::b_fs_yx_fsv2,
     format::b_fs_yx_fsv4,
@@ -127,13 +134,23 @@ const std::vector<cldnn::format> f_blocked_5d_formats {
     format::b_fs_zyx_fsv32,
 };
 
+INSTANTIATE_TEST_SUITE_P(
+    GroupNormalizationGPUTest_planar_layouts_support_4d, GroupNormalizationGPUTest,
+    ::testing::Combine(
+        ::testing::ValuesIn({std::vector<int32_t>{3, 64, 32, 64}, std::vector<int32_t>{3, 124, 97, 61}}),
+        ::testing::Values(4),
+        ::testing::Values(0.0025),
+        ::testing::ValuesIn(f_planar_4d_formats),
+        ::testing::ValuesIn({padding(), padding({0, 0, 1, 1})})));
+
 INSTANTIATE_TEST_SUITE_P(
     GroupNormalizationGPUTest_blocked_layouts_support_4d, GroupNormalizationGPUTest,
     ::testing::Combine(
-        ::testing::Values(std::vector<int32_t>{3, 64, 32, 64}),
+        ::testing::ValuesIn({std::vector<int32_t>{3, 64, 32, 64}, std::vector<int32_t>{3, 124, 97, 61}}),
         ::testing::Values(4),
         ::testing::Values(0.0025),
-        ::testing::ValuesIn(f_blocked_4d_formats)));
+        ::testing::ValuesIn(f_blocked_4d_formats),
+        ::testing::Values(padding())));
 
 INSTANTIATE_TEST_SUITE_P(
     GroupNormalizationGPUTest_blocked_layouts_support_5d, GroupNormalizationGPUTest,
@@ -141,6 +158,7 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::Values(std::vector<int32_t>{3, 64, 28, 32, 12}),
         ::testing::Values(4),
         ::testing::Values(0.0025),
-        ::testing::ValuesIn(f_blocked_5d_formats)));
+        ::testing::ValuesIn(f_blocked_5d_formats),
+        ::testing::Values(padding())));
 
 } // anonymous namespace
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/non_max_suppression_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/non_max_suppression_test.cpp
index adbb0c029c8bb4..3d7647ee2f53f1 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/non_max_suppression_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/non_max_suppression_test.cpp
@@ -572,6 +572,125 @@ struct non_max_suppression_basic : public testing::Test {
         }
     }
 
+    void test_nms_gather_score_threshold(bool is_caching_test) {
+        auto& engine = tests::get_test_engine();
+
+        auto num_per_class_mem = engine.allocate_memory(layout(data_types::f32, format::bfyx, tensor(batch(1))));
+        tests::set_values(num_per_class_mem, {3.f});
+        auto iou_threshold_mem = engine.allocate_memory(layout(data_types::f32, format::bfyx, tensor(batch(1))));
+        tests::set_values(iou_threshold_mem, {0.4f});
+        auto score_threshold_mem = engine.allocate_memory(layout(data_types::f32, format::bfyx, tensor(batch(1))));
+        tests::set_values(score_threshold_mem, {0.4f});
+
+        const auto l_boxes = this->boxes_layout;
+        const auto l_scores = this->scores_layout;
+
+        topology topo;
+        topo.add(input_layout("boxes", layout{ov::PartialShape{l_boxes.batch(), l_boxes.feature(), l_boxes.spatial(1)}, l_boxes.data_type, l_boxes.format}));
+        topo.add(input_layout("scores", layout{ov::PartialShape{l_scores.batch(), l_scores.feature(), l_scores.spatial(1)}, l_scores.data_type, l_scores.format}));
+        topo.add(data("num_per_class", num_per_class_mem));
+        topo.add(data("iou_threshold", iou_threshold_mem));
+        topo.add(data("score_threshold", score_threshold_mem));
+        topo.add(reorder("reformat_boxes", input_info("boxes"), this->layout_format, this->data_type));
+        topo.add(reorder("reformat_scores", input_info("scores"), this->layout_format, this->data_type));
+
+        auto nms = non_max_suppression("nms",
+                                    input_info("reformat_boxes"),
+                                    input_info("reformat_scores"),
+                                    this->batch_size * this->classes_num * this->boxes_num,
+                                    false,
+                                    true,
+                                    "num_per_class",
+                                    "iou_threshold",
+                                    "score_threshold",
+                                    "", "", "", 3);
+        auto output_data_type = this->data_type;
+        nms.output_data_types = {optional_data_type{}, optional_data_type{output_data_type}, optional_data_type{}};
+        nms.output_paddings = {padding(), padding(), padding()};
+        
+        topo.add(nms);
+        topo.add(non_max_suppression_gather("nms_gather",
+                                            {input_info("nms", 0),
+                                            input_info("nms", 1),
+                                            input_info("nms", 2)},
+                                            3));
+        topo.add(reorder("plane_nms0", input_info("nms_gather", 0), format::bfyx, cldnn::data_types::i32));
+        topo.add(reorder("plane_nms1", input_info("nms_gather", 1), format::bfyx, this->data_type));
+        topo.add(reorder("plane_nms2", input_info("nms_gather", 2), format::bfyx, cldnn::data_types::i32));
+
+        ExecutionConfig config = get_test_default_config(engine);
+        config.set_property(ov::intel_gpu::optimize_data(true));
+        config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+        cldnn::network::ptr net = get_network(engine, topo, config, get_test_stream_ptr(), is_caching_test);
+
+        auto boxes_mem = this->get_boxes_memory(engine);
+        auto scores_mem = this->get_scores_memory(engine);
+
+        net->set_input_data("boxes", boxes_mem);
+        net->set_input_data("scores", scores_mem);
+
+        auto result = net->execute();
+
+        // output 0
+        std::vector<int> expected_out0 = {
+            0, 0, 2,
+            0, 1, 0,
+            1, 0, 2,
+            0, 0, 1,
+            1, 0, 1
+        };
+
+        auto out_mem0 = result.at("plane_nms0").get_memory();
+        cldnn::mem_lock<int> out0_ptr(out_mem0, get_test_stream());
+
+        ASSERT_EQ(expected_out0.size(), out0_ptr.size());
+        for (size_t i = 0; i < out0_ptr.size(); ++i) {
+            ASSERT_EQ(expected_out0[i], out0_ptr[i]) << "at i = " << i;
+        }
+
+        // output 1
+        if (this->data_type == cldnn::data_types::f32) {
+            std::vector<float> expected_out1 = {
+                0.0f, 0.0f, 0.9f,
+                0.0f, 1.0f, 0.9f,
+                1.0f, 0.0f, 0.8f,
+                0.0f, 0.0f, 0.7f,
+                1.0f, 0.0f, 0.5f
+            };
+            auto out_mem1 = result.at("plane_nms1").get_memory();
+            cldnn::mem_lock<float> out1_ptr(out_mem1, get_test_stream());
+
+            ASSERT_EQ(expected_out1.size(), out1_ptr.size());
+            for (size_t i = 0; i < out1_ptr.size(); ++i) {
+                ASSERT_EQ(expected_out1[i], out1_ptr[i]) << "at i = " << i;
+            }
+        } else if (this->data_type == cldnn::data_types::f16) {
+            std::vector<ov::float16> expected_out1 = {
+                0.0f, 0.0f, 0.899902f,
+                0.0f, 1.0f, 0.899902f,
+                1.0f, 0.0f, 0.799805f,
+                0.0f, 0.0f, 0.700195f,
+                1.0f, 0.0f, 0.5f
+            };
+            auto out_mem1 = result.at("plane_nms1").get_memory();
+            cldnn::mem_lock<ov::float16> out1_ptr(out_mem1, get_test_stream());
+
+            ASSERT_EQ(expected_out1.size(), out1_ptr.size());
+            for (size_t i = 0; i < out1_ptr.size(); ++i) {
+                ASSERT_EQ(expected_out1[i], out1_ptr[i]) << "at i = " << i;
+            }
+        } else {
+            GTEST_FAIL() << "Not supported data type.";
+        }
+
+        // output 2
+        auto out_mem2 = result.at("plane_nms2").get_memory();
+        cldnn::mem_lock<int> out2_ptr(out_mem2, get_test_stream());
+        ASSERT_EQ(1, out2_ptr.size());
+        ASSERT_EQ(5, out2_ptr[0]);
+    }
+
     void test_soft_nms_sigma(bool is_caching_test) {
         auto& engine = tests::get_test_engine();
 
@@ -678,6 +797,10 @@ TYPED_TEST(non_max_suppression_basic, score_threshold) {
     this->test_score_threshold(false);
 }
 
+TYPED_TEST(non_max_suppression_basic, nms_gather_score_threshold) {
+    this->test_nms_gather_score_threshold(false);
+}
+
 TYPED_TEST(non_max_suppression_basic, soft_nms_sigma) {
     this->test_soft_nms_sigma(false);
 }
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
index eebaac1683e3ec..376eeca959a370 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
@@ -518,6 +518,135 @@ TEST(quantize_gpu, quantize_levels_256_3d_unsigned) {
     }
 }
 
+TEST(quantize_gpu, eltwise_quantize_fs_b_yx_fsv32) {
+    tests::random_generator rg(GET_SUITE_NAME);
+    auto& engine = get_test_engine();
+
+    // conv to enable 'fs_b_yx_fsv32_network'
+    const int batch_num = 2;
+    const int input_xy = 5;
+    const int input_f = 32;
+    const int output_f = 32;
+    const int filter_xy = 1;
+    const int pad = filter_xy / 2;
+
+    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
+    auto input_data = rg.generate_random_4d<ov::float16>(batch_num, input_f, input_xy, input_xy, -1, 1);
+    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
+    auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size });
+    set_values(input_mem, input_data_bfyx);
+
+    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
+    auto weights_data = rg.generate_random_4d<ov::float16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
+    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+    auto weights_mem = engine.allocate_memory({ data_types::f16, format::bfyx, weights_size });
+    set_values(weights_mem, weights_data_bfyx);
+
+    topology topology(
+        input_layout("input_conv", input_mem->get_layout()),
+        data("weights_fsv", weights_mem));
+
+    // Reorder input to fs_byx_fsv32
+    topology.add(reorder("input_fsv", input_info("input_conv"), { data_types::f16, format::fs_b_yx_fsv32, input_size }));
+
+    topology.add(convolution("conv0", input_info("input_fsv"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv1", input_info("conv0"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv2", input_info("conv1"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv3", input_info("conv2"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv4", input_info("conv3"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv5", input_info("conv4"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv6", input_info("conv5"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv7", input_info("conv6"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv8", input_info("conv7"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv9", input_info("conv8"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv10", input_info("conv9"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv11", input_info("conv10"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+
+    topology.add(reorder("reorder_conv", input_info("conv11"), format::b_fs_yx_fsv16, data_types::f32));
+
+    // eltwise + quantize pattern
+    auto in_layout = layout{ ov::PartialShape{2, 16, 1, 2}, data_types::f16, format::b_fs_yx_fsv16 };
+    auto input = engine.allocate_memory(in_layout);
+    auto input_low = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 16, 1, 1 } });
+    auto input_high = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 16, 1, 1 } });
+    auto output_low = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 1, 1, 1 } });
+    auto output_high = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 1, 1, 1 } });
+
+    set_values(input, { -1.0f, 2.0f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f,
+
+                        -1.0f, 2.0f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f });
+
+    set_values(input_low,  { 0.0f, 1.0f, 2.0f, 3.0f,
+                             4.0f, 5.0f, 6.0f, 7.0f,
+                             7.0f, 6.0f, 5.0f, 4.0f,
+                             3.0f, 2.0f, 1.0f, 0.0f });
+    set_values(input_high, { 0.0f, 1.0f, 2.0f, 3.0f,
+                             4.0f, 5.0f, 6.0f, 7.0f,
+                             7.0f, 6.0f, 5.0f, 4.0f,
+                             3.0f, 2.0f, 1.0f, 0.0f });
+    set_values(output_low,  { -1.0f });
+    set_values(output_high, {  1.0f });
+
+    std::vector<float> ref_data = { -1, 1, -1, 1,   -1, -1, -1, -1,     -1, -1, -1, -1,
+                                    -1, 1, -1, 1,   -1, 1, -1, 1,       -1, -1, -1, -1,
+                                    -1, -1, -1, 1,  -1, 1, -1, 1,       -1, 1, -1, 1,
+                                    -1, -1, -1, -1, -1, -1, -1, 1,      -1, 1, -1, 1,
+                                    -1, 1, -1, 1,   -1, -1, -1, -1,     -1, -1, -1, -1,
+                                    -1, 1, -1, 1 };
+
+    topology.add(
+        input_layout("input1", in_layout),
+        input_layout("input2", in_layout),
+        eltwise("multiply", input_info("input1"), input_info("input2"), eltwise_mode::prod),
+        data("input_low", input_low),
+        data("input_high", input_high),
+        data("output_low", output_low),
+        data("output_high", output_high),
+        quantize("quantize", input_info("multiply"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 2, data_types::f32),
+        reorder("reorder", input_info("quantize"), format::b_fs_yx_fsv16, data_types::f32)
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    ov::intel_gpu::ImplementationDesc quantize_impl = { format::b_fs_yx_fsv16, "quantize_gpu_ref" };
+    config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "quantize", quantize_impl } }));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+
+    network network(engine, topology, config);
+    network.set_input_data("input_conv", input_mem);
+    network.set_input_data("input1", input);
+    network.set_input_data("input2", input);
+    auto outputs = network.execute();
+
+    auto output = outputs.at("reorder").get_memory();
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    // Check that layout and memory contains logical size of tensor
+    ASSERT_EQ(output->count(), (size_t)64);
+    ASSERT_EQ(output->get_layout().count(), (size_t)64);
+
+    ASSERT_EQ(output->size(), ref_data.size() * sizeof(uint32_t));
+
+    for (size_t i = 0; i < ref_data.size(); ++i) {
+        ASSERT_EQ(output_ptr[i], ref_data[i]) << " index = " << i;
+    }
+}
+
 TEST(quantize_gpu, dynamic) {
     auto& engine = get_test_engine();
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 596fc94bb7362d..fcee4edeb2e05b 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -315,7 +315,9 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                 m_compiled_submodels[real_id].devices_to_avoid.insert(std::move(d));
             }
         }
-        m_compiled_submodels[real_id].device_it = m_dev_list.cbegin();
+
+        m_compiled_submodels[id].device_it =
+            id != real_id ? m_compiled_submodels[real_id].device_it : m_dev_list.cbegin();
 
         if (forced_sub_devices.count(id)) {
             std::string forced_device = forced_sub_devices[id];
@@ -654,15 +656,94 @@ void ov::npuw::CompiledModel::log_device_dist() const {
 void ov::npuw::CompiledModel::implement_properties() {
     // This function fills the map: {`property name`: `getter for property value`},
     // that can be used later to return requested properties by user.
-    // It does it in 4 steps:
+    // It does it in 3 steps:
     //
-    // 1. Create mappings for all NPUW-specific properties to getters of their
-    //    values from config.
-    // 2. Create mappings for all copied from HETERO plugin properties, to
-    //    their copied implementations.
-    // 3. Fill `m_all_supported_props` with all properties, mentioned above.
+    // 1. Create mappings for OV public properties and hints, exposed
+    //    in ::intel_npu::CompiledModel.
+    // 2. Fill `m_all_supported_props` vector with property names from
+    //    the 1st step. It will be returned as response to `ov::supported_properties`
+    //    request. So the vector will define public properties.
+    // 3. Create mappings for all remaining (private) NPUW-specific properties
+    //    to getters of their values from config.
+
+#define GET_PLUGIN_PROP(property) return get_plugin()->get_property(property.name(), ov::AnyMap());
 
     // 1.
+    // OV Public
+    // ===============================================
+    m_prop_to_opt = {{ov::supported_properties.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           return m_all_supported_props;
+                       }}},
+                     {ov::device::id.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           GET_PLUGIN_PROP(ov::device::id);
+                       }}},
+                     {ov::enable_profiling.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           GET_PLUGIN_PROP(ov::enable_profiling);
+                       }}},
+                     {ov::model_name.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           return m_name;
+                       }}},
+                     {ov::optimal_number_of_infer_requests.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           return 1u;
+                       }}},
+                     {ov::execution_devices.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           return "NPU";
+                       }}},
+                     {ov::loaded_from_cache.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           return m_loaded_from_cache;
+                       }}},
+                     // OV Public Hints
+                     // =====================================================
+                     {ov::hint::performance_mode.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           GET_PLUGIN_PROP(ov::hint::performance_mode);
+                       }}},
+                     {ov::hint::execution_mode.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           GET_PLUGIN_PROP(ov::hint::execution_mode);
+                       }}},
+                     {ov::hint::num_requests.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           GET_PLUGIN_PROP(ov::hint::num_requests);
+                       }}},
+                     {ov::hint::inference_precision.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           GET_PLUGIN_PROP(ov::hint::inference_precision);
+                       }}},
+                     {ov::hint::enable_cpu_pinning.name(),
+                      {ov::PropertyMutability::RO,
+                       [&](const ::intel_npu::Config&) {
+                           GET_PLUGIN_PROP(ov::hint::enable_cpu_pinning);
+                       }}},
+                     {ov::hint::model_priority.name(), {ov::PropertyMutability::RO, [&](const ::intel_npu::Config&) {
+                                                            GET_PLUGIN_PROP(ov::hint::model_priority);
+                                                        }}}};
+#undef GET_PLUGIN_PROP
+
+    // 2.
+    for (auto& p : m_prop_to_opt) {
+        m_all_supported_props.emplace_back(ov::PropertyName(p.first, std::get<0>(p.second)));
+    }
+
+    // 3.
 #define BIND(N, T)                                                                         \
     {                                                                                      \
         ov::intel_npu::N.name(), {                                                         \
@@ -672,101 +753,31 @@ void ov::npuw::CompiledModel::implement_properties() {
         }                                                                                  \
     }
 
-    m_prop_to_opt = {BIND(use_npuw, NPU_USE_NPUW),
-                     BIND(npuw::devices, NPUW_DEVICES),
-                     BIND(npuw::submodel_device, NPUW_SUBMODEL_DEVICE),
-                     BIND(npuw::partitioning::online::pipeline, NPUW_ONLINE_PIPELINE),
-                     BIND(npuw::partitioning::online::min_size, NPUW_ONLINE_MIN_SIZE),
-                     BIND(npuw::partitioning::online::avoid, NPUW_ONLINE_AVOID),
-                     BIND(npuw::partitioning::online::dump_plan, NPUW_ONLINE_DUMP_PLAN),
-                     BIND(npuw::partitioning::plan, NPUW_PLAN),
-                     BIND(npuw::partitioning::fold, NPUW_FOLD),
-                     BIND(npuw::partitioning::cwai, NPUW_CWAI),
-                     BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
-                     BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
-                     BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
-                     BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
-                     BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
-                     BIND(npuw::accuracy::check, NPUW_ACC_CHECK),
-                     BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH),
-                     BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE),
+    m_prop_to_opt.insert({BIND(use_npuw, NPU_USE_NPUW),
+                          BIND(npuw::devices, NPUW_DEVICES),
+                          BIND(npuw::submodel_device, NPUW_SUBMODEL_DEVICE),
+                          BIND(npuw::partitioning::online::pipeline, NPUW_ONLINE_PIPELINE),
+                          BIND(npuw::partitioning::online::min_size, NPUW_ONLINE_MIN_SIZE),
+                          BIND(npuw::partitioning::online::avoid, NPUW_ONLINE_AVOID),
+                          BIND(npuw::partitioning::online::dump_plan, NPUW_ONLINE_DUMP_PLAN),
+                          BIND(npuw::partitioning::plan, NPUW_PLAN),
+                          BIND(npuw::partitioning::fold, NPUW_FOLD),
+                          BIND(npuw::partitioning::cwai, NPUW_CWAI),
+                          BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
+                          BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
+                          BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
+                          BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
+                          BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
+                          BIND(npuw::accuracy::check, NPUW_ACC_CHECK),
+                          BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH),
+                          BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE),
 #ifdef NPU_PLUGIN_DEVELOPER_BUILD
-                     BIND(npuw::dump::full, NPUW_DUMP_FULL),
-                     BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS),
-                     BIND(npuw::dump::subgraphs_on_fail, NPUW_DUMP_SUBS_ON_FAIL),
-                     BIND(npuw::dump::inputs_outputs, NPUW_DUMP_IO),
-                     BIND(npuw::dump::io_iters, NPUW_DUMP_IO_ITERS)
+                          BIND(npuw::dump::full, NPUW_DUMP_FULL),
+                          BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS),
+                          BIND(npuw::dump::subgraphs_on_fail, NPUW_DUMP_SUBS_ON_FAIL),
+                          BIND(npuw::dump::inputs_outputs, NPUW_DUMP_IO),
+                          BIND(npuw::dump::io_iters, NPUW_DUMP_IO_ITERS)
 #endif
-    };
+    });
 #undef BIND
-    // 2.
-    m_prop_to_opt.insert(
-        {{ov::supported_properties.name(),
-          {ov::PropertyMutability::RO,
-           [&](const ::intel_npu::Config&) -> const std::vector<ov::PropertyName, std::allocator<ov::PropertyName>>& {
-               return m_all_supported_props;
-           }}},
-         {ov::device::properties.name(),
-          {ov::PropertyMutability::RO,
-           [&](const ::intel_npu::Config&) {
-               ov::AnyMap all_devices = {};
-               for (size_t i = 0; i < m_compiled_submodels.size(); ++i) {
-                   const auto& comp_model_desc = m_compiled_submodels[i];
-                   if (!comp_model_desc.compiled_model)  // Handle if optimized out
-                       continue;
-                   ov::AnyMap device_properties = {};
-                   if (all_devices.count(submodel_device(i)) == 0) {
-                       auto device_supported_props =
-                           comp_model_desc.compiled_model->get_property(ov::supported_properties.name());
-                       for (auto&& property_name : device_supported_props.as<std::vector<ov::PropertyName>>())
-                           device_properties[property_name] =
-                               comp_model_desc.compiled_model->get_property(property_name);
-                       all_devices[submodel_device(i)] = device_properties;
-                   }
-               }
-               return all_devices;
-           }}},
-         {ov::model_name.name(),
-          {ov::PropertyMutability::RO,
-           [&](const ::intel_npu::Config&) -> const std::string& {
-               return m_name;
-           }}},
-         {ov::optimal_number_of_infer_requests.name(),
-          {ov::PropertyMutability::RO,
-           [&](const ::intel_npu::Config&) {
-               unsigned int value = 0u;
-               for (const auto& comp_model_desc : m_compiled_submodels) {
-                   if (comp_model_desc.compiled_model) {  // Some models may be optimized out
-                       value = std::max(
-                           value,
-                           comp_model_desc.compiled_model->get_property(ov::optimal_number_of_infer_requests.name())
-                               .as<unsigned int>());
-                   }
-               }
-               return value;
-           }}},
-         {ov::execution_devices.name(),
-          {ov::PropertyMutability::RO,
-           [&](const ::intel_npu::Config&) {
-               std::vector<std::string> device_names;
-               std::set<std::string> s;
-               for (size_t i = 0; i < m_compiled_submodels.size(); ++i) {
-                   const auto& comp_model_desc = m_compiled_submodels[i];
-                   if (!comp_model_desc.compiled_model)  // handle optimized out
-                       continue;
-                   if (s.count(submodel_device(i)) != 0)
-                       continue;
-                   s.insert(submodel_device(i));
-                   device_names.push_back(submodel_device(i));
-               }
-               return decltype(ov::execution_devices)::value_type{std::move(device_names)};
-           }}},
-         {ov::loaded_from_cache.name(), {ov::PropertyMutability::RO, [&](const ::intel_npu::Config&) {
-                                             return m_loaded_from_cache;
-                                         }}}});
-
-    // 3.
-    for (auto& p : m_prop_to_opt) {
-        m_all_supported_props.emplace_back(ov::PropertyName(p.first, std::get<0>(p.second)));
-    }
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
index be118934b59a2a..fdda0636072643 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
@@ -445,8 +445,16 @@ std::shared_ptr<Repeated> Snapshot::tryGrowRepeatingGroups(const detail::GPtrSet
             if (a.empty()) {
                 return false;  // doesn't matter for stability - no groups are fused
             }
-            return a.at(0).first->getId() < b.at(0).first->getId();
+            // This std::sort allows to prioritize groups from the tail
+            // of the original model. It's possible due to preservation of
+            // group IDs in topological order throughout the whole partitioning process.
+            // In the networks we're looking at, ensuring the merge order from the bottom
+            // of the network gives a better structure of a repeated block which can be
+            // later optimized by the plugin.
+            return a.at(0).first->getId() > b.at(0).first->getId();
         }
+        // Generally we prefer bigger blocks (in terms of number of layers)
+        // to be merged first. For other cases check the comment above
         return a.size() > b.size();
     });
 
diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
index 3337fc2b70fe72..9883f1b067c93d 100644
--- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -589,6 +589,10 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
     // activate the NPUW path
     auto useNpuwKey = ov::intel_npu::use_npuw.name();
     if (properties.count(useNpuwKey) && properties.at(useNpuwKey).as<bool>()) {
+        // CACHE_DIR isn't supported with NPU_USE_NPUW
+        if (properties.count(ov::cache_dir.name()) || !_globalConfig.get<CACHE_DIR>().empty()) {
+            OPENVINO_THROW("Option 'CACHE_DIR' is not supported with NPU_USE_NPUW");
+        }
         return std::make_shared<ov::npuw::CompiledModel>(model->clone(), shared_from_this(), properties);
     }
 
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/utils/ranges.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/utils/ranges.hpp
index 5e7c51a8146666..e57803f3add6a6 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/utils/ranges.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/utils/ranges.hpp
@@ -128,9 +128,11 @@ static std::map<ov::NodeTypeInfo, Range> inputRanges = {
     {ov::op::v5::HSigmoid::get_type_info_static(), Range({{0, 15}}, {{-1, 2, 32768}})},
     {ov::op::v5::Round::get_type_info_static(), Range({{0, 15}}, {{-10, 20, 4}})},
     {ov::op::v7::Gelu::get_type_info_static(), Range({{0, 15}}, {{-1, 2, 32768}})},
+    {ov::op::v14::MaxPool::get_type_info_static(), Range({{0, 10, 1, 1}}, {{0, 10, 1, 1}})},
     {ov::op::v8::MaxPool::get_type_info_static(), Range({{0, 10, 1, 1}}, {{0, 10, 1, 1}})},
     {ov::op::v1::MaxPool::get_type_info_static(), Range({{0, 10, 1, 1}}, {{0, 10, 1, 1}})},
     {ov::op::v1::AvgPool::get_type_info_static(), Range({{0, 10, 1, 1}}, {{0, 10, 1, 1}})},
+    {ov::op::v14::AvgPool::get_type_info_static(), Range({{0, 10, 1, 1}}, {{0, 10, 1, 1}})},
     {ov::op::v9::SoftSign::get_type_info_static(), Range({{0, 15}}, {{-100, 200, 32768}})},
     // new temp
     {ov::op::v1::Convolution::get_type_info_static(), Range({{0, 15}}, {{0, 8, 32}})},