diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.cpp
new file mode 100644
index 00000000000000..ac48f38d90fdcd
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.cpp
@@ -0,0 +1,211 @@
+﻿// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <string>
+#include <functional>
+#include <cmath>
+
+// Tile size : 8x8
+#define HALF_TILE_SIZE 4
+#define DEFAULT_TILE_SIZE 8
+
+namespace kernel_selector {
+ParamsKey ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::UINT32);
+    k.EnableInputDataType(Datatype::INT32);
+
+    k.EnableAllOutputDataType();
+
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
+    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
+
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfzyx);
+    k.EnableOutputLayout(DataLayout::bfwzyx);
+
+    k.EnableBatching();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableDifferentTypes();
+
+    return k;
+}
+
+static inline std::string GetTiledOutputOrder(size_t size) {
+    std::string order_str = "";
+    switch (size) {
+    case 4:
+        order_str = "b, f, y, x";
+        break;
+    case 5:
+        order_str = "b, f, z, y, x";
+        break;
+    case 6:
+        order_str = "b, f, w, z, y, x";
+        break;
+    default: throw std::runtime_error("Unsupported size\n");
+    }
+    return order_str;
+}
+
+static inline size_t GetFsvAlignment(const reorder_params& params) {
+    const auto& in = params.inputs[0];
+    int fsv_alignment = -1;
+    switch (in.GetLayout()) {
+    case DataLayout::b_fs_yx_fsv16:
+    case DataLayout::b_fs_zyx_fsv16:
+        fsv_alignment = 16;
+        break;
+    case DataLayout::b_fs_yx_fsv32:
+    case DataLayout::b_fs_zyx_fsv32:
+        fsv_alignment = 32;
+        break;
+    default:
+        throw std::runtime_error("Unsupported input\n");
+    }
+    return fsv_alignment;
+}
+
+static inline size_t GetTileSize(const reorder_params& params) {
+    size_t tile_size = 0;
+
+    const auto& in = params.inputs[0];
+    switch (in.GetLayout()) {
+    case DataLayout::b_fs_yx_fsv16:
+    case DataLayout::b_fs_zyx_fsv16:
+        tile_size = DEFAULT_TILE_SIZE;
+        break;
+    case DataLayout::b_fs_yx_fsv32:
+    case DataLayout::b_fs_zyx_fsv32:
+        tile_size = HALF_TILE_SIZE;
+        break;
+    default:
+        throw std::runtime_error("Unsupported input\n");
+    }
+
+    return tile_size;
+}
+
+static inline std::vector<size_t> GetGWS(const reorder_params& params) {
+    const auto& in = params.inputs[0];
+    const size_t fsv_alignment = GetFsvAlignment(params);
+
+    std::vector<size_t> gws = { CeilDiv(in.X().v, DEFAULT_TILE_SIZE) * fsv_alignment,
+        in.Y().v * in.Z().v,
+        in.Batch().v * CeilDiv(in.Feature().v, fsv_alignment) };
+
+    return gws;
+}
+
+CommonDispatchData ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::SetDefault(const reorder_params& params) const {
+    CommonDispatchData dispatchData;
+    const size_t fsv_alignment = GetFsvAlignment(params);
+
+    dispatchData.gws = GetGWS(params);
+    dispatchData.lws = { fsv_alignment, 1, 1};
+    return dispatchData;
+}
+
+JitConstants ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::GetJitConstants(const reorder_params& params) const {
+    auto jit = ReorderKernelBase::GetJitConstants(params);
+
+    const size_t f = params.inputs[0].Feature().v;
+    const size_t x = params.inputs[0].X().v;
+    const size_t tile_size = GetTileSize(params);
+    const size_t output_ndims = params.output.GetDims().size();
+    const size_t fsv_alignment = GetFsvAlignment(params);
+
+    jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(output_ndims)));
+    jit.AddConstant(MakeJitConstant("INPUT0_FEATURE_SLICE_NUM", CeilDiv(f, fsv_alignment)));
+    jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
+    jit.AddConstant(MakeJitConstant("DEFAULT_TILE_SIZE", DEFAULT_TILE_SIZE));
+    jit.AddConstant(MakeJitConstant("FSV_ALIGNMENT", fsv_alignment));
+    jit.AddConstant(MakeJitConstant("DEFAULT_STRIDE", 16));
+
+    // whether F is aligned
+    if (f % fsv_alignment != 0) {
+        jit.AddConstant(MakeJitConstant("F_REMAINDER_SIZE", f % fsv_alignment));
+        jit.AddConstant(MakeJitConstant("F_REMAINDER_CONDITION", "(f >= (INPUT0_FEATURE_NUM - F_REMAINDER_SIZE)) && (f < INPUT0_FEATURE_NUM)"));
+        jit.AddConstant(MakeJitConstant("F_NO_REMAINDER_CONDITION", "(f < (INPUT0_FEATURE_NUM - F_REMAINDER_SIZE))"));
+    } else {
+        jit.AddConstant(MakeJitConstant("F_NO_REMAINDER_CONDITION", "(f < INPUT0_FEATURE_NUM)"));
+    }
+
+    // whether x is tile_size-aligned
+    if (x % DEFAULT_TILE_SIZE != 0) {
+        jit.AddConstant(MakeJitConstant("X_REMAINDER_SIZE", x % DEFAULT_TILE_SIZE));
+        jit.AddConstant(MakeJitConstant("X_REMAINDER_CONDITION", "(x >= (INPUT0_SIZE_X - X_REMAINDER_SIZE)) && (x < INPUT0_SIZE_X)"));
+    }
+
+    return jit;
+}
+
+KernelsData ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::GetKernelsData(const Params& params, const optional_params& options) const {
+    assert(params.GetType() == KernelType::REORDER);
+
+    const reorder_params& orgParams = static_cast<const reorder_params&>(params);
+
+    return GetCommonKernelsData(orgParams, options);
+}
+
+bool ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::Validate(const Params& p, const optional_params& o) const {
+    if (!ReorderKernelBase::Validate(p, o)) {
+        return false;
+    }
+
+    const reorder_params& params = static_cast<const reorder_params&>(p);
+    const auto& input = params.inputs[0];
+    const auto& output = params.output;
+
+    // decreamental-dims are not supported
+    if (input.GetDims().size() > output.GetDims().size()) {
+        return false;
+    }
+
+    // padding is not supported
+    if (input.X().pad.before != 0 || input.X().pad.after != 0 ||
+        input.Y().pad.before != 0 || input.Y().pad.after != 0 ||
+        input.Z().pad.before != 0 || input.Z().pad.after != 0 ||
+        input.W().pad.before != 0 || input.W().pad.after != 0 ||
+        input.Feature().pad.before != 0 || input.Feature().pad.after != 0 ||
+        input.Batch().pad.before != 0 || input.Batch().pad.after != 0) {
+        return false;
+    }
+
+    if (output.X().pad.before != 0 || output.X().pad.after != 0 ||
+        output.Y().pad.before != 0 || output.Y().pad.after != 0 ||
+        output.Z().pad.before != 0 || output.Z().pad.after != 0 ||
+        output.W().pad.before != 0 || output.W().pad.after != 0 ||
+        output.Feature().pad.before != 0 || output.Feature().pad.after != 0 ||
+        output.Batch().pad.before != 0 || output.Batch().pad.after != 0) {
+        return false;
+    }
+
+    return true;
+}
+
+KernelsPriority ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::GetKernelsPriority(const Params& p, const optional_params& /*options*/) const {
+    const reorder_params& params = static_cast<const reorder_params&>(p);
+    const auto& input = params.inputs[0];
+
+    const size_t f = input.Feature().v;
+    const size_t x = input.X().v;
+
+    const size_t tile_size = GetTileSize(params);
+    const size_t fsv_alignment = GetFsvAlignment(params);
+
+    if (f <= fsv_alignment && x < tile_size) {
+        return DONT_USE_IF_HAVE_SOMETHING_ELSE;
+    }
+
+    return FORCE_PRIORITY_3;
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h
new file mode 100644
index 00000000000000..82f51daf2ca16f
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h
@@ -0,0 +1,22 @@
+﻿// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "reorder_kernel_base.h"
+
+namespace kernel_selector {
+class ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx : public ReorderKernelBase {
+public:
+    ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx() : ReorderKernelBase("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx") {}
+
+    bool Validate(const Params& p, const optional_params& o) const override;
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+protected:
+    JitConstants GetJitConstants(const reorder_params& params) const;
+    CommonDispatchData SetDefault(const reorder_params& params) const;
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.cpp
new file mode 100644
index 00000000000000..5a256b641c26a2
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.cpp
@@ -0,0 +1,270 @@
+﻿// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "reorder_kernel_bfyx_to_blocked_format.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <string>
+#include <functional>
+#include <cmath>
+
+// Tile size : 4x4 or 8x8
+#define MIN_TILE_SIZE 4
+#define DEFAULT_TILE_SIZE 8
+
+namespace kernel_selector {
+ParamsKey ReorderKernel_bfyx_to_blocked_format::GetSupportedKey() const {
+    ParamsKey k;
+
+    k.EnableAllInputDataType();
+    k.EnableAllOutputDataType();
+
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::bfzyx);
+
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
+    k.EnableOutputLayout(DataLayout::fs_b_yx_fsv32);
+    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
+    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
+    k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
+
+    k.EnableDifferentTypes();
+    k.EnableBatching();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+
+    return k;
+}
+
+static inline std::string GetTiledInputOrder(size_t size) {
+    std::string order_str = "";
+    switch (size) {
+    case 4:
+        order_str = "b, f + lh, y, x";
+        break;
+    case 5:
+        order_str = "b, f + lh, z, y, x";
+        break;
+    default: throw std::runtime_error("Unsupported combination\n");
+    }
+    return order_str;
+}
+
+static inline size_t GetFsvAlignment(const reorder_params& params) {
+    const auto& out = params.output;
+    int fsv_alignment = -1;
+    switch (out.GetLayout()) {
+    case DataLayout::b_fs_yx_fsv4:
+        fsv_alignment = 4;
+        break;
+    case DataLayout::b_fs_yx_fsv16:
+    case DataLayout::b_fs_zyx_fsv16:
+    case DataLayout::bs_fs_yx_bsv16_fsv16:
+    case DataLayout::bs_fs_zyx_bsv16_fsv16:
+        fsv_alignment = 16;
+        break;
+    case DataLayout::b_fs_yx_fsv32:
+    case DataLayout::fs_b_yx_fsv32:
+    case DataLayout::b_fs_zyx_fsv32:
+        fsv_alignment = 32;
+        break;
+    default:
+        throw std::runtime_error("Unsupported combination\n");
+    }
+    return fsv_alignment;
+}
+
+static inline size_t GetTileSize(const reorder_params& params) {
+    const Datatype input_type = params.inputs[0].GetDType();
+    const Datatype output_type = params.output.GetDType();
+
+    // i64 supports tile size 4
+    if ((input_type == Datatype::INT64) || (output_type == Datatype::INT64)) {
+        return MIN_TILE_SIZE;
+    }
+
+    if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv4) {
+        return MIN_TILE_SIZE;
+    }
+
+    if (params.inputs[0].Feature().v < DEFAULT_TILE_SIZE) {
+        return MIN_TILE_SIZE;
+    }
+
+    return DEFAULT_TILE_SIZE;
+}
+
+static inline std::vector<size_t> GetGWS(const reorder_params& params) {
+    const auto& in = params.inputs[0];
+    const size_t tile_size = GetTileSize(params);
+    const size_t fsv_alignment = GetFsvAlignment(params);
+
+    std::vector<size_t> gws = { (fsv_alignment / tile_size),
+        CeilDiv(in.X().v, tile_size) * in.Y().v * in.Z().v,
+        in.Batch().v * CeilDiv(in.Feature().v, fsv_alignment) };
+
+    return gws;
+}
+
+static std::vector<size_t> GetBestLwsFromGws(const reorder_params& params, const std::vector<size_t>& gws, const size_t tile_width, const size_t tile_size) {
+    std::vector<size_t> lws{ 1, 1, 1 };
+    std::vector<size_t> dims{ 0, 1, 2 };
+
+    // SLM size: elemsize * tile_width * tile_width * work_items <= 64K
+    const size_t elem_size = params.inputs[0].ElementSize();
+    const size_t max_local_mem_size = params.engineInfo.maxLocalMemSize;
+    const size_t max_work_group_size = params.engineInfo.maxWorkGroupSize;
+    size_t max_num_work_items = std::min(max_work_group_size, max_local_mem_size / (elem_size * tile_width * tile_size));
+
+    for (size_t i = 0; i < dims.size(); ++i) {
+        size_t dim = dims[i];
+        size_t max_divider = static_cast<size_t>(std::sqrt(gws[dim]) + 1);
+        for (size_t divider = 1; divider <= max_divider; ++divider) {
+            if (gws[dim] % divider == 0) {
+                const size_t lws0 = gws[dim] / divider;
+                if (lws0 <= max_num_work_items) {
+                    lws[dim] = std::max(lws[dim], lws0);
+                }
+                if (divider <= max_num_work_items) {
+                    lws[dim] = std::max(lws[dim], divider);
+                }
+            }
+        }
+        max_num_work_items /= lws[dim];
+    }
+    return lws;
+}
+
+CommonDispatchData ReorderKernel_bfyx_to_blocked_format::SetDefault(const reorder_params& params) const {
+    CommonDispatchData dispatchData;
+    const size_t tile_size = GetTileSize(params);
+    dispatchData.gws = GetGWS(params);
+    dispatchData.lws = GetBestLwsFromGws(params, dispatchData.gws, tile_size, tile_size);
+    return dispatchData;
+}
+
+JitConstants ReorderKernel_bfyx_to_blocked_format::GetJitConstants(const reorder_params& params) const {
+    auto jit = ReorderKernelBase::GetJitConstants(params);
+
+    const size_t b = params.inputs[0].Batch().v;
+    const size_t f = params.inputs[0].Feature().v;
+    const size_t x = params.inputs[0].X().v;
+    const size_t tile_size = GetTileSize(params);
+    const size_t input_ndims = params.inputs[0].GetDims().size();
+    const size_t fsv_alignment = GetFsvAlignment(params);
+
+    const auto& gws = GetGWS(params);
+    const auto& lws = GetBestLwsFromGws(params, gws, tile_size, tile_size);
+    const uint64_t total_lws = lws[0] * lws[1] * lws[2];
+
+    jit.AddConstant(MakeJitConstant("INPUT0_TILED_ORDER", GetTiledInputOrder(input_ndims)));
+    jit.AddConstant(MakeJitConstant("INPUT0_FEATURE_SLICE_NUM", CeilDiv(f, fsv_alignment)));
+    jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size));
+    jit.AddConstant(MakeJitConstant("FSV_ALIGNMENT", fsv_alignment));
+    jit.AddConstant(MakeJitConstant("TRANS_BUF_SIZE", tile_size * total_lws));
+
+    if (params.output.GetLayout() == DataLayout::fs_b_yx_fsv32) {
+        jit.AddConstant(MakeJitConstant("FS_B_YX_FSV", 1));
+    }
+
+    if (params.output.GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 ||
+        params.output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) {
+        jit.AddConstant(MakeJitConstant("DOUBLE_BLOCKED_FORMAT", 1));
+        jit.AddConstant(MakeJitConstant("INPUT0_BATCH_SLICE_NUM", CeilDiv(b, fsv_alignment)));
+    }
+
+    // whether F is tile_size-aligned
+    if (f % tile_size == 0) {
+        jit.AddConstant(MakeJitConstant("F_NO_REMAINDER_CONDITION", "(f < INPUT0_FEATURE_NUM)"));
+    } else {
+        jit.AddConstant(MakeJitConstant("F_REMAINDER_SIZE", f % tile_size));
+        jit.AddConstant(MakeJitConstant("F_REMAINDER_CONDITION", "(f >= (INPUT0_FEATURE_NUM - F_REMAINDER_SIZE)) && (f < INPUT0_FEATURE_NUM)"));
+        jit.AddConstant(MakeJitConstant("F_NO_REMAINDER_CONDITION", "(f < (INPUT0_FEATURE_NUM - F_REMAINDER_SIZE))"));
+    }
+
+    // whether x is tile_size-aligned
+    if (x % tile_size != 0) {
+        jit.AddConstant(MakeJitConstant("X_REMAINDER_SIZE", x % tile_size));
+        jit.AddConstant(MakeJitConstant("X_REMAINDER_CONDITION", "(x >= (INPUT0_SIZE_X - X_REMAINDER_SIZE)) && (x < INPUT0_SIZE_X)"));
+        jit.AddConstant(MakeJitConstant("X_NO_REMAINDER_CONDITION", "(x < (INPUT0_SIZE_X - X_REMAINDER_SIZE))"));
+    }
+
+    return jit;
+}
+
+KernelsData ReorderKernel_bfyx_to_blocked_format::GetKernelsData(const Params& params, const optional_params& options) const {
+    assert(params.GetType() == KernelType::REORDER);
+
+    const reorder_params& orgParams = static_cast<const reorder_params&>(params);
+
+    return GetCommonKernelsData(orgParams, options);
+}
+
+bool ReorderKernel_bfyx_to_blocked_format::Validate(const Params& p, const optional_params& o) const {
+    if (!ReorderKernelBase::Validate(p, o)) {
+        return false;
+    }
+
+    const reorder_params& params = static_cast<const reorder_params&>(p);
+    const auto& input = params.inputs[0];
+    const auto& output = params.output;
+
+    if (input.GetDims().size() != output.GetDims().size()) {
+        return false;
+    }
+
+    // padding is not supported
+    if (input.X().pad.before != 0 || input.X().pad.after != 0 ||
+        input.Y().pad.before != 0 || input.Y().pad.after != 0 ||
+        input.Z().pad.before != 0 || input.Z().pad.after != 0 ||
+        input.W().pad.before != 0 || input.W().pad.after != 0 ||
+        input.Feature().pad.before != 0 || input.Feature().pad.after != 0 ||
+        input.Batch().pad.before != 0 || input.Batch().pad.after != 0) {
+        return false;
+    }
+
+    if (output.X().pad.before != 0 || output.X().pad.after != 0 ||
+        output.Y().pad.before != 0 || output.Y().pad.after != 0 ||
+        output.Z().pad.before != 0 || output.Z().pad.after != 0 ||
+        output.W().pad.before != 0 || output.W().pad.after != 0 ||
+        output.Feature().pad.before != 0 || output.Feature().pad.after != 0 ||
+        output.Batch().pad.before != 0 || output.Batch().pad.after != 0) {
+        return false;
+    }
+
+    return true;
+}
+
+KernelsPriority ReorderKernel_bfyx_to_blocked_format::GetKernelsPriority(const Params& p, const optional_params& /*options*/) const {
+    const reorder_params& params = static_cast<const reorder_params&>(p);
+    const auto& input = params.inputs[0];
+    const auto& output = params.output;
+
+    const size_t b = input.Batch().v;
+    const size_t f = input.Feature().v;
+    const size_t x = input.X().v;
+    const size_t y = input.Y().v;
+    const size_t z = input.Z().v;
+
+    const size_t elem_size = input.ElementSize();
+    const size_t total_data_byte = b * f * x * y * z * elem_size;
+
+    const size_t tile_size = GetTileSize(params);
+    const size_t fsv_alignment = GetFsvAlignment(params);
+
+    if ((f < fsv_alignment && x < tile_size) || total_data_byte < 32000) {
+        return DONT_USE_IF_HAVE_SOMETHING_ELSE;
+    }
+
+    // At this condition, reorder_data_fast_b1 is faster
+    if (b == 1 && output.Batch().v == 1 && params.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && f < 256) {
+        return FORCE_PRIORITY_8;
+    }
+
+    return FORCE_PRIORITY_5;
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.h
new file mode 100644
index 00000000000000..1e542fec39d228
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.h
@@ -0,0 +1,22 @@
+﻿// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "reorder_kernel_base.h"
+
+namespace kernel_selector {
+class ReorderKernel_bfyx_to_blocked_format : public ReorderKernelBase {
+public:
+    ReorderKernel_bfyx_to_blocked_format() : ReorderKernelBase("reorder_data_bfyx_to_blocked_format") {}
+
+    bool Validate(const Params& p, const optional_params& o) const override;
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+protected:
+    JitConstants GetJitConstants(const reorder_params& params) const;
+    CommonDispatchData SetDefault(const reorder_params& params) const;
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp
index 1076176a639529..e68b6cfa4dbea1 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp
@@ -11,6 +11,8 @@
 #include "reorder_kernel_binary.h"
 #include "reorder_biplanar_nv12.h"
 #include "reorder_kernel_fs_b_yx_fsv32_to_bfyx.h"
+#include "reorder_kernel_bfyx_to_blocked_format.h"
+#include "reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h"
 
 namespace kernel_selector {
 
@@ -23,6 +25,8 @@ reorder_kernel_selector::reorder_kernel_selector() {
     Attach<ReorderKernel_to_yxfb_batched>();
     Attach<reorder_biplanar_nv12>();
     Attach<ReorderKernel_fs_b_yx_fsv32_to_bfyx>();
+    Attach<ReorderKernel_bfyx_to_blocked_format>();
+    Attach<ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx>();
 }
 
 KernelsData reorder_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl
new file mode 100644
index 00000000000000..fe6d79f8014458
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl
@@ -0,0 +1,160 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/include_all.cl"
+
+#define unroll_for __attribute__((opencl_unroll_hint)) for
+#define CEIL_DIV(A, B) (((A) + (B) - 1) / (B))
+#define INPUT0_GET_TILED_INDEX(ORDER) INPUT0_GET_INDEX(ORDER)
+#define OUTPUT_GET_TILED_INDEX(ORDER) OUTPUT_GET_INDEX(ORDER)
+
+#define INPUTVTYPE CAT(INPUT0_TYPE, DEFAULT_TILE_SIZE)
+#define INPUTVTYPE_HALF CAT(INPUT0_TYPE, TILE_SIZE)
+#define OUTPUTVTYPE CAT(OUTPUT_TYPE, TILE_SIZE)
+#define VSTORE CAT(vstore, TILE_SIZE)
+#define AS_INPUTVTYPE CAT(as_, INPUTVTYPE)
+#define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE)
+
+#define GET_GLOBAL_ID(IDX) ((uint)get_global_id(IDX))
+#define GET_LOCAL_ID(IDX) ((uint)get_local_id(IDX))
+#define GET_LOCAL_SIZE(IDX) ((uint)get_local_size(IDX))
+
+__attribute__((intel_reqd_sub_group_size(DEFAULT_STRIDE)))
+
+KERNEL (reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+    )
+{
+    const uint sub_group_id = get_sub_group_id();
+    const uint sub_group_local_id = get_sub_group_local_id();
+
+#if INPUT0_DIMS == 4
+    #if OUTPUT_DIMS > 4
+        const uint z = 0;
+        const uint w = 0;
+    #endif
+    const uint y = GET_GLOBAL_ID(1);
+#elif INPUT0_DIMS == 5
+    #if OUTPUT_DIMS > 5
+        const uint w = 0;
+    #endif
+    const uint y = GET_GLOBAL_ID(1) % INPUT0_SIZE_Y;
+    const uint z = GET_GLOBAL_ID(1) / INPUT0_SIZE_Y;
+#else
+#error reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl: input format - not supported
+#endif
+
+    const uint x = GET_GLOBAL_ID(0) / FSV_ALIGNMENT * DEFAULT_TILE_SIZE;
+
+    const uint fs = GET_GLOBAL_ID(2) % INPUT0_FEATURE_SLICE_NUM;
+    const uint b = GET_GLOBAL_ID(2) / INPUT0_FEATURE_SLICE_NUM;
+    const uint f = fs * FSV_ALIGNMENT + sub_group_local_id;
+
+    //read
+    const uint x_pitch = FSV_ALIGNMENT;
+    const uint y_pitch = x_pitch * INPUT0_SIZE_X;
+#if INPUT0_DIMS == 4
+    const uint fs_pitch = y_pitch * INPUT0_SIZE_Y;
+    const uint b_pitch = fs_pitch * INPUT0_FEATURE_SLICE_NUM;
+    const uint input_idx_tile = (b * b_pitch) + (fs * fs_pitch) + (y * y_pitch) + (x * x_pitch);
+#else
+    const uint z_pitch = y_pitch * INPUT0_SIZE_Y;
+    const uint fs_pitch = z_pitch * INPUT0_SIZE_Z;
+    const uint b_pitch = fs_pitch * INPUT0_FEATURE_SLICE_NUM;
+    const uint input_idx_tile = (b * b_pitch) + (fs * fs_pitch) + (z * z_pitch) + (y * y_pitch) + (x * x_pitch);
+#endif
+
+
+#if (TILE_SIZE == DEFAULT_TILE_SIZE)
+    // read
+    INPUTVTYPE read_data = AS_INPUTVTYPE(intel_sub_group_block_read8((const __global uint*)(input) + input_idx_tile));
+
+    // write
+    const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER);
+
+    if (F_NO_REMAINDER_CONDITION
+#ifdef F_REMAINDER_SIZE
+        || (F_REMAINDER_CONDITION && ((f % FSV_ALIGNMENT) < F_REMAINDER_SIZE))
+#endif
+    ) {
+        #ifdef X_REMAINDER_SIZE
+            if (X_REMAINDER_CONDITION) {
+                for (int i = 0 ; i < X_REMAINDER_SIZE; i++) {
+                    output[output_idx + i] = TO_OUTPUT_TYPE(read_data[i]);
+                }
+            } else {
+                VSTORE(TO_OUTPUTVTYPE(read_data), 0, output + output_idx);
+            }
+        #else
+            VSTORE(TO_OUTPUTVTYPE(read_data), 0, output + output_idx);
+        #endif
+    }
+#else
+    const uint sgid_remainder = sub_group_id % 2;
+
+    // read
+    const uint input_idx_final = input_idx_tile + sgid_remainder * (DEFAULT_STRIDE * DEFAULT_TILE_SIZE);
+    INPUTVTYPE read_data = AS_INPUTVTYPE(intel_sub_group_block_read8((const __global uint*)(input) + input_idx_final));
+    INPUTVTYPE_HALF read_half1 = {read_data[0], read_data[2], read_data[4], read_data[6]};
+    INPUTVTYPE_HALF read_half2 = {read_data[1], read_data[3], read_data[5], read_data[7]};
+
+    // write
+    const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER);
+    const uint output_idx_final = output_idx + (sgid_remainder * TILE_SIZE);
+
+    if (F_NO_REMAINDER_CONDITION
+#ifdef F_REMAINDER_SIZE
+        || (F_REMAINDER_CONDITION && ((f % FSV_ALIGNMENT) < F_REMAINDER_SIZE))
+#endif
+        ) {
+        #ifdef X_REMAINDER_SIZE
+            if (X_REMAINDER_CONDITION) {
+                const int nloop = X_REMAINDER_SIZE - (TILE_SIZE * sgid_remainder);
+                for (int i = 0 ; i < min(nloop, TILE_SIZE); i++) {
+                    output[output_idx_final + i] = TO_OUTPUT_TYPE(read_half1[i]);
+                    #ifdef F_REMAINDER_SIZE
+                    if ((f + DEFAULT_STRIDE) < OUTPUT_FEATURE_NUM)
+                    #endif
+                    {
+                        output[output_idx_final + i + (OUTPUT_FEATURE_PITCH * DEFAULT_STRIDE)] = TO_OUTPUT_TYPE(read_half2[i]);
+                    }
+                }
+            } else {
+                VSTORE(TO_OUTPUTVTYPE(read_half1), 0, output + output_idx_final);
+                #ifdef F_REMAINDER_SIZE
+                if ((f + DEFAULT_STRIDE) < OUTPUT_FEATURE_NUM)
+                #endif
+                {
+                    VSTORE(TO_OUTPUTVTYPE(read_half2), 0, output + output_idx_final + (OUTPUT_FEATURE_PITCH * DEFAULT_STRIDE));
+                }
+            }
+        #else
+            VSTORE(TO_OUTPUTVTYPE(read_half1), 0, output + output_idx_final);
+            #ifdef F_REMAINDER_SIZE
+            if((f + DEFAULT_STRIDE) < OUTPUT_FEATURE_NUM)
+            #endif
+            {
+                VSTORE(TO_OUTPUTVTYPE(read_half2), 0, output + output_idx_final + (OUTPUT_FEATURE_PITCH * DEFAULT_STRIDE));
+            }
+        #endif
+    }
+#endif
+}
+
+#undef GET_LOCAL_SIZE
+#undef GET_LOCAL_ID
+#undef GET_GLOBAL_ID
+
+#undef TO_OUTPUTVTYPE
+#undef AS_INPUTVTYPE
+#undef VSTORE
+#undef OUTPUTVTYPE
+#undef INPUTVTYPE_HALF
+#undef INPUTVTYPE
+
+#undef OUTPUT_GET_TILED_INDEX
+#undef INPUT0_GET_TILED_INDEX
+#undef CEIL_DIV
+#undef unroll_for
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_bfyx_to_blocked_format.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_bfyx_to_blocked_format.cl
new file mode 100644
index 00000000000000..ae03ed32d98d1c
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_bfyx_to_blocked_format.cl
@@ -0,0 +1,165 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/fetch.cl"
+#include "include/common.cl"
+#include "include/data_types.cl"
+
+#define unroll_for __attribute__((opencl_unroll_hint)) for
+#define CEIL_DIV(A, B) (((A) + (B) - 1) / (B))
+#define INPUT0_GET_TILED_INDEX(ORDER) INPUT0_GET_INDEX(ORDER)
+
+#define INPUTVTYPE CAT(INPUT0_TYPE, TILE_SIZE)
+#define OUTPUTVTYPE CAT(OUTPUT_TYPE, TILE_SIZE)
+#define VLOAD CAT(vload, TILE_SIZE)
+#define VSTORE CAT(vstore, TILE_SIZE)
+#define AS_INPUTVTYPE CAT(as_, INPUTVTYPE)
+#define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE)
+
+#define GET_GLOBAL_ID(IDX) ((uint)get_global_id(IDX))
+#define GET_LOCAL_ID(IDX) ((uint)get_local_id(IDX))
+#define GET_LOCAL_SIZE(IDX) ((uint)get_local_size(IDX))
+
+#define FUNC_VLOAD(inner, outer)    unroll_for (uint lh = 0; lh < outer; ++lh) { \
+                                        const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER); \
+                                        INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx)); \
+                                        unroll_for (uint lw = 0; lw < inner; ++lw) { \
+                                            const uint dst = local_buf_offset + lw; \
+                                            transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS); \
+                                        } \
+                                    }
+
+#define FUNC_VSTORE(loop)           unroll_for (uint lw = 0; lw < loop; ++lw) { \
+                                        const uint output_idx = output_idx_tile + (lw * x_pitch); \
+                                        VSTORE(TO_OUTPUTVTYPE(transpose_buf[local_buf_offset + lw]), 0, output + output_idx); \
+                                    }
+
+#define FUNC_WRITE(inner, outer)    unroll_for (uint lw = 0; lw < outer; ++lw) { \
+                                        const uint output_idx = output_idx_tile + (lw * x_pitch); \
+                                        unroll_for (uint i = 0; i < inner; ++i) { \
+                                            output[output_idx + i] = TO_OUTPUT_TYPE(transpose_buf[local_buf_offset + lw][i]); \
+                                        } \
+                                    }
+
+KERNEL (reorder_data_bfyx_to_blocked_format)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+    )
+{
+#if INPUT0_DIMS == 4
+    const uint y = GET_GLOBAL_ID(1) % INPUT0_SIZE_Y;
+    const uint x = (GET_GLOBAL_ID(1) / INPUT0_SIZE_Y) * TILE_SIZE;
+#elif INPUT0_DIMS == 5
+    const uint z = GET_GLOBAL_ID(1) % INPUT0_SIZE_Z;
+    const uint yx = GET_GLOBAL_ID(1) / INPUT0_SIZE_Z;
+    const uint y = yx % INPUT0_SIZE_Y;
+    const uint x = (yx / INPUT0_SIZE_Y) * TILE_SIZE;
+#else
+#error reorder_data_bfyx_to_blocked_format.cl: input format - not supported
+#endif
+
+    const uint fsv = GET_GLOBAL_ID(0) * TILE_SIZE;
+    const uint fs = GET_GLOBAL_ID(2) % INPUT0_FEATURE_SLICE_NUM;
+    const uint b = GET_GLOBAL_ID(2) / INPUT0_FEATURE_SLICE_NUM;
+    const uint f = fsv + fs * FSV_ALIGNMENT;
+
+#if DOUBLE_BLOCKED_FORMAT
+    const uint bs = b / FSV_ALIGNMENT;
+    const uint bsv = b % FSV_ALIGNMENT;
+    const uint x_pitch = FSV_ALIGNMENT * FSV_ALIGNMENT;
+#else
+    const uint x_pitch = FSV_ALIGNMENT;
+#endif
+    const uint y_pitch = x_pitch * (OUTPUT_SIZE_X);
+
+#if INPUT0_DIMS == 4
+    #if DOUBLE_BLOCKED_FORMAT
+        const uint bsv_pitch = FSV_ALIGNMENT;
+        const uint fs_pitch = y_pitch * (OUTPUT_SIZE_Y);
+        const uint bs_pitch = fs_pitch * (INPUT0_FEATURE_SLICE_NUM);
+        const uint output_idx_tile = (bs * bs_pitch) + (fs * fs_pitch) + (y * y_pitch) + (x * x_pitch) + (bsv * bsv_pitch) + (fsv);
+    #else
+        #if FS_B_YX_FSV
+        const uint b_pitch = y_pitch * (OUTPUT_SIZE_Y);
+        const uint fs_pitch = b_pitch * (INPUT0_BATCH_NUM);
+        #else
+        const uint fs_pitch = y_pitch * (OUTPUT_SIZE_Y);
+        const uint b_pitch = fs_pitch * (INPUT0_FEATURE_SLICE_NUM);
+        #endif
+        const uint output_idx_tile = (b * b_pitch) + (fs * fs_pitch) + (y * y_pitch) + (x * x_pitch) + (fsv);
+    #endif
+#elif INPUT0_DIMS == 5
+     #if DOUBLE_BLOCKED_FORMAT
+        const uint bsv_pitch = FSV_ALIGNMENT;
+        const uint z_pitch = y_pitch * (OUTPUT_SIZE_Y);
+        const uint fs_pitch = z_pitch * (OUTPUT_SIZE_Z);
+        const uint bs_pitch = fs_pitch * (INPUT0_FEATURE_SLICE_NUM);
+        const uint output_idx_tile = (bs * bs_pitch) + (fs * fs_pitch) + (z * z_pitch) + (y * y_pitch) + (x * x_pitch) + (bsv * bsv_pitch) + (fsv);
+    #else
+        const uint z_pitch = y_pitch * (OUTPUT_SIZE_Y);
+        const uint fs_pitch = z_pitch * (OUTPUT_SIZE_Z);
+        const uint b_pitch = fs_pitch * (INPUT0_FEATURE_SLICE_NUM);
+        const uint output_idx_tile = (b * b_pitch) + (fs * fs_pitch) + (z * z_pitch) + (y * y_pitch) + (x * x_pitch) + (fsv);
+    #endif
+#endif
+
+    // get local buf offset
+    __local OUTPUTVTYPE transpose_buf[TRANS_BUF_SIZE];
+    const uint local_id = GET_LOCAL_ID(0) * GET_LOCAL_SIZE(2) * GET_LOCAL_SIZE(1)
+                    + GET_LOCAL_ID(1) * GET_LOCAL_SIZE(2)
+                    + GET_LOCAL_ID(2);
+    const uint local_buf_offset = local_id * TILE_SIZE;
+
+    if (F_NO_REMAINDER_CONDITION) {
+        // read and transpose
+        FUNC_VLOAD(TILE_SIZE, TILE_SIZE)
+
+        // write to ddr
+#ifdef X_REMAINDER_CONDITION
+        if (X_NO_REMAINDER_CONDITION) {
+            FUNC_VSTORE(TILE_SIZE)
+        } else {
+            FUNC_VSTORE(X_REMAINDER_SIZE)
+        }
+#else
+        FUNC_VSTORE(TILE_SIZE)
+#endif
+    }
+#ifdef F_REMAINDER_CONDITION
+    else if (F_REMAINDER_CONDITION) {
+        // read and transpose
+        FUNC_VLOAD(TILE_SIZE, F_REMAINDER_SIZE)
+
+        // write to ddr
+    #ifdef X_REMAINDER_CONDITION
+        if (X_NO_REMAINDER_CONDITION) {
+            FUNC_WRITE(F_REMAINDER_SIZE, TILE_SIZE)
+        } else {
+            FUNC_WRITE(F_REMAINDER_SIZE, X_REMAINDER_SIZE)
+        }
+    #else
+        FUNC_WRITE(F_REMAINDER_SIZE, TILE_SIZE)
+    #endif
+    }
+#endif
+}
+
+#undef FUNC_WRITE
+#undef FUNC_VSTORE
+#undef FUNC_VLOAD
+
+#undef GET_LOCAL_SIZE
+#undef GET_LOCAL_ID
+#undef GET_GLOBAL_ID
+
+#undef TO_OUTPUTVTYPE
+#undef AS_INPUTVTYPE
+#undef VSTORE
+#undef VLOAD
+#undef OUTPUTVTYPE
+#undef INPUTVTYPE
+
+#undef INPUT0_GET_TILED_INDEX
+#undef CEIL_DIV
+#undef unroll_for
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp
index 59a4fd0af2328b..f2c80c3b06130c 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp
@@ -23,8 +23,251 @@ using namespace cldnn;
 using namespace tests;
 using namespace testing;
 
-TEST(reorder_gpu_f32, basic)
-{
+static void compare_bfyx2blocked_with_ref(const std::string& kernel_name,
+    const data_types input_data_type, const data_types output_data_type,
+    cldnn::format input_format, cldnn::format output_format,
+    int32_t b_in, int32_t f_in, int32_t x_in, int32_t y_in, int32_t z_in = 0, int32_t w_in = 0) {
+    const auto& engine = get_test_engine();
+
+    tensor ts;
+    if (input_format.dimension() == 4) {
+        ts = { b_in, f_in, x_in, y_in };
+    }
+    else if (input_format.dimension() == 5) {
+        ts = { b_in, f_in, x_in, y_in, z_in };
+    }
+    else {
+        ts = { b_in, f_in, x_in, y_in, z_in, w_in };
+    }
+
+    auto input = memory::allocate(engine, { input_data_type, input_format, ts });
+    layout output_layout(output_data_type, output_format, ts);
+
+    if (input_data_type == data_types::i8) {
+        auto input_ptr = input.pointer<unsigned char>();
+        unsigned char i = 1;
+        for (auto it = input_ptr.begin(); it != input_ptr.end(); ++it)
+        {
+            *it = (i++);
+            if (i > 100) {
+                i = 1;
+            }
+        }
+    } else {
+        auto input_ptr = input.pointer<float>();
+        float i = 1.f;
+        for (auto it = input_ptr.begin(); it != input_ptr.end(); ++it)
+        {
+            *it = (i);
+            i += 1.f;
+        }
+    }
+
+    topology topology(
+        input_layout("input", input.get_layout()),
+        reorder("reorder", "input", output_layout));
+
+    // run on reference(reorder_data) kernel
+    cldnn::build_options options_ref;
+    cldnn::implementation_desc reorder_ref = { output_format, "reorder_data" };
+    options_ref.set_option(cldnn::build_option::force_implementations({ {"reorder", reorder_ref} }));
+
+    network network_ref(engine, topology, options_ref);
+    network_ref.set_input_data("input", input);
+
+    std::map<cldnn::primitive_id, cldnn::network_output> outputs_ref;
+
+    outputs_ref = network_ref.execute();
+    cldnn::event e1 = outputs_ref.at("reorder").get_event();
+    e1.wait();
+
+    auto output_ref = outputs_ref.begin()->second.get_memory();
+    auto output_ref_ptr = output_ref.pointer<unsigned char>();
+
+    // run on optimized kernel
+    cldnn::build_options options;
+    cldnn::implementation_desc reorder_optimized = { output_format, kernel_name };
+    options.set_option(cldnn::build_option::force_implementations({ {"reorder", reorder_optimized} }));
+
+    network network(engine, topology, options);
+    network.set_input_data("input", input);
+
+    std::map<cldnn::primitive_id, cldnn::network_output> outputs;
+
+    outputs = network.execute();
+    cldnn::event e2 = outputs.at("reorder").get_event();
+    e2.wait();
+
+    auto output = outputs.begin()->second.get_memory();
+    auto output_ptr = output.pointer<unsigned char>();
+
+    // compare results
+    const size_t output_size = output_ref_ptr.size();
+    for (size_t i = 0; i < output_size; i++)
+    {
+        EXPECT_EQ(output_ref_ptr[i], output_ptr[i]);
+    }
+}
+
+TEST(reorder_gpu_optimization, compare_with_ref__b_fs_yx_fsv32_to_bfyx_f32) {
+    // b_fs_yx_fsv32 -> bfyx
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfyx, 3, 64 + 5, 16 + 11, 3);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfyx, 3, 96 - 12, 16 + 4, 3);
+    // b_fs_zyx_fsv32 -> bfzyx
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfzyx, 3, 64 + 9, 16 - 1, 2, 8);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfzyx, 2, 64 + 30, 16 + 1, 3, 4);
+    // incremental dims
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfzyx, 2, 64 + 4, 24 - 1, 3);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfwzyx, 2, 64 + 2, 32 - 3, 4);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_zyx_fsv32, format::bfwzyx, 1, 96 + 10, 32 - 3, 4, 3);
+}
+
+TEST(reorder_gpu_optimization, compare_with_ref__b_fs_yx_fsv32_to_bfyx_different_datatype) {
+    // f32 -> other types
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::u8, format::b_fs_yx_fsv32, format::bfyx, 2, 64, 8 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::i64, format::b_fs_yx_fsv32, format::bfyx, 2, 64, 16 + 2, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f16, format::b_fs_yx_fsv32, format::bfyx, 1, 64, 16 + 1, 2);
+    // i32 -> other types
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::i8, format::b_fs_yx_fsv32, format::bfyx, 2, 64, 8 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::i64, format::b_fs_yx_fsv32, format::bfyx, 2, 64, 16 + 2, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::f16, format::b_fs_yx_fsv32, format::bfyx, 1, 64, 16 + 1, 2);
+}
+
+TEST(reorder_gpu_optimization, compare_with_ref__b_fs_yx_fsv16_to_bfyx_f32) {
+    // u-net
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx, 1, 64, 388, 388);
+    // b_fs_yx_fsv16 -> bfyx
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx, 3, 48 + 1, 16, 3);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx, 2, 32 - 1, 24 - 1, 3);
+    // b_fs_zyx_fsv16 -> bfzyx
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx, 5, 48 - 1, 16, 3, 8);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx, 2, 32 + 1, 24 - 1, 3, 17);
+    // incremental dims
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx, 3, 32 - 1, 24 - 1, 3);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfwzyx, 4, 16 + 1, 32 - 3, 4);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx, 3, 16 + 2, 32 - 3, 4, 9);
+}
+
+TEST(reorder_gpu_optimization, compare_with_ref__b_fs_yx_fsv16_to_bfyx_different_datatype) {
+    // f32 -> other types
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::u8, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::i8, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::i32, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::i64, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f16, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+    // i32 -> other types
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::u8, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::i8, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::i64, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::f16, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2);
+}
+
+TEST(reorder_gpu_optimization, compare_with_ref__bfyx_to_blocked_f32) {
+    // bfyx_to_b_fs_yx_fsv4
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv4, 4, 32, 16, 4);
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv4, 3, 32 + 2, 32 + 3, 4);
+    // bfyx_to_b_fs_yx_fsv16
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv16, 2, 48, 8, 4);
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv16, 3, 32 + 4, 16 + 7, 2);
+    // bfyx to b_fs_yx_fsv32
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv32, 2, 64, 64, 4);
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv32, 4, 32 + 6, 96 - 4, 2);
+    // bfyx to fs_b_yx_fsv32
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::fs_b_yx_fsv32, 2, 64, 8, 4);
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::fs_b_yx_fsv32, 3, 64 + 5, 8 + 7, 2);
+    // bfzyx to b_fs_zyx_fsv16
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::b_fs_zyx_fsv16, 2, 48, 8, 4, 4);
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::b_fs_zyx_fsv16, 3, 32 + 5, 16 + 7, 2, 2);
+    // bfzyx to b_fs_zyx_fsv32
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::b_fs_zyx_fsv32, 2, 64, 8, 4, 4);
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::b_fs_zyx_fsv32, 3, 64 + 5, 8 + 7, 2, 2);
+}
+
+TEST(reorder_gpu_optimization, compare_with_ref__bfyx_to_double_blocked_f32) {
+    // bfyx to double blocked format (bs_fs_yx_bsv16_fsv16)
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32, 48, 8, 4);                    // no
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32 + 2, 48, 16, 4);               // b
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32, 48 + 5, 16, 4);               // f
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32, 48, 48 + 3, 4);               // x
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32 + 2, 48 + 3, 16 + 1, 4);       // b-f-x
+    // bfzyx to double blocked format (bs_fs_zyx_bsv16_fsv16)
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32, 48, 8, 4, 16);              // no
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32 + 2, 48, 16, 4, 2);          // b
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32, 48 + 5, 16, 4, 3);          // f
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32, 48, 48 + 3, 4, 4);          // x
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32 + 2, 48 + 3, 16 + 1, 4, 2);  // b-f-x
+}
+
+TEST(reorder_gpu_optimization, compare_with_ref__bfyx_to_blocked_format_different_datatype) {
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f16, format::bfyx, format::b_fs_yx_fsv16, 3, 32 + 4, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::i8, data_types::f32, format::bfyx, format::b_fs_yx_fsv16, 3, 32 + 4, 16 + 7, 2);
+    compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::i64, data_types::f32, format::bfyx, format::b_fs_yx_fsv16, 3, 32 + 4, 16 + 7, 2);
+}
+
+TEST(reorder_gpu_optimization, bfyx_to_fsv16_without_f_remainder) {
+    const auto& engine = get_test_engine();
+    const int32_t b_in = 1;
+    const int32_t f_in = 8 * 4;
+    const int32_t y_in = 4;
+    const int32_t x_in = 8 * 2;
+
+    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { b_in,f_in,x_in,y_in } });
+    layout output_layout(data_types::f32, format::b_fs_yx_fsv16, { b_in,f_in,x_in,y_in });
+
+    // Set incremental input value
+    auto input_ptr = input.pointer<float>();
+    float i = 0.f;
+    for (auto it = input_ptr.begin(); it != input_ptr.end(); ++it)
+    {
+        *it = (i++);
+    }
+
+    topology topology(
+        input_layout("input", input.get_layout()),
+        reorder("reorder", "input", output_layout));
+
+    network network(engine, topology);
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "reorder");
+
+    auto output = outputs.begin()->second.get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    auto get_fsv16_index = [](int32_t /* b_size */, int32_t f_size, int32_t y_size, int32_t x_size,
+        int32_t b, int32_t f, int32_t y, int32_t x) {
+            const int32_t alignment = 16;
+            const int32_t fs = f / alignment;
+            const int32_t fsv = f % alignment;
+
+            const int32_t x_pitch = alignment;
+            const int32_t y_pitch = x_pitch * (x_size);
+            const int32_t fs_pitch = y_pitch * (y_size);
+            const int32_t b_pitch = fs_pitch * ((f_size)/alignment);
+
+            const int32_t output_offset = (b * b_pitch) + (fs * fs_pitch) + (y * y_pitch) + (x * x_pitch) + (fsv);
+
+            return output_offset;
+    };
+
+    int32_t linear_index = 0;
+    for (int32_t b = 0; b < b_in; b++) {
+        for (int32_t f = 0; f < f_in; f++) {
+            for (int32_t y = 0; y < y_in; y++) {
+                for (int32_t x = 0; x < x_in; x++) {
+                    int32_t b_fs_yx_fsv16_index = get_fsv16_index(b_in, f_in, y_in, x_in, b, f, y, x);
+                    EXPECT_FLOAT_EQ(input_ptr[linear_index++], output_ptr[b_fs_yx_fsv16_index]);
+                }
+            }
+        }
+    }
+
+}
+
+TEST(reorder_gpu_f32, basic) {
     //  Input               : yxfb:2x2x2x2
     //  Output              : bfyx:2x2x2x2
     //