diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.cpp new file mode 100644 index 00000000000000..ac48f38d90fdcd --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.cpp @@ -0,0 +1,211 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h" +#include "kernel_selector_utils.h" +#include "common_tools.h" +#include +#include +#include + +// Tile size : 8x8 +#define HALF_TILE_SIZE 4 +#define DEFAULT_TILE_SIZE 8 + +namespace kernel_selector { +ParamsKey ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::UINT32); + k.EnableInputDataType(Datatype::INT32); + + k.EnableAllOutputDataType(); + + k.EnableInputLayout(DataLayout::b_fs_yx_fsv16); + k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16); + k.EnableInputLayout(DataLayout::b_fs_yx_fsv32); + k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32); + + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfzyx); + k.EnableOutputLayout(DataLayout::bfwzyx); + + k.EnableBatching(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableDifferentTypes(); + + return k; +} + +static inline std::string GetTiledOutputOrder(size_t size) { + std::string order_str = ""; + switch (size) { + case 4: + order_str = "b, f, y, x"; + break; + case 5: + order_str = "b, f, z, y, x"; + break; + case 6: + order_str = "b, f, w, z, y, x"; + break; + default: throw std::runtime_error("Unsupported size\n"); + } + return order_str; +} + +static inline size_t GetFsvAlignment(const reorder_params& params) { + const auto& in = params.inputs[0]; + int fsv_alignment = -1; + switch (in.GetLayout()) { + case DataLayout::b_fs_yx_fsv16: + case DataLayout::b_fs_zyx_fsv16: + fsv_alignment = 16; + break; + case DataLayout::b_fs_yx_fsv32: + case DataLayout::b_fs_zyx_fsv32: + fsv_alignment = 32; + break; + default: + throw std::runtime_error("Unsupported input\n"); + } + return fsv_alignment; +} + +static inline size_t GetTileSize(const reorder_params& params) { + size_t tile_size = 0; + + const auto& in = params.inputs[0]; + switch (in.GetLayout()) { + case DataLayout::b_fs_yx_fsv16: + case DataLayout::b_fs_zyx_fsv16: + tile_size = DEFAULT_TILE_SIZE; + break; + case DataLayout::b_fs_yx_fsv32: + case DataLayout::b_fs_zyx_fsv32: + tile_size = HALF_TILE_SIZE; + break; + default: + throw std::runtime_error("Unsupported input\n"); + } + + return tile_size; +} + +static inline std::vector GetGWS(const reorder_params& params) { + const auto& in = params.inputs[0]; + const size_t fsv_alignment = GetFsvAlignment(params); + + std::vector gws = { CeilDiv(in.X().v, DEFAULT_TILE_SIZE) * fsv_alignment, + in.Y().v * in.Z().v, + in.Batch().v * CeilDiv(in.Feature().v, fsv_alignment) }; + + return gws; +} + +CommonDispatchData ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::SetDefault(const reorder_params& params) const { + CommonDispatchData dispatchData; + const size_t fsv_alignment = GetFsvAlignment(params); + + dispatchData.gws = GetGWS(params); + dispatchData.lws = { fsv_alignment, 1, 1}; + return dispatchData; +} + +JitConstants ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::GetJitConstants(const reorder_params& params) const { + auto jit = ReorderKernelBase::GetJitConstants(params); + + const size_t f = params.inputs[0].Feature().v; + const size_t x = params.inputs[0].X().v; + const size_t tile_size = GetTileSize(params); + const size_t output_ndims = params.output.GetDims().size(); + const size_t fsv_alignment = GetFsvAlignment(params); + + jit.AddConstant(MakeJitConstant("OUTPUT_TILED_ORDER", GetTiledOutputOrder(output_ndims))); + jit.AddConstant(MakeJitConstant("INPUT0_FEATURE_SLICE_NUM", CeilDiv(f, fsv_alignment))); + jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size)); + jit.AddConstant(MakeJitConstant("DEFAULT_TILE_SIZE", DEFAULT_TILE_SIZE)); + jit.AddConstant(MakeJitConstant("FSV_ALIGNMENT", fsv_alignment)); + jit.AddConstant(MakeJitConstant("DEFAULT_STRIDE", 16)); + + // whether F is aligned + if (f % fsv_alignment != 0) { + jit.AddConstant(MakeJitConstant("F_REMAINDER_SIZE", f % fsv_alignment)); + jit.AddConstant(MakeJitConstant("F_REMAINDER_CONDITION", "(f >= (INPUT0_FEATURE_NUM - F_REMAINDER_SIZE)) && (f < INPUT0_FEATURE_NUM)")); + jit.AddConstant(MakeJitConstant("F_NO_REMAINDER_CONDITION", "(f < (INPUT0_FEATURE_NUM - F_REMAINDER_SIZE))")); + } else { + jit.AddConstant(MakeJitConstant("F_NO_REMAINDER_CONDITION", "(f < INPUT0_FEATURE_NUM)")); + } + + // whether x is tile_size-aligned + if (x % DEFAULT_TILE_SIZE != 0) { + jit.AddConstant(MakeJitConstant("X_REMAINDER_SIZE", x % DEFAULT_TILE_SIZE)); + jit.AddConstant(MakeJitConstant("X_REMAINDER_CONDITION", "(x >= (INPUT0_SIZE_X - X_REMAINDER_SIZE)) && (x < INPUT0_SIZE_X)")); + } + + return jit; +} + +KernelsData ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::GetKernelsData(const Params& params, const optional_params& options) const { + assert(params.GetType() == KernelType::REORDER); + + const reorder_params& orgParams = static_cast(params); + + return GetCommonKernelsData(orgParams, options); +} + +bool ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::Validate(const Params& p, const optional_params& o) const { + if (!ReorderKernelBase::Validate(p, o)) { + return false; + } + + const reorder_params& params = static_cast(p); + const auto& input = params.inputs[0]; + const auto& output = params.output; + + // decreamental-dims are not supported + if (input.GetDims().size() > output.GetDims().size()) { + return false; + } + + // padding is not supported + if (input.X().pad.before != 0 || input.X().pad.after != 0 || + input.Y().pad.before != 0 || input.Y().pad.after != 0 || + input.Z().pad.before != 0 || input.Z().pad.after != 0 || + input.W().pad.before != 0 || input.W().pad.after != 0 || + input.Feature().pad.before != 0 || input.Feature().pad.after != 0 || + input.Batch().pad.before != 0 || input.Batch().pad.after != 0) { + return false; + } + + if (output.X().pad.before != 0 || output.X().pad.after != 0 || + output.Y().pad.before != 0 || output.Y().pad.after != 0 || + output.Z().pad.before != 0 || output.Z().pad.after != 0 || + output.W().pad.before != 0 || output.W().pad.after != 0 || + output.Feature().pad.before != 0 || output.Feature().pad.after != 0 || + output.Batch().pad.before != 0 || output.Batch().pad.after != 0) { + return false; + } + + return true; +} + +KernelsPriority ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx::GetKernelsPriority(const Params& p, const optional_params& /*options*/) const { + const reorder_params& params = static_cast(p); + const auto& input = params.inputs[0]; + + const size_t f = input.Feature().v; + const size_t x = input.X().v; + + const size_t tile_size = GetTileSize(params); + const size_t fsv_alignment = GetFsvAlignment(params); + + if (f <= fsv_alignment && x < tile_size) { + return DONT_USE_IF_HAVE_SOMETHING_ELSE; + } + + return FORCE_PRIORITY_3; +} +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h new file mode 100644 index 00000000000000..82f51daf2ca16f --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "reorder_kernel_base.h" + +namespace kernel_selector { +class ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx : public ReorderKernelBase { +public: + ReorderKernel_b_fs_yx_fsv16_fsv32_to_bfyx() : ReorderKernelBase("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx") {} + + bool Validate(const Params& p, const optional_params& o) const override; + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; +protected: + JitConstants GetJitConstants(const reorder_params& params) const; + CommonDispatchData SetDefault(const reorder_params& params) const; +}; +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.cpp new file mode 100644 index 00000000000000..5a256b641c26a2 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.cpp @@ -0,0 +1,270 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "reorder_kernel_bfyx_to_blocked_format.h" +#include "kernel_selector_utils.h" +#include "common_tools.h" +#include +#include +#include + +// Tile size : 4x4 or 8x8 +#define MIN_TILE_SIZE 4 +#define DEFAULT_TILE_SIZE 8 + +namespace kernel_selector { +ParamsKey ReorderKernel_bfyx_to_blocked_format::GetSupportedKey() const { + ParamsKey k; + + k.EnableAllInputDataType(); + k.EnableAllOutputDataType(); + + k.EnableInputLayout(DataLayout::bfyx); + k.EnableInputLayout(DataLayout::bfzyx); + + k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); + k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32); + k.EnableOutputLayout(DataLayout::fs_b_yx_fsv32); + k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16); + k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32); + k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16); + k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16); + + k.EnableDifferentTypes(); + k.EnableBatching(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + + return k; +} + +static inline std::string GetTiledInputOrder(size_t size) { + std::string order_str = ""; + switch (size) { + case 4: + order_str = "b, f + lh, y, x"; + break; + case 5: + order_str = "b, f + lh, z, y, x"; + break; + default: throw std::runtime_error("Unsupported combination\n"); + } + return order_str; +} + +static inline size_t GetFsvAlignment(const reorder_params& params) { + const auto& out = params.output; + int fsv_alignment = -1; + switch (out.GetLayout()) { + case DataLayout::b_fs_yx_fsv4: + fsv_alignment = 4; + break; + case DataLayout::b_fs_yx_fsv16: + case DataLayout::b_fs_zyx_fsv16: + case DataLayout::bs_fs_yx_bsv16_fsv16: + case DataLayout::bs_fs_zyx_bsv16_fsv16: + fsv_alignment = 16; + break; + case DataLayout::b_fs_yx_fsv32: + case DataLayout::fs_b_yx_fsv32: + case DataLayout::b_fs_zyx_fsv32: + fsv_alignment = 32; + break; + default: + throw std::runtime_error("Unsupported combination\n"); + } + return fsv_alignment; +} + +static inline size_t GetTileSize(const reorder_params& params) { + const Datatype input_type = params.inputs[0].GetDType(); + const Datatype output_type = params.output.GetDType(); + + // i64 supports tile size 4 + if ((input_type == Datatype::INT64) || (output_type == Datatype::INT64)) { + return MIN_TILE_SIZE; + } + + if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv4) { + return MIN_TILE_SIZE; + } + + if (params.inputs[0].Feature().v < DEFAULT_TILE_SIZE) { + return MIN_TILE_SIZE; + } + + return DEFAULT_TILE_SIZE; +} + +static inline std::vector GetGWS(const reorder_params& params) { + const auto& in = params.inputs[0]; + const size_t tile_size = GetTileSize(params); + const size_t fsv_alignment = GetFsvAlignment(params); + + std::vector gws = { (fsv_alignment / tile_size), + CeilDiv(in.X().v, tile_size) * in.Y().v * in.Z().v, + in.Batch().v * CeilDiv(in.Feature().v, fsv_alignment) }; + + return gws; +} + +static std::vector GetBestLwsFromGws(const reorder_params& params, const std::vector& gws, const size_t tile_width, const size_t tile_size) { + std::vector lws{ 1, 1, 1 }; + std::vector dims{ 0, 1, 2 }; + + // SLM size: elemsize * tile_width * tile_width * work_items <= 64K + const size_t elem_size = params.inputs[0].ElementSize(); + const size_t max_local_mem_size = params.engineInfo.maxLocalMemSize; + const size_t max_work_group_size = params.engineInfo.maxWorkGroupSize; + size_t max_num_work_items = std::min(max_work_group_size, max_local_mem_size / (elem_size * tile_width * tile_size)); + + for (size_t i = 0; i < dims.size(); ++i) { + size_t dim = dims[i]; + size_t max_divider = static_cast(std::sqrt(gws[dim]) + 1); + for (size_t divider = 1; divider <= max_divider; ++divider) { + if (gws[dim] % divider == 0) { + const size_t lws0 = gws[dim] / divider; + if (lws0 <= max_num_work_items) { + lws[dim] = std::max(lws[dim], lws0); + } + if (divider <= max_num_work_items) { + lws[dim] = std::max(lws[dim], divider); + } + } + } + max_num_work_items /= lws[dim]; + } + return lws; +} + +CommonDispatchData ReorderKernel_bfyx_to_blocked_format::SetDefault(const reorder_params& params) const { + CommonDispatchData dispatchData; + const size_t tile_size = GetTileSize(params); + dispatchData.gws = GetGWS(params); + dispatchData.lws = GetBestLwsFromGws(params, dispatchData.gws, tile_size, tile_size); + return dispatchData; +} + +JitConstants ReorderKernel_bfyx_to_blocked_format::GetJitConstants(const reorder_params& params) const { + auto jit = ReorderKernelBase::GetJitConstants(params); + + const size_t b = params.inputs[0].Batch().v; + const size_t f = params.inputs[0].Feature().v; + const size_t x = params.inputs[0].X().v; + const size_t tile_size = GetTileSize(params); + const size_t input_ndims = params.inputs[0].GetDims().size(); + const size_t fsv_alignment = GetFsvAlignment(params); + + const auto& gws = GetGWS(params); + const auto& lws = GetBestLwsFromGws(params, gws, tile_size, tile_size); + const uint64_t total_lws = lws[0] * lws[1] * lws[2]; + + jit.AddConstant(MakeJitConstant("INPUT0_TILED_ORDER", GetTiledInputOrder(input_ndims))); + jit.AddConstant(MakeJitConstant("INPUT0_FEATURE_SLICE_NUM", CeilDiv(f, fsv_alignment))); + jit.AddConstant(MakeJitConstant("TILE_SIZE", tile_size)); + jit.AddConstant(MakeJitConstant("FSV_ALIGNMENT", fsv_alignment)); + jit.AddConstant(MakeJitConstant("TRANS_BUF_SIZE", tile_size * total_lws)); + + if (params.output.GetLayout() == DataLayout::fs_b_yx_fsv32) { + jit.AddConstant(MakeJitConstant("FS_B_YX_FSV", 1)); + } + + if (params.output.GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 || + params.output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) { + jit.AddConstant(MakeJitConstant("DOUBLE_BLOCKED_FORMAT", 1)); + jit.AddConstant(MakeJitConstant("INPUT0_BATCH_SLICE_NUM", CeilDiv(b, fsv_alignment))); + } + + // whether F is tile_size-aligned + if (f % tile_size == 0) { + jit.AddConstant(MakeJitConstant("F_NO_REMAINDER_CONDITION", "(f < INPUT0_FEATURE_NUM)")); + } else { + jit.AddConstant(MakeJitConstant("F_REMAINDER_SIZE", f % tile_size)); + jit.AddConstant(MakeJitConstant("F_REMAINDER_CONDITION", "(f >= (INPUT0_FEATURE_NUM - F_REMAINDER_SIZE)) && (f < INPUT0_FEATURE_NUM)")); + jit.AddConstant(MakeJitConstant("F_NO_REMAINDER_CONDITION", "(f < (INPUT0_FEATURE_NUM - F_REMAINDER_SIZE))")); + } + + // whether x is tile_size-aligned + if (x % tile_size != 0) { + jit.AddConstant(MakeJitConstant("X_REMAINDER_SIZE", x % tile_size)); + jit.AddConstant(MakeJitConstant("X_REMAINDER_CONDITION", "(x >= (INPUT0_SIZE_X - X_REMAINDER_SIZE)) && (x < INPUT0_SIZE_X)")); + jit.AddConstant(MakeJitConstant("X_NO_REMAINDER_CONDITION", "(x < (INPUT0_SIZE_X - X_REMAINDER_SIZE))")); + } + + return jit; +} + +KernelsData ReorderKernel_bfyx_to_blocked_format::GetKernelsData(const Params& params, const optional_params& options) const { + assert(params.GetType() == KernelType::REORDER); + + const reorder_params& orgParams = static_cast(params); + + return GetCommonKernelsData(orgParams, options); +} + +bool ReorderKernel_bfyx_to_blocked_format::Validate(const Params& p, const optional_params& o) const { + if (!ReorderKernelBase::Validate(p, o)) { + return false; + } + + const reorder_params& params = static_cast(p); + const auto& input = params.inputs[0]; + const auto& output = params.output; + + if (input.GetDims().size() != output.GetDims().size()) { + return false; + } + + // padding is not supported + if (input.X().pad.before != 0 || input.X().pad.after != 0 || + input.Y().pad.before != 0 || input.Y().pad.after != 0 || + input.Z().pad.before != 0 || input.Z().pad.after != 0 || + input.W().pad.before != 0 || input.W().pad.after != 0 || + input.Feature().pad.before != 0 || input.Feature().pad.after != 0 || + input.Batch().pad.before != 0 || input.Batch().pad.after != 0) { + return false; + } + + if (output.X().pad.before != 0 || output.X().pad.after != 0 || + output.Y().pad.before != 0 || output.Y().pad.after != 0 || + output.Z().pad.before != 0 || output.Z().pad.after != 0 || + output.W().pad.before != 0 || output.W().pad.after != 0 || + output.Feature().pad.before != 0 || output.Feature().pad.after != 0 || + output.Batch().pad.before != 0 || output.Batch().pad.after != 0) { + return false; + } + + return true; +} + +KernelsPriority ReorderKernel_bfyx_to_blocked_format::GetKernelsPriority(const Params& p, const optional_params& /*options*/) const { + const reorder_params& params = static_cast(p); + const auto& input = params.inputs[0]; + const auto& output = params.output; + + const size_t b = input.Batch().v; + const size_t f = input.Feature().v; + const size_t x = input.X().v; + const size_t y = input.Y().v; + const size_t z = input.Z().v; + + const size_t elem_size = input.ElementSize(); + const size_t total_data_byte = b * f * x * y * z * elem_size; + + const size_t tile_size = GetTileSize(params); + const size_t fsv_alignment = GetFsvAlignment(params); + + if ((f < fsv_alignment && x < tile_size) || total_data_byte < 32000) { + return DONT_USE_IF_HAVE_SOMETHING_ELSE; + } + + // At this condition, reorder_data_fast_b1 is faster + if (b == 1 && output.Batch().v == 1 && params.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && f < 256) { + return FORCE_PRIORITY_8; + } + + return FORCE_PRIORITY_5; +} +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.h new file mode 100644 index 00000000000000..1e542fec39d228 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_bfyx_to_blocked_format.h @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "reorder_kernel_base.h" + +namespace kernel_selector { +class ReorderKernel_bfyx_to_blocked_format : public ReorderKernelBase { +public: + ReorderKernel_bfyx_to_blocked_format() : ReorderKernelBase("reorder_data_bfyx_to_blocked_format") {} + + bool Validate(const Params& p, const optional_params& o) const override; + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; +protected: + JitConstants GetJitConstants(const reorder_params& params) const; + CommonDispatchData SetDefault(const reorder_params& params) const; +}; +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp index 1076176a639529..e68b6cfa4dbea1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_selector.cpp @@ -11,6 +11,8 @@ #include "reorder_kernel_binary.h" #include "reorder_biplanar_nv12.h" #include "reorder_kernel_fs_b_yx_fsv32_to_bfyx.h" +#include "reorder_kernel_bfyx_to_blocked_format.h" +#include "reorder_kernel_b_fs_yx_fsv16_fsv32_to_bfyx.h" namespace kernel_selector { @@ -23,6 +25,8 @@ reorder_kernel_selector::reorder_kernel_selector() { Attach(); Attach(); Attach(); + Attach(); + Attach(); } KernelsData reorder_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl new file mode 100644 index 00000000000000..fe6d79f8014458 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl @@ -0,0 +1,160 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/include_all.cl" + +#define unroll_for __attribute__((opencl_unroll_hint)) for +#define CEIL_DIV(A, B) (((A) + (B) - 1) / (B)) +#define INPUT0_GET_TILED_INDEX(ORDER) INPUT0_GET_INDEX(ORDER) +#define OUTPUT_GET_TILED_INDEX(ORDER) OUTPUT_GET_INDEX(ORDER) + +#define INPUTVTYPE CAT(INPUT0_TYPE, DEFAULT_TILE_SIZE) +#define INPUTVTYPE_HALF CAT(INPUT0_TYPE, TILE_SIZE) +#define OUTPUTVTYPE CAT(OUTPUT_TYPE, TILE_SIZE) +#define VSTORE CAT(vstore, TILE_SIZE) +#define AS_INPUTVTYPE CAT(as_, INPUTVTYPE) +#define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE) + +#define GET_GLOBAL_ID(IDX) ((uint)get_global_id(IDX)) +#define GET_LOCAL_ID(IDX) ((uint)get_local_id(IDX)) +#define GET_LOCAL_SIZE(IDX) ((uint)get_local_size(IDX)) + +__attribute__((intel_reqd_sub_group_size(DEFAULT_STRIDE))) + +KERNEL (reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output + ) +{ + const uint sub_group_id = get_sub_group_id(); + const uint sub_group_local_id = get_sub_group_local_id(); + +#if INPUT0_DIMS == 4 + #if OUTPUT_DIMS > 4 + const uint z = 0; + const uint w = 0; + #endif + const uint y = GET_GLOBAL_ID(1); +#elif INPUT0_DIMS == 5 + #if OUTPUT_DIMS > 5 + const uint w = 0; + #endif + const uint y = GET_GLOBAL_ID(1) % INPUT0_SIZE_Y; + const uint z = GET_GLOBAL_ID(1) / INPUT0_SIZE_Y; +#else +#error reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx.cl: input format - not supported +#endif + + const uint x = GET_GLOBAL_ID(0) / FSV_ALIGNMENT * DEFAULT_TILE_SIZE; + + const uint fs = GET_GLOBAL_ID(2) % INPUT0_FEATURE_SLICE_NUM; + const uint b = GET_GLOBAL_ID(2) / INPUT0_FEATURE_SLICE_NUM; + const uint f = fs * FSV_ALIGNMENT + sub_group_local_id; + + //read + const uint x_pitch = FSV_ALIGNMENT; + const uint y_pitch = x_pitch * INPUT0_SIZE_X; +#if INPUT0_DIMS == 4 + const uint fs_pitch = y_pitch * INPUT0_SIZE_Y; + const uint b_pitch = fs_pitch * INPUT0_FEATURE_SLICE_NUM; + const uint input_idx_tile = (b * b_pitch) + (fs * fs_pitch) + (y * y_pitch) + (x * x_pitch); +#else + const uint z_pitch = y_pitch * INPUT0_SIZE_Y; + const uint fs_pitch = z_pitch * INPUT0_SIZE_Z; + const uint b_pitch = fs_pitch * INPUT0_FEATURE_SLICE_NUM; + const uint input_idx_tile = (b * b_pitch) + (fs * fs_pitch) + (z * z_pitch) + (y * y_pitch) + (x * x_pitch); +#endif + + +#if (TILE_SIZE == DEFAULT_TILE_SIZE) + // read + INPUTVTYPE read_data = AS_INPUTVTYPE(intel_sub_group_block_read8((const __global uint*)(input) + input_idx_tile)); + + // write + const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER); + + if (F_NO_REMAINDER_CONDITION +#ifdef F_REMAINDER_SIZE + || (F_REMAINDER_CONDITION && ((f % FSV_ALIGNMENT) < F_REMAINDER_SIZE)) +#endif + ) { + #ifdef X_REMAINDER_SIZE + if (X_REMAINDER_CONDITION) { + for (int i = 0 ; i < X_REMAINDER_SIZE; i++) { + output[output_idx + i] = TO_OUTPUT_TYPE(read_data[i]); + } + } else { + VSTORE(TO_OUTPUTVTYPE(read_data), 0, output + output_idx); + } + #else + VSTORE(TO_OUTPUTVTYPE(read_data), 0, output + output_idx); + #endif + } +#else + const uint sgid_remainder = sub_group_id % 2; + + // read + const uint input_idx_final = input_idx_tile + sgid_remainder * (DEFAULT_STRIDE * DEFAULT_TILE_SIZE); + INPUTVTYPE read_data = AS_INPUTVTYPE(intel_sub_group_block_read8((const __global uint*)(input) + input_idx_final)); + INPUTVTYPE_HALF read_half1 = {read_data[0], read_data[2], read_data[4], read_data[6]}; + INPUTVTYPE_HALF read_half2 = {read_data[1], read_data[3], read_data[5], read_data[7]}; + + // write + const uint output_idx = OUTPUT_GET_TILED_INDEX(OUTPUT_TILED_ORDER); + const uint output_idx_final = output_idx + (sgid_remainder * TILE_SIZE); + + if (F_NO_REMAINDER_CONDITION +#ifdef F_REMAINDER_SIZE + || (F_REMAINDER_CONDITION && ((f % FSV_ALIGNMENT) < F_REMAINDER_SIZE)) +#endif + ) { + #ifdef X_REMAINDER_SIZE + if (X_REMAINDER_CONDITION) { + const int nloop = X_REMAINDER_SIZE - (TILE_SIZE * sgid_remainder); + for (int i = 0 ; i < min(nloop, TILE_SIZE); i++) { + output[output_idx_final + i] = TO_OUTPUT_TYPE(read_half1[i]); + #ifdef F_REMAINDER_SIZE + if ((f + DEFAULT_STRIDE) < OUTPUT_FEATURE_NUM) + #endif + { + output[output_idx_final + i + (OUTPUT_FEATURE_PITCH * DEFAULT_STRIDE)] = TO_OUTPUT_TYPE(read_half2[i]); + } + } + } else { + VSTORE(TO_OUTPUTVTYPE(read_half1), 0, output + output_idx_final); + #ifdef F_REMAINDER_SIZE + if ((f + DEFAULT_STRIDE) < OUTPUT_FEATURE_NUM) + #endif + { + VSTORE(TO_OUTPUTVTYPE(read_half2), 0, output + output_idx_final + (OUTPUT_FEATURE_PITCH * DEFAULT_STRIDE)); + } + } + #else + VSTORE(TO_OUTPUTVTYPE(read_half1), 0, output + output_idx_final); + #ifdef F_REMAINDER_SIZE + if((f + DEFAULT_STRIDE) < OUTPUT_FEATURE_NUM) + #endif + { + VSTORE(TO_OUTPUTVTYPE(read_half2), 0, output + output_idx_final + (OUTPUT_FEATURE_PITCH * DEFAULT_STRIDE)); + } + #endif + } +#endif +} + +#undef GET_LOCAL_SIZE +#undef GET_LOCAL_ID +#undef GET_GLOBAL_ID + +#undef TO_OUTPUTVTYPE +#undef AS_INPUTVTYPE +#undef VSTORE +#undef OUTPUTVTYPE +#undef INPUTVTYPE_HALF +#undef INPUTVTYPE + +#undef OUTPUT_GET_TILED_INDEX +#undef INPUT0_GET_TILED_INDEX +#undef CEIL_DIV +#undef unroll_for diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_bfyx_to_blocked_format.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_bfyx_to_blocked_format.cl new file mode 100644 index 00000000000000..ae03ed32d98d1c --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data_bfyx_to_blocked_format.cl @@ -0,0 +1,165 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/fetch.cl" +#include "include/common.cl" +#include "include/data_types.cl" + +#define unroll_for __attribute__((opencl_unroll_hint)) for +#define CEIL_DIV(A, B) (((A) + (B) - 1) / (B)) +#define INPUT0_GET_TILED_INDEX(ORDER) INPUT0_GET_INDEX(ORDER) + +#define INPUTVTYPE CAT(INPUT0_TYPE, TILE_SIZE) +#define OUTPUTVTYPE CAT(OUTPUT_TYPE, TILE_SIZE) +#define VLOAD CAT(vload, TILE_SIZE) +#define VSTORE CAT(vstore, TILE_SIZE) +#define AS_INPUTVTYPE CAT(as_, INPUTVTYPE) +#define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE) + +#define GET_GLOBAL_ID(IDX) ((uint)get_global_id(IDX)) +#define GET_LOCAL_ID(IDX) ((uint)get_local_id(IDX)) +#define GET_LOCAL_SIZE(IDX) ((uint)get_local_size(IDX)) + +#define FUNC_VLOAD(inner, outer) unroll_for (uint lh = 0; lh < outer; ++lh) { \ + const uint input_idx = INPUT0_GET_TILED_INDEX(INPUT0_TILED_ORDER); \ + INPUTVTYPE read_data = AS_INPUTVTYPE(VLOAD(0, input + input_idx)); \ + unroll_for (uint lw = 0; lw < inner; ++lw) { \ + const uint dst = local_buf_offset + lw; \ + transpose_buf[dst][lh] = ACTIVATION(read_data[lw], ACTIVATION_PARAMS); \ + } \ + } + +#define FUNC_VSTORE(loop) unroll_for (uint lw = 0; lw < loop; ++lw) { \ + const uint output_idx = output_idx_tile + (lw * x_pitch); \ + VSTORE(TO_OUTPUTVTYPE(transpose_buf[local_buf_offset + lw]), 0, output + output_idx); \ + } + +#define FUNC_WRITE(inner, outer) unroll_for (uint lw = 0; lw < outer; ++lw) { \ + const uint output_idx = output_idx_tile + (lw * x_pitch); \ + unroll_for (uint i = 0; i < inner; ++i) { \ + output[output_idx + i] = TO_OUTPUT_TYPE(transpose_buf[local_buf_offset + lw][i]); \ + } \ + } + +KERNEL (reorder_data_bfyx_to_blocked_format)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output + ) +{ +#if INPUT0_DIMS == 4 + const uint y = GET_GLOBAL_ID(1) % INPUT0_SIZE_Y; + const uint x = (GET_GLOBAL_ID(1) / INPUT0_SIZE_Y) * TILE_SIZE; +#elif INPUT0_DIMS == 5 + const uint z = GET_GLOBAL_ID(1) % INPUT0_SIZE_Z; + const uint yx = GET_GLOBAL_ID(1) / INPUT0_SIZE_Z; + const uint y = yx % INPUT0_SIZE_Y; + const uint x = (yx / INPUT0_SIZE_Y) * TILE_SIZE; +#else +#error reorder_data_bfyx_to_blocked_format.cl: input format - not supported +#endif + + const uint fsv = GET_GLOBAL_ID(0) * TILE_SIZE; + const uint fs = GET_GLOBAL_ID(2) % INPUT0_FEATURE_SLICE_NUM; + const uint b = GET_GLOBAL_ID(2) / INPUT0_FEATURE_SLICE_NUM; + const uint f = fsv + fs * FSV_ALIGNMENT; + +#if DOUBLE_BLOCKED_FORMAT + const uint bs = b / FSV_ALIGNMENT; + const uint bsv = b % FSV_ALIGNMENT; + const uint x_pitch = FSV_ALIGNMENT * FSV_ALIGNMENT; +#else + const uint x_pitch = FSV_ALIGNMENT; +#endif + const uint y_pitch = x_pitch * (OUTPUT_SIZE_X); + +#if INPUT0_DIMS == 4 + #if DOUBLE_BLOCKED_FORMAT + const uint bsv_pitch = FSV_ALIGNMENT; + const uint fs_pitch = y_pitch * (OUTPUT_SIZE_Y); + const uint bs_pitch = fs_pitch * (INPUT0_FEATURE_SLICE_NUM); + const uint output_idx_tile = (bs * bs_pitch) + (fs * fs_pitch) + (y * y_pitch) + (x * x_pitch) + (bsv * bsv_pitch) + (fsv); + #else + #if FS_B_YX_FSV + const uint b_pitch = y_pitch * (OUTPUT_SIZE_Y); + const uint fs_pitch = b_pitch * (INPUT0_BATCH_NUM); + #else + const uint fs_pitch = y_pitch * (OUTPUT_SIZE_Y); + const uint b_pitch = fs_pitch * (INPUT0_FEATURE_SLICE_NUM); + #endif + const uint output_idx_tile = (b * b_pitch) + (fs * fs_pitch) + (y * y_pitch) + (x * x_pitch) + (fsv); + #endif +#elif INPUT0_DIMS == 5 + #if DOUBLE_BLOCKED_FORMAT + const uint bsv_pitch = FSV_ALIGNMENT; + const uint z_pitch = y_pitch * (OUTPUT_SIZE_Y); + const uint fs_pitch = z_pitch * (OUTPUT_SIZE_Z); + const uint bs_pitch = fs_pitch * (INPUT0_FEATURE_SLICE_NUM); + const uint output_idx_tile = (bs * bs_pitch) + (fs * fs_pitch) + (z * z_pitch) + (y * y_pitch) + (x * x_pitch) + (bsv * bsv_pitch) + (fsv); + #else + const uint z_pitch = y_pitch * (OUTPUT_SIZE_Y); + const uint fs_pitch = z_pitch * (OUTPUT_SIZE_Z); + const uint b_pitch = fs_pitch * (INPUT0_FEATURE_SLICE_NUM); + const uint output_idx_tile = (b * b_pitch) + (fs * fs_pitch) + (z * z_pitch) + (y * y_pitch) + (x * x_pitch) + (fsv); + #endif +#endif + + // get local buf offset + __local OUTPUTVTYPE transpose_buf[TRANS_BUF_SIZE]; + const uint local_id = GET_LOCAL_ID(0) * GET_LOCAL_SIZE(2) * GET_LOCAL_SIZE(1) + + GET_LOCAL_ID(1) * GET_LOCAL_SIZE(2) + + GET_LOCAL_ID(2); + const uint local_buf_offset = local_id * TILE_SIZE; + + if (F_NO_REMAINDER_CONDITION) { + // read and transpose + FUNC_VLOAD(TILE_SIZE, TILE_SIZE) + + // write to ddr +#ifdef X_REMAINDER_CONDITION + if (X_NO_REMAINDER_CONDITION) { + FUNC_VSTORE(TILE_SIZE) + } else { + FUNC_VSTORE(X_REMAINDER_SIZE) + } +#else + FUNC_VSTORE(TILE_SIZE) +#endif + } +#ifdef F_REMAINDER_CONDITION + else if (F_REMAINDER_CONDITION) { + // read and transpose + FUNC_VLOAD(TILE_SIZE, F_REMAINDER_SIZE) + + // write to ddr + #ifdef X_REMAINDER_CONDITION + if (X_NO_REMAINDER_CONDITION) { + FUNC_WRITE(F_REMAINDER_SIZE, TILE_SIZE) + } else { + FUNC_WRITE(F_REMAINDER_SIZE, X_REMAINDER_SIZE) + } + #else + FUNC_WRITE(F_REMAINDER_SIZE, TILE_SIZE) + #endif + } +#endif +} + +#undef FUNC_WRITE +#undef FUNC_VSTORE +#undef FUNC_VLOAD + +#undef GET_LOCAL_SIZE +#undef GET_LOCAL_ID +#undef GET_GLOBAL_ID + +#undef TO_OUTPUTVTYPE +#undef AS_INPUTVTYPE +#undef VSTORE +#undef VLOAD +#undef OUTPUTVTYPE +#undef INPUTVTYPE + +#undef INPUT0_GET_TILED_INDEX +#undef CEIL_DIV +#undef unroll_for diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp index 59a4fd0af2328b..f2c80c3b06130c 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp @@ -23,8 +23,251 @@ using namespace cldnn; using namespace tests; using namespace testing; -TEST(reorder_gpu_f32, basic) -{ +static void compare_bfyx2blocked_with_ref(const std::string& kernel_name, + const data_types input_data_type, const data_types output_data_type, + cldnn::format input_format, cldnn::format output_format, + int32_t b_in, int32_t f_in, int32_t x_in, int32_t y_in, int32_t z_in = 0, int32_t w_in = 0) { + const auto& engine = get_test_engine(); + + tensor ts; + if (input_format.dimension() == 4) { + ts = { b_in, f_in, x_in, y_in }; + } + else if (input_format.dimension() == 5) { + ts = { b_in, f_in, x_in, y_in, z_in }; + } + else { + ts = { b_in, f_in, x_in, y_in, z_in, w_in }; + } + + auto input = memory::allocate(engine, { input_data_type, input_format, ts }); + layout output_layout(output_data_type, output_format, ts); + + if (input_data_type == data_types::i8) { + auto input_ptr = input.pointer(); + unsigned char i = 1; + for (auto it = input_ptr.begin(); it != input_ptr.end(); ++it) + { + *it = (i++); + if (i > 100) { + i = 1; + } + } + } else { + auto input_ptr = input.pointer(); + float i = 1.f; + for (auto it = input_ptr.begin(); it != input_ptr.end(); ++it) + { + *it = (i); + i += 1.f; + } + } + + topology topology( + input_layout("input", input.get_layout()), + reorder("reorder", "input", output_layout)); + + // run on reference(reorder_data) kernel + cldnn::build_options options_ref; + cldnn::implementation_desc reorder_ref = { output_format, "reorder_data" }; + options_ref.set_option(cldnn::build_option::force_implementations({ {"reorder", reorder_ref} })); + + network network_ref(engine, topology, options_ref); + network_ref.set_input_data("input", input); + + std::map outputs_ref; + + outputs_ref = network_ref.execute(); + cldnn::event e1 = outputs_ref.at("reorder").get_event(); + e1.wait(); + + auto output_ref = outputs_ref.begin()->second.get_memory(); + auto output_ref_ptr = output_ref.pointer(); + + // run on optimized kernel + cldnn::build_options options; + cldnn::implementation_desc reorder_optimized = { output_format, kernel_name }; + options.set_option(cldnn::build_option::force_implementations({ {"reorder", reorder_optimized} })); + + network network(engine, topology, options); + network.set_input_data("input", input); + + std::map outputs; + + outputs = network.execute(); + cldnn::event e2 = outputs.at("reorder").get_event(); + e2.wait(); + + auto output = outputs.begin()->second.get_memory(); + auto output_ptr = output.pointer(); + + // compare results + const size_t output_size = output_ref_ptr.size(); + for (size_t i = 0; i < output_size; i++) + { + EXPECT_EQ(output_ref_ptr[i], output_ptr[i]); + } +} + +TEST(reorder_gpu_optimization, compare_with_ref__b_fs_yx_fsv32_to_bfyx_f32) { + // b_fs_yx_fsv32 -> bfyx + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfyx, 3, 64 + 5, 16 + 11, 3); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfyx, 3, 96 - 12, 16 + 4, 3); + // b_fs_zyx_fsv32 -> bfzyx + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfzyx, 3, 64 + 9, 16 - 1, 2, 8); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfzyx, 2, 64 + 30, 16 + 1, 3, 4); + // incremental dims + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfzyx, 2, 64 + 4, 24 - 1, 3); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv32, format::bfwzyx, 2, 64 + 2, 32 - 3, 4); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_zyx_fsv32, format::bfwzyx, 1, 96 + 10, 32 - 3, 4, 3); +} + +TEST(reorder_gpu_optimization, compare_with_ref__b_fs_yx_fsv32_to_bfyx_different_datatype) { + // f32 -> other types + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::u8, format::b_fs_yx_fsv32, format::bfyx, 2, 64, 8 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::i64, format::b_fs_yx_fsv32, format::bfyx, 2, 64, 16 + 2, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f16, format::b_fs_yx_fsv32, format::bfyx, 1, 64, 16 + 1, 2); + // i32 -> other types + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::i8, format::b_fs_yx_fsv32, format::bfyx, 2, 64, 8 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::i64, format::b_fs_yx_fsv32, format::bfyx, 2, 64, 16 + 2, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::f16, format::b_fs_yx_fsv32, format::bfyx, 1, 64, 16 + 1, 2); +} + +TEST(reorder_gpu_optimization, compare_with_ref__b_fs_yx_fsv16_to_bfyx_f32) { + // u-net + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx, 1, 64, 388, 388); + // b_fs_yx_fsv16 -> bfyx + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx, 3, 48 + 1, 16, 3); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx, 2, 32 - 1, 24 - 1, 3); + // b_fs_zyx_fsv16 -> bfzyx + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx, 5, 48 - 1, 16, 3, 8); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx, 2, 32 + 1, 24 - 1, 3, 17); + // incremental dims + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx, 3, 32 - 1, 24 - 1, 3); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfwzyx, 4, 16 + 1, 32 - 3, 4); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx, 3, 16 + 2, 32 - 3, 4, 9); +} + +TEST(reorder_gpu_optimization, compare_with_ref__b_fs_yx_fsv16_to_bfyx_different_datatype) { + // f32 -> other types + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::u8, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::i8, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::i32, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::i64, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::f32, data_types::f16, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); + // i32 -> other types + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::u8, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::i8, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::i64, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::f16, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_b_fs_yx_fsv16_fsv32_to_bfyx", data_types::i32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx, 2, 32, 16 + 7, 2); +} + +TEST(reorder_gpu_optimization, compare_with_ref__bfyx_to_blocked_f32) { + // bfyx_to_b_fs_yx_fsv4 + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv4, 4, 32, 16, 4); + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv4, 3, 32 + 2, 32 + 3, 4); + // bfyx_to_b_fs_yx_fsv16 + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv16, 2, 48, 8, 4); + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv16, 3, 32 + 4, 16 + 7, 2); + // bfyx to b_fs_yx_fsv32 + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv32, 2, 64, 64, 4); + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::b_fs_yx_fsv32, 4, 32 + 6, 96 - 4, 2); + // bfyx to fs_b_yx_fsv32 + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::fs_b_yx_fsv32, 2, 64, 8, 4); + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::fs_b_yx_fsv32, 3, 64 + 5, 8 + 7, 2); + // bfzyx to b_fs_zyx_fsv16 + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::b_fs_zyx_fsv16, 2, 48, 8, 4, 4); + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::b_fs_zyx_fsv16, 3, 32 + 5, 16 + 7, 2, 2); + // bfzyx to b_fs_zyx_fsv32 + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::b_fs_zyx_fsv32, 2, 64, 8, 4, 4); + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::b_fs_zyx_fsv32, 3, 64 + 5, 8 + 7, 2, 2); +} + +TEST(reorder_gpu_optimization, compare_with_ref__bfyx_to_double_blocked_f32) { + // bfyx to double blocked format (bs_fs_yx_bsv16_fsv16) + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32, 48, 8, 4); // no + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32 + 2, 48, 16, 4); // b + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32, 48 + 5, 16, 4); // f + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32, 48, 48 + 3, 4); // x + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfyx, format::bs_fs_yx_bsv16_fsv16, 32 + 2, 48 + 3, 16 + 1, 4); // b-f-x + // bfzyx to double blocked format (bs_fs_zyx_bsv16_fsv16) + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32, 48, 8, 4, 16); // no + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32 + 2, 48, 16, 4, 2); // b + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32, 48 + 5, 16, 4, 3); // f + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32, 48, 48 + 3, 4, 4); // x + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f32, format::bfzyx, format::bs_fs_zyx_bsv16_fsv16, 32 + 2, 48 + 3, 16 + 1, 4, 2); // b-f-x +} + +TEST(reorder_gpu_optimization, compare_with_ref__bfyx_to_blocked_format_different_datatype) { + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::f32, data_types::f16, format::bfyx, format::b_fs_yx_fsv16, 3, 32 + 4, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::i8, data_types::f32, format::bfyx, format::b_fs_yx_fsv16, 3, 32 + 4, 16 + 7, 2); + compare_bfyx2blocked_with_ref("reorder_data_bfyx_to_blocked_format", data_types::i64, data_types::f32, format::bfyx, format::b_fs_yx_fsv16, 3, 32 + 4, 16 + 7, 2); +} + +TEST(reorder_gpu_optimization, bfyx_to_fsv16_without_f_remainder) { + const auto& engine = get_test_engine(); + const int32_t b_in = 1; + const int32_t f_in = 8 * 4; + const int32_t y_in = 4; + const int32_t x_in = 8 * 2; + + auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { b_in,f_in,x_in,y_in } }); + layout output_layout(data_types::f32, format::b_fs_yx_fsv16, { b_in,f_in,x_in,y_in }); + + // Set incremental input value + auto input_ptr = input.pointer(); + float i = 0.f; + for (auto it = input_ptr.begin(); it != input_ptr.end(); ++it) + { + *it = (i++); + } + + topology topology( + input_layout("input", input.get_layout()), + reorder("reorder", "input", output_layout)); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "reorder"); + + auto output = outputs.begin()->second.get_memory(); + auto output_ptr = output.pointer(); + + auto get_fsv16_index = [](int32_t /* b_size */, int32_t f_size, int32_t y_size, int32_t x_size, + int32_t b, int32_t f, int32_t y, int32_t x) { + const int32_t alignment = 16; + const int32_t fs = f / alignment; + const int32_t fsv = f % alignment; + + const int32_t x_pitch = alignment; + const int32_t y_pitch = x_pitch * (x_size); + const int32_t fs_pitch = y_pitch * (y_size); + const int32_t b_pitch = fs_pitch * ((f_size)/alignment); + + const int32_t output_offset = (b * b_pitch) + (fs * fs_pitch) + (y * y_pitch) + (x * x_pitch) + (fsv); + + return output_offset; + }; + + int32_t linear_index = 0; + for (int32_t b = 0; b < b_in; b++) { + for (int32_t f = 0; f < f_in; f++) { + for (int32_t y = 0; y < y_in; y++) { + for (int32_t x = 0; x < x_in; x++) { + int32_t b_fs_yx_fsv16_index = get_fsv16_index(b_in, f_in, y_in, x_in, b, f, y, x); + EXPECT_FLOAT_EQ(input_ptr[linear_index++], output_ptr[b_fs_yx_fsv16_index]); + } + } + } + } + +} + +TEST(reorder_gpu_f32, basic) { // Input : yxfb:2x2x2x2 // Output : bfyx:2x2x2x2 //