diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_b_fs_zyx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_b_fs_zyx_fsv16.cpp index 16f83ac34da4e2..746ac5c33f53b5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_b_fs_zyx_fsv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_b_fs_zyx_fsv16.cpp @@ -30,6 +30,8 @@ ParamsKey DeconvolutionKernel_b_fs_zyx_fsv16::GetSupportedKey() const { k.EnableInputWeightsType(WeightsType::F32); k.EnableInputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); k.EnableInputWeightsType(WeightsType::F16); k.EnableInputLayout(DataLayout::b_fs_yx_fsv16); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); @@ -44,6 +46,7 @@ ParamsKey DeconvolutionKernel_b_fs_zyx_fsv16::GetSupportedKey() const { k.EnableBatching(); k.EnableSubGroup(); k.EnableSubGroupShort(); + k.EnableDifferentTypes(); return k; } @@ -155,10 +158,11 @@ JitConstants DeconvolutionKernel_b_fs_zyx_fsv16::GetJitConstants(const deconvolu } jit.AddConstant(MakeJitConstant("OC_BLOCK", 16)); - if (output.GetDType() == Datatype::F32) + if (input.GetDType() == Datatype::F32) { jit.AddConstant(MakeJitConstant("DT_F32", 1)); - else + } else { jit.AddConstant(MakeJitConstant("DT_F16", 1)); + } auto mb_block = 1; auto ic_block = 16; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp index ac89b0b5167460..d44e11f311848c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp @@ -26,6 +26,8 @@ ParamsKey DeconvolutionKernel_bfyx_opt::GetSupportedKey() const { k.EnableInputWeightsType(WeightsType::F32); k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); k.EnableInputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::bfyx); k.EnableTensorOffset(); @@ -36,6 +38,7 @@ ParamsKey DeconvolutionKernel_bfyx_opt::GetSupportedKey() const { k.EnableSplitSupport(); k.EnableDepthwiseSeparableOpt(); k.EnableGroupedConvolution(); + k.EnableDifferentTypes(); return k; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_bwd_data.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_bwd_data.cl index 12935b052f8d52..f1535269e31470 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_bwd_data.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_bwd_data.cl @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019 Intel Corporation +* Copyright 2019-2020 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,10 +14,21 @@ * limitations under the License. *******************************************************************************/ -#include "ocl_types.h" #include "include/fetch.cl" #include "include/data_types.cl" +#define INPUT_TYPE8 MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) +#define OUTPUT_TYPE8 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) +#define FILTER_TYPE8 MAKE_VECTOR_TYPE(FILTER_TYPE, 8) + +#if DT_F16 == 1 +#define FMA_ARG_TYPE half +#define FMA_ARG_TYPE8 half8 +#else +#define FMA_ARG_TYPE INPUT0_TYPE +#define FMA_ARG_TYPE8 INPUT_TYPE8 +#endif + #if ID > 1 #define CASE_3D 1 #else @@ -31,11 +42,11 @@ __attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) // attr:no-format __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) // attr:no-format #endif KERNEL(gen9_common_conv_bwd_data_kernel)( - const __global DATA_T *diff_dst, - __global DATA_T * restrict diff_src, - const __global DATA_T *wei, + const __global INPUT0_TYPE *diff_dst, + __global OUTPUT_TYPE * restrict diff_src, + const __global FILTER_TYPE *wei, #if WITH_BIAS - const __global DATA_T *bias, + const __global BIAS_TYPE *bias, #endif #if HAS_FUSED_OPS_DECLS FUSED_OPS_DECLS, @@ -76,11 +87,11 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( diff_dst += input_offset + mb * OC_FULL * G * OD_FULL * OH_FULL * OW_FULL + g * OC * OD_FULL * OH_FULL * OW_FULL * MB_BLOCK; #if WITH_BIAS - DATA8_T blockC00 = (DATA8_T)bias[g * IC + gic * IC_BLOCK + local_id]; - DATA8_T blockC01 = (DATA8_T)bias[g * IC + gic * IC_BLOCK + local_id]; + INPUT_TYPE8 blockC00 = (INPUT_TYPE8)bias[g * IC + gic * IC_BLOCK + local_id]; + INPUT_TYPE8 blockC01 = (INPUT_TYPE8)bias[g * IC + gic * IC_BLOCK + local_id]; #else - DATA8_T blockC00 = 0.0f; - DATA8_T blockC01 = 0.0f; + INPUT_TYPE8 blockC00 = INPUT0_VAL_ZERO; + INPUT_TYPE8 blockC01 = INPUT0_VAL_ZERO; #endif wei += gic * KD * KH * KW * OC_BLOCK * IC_BLOCK @@ -111,13 +122,13 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( #endif if (oh >= OH || ow >= OW) continue; - const __global DATA_T *diff_dst1 = diff_dst + const __global INPUT0_TYPE *diff_dst1 = diff_dst + ow * OC_BLOCK * MB_BLOCK + oh * OW_FULL * OC_BLOCK * MB_BLOCK; #if CASE_3D diff_dst1 += od * OH_FULL * OW_FULL * OC_BLOCK * MB_BLOCK; #endif - const __global DATA_T *wei1 = wei + const __global FILTER_TYPE *wei1 = wei #if CASE_3D + kd * KH * KW * OC_BLOCK * IC_BLOCK #endif @@ -148,44 +159,30 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( #if SW != 1 || SH != 1 || SD != 1 || PH != 0 || PW != 0 || PD != 0 if (do_ker) { #endif - const __global DATA_T *diff_dst1 = diff_dst + const __global INPUT0_TYPE *diff_dst1 = diff_dst + ow * OC_BLOCK * MB_BLOCK + oh * OW_FULL * OC_BLOCK * MB_BLOCK; #if CASE_3D diff_dst1 += od * OH_FULL * OW_FULL * OC_BLOCK * MB_BLOCK; #endif - const __global DATA_T *wei1 = wei; + const __global FILTER_TYPE *wei1 = wei; #endif -#define LOAD_DIFF_DST(_block, _diff_dst, mb_chunk) \ - { \ - (_block) = AS_DATA8_T( \ - BLOCK_READ8((const __global BLOCK_DATA_T *)((_diff_dst) \ - + (mb_chunk)*OC_BLOCK))); \ - } - -#define SAVE_SRC_DIFF(_block, _diff_src, mb_chunk) \ - { \ - BLOCK_WRITE8((const __global BLOCK_DATA_T *)(&( \ - _diff_src)[(mb_chunk)*IC_BLOCK]), \ - AS_BLOCK_DATA8_T((_block))); \ - } - #if DT_F32 #define TRANSPOSE_8(_block, _col) \ - (DATA8_T)(intel_sub_group_shuffle(_block, _col)) + (intel_sub_group_shuffle(_block, _col)) #else #define TRANSPOSE_8(_block, _col) \ - (DATA8_T)(intel_sub_group_shuffle(_block[0], _col), \ - intel_sub_group_shuffle(_block[1], _col), \ - intel_sub_group_shuffle(_block[2], _col), \ - intel_sub_group_shuffle(_block[3], _col), \ - intel_sub_group_shuffle(_block[4], _col), \ - intel_sub_group_shuffle(_block[5], _col), \ - intel_sub_group_shuffle(_block[6], _col), \ - intel_sub_group_shuffle(_block[7], _col)) + (intel_sub_group_shuffle(_block[0], _col), \ + intel_sub_group_shuffle(_block[1], _col), \ + intel_sub_group_shuffle(_block[2], _col), \ + intel_sub_group_shuffle(_block[3], _col), \ + intel_sub_group_shuffle(_block[4], _col), \ + intel_sub_group_shuffle(_block[5], _col), \ + intel_sub_group_shuffle(_block[6], _col), \ + intel_sub_group_shuffle(_block[7], _col)) #endif -#define FMA8(a, b, c) fma((DATA8_T)(a), (DATA8_T)b, (DATA8_T)c) +#define FMA8(a, b, c) fma((FMA_ARG_TYPE8)(a), (FMA_ARG_TYPE8)b, (FMA_ARG_TYPE8)c) #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB, _blockB1) \ { \ @@ -207,14 +204,10 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( _result = FMA8(_blockB1.s7, TRANSPOSE_8(_blockA, 15), _result); \ } - DATA8_T blockA0, blockA1; - LOAD_DIFF_DST(blockA0, diff_dst1, 0); - LOAD_DIFF_DST(blockA1, diff_dst1, 8); - DATA8_T blockB00 = AS_DATA8_T( - BLOCK_READ8((const __global BLOCK_DATA_T *)wei1)); - DATA8_T blockB01 = AS_DATA8_T( - BLOCK_READ8((const __global BLOCK_DATA_T *)(wei1 - + 8 * IC_BLOCK))); + INPUT_TYPE8 blockA0 = DT_INPUT_BLOCK_READ(diff_dst1, 0); + INPUT_TYPE8 blockA1 = DT_INPUT_BLOCK_READ(diff_dst1, 8 * OC_BLOCK); + FILTER_TYPE8 blockB00 = DT_FILTER_BLOCK_READ8(wei1, 0); + FILTER_TYPE8 blockB01 = DT_FILTER_BLOCK_READ8(wei1, 8 * IC_BLOCK); MULTIPLY_BLOCKS_8x8(blockC00, blockA0, blockB00, blockB01); MULTIPLY_BLOCKS_8x8(blockC01, blockA1, blockB00, blockB01); @@ -232,7 +225,7 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( ocb += OC_BLOCK; } while (ocb < OC); - __global DATA_T *src_write0 = diff_src + OUTPUT_OFFSET + mb * IC_FULL * G * ID_FULL * IH_FULL * IW_FULL + __global OUTPUT_TYPE *src_write0 = diff_src + OUTPUT_OFFSET + mb * IC_FULL * G * ID_FULL * IH_FULL * IW_FULL + gic * ID_FULL * IH_FULL * IW_FULL * IC_BLOCK * MB_BLOCK + g * IC * ID_FULL * IH_FULL * IW_FULL * MB_BLOCK + id * IH_FULL * IW_FULL * IC_BLOCK * MB_BLOCK + ih * IW_FULL * IC_BLOCK * MB_BLOCK @@ -240,20 +233,24 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( blockC00 = ACTIVATION(blockC00, ACTIVATION_PARAMS); blockC01 = ACTIVATION(blockC01, ACTIVATION_PARAMS); + OUTPUT_TYPE8 res0, res1; #if HAS_FUSED_OPS { FUSED_OPS_BLOCK_C00; - blockC00 = FUSED_OPS_RESULT_BLOCK_C00; + res0 = FUSED_OPS_RESULT_BLOCK_C00; } { FUSED_OPS_BLOCK_C01; - blockC01 = FUSED_OPS_RESULT_BLOCK_C01; + res1 = FUSED_OPS_RESULT_BLOCK_C01; } +#else + res0 = blockC00; + res1 = blockC01; #endif - SAVE_SRC_DIFF(blockC00, src_write0, 0); - SAVE_SRC_DIFF(blockC01, src_write0, 8); + DT_OUTPUT_BLOCK_WRITE8(src_write0, 0, res0); + DT_OUTPUT_BLOCK_WRITE8(src_write0, 8 * IC_BLOCK, res1); #endif #if VER_8OW16C == 1 @@ -278,7 +275,7 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( const int iw = (ihw % IWB) * IW_BLOCK; diff_dst += input_offset + mb * OC_FULL * G * OD_FULL * OH_FULL * OW_FULL + g * OC * OD_FULL * OH_FULL * OW_FULL * MB_BLOCK; - DATA_T blockC00[IW_BLOCK] = {0.0f}; + INPUT0_TYPE blockC00[IW_BLOCK] = {INPUT0_VAL_ZERO}; #if WITH_BIAS for (int i = 0; i < IW_BLOCK; i++) @@ -307,12 +304,12 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( oh /= SH; if (oh >= OH) continue; - const __global DATA_T *diff_dst1 + const __global INPUT0_TYPE *diff_dst1 = diff_dst + oh * OW_FULL * OC_BLOCK * MB_BLOCK; #if CASE_3D diff_dst1 += od * OH_FULL * OW_FULL * OC_BLOCK * MB_BLOCK; #endif - const __global DATA_T *wei1 = wei + const __global FILTER_TYPE *wei1 = wei #if CASE_3D + kd * KH * KW * OC_BLOCK * IC_BLOCK #endif @@ -341,21 +338,21 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( #if SW != 1 || SH != 1 || SD != 1 || PH != 0 || PW != 0 || PD != 0 if (do_ker) { #endif - const __global DATA_T *diff_dst1 + const __global INPUT0_TYPE *diff_dst1 = diff_dst + oh * OW_FULL * OC_BLOCK * MB_BLOCK; #if CASE_3D diff_dst1 += od * OH_FULL * OW_FULL * OC_BLOCK * MB_BLOCK; #endif - const __global DATA_T *wei1 = wei; + const __global FILTER_TYPE *wei1 = wei; #endif int ocb = 0; do { #define TRANSPOSE_1(_block, _col) \ - (DATA_T)(intel_sub_group_shuffle(_block, _col)) + (intel_sub_group_shuffle(_block, _col)) -#define FMA1(a, b, c) fma((DATA_T)(a), (DATA_T)b, (DATA_T)c) +#define FMA1(a, b, c) fma((FMA_ARG_TYPE)(a), (FMA_ARG_TYPE)b, (FMA_ARG_TYPE)c) #define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB, _blockB1) \ { \ @@ -377,12 +374,9 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( _result = FMA1(_blockB1.s7, TRANSPOSE_1(_blockA, 15), _result); \ } - DATA8_T blockB00 = AS_DATA8_T( - BLOCK_READ8((const __global BLOCK_DATA_T *)wei1)); - DATA8_T blockB01 = AS_DATA8_T( - BLOCK_READ8((const __global BLOCK_DATA_T *)(wei1 - + 8 * IC_BLOCK))); - DATA_T blockA[IW_BLOCK]; + FILTER_TYPE8 blockB00 = DT_FILTER_BLOCK_READ8(wei1, 0); + FILTER_TYPE8 blockB01 = DT_FILTER_BLOCK_READ8(wei1, 8 * IC_BLOCK); + INPUT0_TYPE blockA[IW_BLOCK]; __attribute__(( opencl_unroll_hint(IW_BLOCK))) // attr:no-format @@ -407,9 +401,7 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( blockA[i] = 0.0; continue; } - blockA[i] = AS_DATA_T( - BLOCK_READ((const __global BLOCK_DATA_T *)(&( - diff_dst1)[ow * OC_BLOCK]))); + blockA[i] = DT_INPUT_BLOCK_READ(diff_dst1, ow * OC_BLOCK); } __attribute__(( @@ -434,7 +426,7 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( #endif #endif - __global DATA_T *src_write0 = diff_src + output_offset + mb * IC_FULL * G * ID_FULL * IH_FULL * IW_FULL + __global OUTPUT_TYPE *src_write0 = diff_src + output_offset + mb * IC_FULL * G * ID_FULL * IH_FULL * IW_FULL + gic * ID_FULL * IH_FULL * IW_FULL * IC_BLOCK * MB_BLOCK + g * IC * ID_FULL * IH_FULL * IW_FULL * MB_BLOCK + id * IH_FULL * IW_FULL * IC_BLOCK * MB_BLOCK + ih * IW_FULL * IC_BLOCK * MB_BLOCK @@ -443,12 +435,14 @@ KERNEL(gen9_common_conv_bwd_data_kernel)( for (int i = 0; i < IW_BLOCK; i++) { blockC00[i] = ACTIVATION(blockC00[i], ACTIVATION_PARAMS); if (iw + i >= IW) continue; + OUTPUT_TYPE res; #if HAS_FUSED_OPS FUSED_OPS_BLOCK_CI; - blockC00[i] = FUSED_OPS_RESULT_BLOCK_CI; + res = FUSED_OPS_RESULT_BLOCK_CI; +#else + res = blockC00[i]; #endif - BLOCK_WRITE((__global BLOCK_DATA_T *)(&(src_write0)[i * IC_BLOCK]), - AS_BLOCK_DATA_T(blockC00[i])); + DT_OUTPUT_BLOCK_WRITE(src_write0, i * IC_BLOCK, res); } #endif } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/ocl_types.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/ocl_types.h deleted file mode 100644 index 332e9551cee0f6..00000000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/ocl_types.h +++ /dev/null @@ -1,444 +0,0 @@ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -// #include "ocl_math_utils.h" - -#define for_ for - -#define CONCAt2(a, b) a##b -#define CONCAT2(a, b) CONCAt2(a, b) - -#if DT_F32 == 1 -#define DATA_T float -#define DATA8_T float8 -#define DATA_MAX FLT_MAX -#define DATA_MIN -DATA_MAX -#define DATA_ZERO 0.0f -#define DATA_ONE 1.0f -#define DEF_ACC_DATA_T float -#define DEF_ACC_DATA8_T float8 -#define POST_OP_DATA_T float -#define TO_DATA_T(v) static_cast(v) -#define TO_DEF_ACC_DATA_T(v) static_cast(v) -#define DATA_TO_REF convert_float -#define CONVERT_DATA_T convert_float -#define CONVERT_DATA8_T convert_float8 -#define CONVERT_FLOAT_T convert_float -#define CONVERT_FLOAT8_T convert_float8 -#define ROUND - -#define BLOCK_READ intel_sub_group_block_read -#define BLOCK_WRITE intel_sub_group_block_write -#define BLOCK_READ8 intel_sub_group_block_read8 -#define BLOCK_WRITE8 intel_sub_group_block_write8 - -#define AS_DATA_T as_float -#define AS_DATA8_T as_float8 - -#define AS_UINT_T as_uint -#define AS_UINT8_T as_uint8 - -#define BLOCK_DATA_T uint -#define BLOCK_DATA8_T uint8 -#define AS_BLOCK_DATA_T as_uint -#define AS_BLOCK_DATA8_T as_uint8 -#elif DT_F16 == 1 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -#define DATA_T half -#define DATA8_T half8 -#define DATA_MAX HALF_MAX -#define DATA_MIN -DATA_MAX -#define DATA_ZERO 0.0h -#define DATA_ONE 1.0h -#define DEF_ACC_DATA_T half -#define DEF_ACC_DATA8_T half8 -#define POST_OP_DATA_T half -#define TO_DATA_T(v) (half)(v) -#define TO_DEF_ACC_DATA_T(v) (half)(v) -#define DATA_TO_REF convert_half -#define CONVERT_DATA_T convert_half -#define CONVERT_DATA8_T convert_half8 -#define CONVERT_FLOAT_T convert_float -#define CONVERT_FLOAT8_T convert_float8 -#define ROUND - -#define BLOCK_READ intel_sub_group_block_read_us -#define BLOCK_WRITE intel_sub_group_block_write_us -#define BLOCK_READ8 intel_sub_group_block_read_us8 -#define BLOCK_WRITE8 intel_sub_group_block_write_us8 -#define AS_DATA_T as_half -#define AS_DATA8_T as_half8 - -#define AS_UINT_T as_ushort -#define AS_UINT8_T as_ushort8 - -#define BLOCK_DATA_T ushort -#define BLOCK_DATA8_T ushort8 -#define AS_BLOCK_DATA_T as_ushort -#define AS_BLOCK_DATA8_T as_ushort8 -#elif DT_BF16 == 1 -#define DATA_T ushort -#define POST_OP_DATA_T float -#define DATA8_T ushort8 -#define DATA_MAX 3.38953138925153547590470800371487866880e+38F -#define DATA_MIN (-DATA_MAX) -#define DATA_ZERO 0.0f -#define DATA_ONE 1.0f -#define DEF_ACC_DATA_T float -#define DEF_ACC_DATA8_T float8 -#define TO_DATA_T(v) convert_f32_to_bf16(v) -#define TO_DEF_ACC_DATA_T(v) convert_bf16_to_f32(v) -#define DATA_TO_REF convert_bf16_to_f32 -#define CONVERT_DATA_T convert_f32_to_bf16 -#define CONVERT_DATA8_T convert_f32_to_bf16_vec8 -#define CONVERT_FLOAT_T convert_bf16_to_f32 -#define CONVERT_FLOAT8_T convert_bf16_to_f32_vec8 -#define ROUND - -#define BLOCK_READ intel_sub_group_block_read_us -#define BLOCK_WRITE intel_sub_group_block_write_us -#define BLOCK_READ8 intel_sub_group_block_read_us8 -#define BLOCK_WRITE8 intel_sub_group_block_write_us8 -#define AS_DATA_T as_ushort -#define AS_DATA8_T as_ushort8 - -#define AS_UINT_T as_ushort -#define AS_UINT8_T as_ushort8 - -#define BLOCK_DATA_T ushort -#define BLOCK_DATA8_T ushort8 -#define AS_BLOCK_DATA_T as_ushort -#define AS_BLOCK_DATA8_T as_ushort8 -#elif DT_S8 == 1 -#define DATA_T char -#define DATA8_T char8 -#define DATA_MAX CHAR_MAX -#define DATA_MIN CHAR_MIN -#define DATA_ZERO 0 -#define DATA_ONE 1 -#define DEF_ACC_DATA_T int -#define DEF_ACC_DATA8_T int8 -#define POST_OP_DATA_T float -#define TO_DATA_T(v) static_cast(v) -#define DATA_TO_REF convert_char -#define CONVERT_DATA_T convert_char -#define CONVERT_DATA8_T convert_char8 -#define ROUND rint - -#define BLOCK_READ intel_sub_group_block_read_uc -#define BLOCK_WRITE intel_sub_group_block_write_uc -#define BLOCK_READ8 intel_sub_group_block_read_uc8 -#define BLOCK_WRITE8 intel_sub_group_block_write_uc8 -#define AS_DATA_T as_char -#define AS_DATA8_T as_char8 - -#define AS_UINT_T as_uchar -#define AS_UINT8_T as_uchar8 - -#define BLOCK_DATA_T uchar -#define BLOCK_DATA8_T uchar8 -#define AS_BLOCK_DATA_T as_uchar -#define AS_BLOCK_DATA8_T as_uchar8 -#elif DT_U8 == 1 -#define DATA_T uchar -#define DATA8_T uchar8 -#define DATA_MAX UCHAR_MAX -#define DATA_MIN 0 -#define DATA_ZERO 0 -#define DATA_ONE 1 -#define DEF_ACC_DATA_T int -#define DEF_ACC_DATA8_T int8 -#define POST_OP_DATA_T float -#define TO_DATA_T(v) (uchar)(v) -#define DATA_TO_REF convert_uchar -#define CONVERT_DATA_T convert_uchar -#define CONVERT_DATA8_T convert_uchar8 -#define ROUND rint - -#define BLOCK_READ intel_sub_group_block_read_uc -#define BLOCK_WRITE intel_sub_group_block_write_uc -#define BLOCK_READ8 intel_sub_group_block_read_uc8 -#define BLOCK_WRITE8 intel_sub_group_block_write_uc8 -#define AS_DATA_T as_uchar -#define AS_DATA8_T as_uchar8 - -#define AS_UINT_T as_uchar -#define AS_UINT8_T as_uchar8 - -#define BLOCK_DATA_T uchar -#define BLOCK_DATA8_T uchar8 -#define AS_BLOCK_DATA_T as_uchar -#define AS_BLOCK_DATA8_T as_uchar8 -#elif DT_S32 == 1 -#define DATA_T int -#define CONVERT_DATA_T convert_int_sat_rte -#define POST_OP_DATA_T float -#elif !defined(DT_UNDEF) -#error "Unexpected data type" -#endif - -#if VECT_DT_N == 1 -#define VECT_DATA_T DATA_T -#define VECT_DEF_ACC_DATA_T DEF_ACC_DATA_T -#define AS_VECT_DATA_T AS_DATA_T -#define VECT_BLOCK_READ BLOCK_READ -#define VECT_BLOCK_WRITE BLOCK_WRITE -#define VECT_UINT_READ intel_sub_group_block_read -#define VECT_UINT_WRITE intel_sub_group_block_write -#define VECT_BLOCK_DATA_T BLOCK_DATA_T -#define AS_VECT_BLOCK_DATA_T AS_BLOCK_DATA_T -#define CONVERT_VECT_FLOAT_T CONVERT_FLOAT_T -#define CONVERT_VECTOR_DATA_T CONVERT_DATA_T -#define VECT_INT_T int -#define VECT_UINT_T uint -#define VECT_FLOAT_T float -#define AS_VECT_INT_T as_int -#define AS_VECT_UINT_T as_uint -#elif VECT_DT_N == 8 -#define VECT_DATA_T DATA8_T -#define VECT_DEF_ACC_DATA_T DEF_ACC_DATA8_T -#define AS_VECT_DATA_T AS_DATA8_T -#define VECT_BLOCK_READ BLOCK_READ8 -#define VECT_BLOCK_WRITE BLOCK_WRITE8 -#define VECT_UINT_READ intel_sub_group_block_read8 -#define VECT_UINT_WRITE intel_sub_group_block_write8 -#define VECT_BLOCK_DATA_T BLOCK_DATA8_T -#define AS_VECT_BLOCK_DATA_T AS_BLOCK_DATA8_T -#define CONVERT_VECT_FLOAT_T CONVERT_FLOAT8_T -#define CONVERT_VECTOR_DATA_T CONVERT_DATA8_T -#define VECT_INT_T int8 -#define VECT_UINT_T uint8 -#define VECT_FLOAT_T float8 -#define AS_VECT_INT_T as_int8 -#define AS_VECT_UINT_T as_uint8 -#endif - -#ifdef SRC_DATA_T -#define SRC_DATA8_T CONCAT2(SRC_DATA_T, 8) -#if SRC_DT_BF16 -#define SRC_TO_REF(x) convert_bf16_to_f32(x) -#define SRC_TO_REF8(x) convert_bf16_to_f32_vec8(x) -#else -#define SRC_TO_REF(x) (x) -#define SRC_TO_REF8(x) (x) -#endif -#if SRC_DT_BF16 -#define TO_SRC(x) convert_f32_to_bf16(x) -#elif SRC_DT_U8 -#define TO_SRC(x) convert_uchar_sat_rte(x) -#elif SRC_DT_S8 -#define TO_SRC(x) convert_char_sat_rte(x) -#elif SRC_DT_S32 -#define TO_SRC(x) convert_int_sat_rte(x) -#else -#define TO_SRC(x) (x) -#endif -#endif - -#ifdef WEI_DATA_T -#if WEI_DT_BF16 -#define WEI_TO_REF(x) convert_bf16_to_f32(x) -#define REF_TO_WEI(x) convert_f32_to_bf16(x) -#else -#define WEI_TO_REF(x) (x) -#define REF_TO_WEI(x) (x) -#endif -#if WEI_DT_BF16 -#define TO_WEI(x) convert_f32_to_bf16(x) -#elif WEI_DT_U8 -#define TO_WEI(x) convert_uchar_sat_rte(x) -#elif WEI_DT_S8 -#define TO_WEI(x) convert_char_sat_rte(x) -#elif WEI_DT_S32 -#define TO_WEI(x) convert_int_sat_rte(x) -#else -#define TO_WEI(x) (x) -#endif -#endif - -#ifdef BIA_DATA_T -#if BIA_DT_BF16 -#define BIA_TO_REF(x) convert_bf16_to_f32(x) -#define REF_TO_BIA(x) convert_f32_to_bf16(x) -#else -#define BIA_TO_REF(x) (x) -#define REF_TO_BIA(x) (x) -#endif -#if BIA_DT_BF16 -#define TO_BIA(x) convert_f32_to_bf16(x) -#elif BIA_DT_U8 -#define TO_BIA(x) convert_uchar_sat_rte(x) -#elif BIA_DT_S8 -#define TO_BIA(x) convert_char_sat_rte(x) -#elif BIA_DT_S32 -#define TO_BIA(x) convert_int_sat_rte(x) -#else -#define TO_BIA(x) (x) -#endif -#endif - -#ifdef DST_DATA_T -#define DST_DATA8_T CONCAT2(DST_DATA_T, 8) -#if DST_DT_BF16 -#define DST_TO_REF(x) convert_bf16_to_f32(x) -#define DST_TO_REF8(x) convert_bf16_to_f32_vec8(x) -#define REF_TO_DST(x) convert_f32_to_bf16(x) -#define REF_TO_DST8(x) convert_f32_to_bf16_vec8(convert_float8(x)) -#else -#define DST_TO_REF(x) (x) -#define DST_TO_REF8(x) (x) -#define REF_TO_DST(x) (x) -#define REF_TO_DST8(x) (x) -#endif -#if DST_DT_BF16 -#define TO_DST(x) convert_f32_to_bf16(x) -#define TO_DST8(x) convert_f32_to_bf16_vec8(convert_float8(x)) -#elif DST_DT_F16 -#define TO_DST(x) convert_half(x) -#define TO_DST8(x) convert_half8(x) -#elif DST_DT_U8 -#define TO_DST(x) convert_uchar_sat_rte(x) -#define TO_DST8(x) convert_uchar8_sat_rte(x) -#elif DST_DT_S8 -#define TO_DST(x) convert_char_sat_rte(x) -#define TO_DST8(x) convert_char8_sat_rte(x) -#elif DST_DT_S32 -#define TO_DST(x) convert_int_sat_rte(x) -#define TO_DST8(x) convert_int8_sat_rte(x) -#elif DST_DT_F32 -#define TO_DST(x) convert_float(x) -#define TO_DST8(x) convert_float8(x) -#else -#error "Not expected" -#endif -#endif - -#ifdef ACC_DATA_T -#if ACC_DT_F16 -#define TO_ACC(x) convert_half(x) -#elif ACC_DT_F32 -#define TO_ACC(x) convert_float(x) -#elif ACC_DT_S32 -#define TO_ACC(x) convert_int(x) -#else -#error "Unexpected accumulation data type" -#endif -#endif - -#define OFF_MD(prefix, x0, x1, x2, x3, x4, x5) \ - ((x0 / prefix##_B0_2) / prefix##_B0_1 * prefix##_S0_0) \ - + ((x0 / prefix##_B0_2) % prefix##_B0_1 * prefix##_S0_1) \ - + ((x0 % prefix##_B0_2) * prefix##_S0_2) \ - + ((x1 / prefix##_B1_2) / prefix##_B1_1 * prefix##_S1_0) \ - + ((x1 / prefix##_B1_2) % prefix##_B1_1 * prefix##_S1_1) \ - + ((x1 % prefix##_B1_2) * prefix##_S1_2) \ - + ((x2 / prefix##_B2_2) / prefix##_B2_1 * prefix##_S2_0) \ - + ((x2 / prefix##_B2_2) % prefix##_B2_1 * prefix##_S2_1) \ - + ((x2 % prefix##_B2_2) * prefix##_S2_2) \ - + ((x3 / prefix##_B3_2) / prefix##_B3_1 * prefix##_S3_0) \ - + ((x3 / prefix##_B3_2) % prefix##_B3_1 * prefix##_S3_1) \ - + ((x3 % prefix##_B3_2) * prefix##_S3_2) \ - + ((x4 / prefix##_B4_2) / prefix##_B4_1 * prefix##_S4_0) \ - + ((x4 / prefix##_B4_2) % prefix##_B4_1 * prefix##_S4_1) \ - + ((x4 % prefix##_B4_2) * prefix##_S4_2) \ - + ((x5 / prefix##_B5_2) / prefix##_B5_1 * prefix##_S5_0) \ - + ((x5 / prefix##_B5_2) % prefix##_B5_1 * prefix##_S5_1) \ - + ((x5 % prefix##_B5_2) * prefix##_S5_2) - -#if NDIMS == 3 -#define SRC_OFF(x0, x1, d, h, x2) \ - (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \ - + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \ - + ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2) - -#if WITH_GROUPS == 1 -#define WHT_OFF(x0, x1, x2, d, h, x3) \ - (((x0) % WHT_B0) * WHT_SB0 + ((x0) / WHT_B0) * WHT_S0 \ - + ((x1) % WHT_B1) * WHT_SB1 + ((x1) / WHT_B1) * WHT_S1 \ - + ((x2) % WHT_B2) * WHT_SB2 + ((x2) / WHT_B2) * WHT_S2 \ - + ((x3) % WHT_B3) * WHT_SB3 + ((x3) / WHT_B3) * WHT_S3) -#else -#define WHT_OFF(g, x0, x1, d, h, x2) \ - (((x0) % WHT_B0) * WHT_SB0 + ((x0) / WHT_B0) * WHT_S0 \ - + ((x1) % WHT_B1) * WHT_SB1 + ((x1) / WHT_B1) * WHT_S1 \ - + ((x2) % WHT_B2) * WHT_SB2 + ((x2) / WHT_B2) * WHT_S2) -#endif - -#define DST_OFF(x0, x1, d, h, x2) \ - (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \ - + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \ - + ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2) -#elif NDIMS == 4 -#define SRC_OFF(x0, x1, d, x2, x3) \ - (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \ - + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \ - + ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2 \ - + ((x3) % SRC_B3) * SRC_SB3 + ((x3) / SRC_B3) * SRC_S3) - -#if WITH_GROUPS == 1 -#define WHT_OFF(x0, x1, x2, d, x3, x4) \ - (((x0) % WHT_B0) * WHT_SB0 + ((x0) / WHT_B0) * WHT_S0 \ - + ((x1) % WHT_B1) * WHT_SB1 + ((x1) / WHT_B1) * WHT_S1 \ - + ((x2) % WHT_B2) * WHT_SB2 + ((x2) / WHT_B2) * WHT_S2 \ - + ((x3) % WHT_B3) * WHT_SB3 + ((x3) / WHT_B3) * WHT_S3 \ - + ((x4) % WHT_B4) * WHT_SB4 + ((x4) / WHT_B4) * WHT_S4) -#else -#define WHT_OFF(g, x1, x2, d, x3, x4) \ - (((x1) % WHT_B0) * WHT_SB0 + ((x1) / WHT_B0) * WHT_S0 \ - + ((x2) % WHT_B1) * WHT_SB1 + ((x2) / WHT_B1) * WHT_S1 \ - + ((x3) % WHT_B2) * WHT_SB2 + ((x3) / WHT_B2) * WHT_S2 \ - + ((x4) % WHT_B3) * WHT_SB3 + ((x4) / WHT_B3) * WHT_S3) -#endif - -#define DST_OFF(x0, x1, d, x2, x3) \ - (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \ - + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \ - + ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2 \ - + ((x3) % DST_B3) * DST_SB3 + ((x3) / DST_B3) * DST_S3) -#elif NDIMS == 5 -#define SRC_OFF(x0, x1, x2, x3, x4) \ - (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \ - + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \ - + ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2 \ - + ((x3) % SRC_B3) * SRC_SB3 + ((x3) / SRC_B3) * SRC_S3 \ - + ((x4) % SRC_B4) * SRC_SB4 + ((x4) / SRC_B4) * SRC_S4) - -#if WITH_GROUPS == 1 -#define WHT_OFF(x0, x1, x2, x3, x4, x5) \ - (((x0) % WHT_B0) * WHT_SB0 + ((x0) / WHT_B0) * WHT_S0 \ - + ((x1) % WHT_B1) * WHT_SB1 + ((x1) / WHT_B1) * WHT_S1 \ - + ((x2) % WHT_B2) * WHT_SB2 + ((x2) / WHT_B2) * WHT_S2 \ - + ((x3) % WHT_B3) * WHT_SB3 + ((x3) / WHT_B3) * WHT_S3 \ - + ((x4) % WHT_B4) * WHT_SB4 + ((x4) / WHT_B4) * WHT_S4 \ - + ((x5) % WHT_B5) * WHT_SB5 + ((x5) / WHT_B5) * WHT_S5) -#else -#define WHT_OFF(g, x1, x2, x3, x4, x5) \ - (((x1) % WHT_B0) * WHT_SB0 + ((x1) / WHT_B0) * WHT_S0 \ - + ((x2) % WHT_B1) * WHT_SB1 + ((x2) / WHT_B1) * WHT_S1 \ - + ((x3) % WHT_B2) * WHT_SB2 + ((x3) / WHT_B2) * WHT_S2 \ - + ((x4) % WHT_B3) * WHT_SB3 + ((x4) / WHT_B3) * WHT_S3 \ - + ((x5) % WHT_B4) * WHT_SB4 + ((x5) / WHT_B4) * WHT_S4) -#endif - -#define DST_OFF(x0, x1, x2, x3, x4) \ - (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \ - + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \ - + ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2 \ - + ((x3) % DST_B3) * DST_SB3 + ((x3) / DST_B3) * DST_S3 \ - + ((x4) % DST_B4) * DST_SB4 + ((x4) / DST_B4) * DST_S4) -#endif - diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp index 32efb50d630c7f..85b6d03382fa57 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp @@ -676,11 +676,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { (input_data.get_dependency(0).get_output_layout().data_type == data_types::u8 || input_data.get_dependency(0).get_output_layout().data_type == data_types::i8); - should_fuse |= input_data.is_type() && quantize_node.get_scale_shift_opt() && - // fp16/fp32 optimized kernels don't support chaning data type - (input_data.get_dependency(0).get_output_layout().data_type == data_types::u8 || - input_data.get_dependency(0).get_output_layout().data_type == data_types::i8 || - input_data.get_output_layout().data_type == out_layout.data_type); + should_fuse |= input_data.is_type() && quantize_node.get_scale_shift_opt(); should_fuse |= input_data.is_type() && quantize_node.get_scale_shift_opt(); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp index 566bf119e4612c..6d043cf5d5a6ed 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp @@ -445,6 +445,17 @@ class ConvEltwTest : public ::BaseFusingTest { network_not_fused.set_input_data("input", input_prim); compare(network_not_fused, network_fused, p); + auto find_prim = [](primitive_info& p) -> bool { + // Add more ids when needed + if (p.original_id == "deconv_prim") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_prim); + if (info_fused != pi_fused.end()) + std::cout << "kernel: " << info_fused->kernel_id << std::endl; } layout get_input_layout(conv_eltw_test_params& p) { @@ -4333,23 +4344,23 @@ TEST_P(deconv_scale_actv_quant_i8, basic) { INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_i8, ::testing::ValuesIn(std::vector{ - deconv_test_params{ CASE_DECONV_FP32_1, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_2, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_4, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_5, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_6, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_7, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_8, 3, 5 }, + deconv_test_params{ CASE_DECONV_FP32_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_8, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_1, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_2, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_4, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_5, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_6, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_7, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_8, 3, 5 }, + deconv_test_params{ CASE_DECONV_FP16_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_8, 2, 5 }, deconv_test_params{ CASE_DECONV_U8S8_1, 2, 5 }, deconv_test_params{ CASE_DECONV_U8S8_2, 2, 5 }, @@ -4369,26 +4380,26 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_i8, deconv_test_params{ CASE_DECONV_S8S8_7, 2, 5 }, deconv_test_params{ CASE_DECONV_S8S8_8, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_1, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_2, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_3, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_4, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_5, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_6, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_7, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_8, 3, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 5 }, // FIXME no quantize implementation for bs_fs_yx_bsv16_fsv16 format AND add_required_reorders pass completely ruins data types // add_required_reorders pass tries to reorder everything to output type if no format exists, this ruins fp32 -> int8 quantize //deconv_test_params{ CASE_DECONV_FP32_3D_9, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_1, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_2, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_3, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_4, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_5, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_6, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_7, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_8, 3, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 5 }, //deconv_test_params{ CASE_DECONV_FP16_3D_9, 3, 5 }, deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 5 }, @@ -4444,23 +4455,23 @@ TEST_P(deconv_scale_actv_quant_u8_eltw_scale_actv_quant_i8, basic) { INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_u8_eltw_scale_actv_quant_i8, ::testing::ValuesIn(std::vector{ - deconv_test_params{ CASE_DECONV_FP32_1, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_2, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_4, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_5, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_6, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_7, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_8, 4, 9 }, - - deconv_test_params{ CASE_DECONV_FP16_1, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_2, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_4, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_5, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_6, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_7, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_8, 4, 9 }, + deconv_test_params{ CASE_DECONV_FP32_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_8, 2, 9 }, + + deconv_test_params{ CASE_DECONV_FP16_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_8, 2, 9 }, deconv_test_params{ CASE_DECONV_U8S8_1, 2, 9 }, deconv_test_params{ CASE_DECONV_U8S8_2, 2, 9 }, @@ -4480,24 +4491,24 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_u8_eltw_scale_actv_ deconv_test_params{ CASE_DECONV_S8S8_7, 2, 9 }, deconv_test_params{ CASE_DECONV_S8S8_8, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_1, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_2, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_3, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_4, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_5, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_6, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_7, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_8, 4, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 9 }, // deconv_test_params{ CASE_DECONV_FP32_3D_9, 6, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_1, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_2, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_3, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_4, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_5, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_6, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_7, 4, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_8, 4, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 9 }, // deconv_test_params{ CASE_DECONV_FP16_3D_9, 6, 9 }, deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 9 }, @@ -4548,14 +4559,14 @@ TEST_P(deconv_scale_activation_quantize_i8_eltwise_quantize_u8, basic) { INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_activation_quantize_i8_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ - conv_eltw_test_params{CASE_DECONV_ELTW_FP32_1, 4, 7}, - conv_eltw_test_params{CASE_DECONV_ELTW_FP32_2, 4, 7}, - conv_eltw_test_params{CASE_DECONV_ELTW_FP32_3, 4, 7}, - conv_eltw_test_params{CASE_DECONV_ELTW_FP32_4, 4, 7}, - conv_eltw_test_params{CASE_DECONV_ELTW_FP32_5, 4, 7}, - conv_eltw_test_params{CASE_DECONV_ELTW_FP32_6, 4, 7}, - conv_eltw_test_params{CASE_DECONV_ELTW_FP32_7, 4, 7}, - conv_eltw_test_params{CASE_DECONV_ELTW_FP32_8, 4, 7}, + conv_eltw_test_params{CASE_DECONV_ELTW_FP32_1, 2, 7}, + conv_eltw_test_params{CASE_DECONV_ELTW_FP32_2, 2, 7}, + conv_eltw_test_params{CASE_DECONV_ELTW_FP32_3, 2, 7}, + conv_eltw_test_params{CASE_DECONV_ELTW_FP32_4, 2, 7}, + conv_eltw_test_params{CASE_DECONV_ELTW_FP32_5, 2, 7}, + conv_eltw_test_params{CASE_DECONV_ELTW_FP32_6, 2, 7}, + conv_eltw_test_params{CASE_DECONV_ELTW_FP32_7, 2, 7}, + conv_eltw_test_params{CASE_DECONV_ELTW_FP32_8, 2, 7}, conv_eltw_test_params{CASE_DECONV_ELTW_i8_1, 2, 7}, conv_eltw_test_params{CASE_DECONV_ELTW_i8_2, 2, 7},