From 20ef9a94236be4f38f21cc84818d38fc04348891 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Wed, 3 Jun 2020 13:42:15 +0300 Subject: [PATCH 01/12] [IE CLDNN] Improve kernel selection for b_fs_yx_fsv16 layout and optimize Convolution kernels (#730) --- .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 12 ++++++-- .../cl_kernels/convolution_gpu_bfyx_f16.cl | 29 ++++++++++--------- .../convolution_gpu_bfyx_f16_1x1.cl | 18 ++---------- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index b0284dbaf56d55..bcb6a1d45b5137 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -78,6 +78,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa auto autoTune = GetAutoTuneOptions(params, autoTuneIndex); kd.cldnnStyle.blockWidth = autoTune.blockWidth; + const auto& input = params.inputs[0]; const auto& out = params.output; auto x = out.X().v; auto y = out.Y().v; @@ -92,11 +93,16 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa kd.lws1 = sub_group_size; kd.lws2 = 1; + auto bBlockSizeX = x % autoTune.blockWidth == 0; + auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0; + auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0; + if (b == 1) { - if (x <= 8) + if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) { kd.efficiency = FORCE_PRIORITY_1; - else - kd.efficiency = FORCE_PRIORITY_2; + } else { + kd.efficiency = FORCE_PRIORITY_3; + } } else { kd.efficiency = FORCE_PRIORITY_7; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl index 0adfb299c4f779..6af3b271d8b38f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -161,9 +161,14 @@ KERNEL(convolution_bfyx_f16)( vec_t dst = INPUT0_VAL_ZERO; #endif // BIAS_TERM -#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD - for (uint g = group; g < group + groups_per_sub_group; g++) { +#if MULTIPLE_GROUPS_INPUT_PRELOAD + const uint in_split_offset = f_block * input_fs_pitch; + const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group); + const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group); + const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH; +#else #if GROUPED + for (uint g = group; g < group + groups_per_sub_group; g++) { const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE); const uint filter_split_offset = g * FILTER_GROUPS_PITCH; const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch; @@ -173,11 +178,6 @@ KERNEL(convolution_bfyx_f16)( const uint filter_offset = f_block * filter_os_pitch; #endif // GROUPED const uint grouped_filter_offset = filter_offset + filter_split_offset; -#else - const uint in_split_offset = f_block * input_fs_pitch; - const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group); - const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group); - const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH; #endif // MULTIPLE_GROUPS_INPUT_PRELOAD const uint grouped_input_offset = input_offset + in_split_offset; @@ -248,7 +248,11 @@ KERNEL(convolution_bfyx_f16)( vec_t src; __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE))) for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) { +#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1 + src[i] = line_cache[i]; +#else src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i]; +#endif // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1 } #if MULTIPLE_GROUPS_INPUT_PRELOAD typedef MAKE_VECTOR_TYPE(FILTER_TYPE, FILTER_IFM_NUM) ifm_vec_t; @@ -345,9 +349,9 @@ KERNEL(convolution_bfyx_f16)( } } } -#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD +#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD } -#endif // MULTIPLE_GROUPS_INPUT_PRELOAD +#endif // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD dst = ACTIVATION(dst, ACTIVATION_PARAMS); typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t; @@ -370,7 +374,7 @@ KERNEL(convolution_bfyx_f16)( else #endif // OUTPUT_LEFTOVERS { - if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X) { + if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0) { #if HAS_FUSED_OPS FUSED_OPS_VEC; res = FUSED_OPS_RESULT_VEC; @@ -390,8 +394,7 @@ KERNEL(convolution_bfyx_f16)( # error convolution_gpu_bfyx_f16.cl: Unsupported output x block size. #endif } else { - const int x_tail = OUTPUT_SIZE_X - x; - for (int i = 0; i < x_tail; i++) { + for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) { #if HAS_FUSED_OPS FUSED_OPS_SCALAR; res[i] = FUSED_OPS_RESULT_SCALAR; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl index 25a2b36197a912..155ed590e73113 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl @@ -208,21 +208,10 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)( #endif { #if !PADDED_OUTPUT - if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y) { -#if HAS_FUSED_OPS - FUSED_OPS_VEC; - dst = FUSED_OPS_RESULT_VEC; -#endif -#if X_BLOCK_SIZE == 8 - UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst); -#elif X_BLOCK_SIZE == 4 - UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst); -#elif X_BLOCK_SIZE == 2 - UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst); -#endif - } else { + if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) { #else - if (x * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X) { + if (x + X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % X_BLOCK_SIZE == 0) { +#endif #if HAS_FUSED_OPS FUSED_OPS_VEC; dst = FUSED_OPS_RESULT_VEC; @@ -235,7 +224,6 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)( UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst); #endif } else { -#endif for (int i = 0; i < X_BLOCK_SIZE; i++) { if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y) return; From 72d9a9fae7bddf758032599724f8220eb28d20d0 Mon Sep 17 00:00:00 2001 From: Vitaliy Urusovskij Date: Wed, 3 Jun 2020 13:54:38 +0300 Subject: [PATCH 02/12] Use pre-defined DB collection names in memcheck_upload.py CLI (#651) Use argparses `choices` for `--db_collection` option. Also removed unnecessary redefinition of `db_collection` in memcheck_upload.py --- tests/stress_tests/scripts/memcheck_upload.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/stress_tests/scripts/memcheck_upload.py b/tests/stress_tests/scripts/memcheck_upload.py index d23bc4d5277c79..0f6474b737036d 100644 --- a/tests/stress_tests/scripts/memcheck_upload.py +++ b/tests/stress_tests/scripts/memcheck_upload.py @@ -165,8 +165,6 @@ def query_timeline(records, db_url, db_collection, max_items=20, similarity=TIME def create_memcheck_report(records, db_url, db_collection, output_path): """ Create memcheck timeline HTML report for records. """ - if db_collection == 'pre_commit': - db_collection = 'commit' # pre-commit jobs building report from past commits records.sort( key=lambda item: f"{item['status']}{item['device']}{item['model']}{item['test_name']}") timelines = query_timeline(records, db_url, db_collection) @@ -203,7 +201,8 @@ def main(): parser.add_argument('--db_url', required=not is_dryrun, help='MongoDB URL in a for "mongodb://server:port".') parser.add_argument('--db_collection', required=not is_dryrun, - help=f'Collection name in {DATABASE} database to upload') + help=f'Collection name in {DATABASE} database to upload.', + choices=["commit", "nightly", "weekly"]) parser.add_argument('--artifact_root', required=True, help=f'A root directory to strip from log path before upload.') parser.add_argument('--append', help='JSON to append to each item.') From da230131d0958a12db6eddb585748bb7ed7da449 Mon Sep 17 00:00:00 2001 From: Evgenya Stepyreva Date: Wed, 3 Jun 2020 14:14:59 +0300 Subject: [PATCH 03/12] [ nGraph ] FP16 for evaluate (#722) --- ngraph/src/ngraph/op/acosh.cpp | 4 + ngraph/src/ngraph/op/add.cpp | 4 + ngraph/src/ngraph/op/and.cpp | 4 + ngraph/src/ngraph/op/asinh.cpp | 4 + ngraph/src/ngraph/op/atanh.cpp | 4 + ngraph/src/ngraph/op/convert.cpp | 4 + ngraph/src/ngraph/op/divide.cpp | 4 + ngraph/src/ngraph/op/equal.cpp | 4 + ngraph/src/ngraph/op/fused/matmul.cpp | 2 + ngraph/src/ngraph/op/fused/squeeze.cpp | 4 + ngraph/src/ngraph/op/fused/unsqueeze.cpp | 4 + ngraph/src/ngraph/op/gather.cpp | 4 + ngraph/src/ngraph/op/greater.cpp | 4 + ngraph/src/ngraph/op/greater_eq.cpp | 4 + ngraph/src/ngraph/op/less.cpp | 4 + ngraph/src/ngraph/op/less_eq.cpp | 4 + ngraph/src/ngraph/op/max.cpp | 4 + ngraph/src/ngraph/op/max_pool.cpp | 4 + ngraph/src/ngraph/op/maximum.cpp | 4 + ngraph/src/ngraph/op/min.cpp | 4 + ngraph/src/ngraph/op/minimum.cpp | 4 + ngraph/src/ngraph/op/multiply.cpp | 4 + ngraph/src/ngraph/op/non_zero.cpp | 2 + ngraph/src/ngraph/op/not_equal.cpp | 4 + ngraph/src/ngraph/op/or.cpp | 4 + ngraph/src/ngraph/op/power.cpp | 4 + ngraph/src/ngraph/op/product.cpp | 4 + ngraph/src/ngraph/op/reduce_mean.cpp | 4 + ngraph/src/ngraph/op/reduce_prod.cpp | 4 + ngraph/src/ngraph/op/reduce_sum.cpp | 4 + .../src/ngraph/op/scatter_elements_update.cpp | 2 + ngraph/src/ngraph/op/softmax.cpp | 3 +- ngraph/src/ngraph/op/strided_slice.cpp | 2 + ngraph/src/ngraph/op/subtract.cpp | 4 + ngraph/src/ngraph/op/sum.cpp | 4 + ngraph/src/ngraph/op/xor.cpp | 4 + ngraph/src/ngraph/type/float16.cpp | 32 ------ ngraph/src/ngraph/type/float16.hpp | 105 +++++++++++++++++- 38 files changed, 233 insertions(+), 39 deletions(-) diff --git a/ngraph/src/ngraph/op/acosh.cpp b/ngraph/src/ngraph/op/acosh.cpp index a6fea9542ef6c2..514bc2c2f0c06d 100644 --- a/ngraph/src/ngraph/op/acosh.cpp +++ b/ngraph/src/ngraph/op/acosh.cpp @@ -71,6 +71,10 @@ namespace break; TYPE_CASE(u64)(arg0, out); break; + TYPE_CASE(bf16)(arg0, out); + break; + TYPE_CASE(f16)(arg0, out); + break; TYPE_CASE(f32)(arg0, out); break; TYPE_CASE(f64)(arg0, out); diff --git a/ngraph/src/ngraph/op/add.cpp b/ngraph/src/ngraph/op/add.cpp index a39d4213352163..9e65778db867c1 100644 --- a/ngraph/src/ngraph/op/add.cpp +++ b/ngraph/src/ngraph/op/add.cpp @@ -108,6 +108,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/and.cpp b/ngraph/src/ngraph/op/and.cpp index e309cb80a43a3a..4e00373450a680 100644 --- a/ngraph/src/ngraph/op/and.cpp +++ b/ngraph/src/ngraph/op/and.cpp @@ -87,6 +87,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/asinh.cpp b/ngraph/src/ngraph/op/asinh.cpp index b9ae4c16f659ab..2efc0a341b2b0d 100644 --- a/ngraph/src/ngraph/op/asinh.cpp +++ b/ngraph/src/ngraph/op/asinh.cpp @@ -71,6 +71,10 @@ namespace break; TYPE_CASE(u64)(arg0, out); break; + TYPE_CASE(bf16)(arg0, out); + break; + TYPE_CASE(f16)(arg0, out); + break; TYPE_CASE(f32)(arg0, out); break; TYPE_CASE(f64)(arg0, out); diff --git a/ngraph/src/ngraph/op/atanh.cpp b/ngraph/src/ngraph/op/atanh.cpp index ed33af21f93a38..ee0ba9539a805b 100644 --- a/ngraph/src/ngraph/op/atanh.cpp +++ b/ngraph/src/ngraph/op/atanh.cpp @@ -71,6 +71,10 @@ namespace break; TYPE_CASE(u64)(arg0, out); break; + TYPE_CASE(bf16)(arg0, out); + break; + TYPE_CASE(f16)(arg0, out); + break; TYPE_CASE(f32)(arg0, out); break; TYPE_CASE(f64)(arg0, out); diff --git a/ngraph/src/ngraph/op/convert.cpp b/ngraph/src/ngraph/op/convert.cpp index 12cfaa7110ebdb..9695c26b307546 100644 --- a/ngraph/src/ngraph/op/convert.cpp +++ b/ngraph/src/ngraph/op/convert.cpp @@ -99,6 +99,8 @@ namespace break; TYPE_OUT_CASE(bf16)(arg, out); break; + TYPE_OUT_CASE(f16)(arg, out); + break; TYPE_OUT_CASE(f32)(arg, out); break; TYPE_OUT_CASE(f64)(arg, out); @@ -132,6 +134,8 @@ namespace break; TYPE_CASE(bf16)(arg, out); break; + TYPE_CASE(f16)(arg, out); + break; TYPE_CASE(f32)(arg, out); break; TYPE_CASE(f64)(arg, out); diff --git a/ngraph/src/ngraph/op/divide.cpp b/ngraph/src/ngraph/op/divide.cpp index 1b4e4bfda37eaa..de125abe76bfdb 100644 --- a/ngraph/src/ngraph/op/divide.cpp +++ b/ngraph/src/ngraph/op/divide.cpp @@ -125,6 +125,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec, pythondiv); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec, pythondiv); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec, pythondiv); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec, pythondiv); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec, pythondiv); diff --git a/ngraph/src/ngraph/op/equal.cpp b/ngraph/src/ngraph/op/equal.cpp index 972ac43a13f435..a4084210f7b94b 100644 --- a/ngraph/src/ngraph/op/equal.cpp +++ b/ngraph/src/ngraph/op/equal.cpp @@ -83,6 +83,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/fused/matmul.cpp b/ngraph/src/ngraph/op/fused/matmul.cpp index cfe6426ff7bc70..b9680e80c10672 100644 --- a/ngraph/src/ngraph/op/fused/matmul.cpp +++ b/ngraph/src/ngraph/op/fused/matmul.cpp @@ -226,6 +226,8 @@ namespace break; TYPE_CASE(bf16)(arg0, arg1, output, transpose_a, transpose_b); break; + TYPE_CASE(f16)(arg0, arg1, output, transpose_a, transpose_b); + break; TYPE_CASE(f32)(arg0, arg1, output, transpose_a, transpose_b); break; TYPE_CASE(f64)(arg0, arg1, output, transpose_a, transpose_b); diff --git a/ngraph/src/ngraph/op/fused/squeeze.cpp b/ngraph/src/ngraph/op/fused/squeeze.cpp index 6e7977c018dcc7..f1038ebefbbed1 100644 --- a/ngraph/src/ngraph/op/fused/squeeze.cpp +++ b/ngraph/src/ngraph/op/fused/squeeze.cpp @@ -201,6 +201,10 @@ namespace break; TYPE_CASE(u64)(arg0, out); break; + TYPE_CASE(bf16)(arg0, out); + break; + TYPE_CASE(f16)(arg0, out); + break; TYPE_CASE(f32)(arg0, out); break; TYPE_CASE(f64)(arg0, out); diff --git a/ngraph/src/ngraph/op/fused/unsqueeze.cpp b/ngraph/src/ngraph/op/fused/unsqueeze.cpp index 8f8cc569a4f134..ced8d8952e6ae8 100644 --- a/ngraph/src/ngraph/op/fused/unsqueeze.cpp +++ b/ngraph/src/ngraph/op/fused/unsqueeze.cpp @@ -161,6 +161,10 @@ namespace break; TYPE_CASE(u64)(arg0, out); break; + TYPE_CASE(bf16)(arg0, out); + break; + TYPE_CASE(f16)(arg0, out); + break; TYPE_CASE(f32)(arg0, out); break; TYPE_CASE(f64)(arg0, out); diff --git a/ngraph/src/ngraph/op/gather.cpp b/ngraph/src/ngraph/op/gather.cpp index 6ad7b1641e98a3..a40ff71533db2a 100644 --- a/ngraph/src/ngraph/op/gather.cpp +++ b/ngraph/src/ngraph/op/gather.cpp @@ -292,6 +292,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, axis); break; + TYPE_CASE(bf16)(arg0, arg1, out, axis); + break; + TYPE_CASE(f16)(arg0, arg1, out, axis); + break; TYPE_CASE(f32)(arg0, arg1, out, axis); break; TYPE_CASE(f64)(arg0, arg1, out, axis); diff --git a/ngraph/src/ngraph/op/greater.cpp b/ngraph/src/ngraph/op/greater.cpp index a8d483fb56da5c..cc5b70ceae7995 100644 --- a/ngraph/src/ngraph/op/greater.cpp +++ b/ngraph/src/ngraph/op/greater.cpp @@ -83,6 +83,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/greater_eq.cpp b/ngraph/src/ngraph/op/greater_eq.cpp index f9144afb3918e0..b211e72b331bb9 100644 --- a/ngraph/src/ngraph/op/greater_eq.cpp +++ b/ngraph/src/ngraph/op/greater_eq.cpp @@ -83,6 +83,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/less.cpp b/ngraph/src/ngraph/op/less.cpp index 217780a4d84fd4..23e15be240414a 100644 --- a/ngraph/src/ngraph/op/less.cpp +++ b/ngraph/src/ngraph/op/less.cpp @@ -83,6 +83,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/less_eq.cpp b/ngraph/src/ngraph/op/less_eq.cpp index 0107c33a380c01..47a66bc891a3e1 100644 --- a/ngraph/src/ngraph/op/less_eq.cpp +++ b/ngraph/src/ngraph/op/less_eq.cpp @@ -83,6 +83,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/max.cpp b/ngraph/src/ngraph/op/max.cpp index 747452dfe52745..73f7a138ba5e43 100644 --- a/ngraph/src/ngraph/op/max.cpp +++ b/ngraph/src/ngraph/op/max.cpp @@ -117,6 +117,10 @@ namespace break; TYPE_CASE(u64)(arg, out, axes); break; + TYPE_CASE(bf16)(arg, out, axes); + break; + TYPE_CASE(f16)(arg, out, axes); + break; TYPE_CASE(f32)(arg, out, axes); break; TYPE_CASE(f64)(arg, out, axes); diff --git a/ngraph/src/ngraph/op/max_pool.cpp b/ngraph/src/ngraph/op/max_pool.cpp index 3cca67b3863aa4..1d35f620b65d0d 100644 --- a/ngraph/src/ngraph/op/max_pool.cpp +++ b/ngraph/src/ngraph/op/max_pool.cpp @@ -562,6 +562,10 @@ namespace break; TYPE_CASE(u64)(arg, out, out_shape, kernel, strides, pad_begin, pad_end); break; + TYPE_CASE(bf16)(arg, out, out_shape, kernel, strides, pad_begin, pad_end); + break; + TYPE_CASE(f16)(arg, out, out_shape, kernel, strides, pad_begin, pad_end); + break; TYPE_CASE(f32)(arg, out, out_shape, kernel, strides, pad_begin, pad_end); break; TYPE_CASE(f64)(arg, out, out_shape, kernel, strides, pad_begin, pad_end); diff --git a/ngraph/src/ngraph/op/maximum.cpp b/ngraph/src/ngraph/op/maximum.cpp index be9607a0e858fe..83443a0496213e 100644 --- a/ngraph/src/ngraph/op/maximum.cpp +++ b/ngraph/src/ngraph/op/maximum.cpp @@ -106,6 +106,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/min.cpp b/ngraph/src/ngraph/op/min.cpp index 8ba8a35502fb0e..c3f41930be1496 100644 --- a/ngraph/src/ngraph/op/min.cpp +++ b/ngraph/src/ngraph/op/min.cpp @@ -117,6 +117,10 @@ namespace break; TYPE_CASE(u64)(arg, out, axes); break; + TYPE_CASE(bf16)(arg, out, axes); + break; + TYPE_CASE(f16)(arg, out, axes); + break; TYPE_CASE(f32)(arg, out, axes); break; TYPE_CASE(f64)(arg, out, axes); diff --git a/ngraph/src/ngraph/op/minimum.cpp b/ngraph/src/ngraph/op/minimum.cpp index 0e3be4474572e8..a3365e5c26a0c1 100644 --- a/ngraph/src/ngraph/op/minimum.cpp +++ b/ngraph/src/ngraph/op/minimum.cpp @@ -105,6 +105,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/multiply.cpp b/ngraph/src/ngraph/op/multiply.cpp index 92843b6ac48b9f..956e0c75f1e9fe 100644 --- a/ngraph/src/ngraph/op/multiply.cpp +++ b/ngraph/src/ngraph/op/multiply.cpp @@ -97,6 +97,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/non_zero.cpp b/ngraph/src/ngraph/op/non_zero.cpp index e3ae0ea3f64797..ab4c719cf823c7 100644 --- a/ngraph/src/ngraph/op/non_zero.cpp +++ b/ngraph/src/ngraph/op/non_zero.cpp @@ -156,6 +156,8 @@ namespace break; TYPE_CASE(bf16)(input, output); break; + TYPE_CASE(f16)(input, output); + break; TYPE_CASE(f32)(input, output); break; TYPE_CASE(f64)(input, output); diff --git a/ngraph/src/ngraph/op/not_equal.cpp b/ngraph/src/ngraph/op/not_equal.cpp index dd6f470eca4fb6..034690c1ca7ebd 100644 --- a/ngraph/src/ngraph/op/not_equal.cpp +++ b/ngraph/src/ngraph/op/not_equal.cpp @@ -83,6 +83,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/or.cpp b/ngraph/src/ngraph/op/or.cpp index 29f4a4beffb267..587e44fd25cc7e 100644 --- a/ngraph/src/ngraph/op/or.cpp +++ b/ngraph/src/ngraph/op/or.cpp @@ -81,6 +81,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/power.cpp b/ngraph/src/ngraph/op/power.cpp index b05ad1f6878e13..18d88d7e7d5ad5 100644 --- a/ngraph/src/ngraph/op/power.cpp +++ b/ngraph/src/ngraph/op/power.cpp @@ -102,6 +102,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/product.cpp b/ngraph/src/ngraph/op/product.cpp index 05397656ce1e46..9aa94898145913 100644 --- a/ngraph/src/ngraph/op/product.cpp +++ b/ngraph/src/ngraph/op/product.cpp @@ -80,6 +80,10 @@ namespace break; TYPE_CASE(u64)(arg, out, axes); break; + TYPE_CASE(bf16)(arg, out, axes); + break; + TYPE_CASE(f16)(arg, out, axes); + break; TYPE_CASE(f32)(arg, out, axes); break; TYPE_CASE(f64)(arg, out, axes); diff --git a/ngraph/src/ngraph/op/reduce_mean.cpp b/ngraph/src/ngraph/op/reduce_mean.cpp index 4ed0ba07c325a3..d527d25487817f 100644 --- a/ngraph/src/ngraph/op/reduce_mean.cpp +++ b/ngraph/src/ngraph/op/reduce_mean.cpp @@ -72,6 +72,10 @@ namespace break; TYPE_CASE(u64)(arg, out, axes); break; + TYPE_CASE(bf16)(arg, out, axes); + break; + TYPE_CASE(f16)(arg, out, axes); + break; TYPE_CASE(f32)(arg, out, axes); break; TYPE_CASE(f64)(arg, out, axes); diff --git a/ngraph/src/ngraph/op/reduce_prod.cpp b/ngraph/src/ngraph/op/reduce_prod.cpp index dfe249fde628a8..cd709378d442e7 100644 --- a/ngraph/src/ngraph/op/reduce_prod.cpp +++ b/ngraph/src/ngraph/op/reduce_prod.cpp @@ -76,6 +76,10 @@ namespace break; TYPE_CASE(u64)(arg, out, axes); break; + TYPE_CASE(bf16)(arg, out, axes); + break; + TYPE_CASE(f16)(arg, out, axes); + break; TYPE_CASE(f32)(arg, out, axes); break; TYPE_CASE(f64)(arg, out, axes); diff --git a/ngraph/src/ngraph/op/reduce_sum.cpp b/ngraph/src/ngraph/op/reduce_sum.cpp index 068a84edad1878..8032a3350d0744 100644 --- a/ngraph/src/ngraph/op/reduce_sum.cpp +++ b/ngraph/src/ngraph/op/reduce_sum.cpp @@ -87,6 +87,10 @@ namespace break; TYPE_CASE(u64)(arg, out, axes); break; + TYPE_CASE(bf16)(arg, out, axes); + break; + TYPE_CASE(f16)(arg, out, axes); + break; TYPE_CASE(f32)(arg, out, axes); break; TYPE_CASE(f64)(arg, out, axes); diff --git a/ngraph/src/ngraph/op/scatter_elements_update.cpp b/ngraph/src/ngraph/op/scatter_elements_update.cpp index c7a2542d1657e8..93dc899381c2af 100644 --- a/ngraph/src/ngraph/op/scatter_elements_update.cpp +++ b/ngraph/src/ngraph/op/scatter_elements_update.cpp @@ -267,6 +267,8 @@ namespace break; TYPE_CASE(bf16)(arg0, arg1, arg2, arg3, out, normalized_axis); break; + TYPE_CASE(f16)(arg0, arg1, arg2, arg3, out, normalized_axis); + break; TYPE_CASE(f32)(arg0, arg1, arg2, arg3, out, normalized_axis); break; TYPE_CASE(f64)(arg0, arg1, arg2, arg3, out, normalized_axis); diff --git a/ngraph/src/ngraph/op/softmax.cpp b/ngraph/src/ngraph/op/softmax.cpp index a22a3a43e02803..6f4e2e5d4d0bf9 100644 --- a/ngraph/src/ngraph/op/softmax.cpp +++ b/ngraph/src/ngraph/op/softmax.cpp @@ -173,7 +173,8 @@ namespace bool evaluate_softmax(const HostTensorPtr& arg, const HostTensorPtr& out, const AxisSet& axes) { auto shape = out->get_shape(); - return try_evaluate_softmax(arg, out, shape, axes) || + return try_evaluate_softmax(arg, out, shape, axes) || + try_evaluate_softmax(arg, out, shape, axes) || try_evaluate_softmax(arg, out, shape, axes); } } diff --git a/ngraph/src/ngraph/op/strided_slice.cpp b/ngraph/src/ngraph/op/strided_slice.cpp index 28412a9a1e66d2..9a8acec0671a2d 100644 --- a/ngraph/src/ngraph/op/strided_slice.cpp +++ b/ngraph/src/ngraph/op/strided_slice.cpp @@ -291,6 +291,8 @@ namespace break; TYPE_CASE(bf16)(in, slice_plan, out); break; + TYPE_CASE(f16)(in, slice_plan, out); + break; TYPE_CASE(f32)(in, slice_plan, out); break; TYPE_CASE(f64)(in, slice_plan, out); diff --git a/ngraph/src/ngraph/op/subtract.cpp b/ngraph/src/ngraph/op/subtract.cpp index 50fb2624ba875f..7d638acc9fa6ab 100644 --- a/ngraph/src/ngraph/op/subtract.cpp +++ b/ngraph/src/ngraph/op/subtract.cpp @@ -103,6 +103,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/op/sum.cpp b/ngraph/src/ngraph/op/sum.cpp index 4b8a56dd9bb752..24643889f97846 100644 --- a/ngraph/src/ngraph/op/sum.cpp +++ b/ngraph/src/ngraph/op/sum.cpp @@ -91,6 +91,10 @@ namespace break; TYPE_CASE(u64)(arg, out, axes); break; + TYPE_CASE(bf16)(arg, out, axes); + break; + TYPE_CASE(f16)(arg, out, axes); + break; TYPE_CASE(f32)(arg, out, axes); break; TYPE_CASE(f64)(arg, out, axes); diff --git a/ngraph/src/ngraph/op/xor.cpp b/ngraph/src/ngraph/op/xor.cpp index 40a0019d8e0612..3c144500951017 100644 --- a/ngraph/src/ngraph/op/xor.cpp +++ b/ngraph/src/ngraph/op/xor.cpp @@ -87,6 +87,10 @@ namespace break; TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec); break; + TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec); + break; + TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec); + break; TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec); break; TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec); diff --git a/ngraph/src/ngraph/type/float16.cpp b/ngraph/src/ngraph/type/float16.cpp index acca521b36da6e..8d6f748c0b2c25 100644 --- a/ngraph/src/ngraph/type/float16.cpp +++ b/ngraph/src/ngraph/type/float16.cpp @@ -138,38 +138,6 @@ size_t float16::size() const return sizeof(m_value); } -bool float16::operator==(const float16& other) const -{ -#if defined(__GNUC__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wfloat-equal" -#endif - return (static_cast(*this) == static_cast(other)); -#if defined(__GNUC__) -#pragma GCC diagnostic pop -#endif -} - -bool float16::operator<(const float16& other) const -{ - return (static_cast(*this) < static_cast(other)); -} - -bool float16::operator<=(const float16& other) const -{ - return (static_cast(*this) <= static_cast(other)); -} - -bool float16::operator>(const float16& other) const -{ - return (static_cast(*this) > static_cast(other)); -} - -bool float16::operator>=(const float16& other) const -{ - return (static_cast(*this) >= static_cast(other)); -} - float16::operator float() const { union { diff --git a/ngraph/src/ngraph/type/float16.hpp b/ngraph/src/ngraph/type/float16.hpp index 9028e30a4558f0..12d69574d73516 100644 --- a/ngraph/src/ngraph/type/float16.hpp +++ b/ngraph/src/ngraph/type/float16.hpp @@ -50,12 +50,20 @@ namespace ngraph std::string to_string() const; size_t size() const; - bool operator==(const float16& other) const; - bool operator!=(const float16& other) const { return !(*this == other); } - bool operator<(const float16& other) const; - bool operator<=(const float16& other) const; - bool operator>(const float16& other) const; - bool operator>=(const float16& other) const; + template bool operator==(const T& other) const; + template bool operator!=(const T& other) const { return !(*this == other); } + template bool operator<(const T& other) const; + template bool operator<=(const T& other) const; + template bool operator>(const T& other) const; + template bool operator>=(const T& other) const; + template float16 operator+(const T& other) const; + template float16 operator+=(const T& other); + template float16 operator-(const T& other) const; + template float16 operator-=(const T& other); + template float16 operator*(const T& other) const; + template float16 operator*=(const T& other); + template float16 operator/(const T& other) const; + template float16 operator/=(const T& other); operator float() const; static constexpr float16 from_bits(uint16_t bits) { return float16(bits, true); } @@ -86,6 +94,91 @@ namespace ngraph uint16_t m_value; }; + + template + bool float16::operator==(const T& other) const + { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + return (static_cast(*this) == static_cast(other)); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } + + template + bool float16::operator<(const T& other) const + { + return (static_cast(*this) < static_cast(other)); + } + + template + bool float16::operator<=(const T& other) const + { + return (static_cast(*this) <= static_cast(other)); + } + + template + bool float16::operator>(const T& other) const + { + return (static_cast(*this) > static_cast(other)); + } + + template + bool float16::operator>=(const T& other) const + { + return (static_cast(*this) >= static_cast(other)); + } + + template + float16 float16::operator+(const T& other) const + { + return {static_cast(*this) + static_cast(other)}; + } + + template + float16 float16::operator+=(const T& other) + { + return *this = *this + other; + } + + template + float16 float16::operator-(const T& other) const + { + return {static_cast(*this) - static_cast(other)}; + } + + template + float16 float16::operator-=(const T& other) + { + return *this = *this - other; + } + + template + float16 float16::operator*(const T& other) const + { + return {static_cast(*this) * static_cast(other)}; + } + + template + float16 float16::operator*=(const T& other) + { + return *this = *this * other; + } + + template + float16 float16::operator/(const T& other) const + { + return {static_cast(*this) / static_cast(other)}; + } + + template + float16 float16::operator/=(const T& other) + { + return *this = *this / other; + } } namespace std From 7edebd8d878c82ac01ee63f1ed0e5bbf91b922cd Mon Sep 17 00:00:00 2001 From: Edward Shogulin Date: Wed, 3 Jun 2020 15:05:45 +0300 Subject: [PATCH 04/12] [LPT] [TEST] Sporadic test fail fix (workaround) (#742) --- .../output_layers_handling_in_transformations.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/output_layers_handling_in_transformations.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/output_layers_handling_in_transformations.cpp index 88c6ce69e42ae9..fddf40add41267 100644 --- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/output_layers_handling_in_transformations.cpp +++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/output_layers_handling_in_transformations.cpp @@ -75,6 +75,8 @@ InferenceEngine::Blob::Ptr OutputLayersHandlingInTransformations::GenerateInput( */ void OutputLayersHandlingInTransformations::SetUp() { + threshold = 0.05; + InferenceEngine::SizeVector inputShape1; InferenceEngine::Precision netPrecision; InferenceEngine::details::LayerTransformation::Params params; From 63a77bb4a18d4d95f8056a88027468360af091d4 Mon Sep 17 00:00:00 2001 From: LiweiSong <2005songliwei@163.com> Date: Wed, 3 Jun 2020 20:19:29 +0800 Subject: [PATCH 05/12] mkldnn_memory_solver.hpp: include stdint.h to avoid build error (#729) fix the following compile error: inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp:60:9: error: 'int64_t' does not name a type | 60 | int64_t size; | | ^~~~~~~ include stdint.h to fix this. Signed-off-by: Liwei Song --- inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp b/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp index 2c236a3072767f..aa52d29b310dd4 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp @@ -10,6 +10,8 @@ #include "ie_api.h" +#include + #include #include From 53927034da74cf47f11a34481fa749a449f81f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Do=C5=82bniak?= Date: Wed, 3 Jun 2020 15:01:43 +0200 Subject: [PATCH 06/12] Python API for Assign, ReadValue and ExtractImagePatches (#719) --- ngraph/python/src/ngraph/__init__.py | 3 ++ ngraph/python/src/ngraph/ops.py | 50 +++++++++++++++++++++ ngraph/python/test/ngraph/test_create_op.py | 36 +++++++++++++++ ngraph/src/ngraph/op/read_value.cpp | 4 +- ngraph/src/ngraph/op/read_value.hpp | 6 +-- 5 files changed, 94 insertions(+), 5 deletions(-) diff --git a/ngraph/python/src/ngraph/__init__.py b/ngraph/python/src/ngraph/__init__.py index 6055ba92e59c59..d644fda3758b86 100644 --- a/ngraph/python/src/ngraph/__init__.py +++ b/ngraph/python/src/ngraph/__init__.py @@ -29,6 +29,7 @@ from ngraph.ops import acos from ngraph.ops import add from ngraph.ops import asin +from ngraph.ops import assign from ngraph.ops import atan from ngraph.ops import avg_pool from ngraph.ops import batch_norm_inference @@ -59,6 +60,7 @@ from ngraph.ops import embedding_bag_offsets_sum from ngraph.ops import embedding_bag_packed_sum from ngraph.ops import embedding_segments_sum +from ngraph.ops import extract_image_patches from ngraph.ops import equal from ngraph.ops import erf from ngraph.ops import exp @@ -108,6 +110,7 @@ from ngraph.ops import prior_box_clustered from ngraph.ops import psroi_pooling from ngraph.ops import proposal +from ngraph.ops import read_value from ngraph.ops import reduce_logical_and from ngraph.ops import reduce_logical_or from ngraph.ops import reduce_max diff --git a/ngraph/python/src/ngraph/ops.py b/ngraph/python/src/ngraph/ops.py index 51a299ef8f97c8..8d5fe41dfb4eae 100644 --- a/ngraph/python/src/ngraph/ops.py +++ b/ngraph/python/src/ngraph/ops.py @@ -3438,3 +3438,53 @@ def proposal( return _get_node_factory().create( "Proposal", [class_probs, box_logits, as_node(image_shape)], attrs ) + + +@nameable_op +def assign(new_value: NodeInput, variable_id: str, name: Optional[str] = None) -> Node: + """Return a node which produces the Assign operation. + + :param new_value: Node producing a value to be assigned to a variable. + :param variable_id: Id of a variable to be updated. + :param name: Optional name for output node. + :return: Assign node + """ + return _get_node_factory().create("Assign", [as_node(new_value)], {"variable_id": variable_id}) + + +@nameable_op +def read_value(init_value: NodeInput, variable_id: str, name: Optional[str] = None) -> Node: + """Return a node which produces the Assign operation. + + :param init_value: Node producing a value to be returned instead of an unassigned variable. + :param variable_id: Id of a variable to be read. + :param name: Optional name for output node. + :return: ReadValue node + """ + return _get_node_factory().create("ReadValue", [as_node(init_value)], {"variable_id": variable_id}) + + +@nameable_op +def extract_image_patches( + image: NodeInput, + sizes: TensorShape, + strides: List[int], + rates: TensorShape, + auto_pad: str, + name: Optional[str] = None, +) -> Node: + """Return a node which produces the ExtractImagePatches operation. + + :param image: 4-D Input data to extract image patches. + :param sizes: Patch size in the format of [size_rows, size_cols]. + :param strides: Patch movement stride in the format of [stride_rows, stride_cols] + :param rates: Element seleciton rate for creating a patch. + :param auto_pad: Padding type. + :param name: Optional name for output node. + :return: ExtractImagePatches node + """ + return _get_node_factory().create( + "ExtractImagePatches", + [as_node(image)], + {"sizes": sizes, "strides": strides, "rates": rates, "auto_pad": auto_pad}, + ) diff --git a/ngraph/python/test/ngraph/test_create_op.py b/ngraph/python/test/ngraph/test_create_op.py index abb50adce9e26d..9b041d8fc8d61c 100644 --- a/ngraph/python/test/ngraph/test_create_op.py +++ b/ngraph/python/test/ngraph/test_create_op.py @@ -845,3 +845,39 @@ def test_proposal(int_dtype, fp_dtype): assert node.get_type_name() == "Proposal" assert node.get_output_size() == 1 assert list(node.get_output_shape(0)) == [batch_size * attributes["attrs.post_nms_topn"], 5] + + +def test_read_value(): + init_value = ng.parameter([2, 2], name="init_value", dtype=np.int32) + + node = ng.read_value(init_value, "var_id_667") + + assert node.get_type_name() == "ReadValue" + assert node.get_output_size() == 1 + assert list(node.get_output_shape(0)) == [2, 2] + assert node.get_output_element_type(0) == Type.i32 + + +def test_assign(): + input_data = ng.parameter([5, 7], name="input_data", dtype=np.int32) + rv = ng.read_value(input_data, "var_id_667") + node = ng.assign(rv, "var_id_667") + + assert node.get_type_name() == "Assign" + assert node.get_output_size() == 1 + assert list(node.get_output_shape(0)) == [5, 7] + assert node.get_output_element_type(0) == Type.i32 + + +def test_extract_image_patches(): + image = ng.parameter([64, 3, 10, 10], name="image", dtype=np.int32) + sizes = [3, 3]; + strides = [5, 5]; + rates = [1, 1]; + padding = "VALID"; + node = ng.extract_image_patches(image, sizes, strides, rates, padding) + + assert node.get_type_name() == "ExtractImagePatches" + assert node.get_output_size() == 1 + assert list(node.get_output_shape(0)) == [64, 27, 2, 2] + assert node.get_output_element_type(0) == Type.i32 diff --git a/ngraph/src/ngraph/op/read_value.cpp b/ngraph/src/ngraph/op/read_value.cpp index 9f7abb6ed7dc75..f6581a6b6ed479 100644 --- a/ngraph/src/ngraph/op/read_value.cpp +++ b/ngraph/src/ngraph/op/read_value.cpp @@ -21,8 +21,8 @@ using namespace ngraph; constexpr NodeTypeInfo op::ReadValue::type_info; -op::ReadValue::ReadValue(const Output& new_value, const std::string& variable_id) - : Op({new_value}) +op::ReadValue::ReadValue(const Output& init_value, const std::string& variable_id) + : Op({init_value}) , m_variable_id(variable_id) { constructor_validate_and_infer_types(); diff --git a/ngraph/src/ngraph/op/read_value.hpp b/ngraph/src/ngraph/op/read_value.hpp index ea451f3ed45064..ca3f5325f0dcb0 100644 --- a/ngraph/src/ngraph/op/read_value.hpp +++ b/ngraph/src/ngraph/op/read_value.hpp @@ -36,9 +36,9 @@ namespace ngraph /// \brief Constructs a ReadValue operation. /// - /// \param new_value Node that produces the input tensor. - /// \param variable_id identificator of the variable to create. - ReadValue(const Output& new_value, const std::string& variable_id); + /// \param init_value Node that produces the input tensor. + /// \param variable_id identificator of the variable to create. + ReadValue(const Output& init_value, const std::string& variable_id); void validate_and_infer_types() override; From cfb5f2789905e14c2e37348ad46834544b7a70b9 Mon Sep 17 00:00:00 2001 From: Ilya-Krylov Date: Wed, 3 Jun 2020 17:52:40 +0300 Subject: [PATCH 07/12] Add 'aligned' param to ExperimentalDetectronROIFeatureExtractor for CPU plugin and MO --- .../nodes/roifeatureextractor_onnx.cpp | 13 +++++++++---- .../front/onnx/roifeatureextractor_ext.py | 1 + .../extensions/ops/roifeatureextractor_onnx.py | 3 ++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp index 55ffb832f676b1..18d0fd73c30940 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp @@ -140,6 +140,7 @@ void ROIAlignForward_cpu_kernel( const int pooled_width, const int sampling_ratio, const T* bottom_rois, + const bool aligned, T* top_data) { int roi_cols = 4; @@ -156,11 +157,12 @@ void ROIAlignForward_cpu_kernel( offset_bottom_rois++; } + T offset = aligned ? (T)0.5 : (T)0.0; // Do not using rounding; this implementation detail is critical - T roi_start_w = offset_bottom_rois[0] * spatial_scale; - T roi_start_h = offset_bottom_rois[1] * spatial_scale; - T roi_end_w = offset_bottom_rois[2] * spatial_scale; - T roi_end_h = offset_bottom_rois[3] * spatial_scale; + T roi_start_w = offset_bottom_rois[0] * spatial_scale - offset; + T roi_start_h = offset_bottom_rois[1] * spatial_scale - offset; + T roi_end_w = offset_bottom_rois[2] * spatial_scale - offset; + T roi_end_h = offset_bottom_rois[3] * spatial_scale - offset; // Force malformed ROIs to be 1x1 T roi_width = (std::max)(roi_end_w - roi_start_w, (T)1.); @@ -321,6 +323,7 @@ class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase { output_dim_ = layer->GetParamAsInt("output_size"); pyramid_scales_ = layer->GetParamAsInts("pyramid_scales"); sampling_ratio_ = layer->GetParamAsInt("sampling_ratio"); + aligned_ = layer->GetParamAsBool("aligned"); pooled_height_ = output_dim_; pooled_width_ = output_dim_; @@ -374,6 +377,7 @@ class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase { pooled_width_, sampling_ratio_, &reordered_rois[4 * level_rois_offset], + aligned_, &output_rois_features_temp[feaxels_per_roi * level_rois_offset]); } } @@ -394,6 +398,7 @@ class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase { int pooled_width_ = 0; std::vector pyramid_scales_; int sampling_ratio_ = 0; + bool aligned_ = false; }; REG_FACTORY_FOR(ExperimentalDetectronROIFeatureExtractorImpl, ExperimentalDetectronROIFeatureExtractor); diff --git a/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py b/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py index 0ae58ca77e6642..9c070aa8a58a90 100644 --- a/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py +++ b/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py @@ -31,6 +31,7 @@ def extract(cls, node): sampling_ratio=onnx_attr(node, 'sampling_ratio', 'i', 2), distribute_rois_between_levels=onnx_attr(node, 'distribute_rois_between_levels', 'i', 1), preserve_rois_order=onnx_attr(node, 'preserve_rois_order', 'i', 1), + aligned=onnx_attr(node, 'aligned', 'i', 0), num_classes=onnx_attr(node, 'num_classes', 'i', 81), post_nms_count=onnx_attr(node, 'post_nms_count', 'i', 2000), score_threshold=onnx_attr(node, 'score_threshold', 'f', 0.05), diff --git a/model-optimizer/extensions/ops/roifeatureextractor_onnx.py b/model-optimizer/extensions/ops/roifeatureextractor_onnx.py index 87d6a06d7dfe49..26c2e63b4978c3 100644 --- a/model-optimizer/extensions/ops/roifeatureextractor_onnx.py +++ b/model-optimizer/extensions/ops/roifeatureextractor_onnx.py @@ -41,7 +41,8 @@ def backend_attrs(self): 'image_id', 'output_size', 'sampling_ratio', - 'preserve_rois_order'] + 'preserve_rois_order', + 'aligned'] @staticmethod def infer(node): From e2d1ae7055229603528f2c2be797aa3de057250e Mon Sep 17 00:00:00 2001 From: Lukasz Debski Date: Wed, 3 Jun 2020 18:42:50 +0200 Subject: [PATCH 08/12] [IE CLDNN] Fixed stack overflow in calculate_prior_boxes pass (#747) The problem behind this error was in program_impl::init_graph() where in calculate_prior_boxes we are trying to calculate output layout of an entire network recursively which causes stack overflow. Calculating output layouts beforehand in processing order fixes this issue. --- inference-engine/thirdparty/clDNN/src/program.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp index 14dedbe7c06c16..d5a2f3b531ce06 100644 --- a/inference-engine/thirdparty/clDNN/src/program.cpp +++ b/inference-engine/thirdparty/clDNN/src/program.cpp @@ -375,6 +375,11 @@ void program_impl::build_program(bool is_internal) { void program_impl::init_graph() { apply_opt_pass(); + for (auto& node : processing_order) { + if (!node->is_type() && !node->is_type()) + node->get_output_layout(); + } + apply_opt_pass(); apply_opt_pass(); From 023344a317ac5c0b61d5d8430f429935563cc996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20=C5=BByczy=C5=84ski?= Date: Wed, 3 Jun 2020 18:44:27 +0200 Subject: [PATCH 09/12] [IE CLDNN] Added fusing suport to all pooling kernels (#689) adds fusing support to all available pooling kernels tests all possible input type/output type configurations fixes minor bug in max pooling in pooling_gpu_test.cpp fixed minor bug with yxbf format in pooling_gpu_ref and pooling_gpu_int8_ref kernels fixes bug with b_fs_yx_fsv32 format in pooling_gpu kernel resolves bug with max pooling accuracy missmatch in case of non zero pad end layer parameter resolves average pooling accuracy missmatch in case of non zero pad end layer parameter --- .../src/cldnn_engine/cldnn_program.cpp | 2 + .../single_layer_tests/pooling.cpp | 28 +- .../thirdparty/clDNN/api/pooling.hpp | 2 + .../pooling/pooling_kernel_base.cpp | 45 +- .../pooling_kernel_gpu_average_opt.cpp | 20 +- .../pooling/pooling_kernel_gpu_average_opt.h | 4 +- .../pooling_kernel_gpu_b_fs_yx_fsv16.cpp | 30 + .../pooling_kernel_gpu_b_fs_yx_fsv16.h | 7 +- .../pooling_kernel_gpu_b_fs_yx_fsv4.cpp | 26 +- .../pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h | 2 +- .../pooling_kernel_gpu_bfyx_block_opt.cpp | 26 +- .../pooling_kernel_gpu_bfyx_block_opt.h | 9 +- .../pooling_kernel_gpu_bsv16_fsv16.cpp | 30 +- .../pooling/pooling_kernel_gpu_bsv16_fsv16.h | 8 +- .../pooling/pooling_kernel_gpu_byxf_af32.cpp | 24 +- .../pooling/pooling_kernel_gpu_byxf_af32.h | 2 +- .../pooling/pooling_kernel_gpu_byxf_opt.cpp | 26 +- .../pooling/pooling_kernel_gpu_byxf_opt.h | 9 +- .../pooling_kernel_gpu_byxf_padding_opt.cpp | 18 +- .../pooling_kernel_gpu_byxf_padding_opt.h | 9 +- .../pooling_kernel_gpu_fs_b_yx_fsv32.cpp | 23 +- .../pooling_kernel_gpu_fs_b_yx_fsv32.h | 7 +- ...pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp | 36 +- .../pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h | 10 +- ..._kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp | 24 +- ...ng_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h | 9 +- .../pooling/pooling_kernel_gpu_int8_ref.cpp | 12 +- .../pooling/pooling_kernel_gpu_int8_ref.h | 5 +- .../pooling/pooling_kernel_gpu_ref.cpp | 24 +- .../pooling/pooling_kernel_gpu_ref.h | 12 +- .../pooling/pooling_kernel_selector.cpp | 2 +- .../cl_kernels/pooling_gpu_average_opt.cl | 11 +- .../cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl | 89 +-- .../cl_kernels/pooling_gpu_bfyx_block_opt.cl | 70 ++- .../core/cl_kernels/pooling_gpu_blocked.cl | 154 +++-- .../cl_kernels/pooling_gpu_bsv16_fsv16.cl | 110 +++- .../core/cl_kernels/pooling_gpu_byxf_af32.cl | 86 +-- .../core/cl_kernels/pooling_gpu_byxf_opt.cl | 72 ++- .../pooling_gpu_byxf_padding_opt.cl | 72 ++- .../cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl | 87 +-- .../pooling_gpu_fs_bs_yx_bsv4_fsv32.cl | 150 +++-- .../pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl | 75 ++- .../core/cl_kernels/pooling_gpu_int8_ref.cl | 41 +- .../core/cl_kernels/pooling_gpu_ref.cl | 245 +++++--- .../thirdparty/clDNN/src/gpu/pooling_gpu.cpp | 12 +- .../thirdparty/clDNN/src/gpu/quantize_gpu.cpp | 20 + .../prepare_primitive_fusing.cpp | 39 +- .../tests/test_cases/fusings_gpu_test.cpp | 561 ++++++++++++++---- .../tests/test_cases/pooling_gpu_test.cpp | 106 +++- .../clDNN/tests/test_utils/float16.h | 174 +++--- 50 files changed, 1833 insertions(+), 832 deletions(-) diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp index 3090a371122604..86cdddb81ef433 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp @@ -2735,6 +2735,8 @@ void Program::CreatePoolingPrimitive(cldnn::topology& topology, InferenceEngine: input_offset, CldnnTensorFromIEDims(poolLayer->outData[0]->getTensorDesc().getDims()), dt); + cldnn::tensor pad_end = { 0, 0, -TensorValue(poolLayer->_pads_end[X_AXIS]), -TensorValue(poolLayer->_pads_end[Y_AXIS]), 0 }; + poolPrim.pad_end = pad_end; topology.add(poolPrim); primitiveIDs[poolLayerName] = poolLayerName; } diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp index 8ddd708318001b..5cab9e39aadf7f 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp @@ -12,7 +12,6 @@ using namespace ngraph::helpers; using namespace LayerTestsDefinitions; namespace { - const std::vector netPrecisions = { InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16 @@ -28,6 +27,7 @@ const std::vector> padEnds = {{0, 0}, {0, 2}}; const std::vector roundingTypes = {ngraph::op::RoundingType::CEIL, ngraph::op::RoundingType::FLOOR}; + ////* ========== Max Polling ========== */ /* +========== Explicit Pad Floor Rounding ========== */ const auto maxPool_ExplicitPad_FloorRounding_Params = ::testing::Combine( @@ -35,8 +35,7 @@ const auto maxPool_ExplicitPad_FloorRounding_Params = ::testing::Combine( ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), ::testing::ValuesIn(padBegins), - // TODO: Accuracy mismatch with non zero Pad Ends (tested with {0.2}) - ::testing::Values(std::vector({0, 0})), + ::testing::ValuesIn(padEnds), ::testing::Values(ngraph::op::RoundingType::FLOOR), ::testing::Values(ngraph::op::PadType::EXPLICIT), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling @@ -57,8 +56,7 @@ const auto maxPool_ExplicitPad_CeilRounding_Params = ::testing::Combine( // TODO: Non 1 strides fails in ngraph reference implementation with error "The end corner is out of bounds at axis 3" thrown in the test body. ::testing::Values(std::vector({1, 1})), ::testing::ValuesIn(padBegins), - // TODO: Accuracy mismatch with non zero Pad Ends (tested with {0.2}) - ::testing::Values(std::vector({0, 0})), + ::testing::ValuesIn(padEnds), ::testing::Values(ngraph::op::RoundingType::CEIL), ::testing::Values(ngraph::op::PadType::EXPLICIT), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling @@ -80,9 +78,8 @@ const auto avgPoolExplicitPadCeilRoundingParams = ::testing::Combine( ::testing::ValuesIn(kernels), // TODO: Non 1 strides fails in ngraph reference implementation with error "The end corner is out of bounds at axis 3" thrown in the test body. ::testing::Values(std::vector({1, 1})), - // TODO: Non zero pads excluded because of accuracy mismatch - ::testing::Values(std::vector({0, 0})), - ::testing::Values(std::vector({0, 0})), + ::testing::ValuesIn(padBegins), + ::testing::ValuesIn(padEnds), ::testing::Values(ngraph::op::RoundingType::CEIL), ::testing::Values(ngraph::op::PadType::EXPLICIT), ::testing::Values(true, false) @@ -101,9 +98,8 @@ const auto avgPoolExplicitPadFloorRoundingParams = ::testing::Combine( ::testing::Values(PoolingTypes::AVG), ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), - // TODO: Non zero pads excluded because of accuracy mismatch - ::testing::Values(std::vector({0, 0})), - ::testing::Values(std::vector({0, 0})), + ::testing::ValuesIn(padBegins), + ::testing::ValuesIn(padEnds), ::testing::Values(ngraph::op::RoundingType::FLOOR), ::testing::Values(ngraph::op::PadType::EXPLICIT), ::testing::Values(true, false) @@ -125,9 +121,9 @@ const auto allPools_ValidPad_Params = ::testing::Combine( ::testing::ValuesIn(kernels), ::testing::ValuesIn(strides), ::testing::Values(std::vector({0, 0})), - ::testing::Values(std::vector({0, 0})), - ::testing::Values( - ngraph::op::RoundingType::FLOOR), // placeholder value - Rounding Type not applicable for Valid pad type + ::testing::ValuesIn(padEnds), + ::testing::Values(ngraph::op::RoundingType::FLOOR), // placeholder value - Rounding Type not applicable for Valid pad type + // TODO: PadType::VALID seems not to ignore padBegins ::testing::Values(ngraph::op::PadType::VALID), ::testing::Values(false) // placeholder value - exclude pad not applicable for max pooling ); @@ -139,6 +135,4 @@ INSTANTIATE_TEST_CASE_P(MAX_and_AVGPool_ValidPad, PoolingLayerTest, ::testing::Values(std::vector({1, 3, 50, 50})), ::testing::Values(CommonTestUtils::DEVICE_GPU)), PoolingLayerTest::getTestCaseName); - - -} // namespace \ No newline at end of file +} // namespace diff --git a/inference-engine/thirdparty/clDNN/api/pooling.hpp b/inference-engine/thirdparty/clDNN/api/pooling.hpp index 4a92e601a1f41a..4dcccfbaafe349 100644 --- a/inference-engine/thirdparty/clDNN/api/pooling.hpp +++ b/inference-engine/thirdparty/clDNN/api/pooling.hpp @@ -188,6 +188,8 @@ struct pooling : public primitive_base { bool with_output_size; /// @brief User-defined output data size of the primitive (w/o padding). tensor output_size; + /// @brief Defines a shift, relative to the end of padding shape. + tensor pad_end; protected: std::vector> get_dependencies() const override { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp index d7e8081ff062ba..f9486d77a2da78 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -33,24 +33,27 @@ bool PoolingKernelBase::Validate(const Params& p, const optional_params& o) cons } Datatype PoolingKernelBase::GetAccumulatorType(const pooling_params& params) const { - if (params.quantization != QuantizationType::NONE) - return Datatype::INT32; + const auto& input_dt = params.inputs[0].GetDType(); + const auto& pool_type = params.poolType; - Datatype types[] = { Datatype::F32, Datatype::F16, Datatype::INT64, Datatype::INT32, Datatype::UINT32}; - - for (Datatype type : types) - for (auto& in : params.inputs) - if (in.GetDType() == type) - return type; - - return Datatype::F32; + if (pool_type == PoolType::MAX) { + return input_dt; + } else { + switch (input_dt) { + case Datatype::F32: return Datatype::F32; + case Datatype::F16: return Datatype::F32; + case Datatype::INT8: return Datatype::INT32; + case Datatype::UINT8: return Datatype::INT32; + default: return Datatype::F32; + } + } } Datatype PoolingKernelBase::GetActivationType(const pooling_params& params) const { - if (params.quantization != QuantizationType::NONE) + if (params.output.GetDType() == Datatype::F16) + return Datatype::F16; + else return Datatype::F32; - - return GetUnitType(params); } @@ -78,12 +81,17 @@ JitConstants PoolingKernelBase::GetJitConstants(const pooling_params& pp, Poolin // Checks if we need boundary checking in kernel. bool PoolingKernelBase::NeedsBoundaryCheck(const pooling_params& pp) const { + const auto& input = pp.inputs[0]; + const auto& output = pp.output; + if (pp.poolPad.x != 0 || pp.poolPad.y != 0 || pp.poolPad.z != 0) { return true; + } else if ((((input.X().v - pp.poolSize.x) / pp.poolStride.x) + 1) < output.X().v || + (((input.Y().v - pp.poolSize.y) / pp.poolStride.y) + 1) < output.Y().v || + (((input.Z().v - pp.poolSize.z) / pp.poolStride.z) + 1) < output.Z().v) { + return true; } - const auto& input = pp.inputs[0]; - if (input.X().v < pp.poolSize.x || input.Y().v < pp.poolSize.y || input.Z().v < pp.poolSize.z) { return true; } @@ -99,7 +107,7 @@ bool PoolingKernelBase::NeedsBoundaryCheck(const pooling_params& pp) const { return mod_x || mod_y || mod_z; } -bool PoolingKernelBase::EnableRound(const kernel_selector::pooling_params ¶ms) const { +bool PoolingKernelBase::EnableRound(const kernel_selector::pooling_params& params) const { bool has_fused_quantize_to_int8 = false; for (auto& op : params.fused_ops) { if (op.GetType() == FusedOpType::QUANTIZE && @@ -108,7 +116,8 @@ bool PoolingKernelBase::EnableRound(const kernel_selector::pooling_params ¶m } } - if (!has_fused_quantize_to_int8 && (params.output.GetDType() == Datatype::INT8 || params.output.GetDType() == Datatype::UINT8) && + if (!has_fused_quantize_to_int8 && + (params.output.GetDType() == Datatype::INT8 || params.output.GetDType() == Datatype::UINT8) && params.poolType == PoolType::AVG) { return true; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp index ef90e978005c4d..5e20ef6349a8cf 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,12 +20,16 @@ ParamsKey PoolingKernelGPUAverageOpt::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); k.EnableInputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::bfyx); k.EnablePoolType(PoolType::AVG); k.EnablePoolRemainder(PoolRemainder::FLOOR); k.EnablePoolRemainder(PoolRemainder::CEIL); k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED); + k.EnableDifferentTypes(); return k; } @@ -80,19 +84,19 @@ PoolingKernelBase::DispatchData PoolingKernelGPUAverageOpt::SetDefault(const poo JitConstants PoolingKernelGPUAverageOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const { auto tileDims = GetTileDimentions(); - auto mem_consts = PoolingKernelBase::GetJitConstants(params, kd); + auto jit = PoolingKernelBase::GetJitConstants(params, kd); if (tileDims.y != 0 && tileDims.x != 0) { - mem_consts.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", kd.lws0)); - mem_consts.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y)); - mem_consts.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x)); - mem_consts.AddConstant(MakeJitConstant("ONE_OVER_POOL_SIZE", 1.f / (params.poolSize.x * params.poolSize.y))); + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", kd.lws0)); + jit.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y)); + jit.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x)); + jit.AddConstant(MakeJitConstant("ONE_OVER_POOL_SIZE", 1.f / (params.poolSize.x * params.poolSize.y))); } - return mem_consts; + return jit; } KernelsData PoolingKernelGPUAverageOpt::GetKernelsData(const Params& params, const optional_params& options) const { return GetCommonKernelsData(params, options, FORCE_PRIORITY_7); } -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h index 39b8ec97efd0f8..828434705fa1ce 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,4 +30,4 @@ class PoolingKernelGPUAverageOpt : public PoolingKernelBase { JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; }; -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp index 31a2ac39c40cdd..aeb43373bd7fca 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp @@ -22,6 +22,8 @@ ParamsKey PoolingKernel_b_fs_yx_fsv16::GetSupportedKey() const { k.EnableOutputDataType(Datatype::F16); k.EnableInputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); k.EnableInputLayout(DataLayout::b_fs_yx_fsv16); k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); k.EnableTensorOffset(); @@ -38,6 +40,7 @@ ParamsKey PoolingKernel_b_fs_yx_fsv16::GetSupportedKey() const { k.EnableDifferentTypes(); k.EnableSubGroup(); k.EnableSubGroupShort(); + k.EnableDifferentTypes(); return k; } @@ -88,9 +91,36 @@ JitConstants PoolingKernel_b_fs_yx_fsv16::GetJitConstants(const pooling_params& jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size)); jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", alignment)); jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, x_block_size))); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); + if (params.output.Feature().v % 16 != 0) { jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1)); } + + if (!params.fused_ops.empty()) { + auto input_dt = GetActivationType(params); + FusedOpsConfiguration conf_vec = {"_VEC", + {"b", "(f_block*16)", "y", "x"}, + "pool_result", + input_dt, + x_block_size, + LoadType::LT_ALIGNED_READ, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::X}; + FusedOpsConfiguration conf_scalar = {"_SCALAR", + {"b", "(f_block*16)", "y", "(x+i)"}, + "pool_result[i]", + input_dt, + 1, + LoadType::LT_ALIGNED_READ, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::X}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar})); + } + return jit; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h index 90b7fb2c2ee412..4877d4f9d102f4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2018-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,6 +30,11 @@ class PoolingKernel_b_fs_yx_fsv16 : public PoolingKernelBase { bool Validate(const Params&, const optional_params&) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION }; + } size_t GetBlockSize(const pooling_params& params) const; }; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp index 621dd566fc2b23..606023653ebd30 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -66,19 +66,21 @@ JitConstants PoolingKerneGPU_b_fs_yx_fsv4::GetJitConstants(const pooling_params& const size_t in_y_pitch = 4 * params.inputs[0].X().LogicalDimPadded(); jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); if (!params.fused_ops.empty()) { - auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params); - FusedOpsConfiguration conf = { "", - {"b", "f", "y", "x"}, - "pool_result", - input_dt, - 4, - LoadType::LT_UNALIGNED, - BoundaryCheck::ENABLED, - IndexType::TENSOR_COORD, - Tensor::DataChannelName::FEATURE }; - jit.Merge(MakeFusedOpsJitConstants(params, { conf })); + auto input_dt = GetActivationType(params); + FusedOpsConfiguration conf = {"", + {"b", "f", "y", "x"}, + "pool_result", + input_dt, + 4, + LoadType::LT_UNALIGNED, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::FEATURE}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); } return jit; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h index 6caf7c1c30c817..fd12d6526fa84c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.cpp index 4acba9efe4aec9..4088e22b30ebae 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,6 +22,8 @@ ParamsKey PoolingKernelGPUBfyxBlockOpt::GetSupportedKey() const { k.EnableInputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); k.EnableInputLayout(DataLayout::bfyx); k.EnableOutputLayout(DataLayout::bfyx); k.EnableTensorOffset(); @@ -48,12 +50,28 @@ PoolingKernelBase::DispatchData PoolingKernelGPUBfyxBlockOpt::SetDefault(const p } JitConstants PoolingKernelGPUBfyxBlockOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const { - auto mem_consts = PoolingKernelBase::GetJitConstants(params, kd); + auto jit = PoolingKernelBase::GetJitConstants(params, kd); - mem_consts.AddConstant( + jit.AddConstant( MakeJitConstant("BLOCK_SIZE_Y", params.poolSize.y + params.poolSize.y * params.poolStride.y - 1)); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); - return mem_consts; + if (!params.fused_ops.empty()) { + auto input_dt = GetActivationType(params); + FusedOpsConfiguration conf = {"", + {"b", "f", "y + i", "x"}, + "pool_result", + input_dt, + 1, + LoadType::LT_UNALIGNED, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::Y}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + + return jit; } bool PoolingKernelGPUBfyxBlockOpt::Validate(const Params& p, const optional_params& o) const { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h index 77b94e7e2aa3c5..4b77a845df793a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,5 +30,10 @@ class PoolingKernelGPUBfyxBlockOpt : public PoolingKernelBase { bool Validate(const Params&, const optional_params&) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION }; + } }; -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.cpp index a4714ab8d5ef75..93ae17541e286f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.cpp @@ -1,4 +1,3 @@ -// // Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -27,9 +26,11 @@ static const size_t batch_block_size = 16; ParamsKey PoolingKernel_bsv16_fsv16::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F32); k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16); k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16); k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16); @@ -44,6 +45,7 @@ ParamsKey PoolingKernel_bsv16_fsv16::GetSupportedKey() const { k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED); k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC); k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING); + k.EnableDifferentTypes(); return k; } @@ -105,6 +107,30 @@ JitConstants PoolingKernel_bsv16_fsv16::GetJitConstants(const pooling_params& pa jit.AddConstant(MakeJitConstant("MB_BLOCK", batch_block_size)); jit.AddConstant(MakeJitConstant("IC_BLOCK", feature_block_size)); jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size)); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); + + if (!params.fused_ops.empty()) { + auto input_dt = GetActivationType(params); + + std::vector idx_order; + if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) { + idx_order = {"(b + BLOCK_NUM * 8)", "oc", "y", "x"}; + } else if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) { + idx_order = {"(b + BLOCK_NUM * 8)", "oc", "z", "y", "x"}; + } + + FusedOpsConfiguration conf = {"", + idx_order, + "pool_result", + input_dt, + 8, + LoadType::LT_ALIGNED_READ, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::BATCH}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } return jit; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.h index 57cdbd1004c4a9..fc2ebc258bd5da 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.h @@ -1,5 +1,4 @@ -// -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -35,5 +34,10 @@ class PoolingKernel_bsv16_fsv16 : public PoolingKernelBase { bool Validate(const Params& p, const optional_params& o) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION }; + } }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp index 6818394affe9b7..70d1d655c3d051 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -62,18 +62,20 @@ JitConstants PoolingKerneGPU_byxf_af32::GetJitConstants(const pooling_params& pa JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd); jit.AddConstant(MakeJitConstant("AS_INPUT_TYPE(val)", "as_" + toCLType(params.inputs[0].GetDType()) + "4(val)")); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); if (!params.fused_ops.empty()) { - auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params); - FusedOpsConfiguration conf = { "", - {"b", "f", "y", "x"}, - "pool_result", - input_dt, - 4, - LoadType::LT_UNALIGNED, - BoundaryCheck::ENABLED, - IndexType::TENSOR_COORD, - Tensor::DataChannelName::FEATURE }; + auto input_dt = GetActivationType(params); + FusedOpsConfiguration conf = {"", + {"b", "f", "y", "x"}, + "fused_pool_result", + input_dt, + 4, + LoadType::LT_UNALIGNED, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::FEATURE}; jit.Merge(MakeFusedOpsJitConstants(params, { conf })); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h index c2bbb9f0feffd6..1ffc94bc545ebf 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.cpp index bca6e0367e757a..b5d9e4759db0b4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,10 +22,13 @@ ParamsKey PoolingKernelGPUByxfOpt::GetSupportedKey() const { k.EnableInputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); k.EnableInputLayout(DataLayout::byxf); k.EnableOutputLayout(DataLayout::byxf); k.EnableTensorOffset(); k.EnableTensorPitches(); + k.EnableDifferentTypes(); k.EnableBatching(); k.EnablePoolType(PoolType::MAX); k.EnablePoolType(PoolType::AVG); @@ -46,9 +49,24 @@ PoolingKernelBase::DispatchData PoolingKernelGPUByxfOpt::SetDefault(const poolin } JitConstants PoolingKernelGPUByxfOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const { - auto mem_consts = PoolingKernelBase::GetJitConstants(params, kd); + auto jit = PoolingKernelBase::GetJitConstants(params, kd); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); - return mem_consts; + if (!params.fused_ops.empty()) { + auto input_dt = GetActivationType(params); + FusedOpsConfiguration conf = {"", + {"b", "f + i", "y", "x"}, + "pool_result", + input_dt, + 1, + LoadType::LT_UNALIGNED, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::FEATURE}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + return jit; } bool PoolingKernelGPUByxfOpt::Validate(const Params& p, const optional_params& o) const { @@ -71,4 +89,4 @@ bool PoolingKernelGPUByxfOpt::Validate(const Params& p, const optional_params& o KernelsData PoolingKernelGPUByxfOpt::GetKernelsData(const Params& params, const optional_params& options) const { return GetCommonKernelsData(params, options, FORCE_PRIORITY_7); } -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h index 0678f7f059037f..5c6547706b89b1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,5 +30,10 @@ class PoolingKernelGPUByxfOpt : public PoolingKernelBase { bool Validate(const Params&, const optional_params&) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION }; + } }; -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.cpp index 02f0f90f91089f..655f1648d0b425 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,6 +22,8 @@ ParamsKey PoolingKernelGPUByxfPaddingOpt::GetSupportedKey() const { k.EnableInputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); k.EnableInputLayout(DataLayout::byxf); k.EnableOutputLayout(DataLayout::byxf); k.EnableTensorOffset(); @@ -32,6 +34,7 @@ ParamsKey PoolingKernelGPUByxfPaddingOpt::GetSupportedKey() const { k.EnablePoolRemainder(PoolRemainder::FLOOR); k.EnablePoolRemainder(PoolRemainder::CEIL); k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED); + k.EnableDifferentTypes(); return k; } @@ -46,9 +49,16 @@ PoolingKernelBase::DispatchData PoolingKernelGPUByxfPaddingOpt::SetDefault(const } JitConstants PoolingKernelGPUByxfPaddingOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const { - auto mem_consts = PoolingKernelBase::GetJitConstants(params, kd); + auto jit = PoolingKernelBase::GetJitConstants(params, kd); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); - return mem_consts; + if (!params.fused_ops.empty()) { + auto input_dt = GetActivationType(params); + FusedOpsConfiguration conf = {"", {"b", "f + i", "y", "x"}, "pool_result", input_dt, 1}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + return jit; } bool PoolingKernelGPUByxfPaddingOpt::Validate(const Params& p, const optional_params& o) const { @@ -66,4 +76,4 @@ bool PoolingKernelGPUByxfPaddingOpt::Validate(const Params& p, const optional_pa KernelsData PoolingKernelGPUByxfPaddingOpt::GetKernelsData(const Params& params, const optional_params& options) const { return GetCommonKernelsData(params, options, FORCE_PRIORITY_8); } -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h index 64d0a3af6acdea..f7566aac68a03d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,5 +30,10 @@ class PoolingKernelGPUByxfPaddingOpt : public PoolingKernelBase { bool Validate(const Params&, const optional_params&) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; DispatchData SetDefault(const pooling_params& params) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION }; + } }; -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp index 71e96934dc66df..b963162bc18f49 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -19,7 +19,11 @@ namespace kernel_selector { ParamsKey PoolingKerneGPU_fs_b_yx_fsv32::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); k.EnableInputLayout(DataLayout::fs_b_yx_fsv32); k.EnableOutputLayout(DataLayout::fs_b_yx_fsv32); k.EnableTensorOffset(); @@ -34,6 +38,7 @@ ParamsKey PoolingKerneGPU_fs_b_yx_fsv32::GetSupportedKey() const { k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING); k.EnableSubGroup(); k.EnableSubGroupShort(); + k.EnableDifferentTypes(); return k; } @@ -75,6 +80,22 @@ JitConstants PoolingKerneGPU_fs_b_yx_fsv32::GetJitConstants(const pooling_params if (pp.poolSize.x >= 7 && pp.poolSize.y >= 7 && pp.poolType == PoolType::AVG) { jit.AddConstant(MakeJitConstant("USE_FLOAT_ACC", true)); } + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); + + if (!params.fused_ops.empty()) { + auto input_dt = GetActivationType(params); + FusedOpsConfiguration conf = {"", + {"b", "fs", "out_y", "out_x"}, + "pool_result", + input_dt, + 2, + LoadType::LT_ALIGNED_READ, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::FEATURE}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } return jit; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.h index 5db49e66e20587..5bb61fa3309994 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.h @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,5 +30,10 @@ class PoolingKerneGPU_fs_b_yx_fsv32 : public PoolingKernelBase { protected: bool Validate(const Params& p, const optional_params& o) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION }; + } }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp index 0fb5fbdc79c774..9f5a25204402d6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,6 +20,9 @@ ParamsKey PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::INT8); k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); k.EnableTensorOffset(); @@ -68,12 +71,41 @@ JitConstants PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetJitConstants(const pooling_ jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); + + if (!params.fused_ops.empty()) { + auto input_dt = GetActivationType(params); + FusedOpsConfiguration conf = {"", + {"b + bi", "f", "y", "x"}, + "char_result", + input_dt, + 4, + LoadType::LT_UNALIGNED, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::FEATURE}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } return jit; } +bool PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::Validate(const Params& params, const optional_params& options) const { + if (!PoolingKernelBase::Validate(params, options)) { + return false; + } + + auto p = dynamic_cast(params); + + if (p.quantization != QuantizationType::NONE && p.poolType == PoolType::AVG) { + return false; + } + + return true; +} KernelsData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params, const optional_params& options) const { return GetCommonKernelsData(params, options, FORCE_PRIORITY_2); } -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h index 6ac996cc60e9f8..307b426a5635a1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -29,5 +29,11 @@ class PoolingKerneGPU_fs_bs_yx_bsv4_fsv32 : public PoolingKernelBase { protected: JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; + bool Validate(const Params&, const optional_params&) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION }; + } }; -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp index 16c566544bc915..34f97ab9cc963f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,13 +20,15 @@ ParamsKey PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::INT8); k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); k.EnablePoolType(PoolType::MAX); - // k.EnablePoolType(PoolType::AVG); k.EnablePoolRemainder(PoolRemainder::FLOOR); k.EnablePoolRemainder(PoolRemainder::CEIL); k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED); @@ -77,6 +79,22 @@ JitConstants PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetJitConstants(const p jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); jit.AddConstant(MakeJitConstant("BATCH_SG_COUNT", get_batch_sub_groups_count(params))); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); + + if (!params.fused_ops.empty()) { + auto input_dt = GetActivationType(params); + FusedOpsConfiguration conf = {"", + {"b", "f", "y", "x"}, + "pool_result", + input_dt, + 4, + LoadType::LT_UNALIGNED, + BoundaryCheck::ENABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::FEATURE}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } return jit; } @@ -85,4 +103,4 @@ KernelsData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetKernelsData(const Par const optional_params& options) const { return GetCommonKernelsData(params, options, FORCE_PRIORITY_1); } -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h index bb3fbf0dde4e13..3e2de8f1a3b57a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -29,5 +29,10 @@ class PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32 : public PoolingKernelBase { protected: JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION }; + } }; -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp index 71524b83512453..beedfe9e995b26 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -63,9 +63,12 @@ KernelsData PoolingKernelGPUInt8Ref::GetKernelsData(const Params& params, const JitConstants PoolingKernelGPUInt8Ref::GetJitConstants(const pooling_params& params, DispatchData kd) const { JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); if (!params.fused_ops.empty()) { - auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params); + auto input_dt = GetActivationType(params); + std::vector idx_order; if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) { idx_order = {"b", "f", "y", "x"}; @@ -73,7 +76,7 @@ JitConstants PoolingKernelGPUInt8Ref::GetJitConstants(const pooling_params& para idx_order = {"b", "f", "z", "y", "x"}; } - FusedOpsConfiguration conf = {"", idx_order, "pool_res", input_dt, 1 }; + FusedOpsConfiguration conf = {"", idx_order, "pool_result", input_dt, 1 }; jit.Merge(MakeFusedOpsJitConstants(params, {conf})); } @@ -88,7 +91,8 @@ bool PoolingKernelGPUInt8Ref::Validate(const Params& params, const optional_para if (p.inputs[0].GetDType() == Datatype::INT8 || p.inputs[0].GetDType() == Datatype::UINT8) { // Max pooling doesn't change quantization ranges, so output data type should be the same as input - if ((p.poolType == PoolType::MAX || p.poolType == PoolType::MAX_WITH_ARGMAX) && p.output.GetDType() != p.inputs[0].GetDType()) + if ((p.poolType == PoolType::MAX || p.poolType == PoolType::MAX_WITH_ARGMAX) + && (p.output.GetDType() != p.inputs[0].GetDType()) && p.quantization == QuantizationType::NONE) return false; // Average pooling should produce FP by default. (u)int8 is possible when quantize op is fused. // if (p.poolType == PoolType::AVG && diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h index efef3e15433b68..6def2a4b290a4c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -29,8 +29,7 @@ class PoolingKernelGPUInt8Ref : public PoolingKernelBase { bool Validate(const Params&, const optional_params&) const override; JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; std::vector GetSupportedFusedOps() const override { - return { FusedOpType::ELTWISE, - FusedOpType::QUANTIZE, + return { FusedOpType::QUANTIZE, FusedOpType::SCALE, FusedOpType::ACTIVATION }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp index 71e64f242c7d15..1f4bb273ec5fb6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,6 +22,8 @@ ParamsKey PoolingKernelGPURef::GetSupportedKey() const { k.EnableInputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); k.EnableInputLayout(DataLayout::bfyx); k.EnableInputLayout(DataLayout::yxfb); k.EnableInputLayout(DataLayout::byxf); @@ -53,6 +55,26 @@ ParamsKey PoolingKernelGPURef::GetSupportedKey() const { return k; } +JitConstants PoolingKernelGPURef::GetJitConstants(const pooling_params& params, DispatchData kd) const { + auto jit = PoolingKernelBase::GetJitConstants(params, kd); + jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); + + if (!params.fused_ops.empty()) { + auto input_dt = GetActivationType(params); + std::vector idx_order; + if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) { + idx_order = {"b", "f", "y", "x"}; + } else if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) { + idx_order = {"b", "f", "z", "y", "x"}; + } + FusedOpsConfiguration conf = {"", idx_order, "pool_result", input_dt, 1}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + + return jit; +} + KernelsData PoolingKernelGPURef::GetKernelsData(const Params& params, const optional_params& options) const { return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h index ff693f6be5cbfd..e42bcc8c77a1e9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -25,5 +25,13 @@ class PoolingKernelGPURef : public PoolingKernelBase { KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; ParamsKey GetSupportedKey() const override; + std::vector GetSupportedFusedOps() const override { + return { FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION }; + } + +protected: + JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; }; -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp index 6b9fd4b27d667c..3177325901d5a0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp @@ -32,7 +32,7 @@ namespace kernel_selector { pooling_kernel_selector::pooling_kernel_selector() { Attach(); - // Attach(); TODO: fix the kernel as it reads out of bounds now + //Attach(); TODO: fix the kernel as it reads out of bounds now Attach(); Attach(); Attach(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_average_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_average_opt.cl index 57205585849bb0..a10c90a68c587b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_average_opt.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_average_opt.cl @@ -17,7 +17,10 @@ __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1))) -KERNEL(pooling_gpu_average_opt)(const __global float* input, __global float* output) +KERNEL(pooling_gpu_average_opt)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +) { int local_id = get_local_id(0); int tile_x = get_global_id(0); @@ -39,7 +42,7 @@ KERNEL(pooling_gpu_average_opt)(const __global float* input, __global float* out // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // In the diagram above X represents the current work item. - const __global float* base_addr = input + offset + (start_y * INPUT0_SIZE_X + start_x) - 1; + const __global INPUT0_TYPE* base_addr = input + offset + (start_y * INPUT0_SIZE_X + start_x) - 1; float input_buffer[3]; input_buffer[0] = as_float(intel_sub_group_block_read((const __global uint*)(base_addr - INPUT0_SIZE_X))); @@ -92,10 +95,12 @@ KERNEL(pooling_gpu_average_opt)(const __global float* input, __global float* out res = (sum + sum_1 + sum_2) * ONE_OVER_POOL_SIZE; } #endif + OUTPUT_TYPE final_result; if ((local_id < TILE_WIDTH) && (offset_x < INPUT0_SIZE_X)) { - output[offset + y * INPUT0_SIZE_X + offset_x] = ACTIVATION(res, ACTIVATION_PARAMS); + final_result = TO_OUTPUT_TYPE(ACTIVATION(res, ACTIVATION_PARAMS)); + output[offset + y * INPUT0_SIZE_X + offset_x] = final_result; } first = (first + 1) % 3; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl index 23bca7c504ae74..08c4bf32ae0bf5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -19,24 +19,26 @@ #define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple)) #define AS_TYPE(type, val) CAT(as_, type)(val) -#define IN_VEC4 MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) -#define OUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) -#define CONVERT_OUT CAT(convert_, OUTPUT_TYPE) -#define CONVERT_OUT_VEC4 CAT(convert_, OUT_VEC4) +#define INPUT_VEC4 MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) -#if MAX_POOLING - #define INIT_VAL CHAR_MIN -#elif AVG_POOLING - #define INIT_VAL 0 +#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4) +#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4) + +#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) +#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4) + +#if defined MAX_POOLING + #define INIT_VAL ACCUMULATOR_VAL_MIN +#elif defined AVG_POOLING + #define INIT_VAL ACCUMULATOR_VAL_ZERO #else -#error + #error #endif - -inline int FUNC(apply_pooling)(int tmp, int in) +inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) { #if MAX_POOLING - return max(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif AVG_POOLING return tmp + in; #endif @@ -59,7 +61,7 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)( const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - int result[4] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL }; + ACCUMULATOR_TYPE result[4] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL }; #ifdef CHECK_BOUNDRY if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || @@ -88,11 +90,11 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)( const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH; int int_data = *((const __global int*)(input + input_idx)); - IN_VEC4 ch4_data = AS_TYPE(IN_VEC4, int_data); - result[0] = FUNC_CALL(apply_pooling)(result[0], (int)ch4_data[0]); - result[1] = FUNC_CALL(apply_pooling)(result[1], (int)ch4_data[1]); - result[2] = FUNC_CALL(apply_pooling)(result[2], (int)ch4_data[2]); - result[3] = FUNC_CALL(apply_pooling)(result[3], (int)ch4_data[3]); + INPUT_VEC4 ch4_data = AS_TYPE(INPUT_VEC4, int_data); + result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(ch4_data[0])); + result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(ch4_data[1])); + result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(ch4_data[2])); + result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(ch4_data[3])); #ifdef DYNAMIC_KERNEL_DIVIDER num_elements++; @@ -114,11 +116,11 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)( for(uint i = 0; i < POOL_SIZE_X; i++) { int int_data = *((const __global int*)(input + input_idx)); - IN_VEC4 ch4_data = AS_TYPE(IN_VEC4, int_data); - result[0] = FUNC_CALL(apply_pooling)(result[0], (int)ch4_data[0]); - result[1] = FUNC_CALL(apply_pooling)(result[1], (int)ch4_data[1]); - result[2] = FUNC_CALL(apply_pooling)(result[2], (int)ch4_data[2]); - result[3] = FUNC_CALL(apply_pooling)(result[3], (int)ch4_data[3]); + INPUT_VEC4 ch4_data = AS_TYPE(INPUT_VEC4, int_data); + result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(ch4_data[0])); + result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(ch4_data[1])); + result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(ch4_data[2])); + result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(ch4_data[3]));; input_idx += IN_X_PITCH; } @@ -132,47 +134,48 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)( #if defined AVG_POOLING #if ENABLE_ROUND - int4 pool_result; + int4 not_fused_result; for(uint i = 0; i < 4; i++) { #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1)))); + not_fused_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1)))); #else - result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X))); + not_fused_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X))); #endif } #else - float4 pool_result; + float4 not_fused_result; for(uint i = 0; i < 4; i++) { #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - pool_result[i] = (float)result[i] / max(num_elements, (uint)1); + not_fused_result[i] = (float)result[i] / max(num_elements, (uint)1); #else - pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X); + not_fused_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X); #endif } #endif // ENABLE_ROUND #else // AVG_POOLING - int4 pool_result; + int4 not_fused_result; for (uint i = 0; i < 4; ++i) { - pool_result[i] = result[i]; + not_fused_result[i] = result[i]; } #endif // AVG_POOLING + ACTIVATION_VEC4 pool_result = TO_ACTIVATION_VEC4(not_fused_result); + #if HAS_FUSED_OPS FUSED_OPS; - OUT_VEC4 final_result = FUSED_OPS_RESULT; + OUTPUT_VEC4 final_result = FUSED_OPS_RESULT; #else - OUT_VEC4 final_result = CONVERT_OUT_VEC4(pool_result); -#endif - + OUTPUT_VEC4 final_result = TO_OUTPUT_VEC4(pool_result); for(uint op = 0; op < 4; op++) { final_result[op] = ACTIVATION(final_result[op], ACTIVATION_PARAMS); } +#endif #if OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32 const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x); #if OUTPUT_FEATURE_NUM % 4 == 0 - *((__global OUT_VEC4*)(output + output_pos)) = final_result; + *((__global OUTPUT_VEC4*)(output + output_pos)) = final_result; #else for (uint i = 0; i < 4; ++i) { if (f + i < OUTPUT_FEATURE_NUM) { @@ -191,8 +194,12 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)( #undef ALIGN_TO #undef AS_TYPE -#undef IN_VEC4 -#undef OUT_VEC4 -#undef CONVERT_OUT -#undef CONVERT_OUT_VEC4 + #undef INIT_VAL +#undef INPUT_VEC4 + +#undef ACTIVATION_VEC4 +#undef TO_ACTIVATION_VEC4 + +#undef OUTPUT_VEC4 +#undef TO_OUTPUT_VEC4 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bfyx_block_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bfyx_block_opt.cl index a72c5bbacd89cc..960d4933ddf03e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bfyx_block_opt.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bfyx_block_opt.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2017 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,57 +16,58 @@ #include "include/include_all.cl" #if MAX_POOLING || MAX_WITH_ARGMAX_POOLING - #define UNIT_INIT_VAL UNIT_VAL_MIN -#elif AVG_POOLING - #define UNIT_INIT_VAL UNIT_VAL_ZERO + #define INIT_VAL ACCUMULATOR_VAL_MIN +#elif defined AVG_POOLING + #define INIT_VAL ACCUMULATOR_VAL_ZERO #else -#error + #error #endif - -inline UNIT_TYPE FUNC(apply_pooling)(UNIT_TYPE tmp, UNIT_TYPE in) +inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) { #if MAX_POOLING || MAX_WITH_ARGMAX_POOLING - return max(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif AVG_POOLING return tmp + in; #endif } -KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output +KERNEL(pooling_gpu)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output #if MAX_WITH_ARGMAX_POOLING -, __global float* arg_max + , __global float* arg_max +#endif +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS #endif ) { - const uint x = (uint)get_global_id(0); const uint y = (uint)get_global_id(1) * POOL_SIZE_Y; const uint bf = (uint)get_global_id(2); const uint f = bf % INPUT0_FEATURE_NUM; const uint b = bf / INPUT0_FEATURE_NUM; - + if ((x >= OUTPUT_SIZE_X) || (y >= OUTPUT_SIZE_Y)) return; const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - - UNIT_TYPE result = UNIT_INIT_VAL; - + uint input_idx = GET_DATA_INDEX(INPUT0, b, f, offset_y, offset_x); - UNIT_TYPE max_x[BLOCK_SIZE_Y]; - UNIT_TYPE out[POOL_SIZE_Y]; + ACCUMULATOR_TYPE max_x[BLOCK_SIZE_Y]; + ACCUMULATOR_TYPE result[POOL_SIZE_Y]; #if MAX_WITH_ARGMAX_POOLING uint arg_max_x[BLOCK_SIZE_Y] = { 0 }; - uint arg_max_out[POOL_SIZE_Y] = { 0 }; + uint arg_max_result[POOL_SIZE_Y] = { 0 }; uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b)); #endif for(uint i = 0; i < BLOCK_SIZE_Y; i++) { - max_x[i] = UNIT_INIT_VAL; + max_x[i] = INIT_VAL; } // we do max in "x" dimension @@ -79,7 +80,7 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output if(input[input_idx] > max_x[j]) arg_max_x[j] = input_idx_bfyx_no_padding; #endif - max_x[j] = FUNC_CALL(apply_pooling)(max_x[j], input[input_idx]); + max_x[j] = FUNC_CALL(apply_pooling)(max_x[j], TO_ACCUMULATOR_TYPE(input[input_idx])); input_idx += INPUT0_X_PITCH; #if MAX_WITH_ARGMAX_POOLING @@ -96,10 +97,10 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output for(uint i = 0; i < POOL_SIZE_Y; i++) { - out[i] = max_x[i * STRIDE_SIZE_Y]; + result[i] = max_x[i * STRIDE_SIZE_Y]; #if MAX_WITH_ARGMAX_POOLING - arg_max_out[i] = arg_max_x[i * STRIDE_SIZE_Y]; + arg_max_result[i] = arg_max_x[i * STRIDE_SIZE_Y]; #endif } @@ -110,11 +111,11 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output { #if MAX_WITH_ARGMAX_POOLING - if(max_x[j + i * STRIDE_SIZE_Y] > out[i]) - arg_max_out[i] = arg_max_x[j + i * STRIDE_SIZE_Y]; + if(max_x[j + i * STRIDE_SIZE_Y] > result[i]) + arg_max_result[i] = arg_max_x[j + i * STRIDE_SIZE_Y]; #endif - out[i] = FUNC_CALL(apply_pooling)(out[i], max_x[j + i * STRIDE_SIZE_Y]); + result[i] = FUNC_CALL(apply_pooling)(result[i], max_x[j + i * STRIDE_SIZE_Y]); } } @@ -124,22 +125,31 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output uint arg_max_pos = GET_DATA_INDEX(INPUT1, b, f, y, x); #endif + OUTPUT_TYPE final_result; + ACTIVATION_TYPE pool_result; + for(uint i = 0; i < POOL_SIZE_Y; i++) { if((y + i) < OUTPUT_SIZE_Y) { #if defined AVG_POOLING - out[i] /= (UNIT_TYPE)(POOL_SIZE_Y * POOL_SIZE_X); + result[i] /= TO_ACCUMULATOR_TYPE(POOL_SIZE_Y * POOL_SIZE_X); #endif - output[output_pos] = ACTIVATION(out[i], ACTIVATION_PARAMS); + pool_result = TO_ACTIVATION_TYPE(result[i]); + #if HAS_FUSED_OPS + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + #else + final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS)); + #endif + output[output_pos] = final_result; output_pos += OUTPUT_Y_PITCH; - #if MAX_WITH_ARGMAX_POOLING - arg_max[arg_max_pos] = arg_max_out[i]; + arg_max[arg_max_pos] = arg_max_result[i]; arg_max_pos += INPUT1_Y_PITCH; #endif } } } -#undef UNIT_INIT_VAL \ No newline at end of file +#undef INIT_VAL diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl index a9918ac970ef10..c20dbc1775a20b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,25 +14,39 @@ #include "include/include_all.cl" -#include "include/unit_type.cl" +#include "include/data_types.cl" #define FEATURE_SLICE_SIZE 16 #if X_BLOCK_SIZE > 1 -#define vec_t MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE) + #define INPUT_VAR_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, X_BLOCK_SIZE) + #define OUTPUT_VAR_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, X_BLOCK_SIZE) + #define ACCUMULATOR_VAR_TYPE MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, X_BLOCK_SIZE) + #define ACTIVATION_VAR_TYPE MAKE_VECTOR_TYPE(ACTIVATION_TYPE, X_BLOCK_SIZE) #else -#define vec_t UNIT_TYPE + #define INPUT_VAR_TYPE INPUT0_TYPE + #define OUTPUT_VAR_TYPE OUTPUT_TYPE + #define ACCUMULATOR_VAR_TYPE ACCUMULATOR_TYPE + #define ACTIVATION_VAR_TYPE ACTIVATION_TYPE #endif +#define TO_OUTPUT_VAR_TYPE(x) CAT(convert_, OUTPUT_VAR_TYPE)(x) +#define TO_ACCUMULATOR_VAR_TYPE CAT(convert_, ACCUMULATOR_VAR_TYPE) +#define TO_ACTIVATION_VAR_TYPE CAT(convert_, ACTIVATION_VAR_TYPE) + #if defined MAX_POOLING - #define UNIT_INIT_VAL UNIT_VAL_MIN + #define INIT_VAL ACCUMULATOR_VAL_MIN #elif defined AVG_POOLING - #define UNIT_INIT_VAL UNIT_VAL_ZERO -#else -#error + #define INIT_VAL ACCUMULATOR_VAL_ZERO #endif __attribute__((intel_reqd_sub_group_size(16))) -KERNEL(pooling_gpu_blocked)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output) +KERNEL(pooling_gpu_blocked)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) { const int lid = get_sub_group_local_id(); const int f_block = get_group_id(1); @@ -74,10 +88,10 @@ KERNEL(pooling_gpu_blocked)(const __global UNIT_TYPE* input, __global UNIT_TYPE* (x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch; - vec_t dst = (vec_t)UNIT_INIT_VAL; + ACCUMULATOR_VAR_TYPE dst = (ACCUMULATOR_VAR_TYPE)INIT_VAL; #if AVG_POOLING && (defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)) - UNIT_TYPE count; + ACCUMULATOR_TYPE count; if (lid < X_BLOCK_SIZE) { #if defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) @@ -91,10 +105,10 @@ KERNEL(pooling_gpu_blocked)(const __global UNIT_TYPE* input, __global UNIT_TYPE* int x_max = min(input_x + lid*STRIDE_SIZE_X + POOL_SIZE_X, INPUT0_SIZE_X); int y_max = min(input_y + POOL_SIZE_Y, INPUT0_SIZE_Y); #endif - count = (UNIT_TYPE)(1.f / (float)((y_max - y_min) * (x_max - x_min))); + count = TO_ACCUMULATOR_TYPE(1.f / (float)((y_max - y_min) * (x_max - x_min))); } - vec_t scale; + ACCUMULATOR_VAR_TYPE scale; #if X_BLOCK_SIZE > 1 for (int i = 0; i < X_BLOCK_SIZE; i++) scale[i] = intel_sub_group_shuffle(count, i); @@ -108,80 +122,138 @@ KERNEL(pooling_gpu_blocked)(const __global UNIT_TYPE* input, __global UNIT_TYPE* if (input_y + kh < 0 || input_y + kh >= INPUT0_SIZE_Y) continue; - UNIT_TYPE line_cache[INPUT_LINE_SIZE]; + INPUT0_TYPE line_cache[INPUT_LINE_SIZE]; for (int i = 0; i < INPUT_LINE_SIZE; i++) { if ((input_x + i) >= 0 && (input_x + i) < INPUT0_SIZE_X) - line_cache[i] = UNIT_BLOCK_READ(input, input_offset + kh*input_y_pitch + i*input_x_pitch); + line_cache[i] = DT_INPUT_BLOCK_READ(input, input_offset + kh*input_y_pitch + i*input_x_pitch); else - line_cache[i] = UNIT_INIT_VAL; + #if defined MAX_POOLING + line_cache[i] = INPUT0_VAL_MIN; + #elif defined AVG_POOLING + line_cache[i] = INPUT0_VAL_ZERO; + #endif } __attribute__((opencl_unroll_hint(POOL_SIZE_X))) for (int kw = 0; kw < POOL_SIZE_X; kw++) { - vec_t src; + ACCUMULATOR_VAR_TYPE src; #if X_BLOCK_SIZE > 1 for (int i = 0; i < X_BLOCK_SIZE; i++) { - src[i] = line_cache[kw + STRIDE_SIZE_X*i]; + src[i] = TO_ACCUMULATOR_TYPE(line_cache[kw + STRIDE_SIZE_X*i]); } #else - src = line_cache[kw]; + src = TO_ACCUMULATOR_VAR_TYPE(line_cache[kw]); #endif #if defined MAX_POOLING - dst = max(dst, src); + dst = ACCUMULATOR_MAX_FUNC(dst, src); #elif defined AVG_POOLING dst += src; #endif } } + ACTIVATION_VAR_TYPE pool_result; + #if defined MAX_POOLING - dst = ACTIVATION(dst, ACTIVATION_PARAMS); + pool_result = TO_ACTIVATION_VAR_TYPE(dst); + #if !HAS_FUSED_OP + pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS); + #endif #elif defined AVG_POOLING && (defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)) - dst = ACTIVATION((dst*scale), ACTIVATION_PARAMS); + pool_result = TO_ACTIVATION_VAR_TYPE(dst*scale); + #if !HAS_FUSED_OP + pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS); + #endif #elif defined AVG_POOLING - dst = ACTIVATION((dst/(POOL_SIZE_X*POOL_SIZE_Y)), ACTIVATION_PARAMS); + pool_result = TO_ACTIVATION_VAR_TYPE(dst/(POOL_SIZE_X*POOL_SIZE_Y)); + #if !HAS_FUSED_OP + pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS); + #endif #endif + OUTPUT_VAR_TYPE final_result; + #if OUTPUT_LEFTOVERS if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) { for (int i = 0; i < X_BLOCK_SIZE; i++) { - if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) + if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) { #if X_BLOCK_SIZE > 1 - output[output_offset + i * output_x_pitch + lid] = dst[i]; + #if HAS_FUSED_OP + FUSED_OPS_SCALAR; + final_result[i] = FUSED_OPS_RESULT_SCALAR; + #else + final_result[i] = TO_OUTPUT_TYPE(pool_result[i]); + #endif + output[output_offset + i * output_x_pitch + lid] = final_result[i]; #else - output[output_offset + i * output_x_pitch + lid] = dst; + #if HAS_FUSED_OPS + FUSED_OPS_VEC; + final_result = FUSED_OPS_RESULT_VEC; + #else + final_result = TO_OUTPUT_VAR_TYPE(pool_result); + #endif + output[output_offset + i * output_x_pitch + lid] = final_result; + #endif + } } } else #endif // OUTPUT_LEFTOVERS if (x + X_BLOCK_SIZE <= OUTPUT_SIZE_X) { -#if X_BLOCK_SIZE == 8 - UNIT_BLOCK_WRITE8(output, output_offset, dst); -#elif X_BLOCK_SIZE == 4 - UNIT_BLOCK_WRITE4(output, output_offset, dst); -#elif X_BLOCK_SIZE == 2 - UNIT_BLOCK_WRITE2(output, output_offset, dst); -#elif X_BLOCK_SIZE == 1 - UNIT_BLOCK_WRITE(output, output_offset, dst); -#endif + #if HAS_FUSED_OPS + FUSED_OPS_VEC; + final_result = FUSED_OPS_RESULT_VEC; + #else + final_result = TO_OUTPUT_VAR_TYPE(pool_result); + #endif + + #if X_BLOCK_SIZE == 8 + DT_OUTPUT_BLOCK_WRITE8(output, output_offset, final_result); + #elif X_BLOCK_SIZE == 4 + DT_OUTPUT_BLOCK_WRITE4(output, output_offset, final_result); + #elif X_BLOCK_SIZE == 2 + DT_OUTPUT_BLOCK_WRITE2(output, output_offset, final_result); + #elif X_BLOCK_SIZE == 1 + DT_OUTPUT_BLOCK_WRITE(output, output_offset, final_result); + #endif } else { const int x_tail = OUTPUT_SIZE_X - x; - for (int i = 0; i < x_tail; i++) + for (int i = 0; i < x_tail; i++){ #if X_BLOCK_SIZE > 1 - UNIT_BLOCK_WRITE(output, output_offset + i*output_x_pitch, dst[i]); + #if HAS_FUSED_OPS + FUSED_OPS_SCALAR; + final_result[i] = FUSED_OPS_RESULT_SCALAR; + #else + final_result[i] = TO_OUTPUT_TYPE(pool_result[i]); + #endif + DT_OUTPUT_BLOCK_WRITE(output, output_offset + i*output_x_pitch, final_result[i]); #else - UNIT_BLOCK_WRITE(output, output_offset + i*output_x_pitch, dst); + #if HAS_FUSED_OPS + FUSED_OPS_VEC; + final_result = FUSED_OPS_RESULT_VEC; + #else + final_result = TO_OUTPUT_VAR_TYPE(pool_result); + #endif + DT_OUTPUT_BLOCK_WRITE(output, output_offset + i*output_x_pitch, final_result); #endif + } } - - } -#undef UNIT_INIT_VAL +#undef INIT_VAL #undef FEATURE_SLICE_SIZE + +#undef INPUT_VAR_TYPE +#undef OUTPUT_VAR_TYPE +#undef TO_OUTPUT_VAR_TYPE + +#undef ACCUMULATOR_VAR_TYPE + +#undef ACTIVATION_VAR_TYPE +#undef TO_ACTIVATION_VAR_TYPE diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bsv16_fsv16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bsv16_fsv16.cl index e5bac34d0bec2d..c0fb62e8e050e0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bsv16_fsv16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bsv16_fsv16.cl @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019 Intel Corporation +* Copyright (c) 2020 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ -#include "include/unit_type.cl" #include "include/include_all.cl" #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) @@ -28,18 +27,29 @@ #define HAS_PAD_Y (PADDING_SIZE_Y != 0) #define HAS_PAD_X (PADDING_SIZE_X != 0) +#define INPUT_VEC8 MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) + +#define ACCUMULATOR_VEC8 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 8) +#define TO_ACCUMULATOR_VEC8 CAT(convert_, ACCUMULATOR_VEC8) + +#define ACTIVATION_VEC8 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 8) +#define TO_ACTIVATION_VEC8 CAT(convert_, ACTIVATION_VEC8) + +#define OUTPUT_VEC8 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) +#define TO_OUTPUT_VEC8 CAT(convert_, OUTPUT_VEC8) + +#define unroll_for __attribute__((opencl_unroll_hint)) for + #if MAX_POOLING -#define INIT_VAL INPUT0_VAL_MIN + #define INIT_VAL ACCUMULATOR_VAL_MIN #elif AVG_POOLING -#define INIT_VAL 0 + #define INIT_VAL ACCUMULATOR_VAL_ZERO #endif -#define unroll_for __attribute__((opencl_unroll_hint)) for - -inline UNIT_TYPE8 FUNC(apply_pooling)(UNIT_TYPE8 tmp, UNIT_TYPE8 in) +inline ACCUMULATOR_VEC8 FUNC(apply_pooling)(ACCUMULATOR_VEC8 tmp, ACCUMULATOR_VEC8 in) { #if MAX_POOLING - return INPUT0_MAX_FUNC(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif AVG_POOLING return tmp + in; #endif @@ -49,7 +59,13 @@ __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1))) #if SUB_GROUP_SIZE != 1 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) #endif -KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output) +KERNEL(pooling_gpu_bsv16_fsv16)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) { const int oc = get_group_id(0) * OC_BLOCK; const int sp = get_group_id(1); @@ -71,7 +87,7 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T int in_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; int pool_elementes = 0; - __global UNIT_TYPE *dst_write0 = output + __global OUTPUT_TYPE *dst_write0 = output + b * OUTPUT_FEATURE_NUM * (OUTPUT_SIZE_Z * OUTPUT_SIZE_Y * OUTPUT_SIZE_X) + oc * (OUTPUT_SIZE_Z * OUTPUT_SIZE_Y * OUTPUT_SIZE_X) * OC_BLOCK + z * OUTPUT_SIZE_Y * OUTPUT_SIZE_X * OC_BLOCK * MB_BLOCK @@ -84,8 +100,8 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T + in_y * INPUT0_SIZE_X_WITH_PADDING * IC_BLOCK * MB_BLOCK + in_z * INPUT0_SIZE_Y_WITH_PADDING * INPUT0_SIZE_X_WITH_PADDING * IC_BLOCK * MB_BLOCK; - UNIT_TYPE8 blockC00 = (UNIT_TYPE8)(INIT_VAL); - UNIT_TYPE8 blockC01 = (UNIT_TYPE8)(INIT_VAL); + ACCUMULATOR_VEC8 blockC00 = (ACCUMULATOR_VEC8)(INIT_VAL); + ACCUMULATOR_VEC8 blockC01 = (ACCUMULATOR_VEC8)(INIT_VAL); #if ((HAS_PAD_Z && POOL_SIZE_Z == 1) || (HAS_PAD_Y && POOL_SIZE_Y == 1) || (HAS_PAD_X && POOL_SIZE_X == 1)) if (!(in_z < 0 || in_z >= INPUT0_SIZE_Z_WITH_PADDING || in_y < 0 || in_y >= INPUT0_SIZE_Y_WITH_PADDING || in_x < 0 || in_x >= INPUT0_SIZE_X_WITH_PADDING)) { @@ -105,26 +121,25 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T #endif continue; } - const uint idx = p_z * INPUT0_SIZE_Y_WITH_PADDING * INPUT0_SIZE_X_WITH_PADDING * IC_BLOCK * MB_BLOCK + p_y * INPUT0_SIZE_X_WITH_PADDING * IC_BLOCK * MB_BLOCK + p_x * IC_BLOCK * MB_BLOCK; - const __global UNIT_TYPE *src1 = input + idx; + const __global INPUT0_TYPE *src1 = input + idx; #else - const __global UNIT_TYPE *src1 = input; + const __global INPUT0_TYPE *src1 = input; #endif + INPUT_VEC8 blockA; - UNIT_TYPE8 blockA; + blockA = DT_INPUT_BLOCK_READ8(src1, 0); - blockA = UNIT_BLOCK_READ8(src1, 0); + blockC00 = FUNC_CALL(apply_pooling)(blockC00, TO_ACCUMULATOR_VEC8(blockA)); - blockC00 = FUNC_CALL(apply_pooling)(blockC00, blockA); + blockA = DT_INPUT_BLOCK_READ8(src1, 8 * IC_BLOCK); - blockA = UNIT_BLOCK_READ8(src1, 8 * IC_BLOCK); - - blockC01 = FUNC_CALL(apply_pooling)(blockC01, blockA); + blockC01 = FUNC_CALL(apply_pooling)(blockC01, TO_ACCUMULATOR_VEC8(blockA)); pool_elementes++; + #if POOL_SIZE_Y != 1 || POOL_SIZE_X != 1 || POOL_SIZE_Z != 1 } #endif @@ -135,20 +150,43 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T #if defined AVG_POOLING #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - blockC00 /= max(pool_elementes, (int)1); - blockC01 /= max(pool_elementes, (int)1); + blockC00 /= (ACCUMULATOR_TYPE)max(pool_elementes, (int)1); + blockC01 /= (ACCUMULATOR_TYPE)max(pool_elementes, (int)1); #else - blockC00 /= (POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X); - blockC01 /= (POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X); + blockC00 /= (ACCUMULATOR_TYPE)POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X; + blockC01 /= (ACCUMULATOR_TYPE)POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X; #endif #endif - - blockC00 = ACTIVATION(blockC00, ACTIVATION_PARAMS); - blockC01 = ACTIVATION(blockC01, ACTIVATION_PARAMS); - - UNIT_BLOCK_WRITE8(dst_write0, 0, blockC00); - UNIT_BLOCK_WRITE8(dst_write0, 8 * OC_BLOCK, blockC01); + ACTIVATION_VEC8 pool_result; + OUTPUT_VEC8 final_result; + + #if HAS_FUSED_OPS + { + #define BLOCK_NUM 0 + pool_result = TO_ACTIVATION_VEC8(blockC00); + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + DT_OUTPUT_BLOCK_WRITE8(dst_write0, 0, final_result); + #undef BLOCK_NUM + } + { + #define BLOCK_NUM 1 + pool_result = TO_ACTIVATION_VEC8(blockC01); + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + DT_OUTPUT_BLOCK_WRITE8(dst_write0, 8 * OC_BLOCK, final_result); + #undef BLOCK_NUM + } + #else + pool_result = TO_ACTIVATION_VEC8(blockC00); + final_result = TO_OUTPUT_VEC8(ACTIVATION(pool_result, ACTIVATION_PARAMS)); + DT_OUTPUT_BLOCK_WRITE8(dst_write0, 0, final_result); + + pool_result = TO_ACTIVATION_VEC8(blockC01); + final_result = TO_OUTPUT_VEC8(ACTIVATION(pool_result, ACTIVATION_PARAMS)); + DT_OUTPUT_BLOCK_WRITE8(dst_write0, 8 * OC_BLOCK, final_result); + #endif } #undef INPUT0_SIZE_X_WITH_PADDING @@ -164,3 +202,13 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T #undef HAS_PAD_X #undef unroll_for +#undef INPUT_VEC8 + +#undef ACCUMULATOR_VEC8 +#undef TO_ACCUMULATOR_VEC8 + +#undef ACTIVATION_VEC8 +#undef TO_ACTIVATION_VEC8 + +#undef OUTPUT_VEC8 +#undef TO_OUTPUT_VEC8 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl index 9317c5113971c6..b3829ec6a96f7b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2017 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,22 +15,26 @@ #include "include/include_all.cl" -#define OUTPUT_TYPE4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) -#define TO_OUTPUT_TYPE4(x) CAT(convert_, OUTPUT_TYPE4)(x) +#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4) +#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4) + +#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4) + +#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) +#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4) #if MAX_POOLING - #define INIT_VAL INPUT0_VAL_MIN + #define INIT_VAL ACCUMULATOR_VAL_MIN #elif AVG_POOLING - #define INIT_VAL 0 + #define INIT_VAL ACCUMULATOR_VAL_ZERO #else -#error + #error #endif - -inline int FUNC(apply_pooling)(int tmp, int in) +inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) { #if MAX_POOLING - return max(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif AVG_POOLING return tmp + in; #endif @@ -61,7 +65,7 @@ KERNEL(pooling_gpu_byxf_af32)( const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - int4 result = INIT_VAL; + ACCUMULATOR_VEC4 result = INIT_VAL; #ifdef CHECK_BOUNDRY if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || @@ -90,10 +94,10 @@ KERNEL(pooling_gpu_byxf_af32)( const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH; input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx))); - result[0] = FUNC_CALL(apply_pooling)(result[0], (int)input_data[0]); - result[1] = FUNC_CALL(apply_pooling)(result[1], (int)input_data[1]); - result[2] = FUNC_CALL(apply_pooling)(result[2], (int)input_data[2]); - result[3] = FUNC_CALL(apply_pooling)(result[3], (int)input_data[3]); + result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0])); + result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1])); + result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2])); + result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3])); #ifdef DYNAMIC_KERNEL_DIVIDER num_elementes++; @@ -115,10 +119,10 @@ KERNEL(pooling_gpu_byxf_af32)( for(uint i = 0; i < POOL_SIZE_X; i++) { input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx))); - result[0] = FUNC_CALL(apply_pooling)(result[0], (int)input_data[0]); - result[1] = FUNC_CALL(apply_pooling)(result[1], (int)input_data[1]); - result[2] = FUNC_CALL(apply_pooling)(result[2], (int)input_data[2]); - result[3] = FUNC_CALL(apply_pooling)(result[3], (int)input_data[3]); + result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0])); + result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1])); + result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2])); + result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3])); input_idx += INPUT0_X_PITCH; } @@ -132,44 +136,54 @@ KERNEL(pooling_gpu_byxf_af32)( #if defined AVG_POOLING #if ENABLE_ROUND - int4 pool_result; + int4 not_fused_result; for (uint i = 0; i < 4; ++i) { #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - pool_result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1))); + not_fused_result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1))); #else - pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X))); + not_fused_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X))); #endif } #else // ENABLE_ROUND - float4 pool_result; + float4 not_fused_result; for (uint i = 0; i < 4; ++i) { #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - pool_result[i] = (float)result[i] / max(num_elementes, (uint)1); + not_fused_result[i] = (float)result[i] / max(num_elementes, (uint)1); #else - pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X); + not_fused_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X); #endif } #endif // ENABLE_ROUND #else // AVG_POOLING - int4 pool_result = result; + float4 not_fused_result = convert_float4(result); #endif // AVG_POOLING - OUTPUT_TYPE4 final_result; + OUTPUT_VEC4 final_result; #if HAS_FUSED_OPS + ACTIVATION_VEC4 fused_pool_result = TO_ACTIVATION_VEC4(not_fused_result); FUSED_OPS; final_result = FUSED_OPS_RESULT; + for(uint op = 0; op < 4; op++) + { + const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x); + output[output_pos] = final_result[op]; + } #else - final_result = TO_OUTPUT_TYPE4(pool_result); + final_result = TO_OUTPUT_VEC4(not_fused_result); + for(uint op = 0; op < 4; op++) + { + const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x); + final_result[op] = TO_OUTPUT_TYPE(ACTIVATION(not_fused_result[op], ACTIVATION_PARAMS)); + output[output_pos] = final_result[op]; + } #endif - -for(uint op = 0; op < 4; op++) -{ - const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x); - output[output_pos] = ACTIVATION(TO_OUTPUT_TYPE(final_result[op]), ACTIVATION_PARAMS); -} - } #undef INIT_VAL -#undef OUTPUT_TYPE4 -#undef TO_OUTPUT_TYPE4 +#undef ACCUMULATOR_VEC4 + +#undef ACTIVATION_VEC4 +#undef TO_ACTIVATION_VEC4 + +#undef OUTPUT_VEC4 +#undef TO_OUTPUT_VEC4 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_opt.cl index 293f56f6960b53..54bcab6d1b5b96 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_opt.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_opt.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,38 +15,48 @@ #include "include/include_all.cl" -#define VECTOR_TYPE MAKE_VECTOR_TYPE(UNIT_TYPE,8) +#define INPUT_VEC8 MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) + +#define ACCUMULATOR_VEC8 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 8) +#define TO_ACCUMULATOR_VEC8 CAT(convert_, ACCUMULATOR_VEC8) + #define FEATURE_PER_ITEM 8 #define FEATURE_BLOCK_NUM (OUTPUT_FEATURE_NUM / 8) -#if defined MAX_POOLING - #define UNIT_INIT_VAL UNIT_VAL_MIN -#elif defined AVG_POOLING - #define UNIT_INIT_VAL UNIT_VAL_ZERO +#if MAX_POOLING + #define INIT_VAL ACCUMULATOR_VAL_MIN +#elif AVG_POOLING + #define INIT_VAL ACCUMULATOR_VAL_ZERO #else -#error + #error #endif -inline VECTOR_TYPE FUNC(apply_pooling)(VECTOR_TYPE tmp, VECTOR_TYPE in) +inline ACCUMULATOR_VEC8 FUNC(apply_pooling)(ACCUMULATOR_VEC8 tmp, ACCUMULATOR_VEC8 in) { #if defined MAX_POOLING - return max(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif defined AVG_POOLING return tmp + in; #endif } -KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output) +KERNEL(pooling_gpu_byxf_opt)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) { - VECTOR_TYPE out; const uint x = (uint)get_global_id(0); const uint y = (uint)get_global_id(1); const uint bf = (uint)get_global_id(2); const uint f = bf / INPUT0_BATCH_NUM * FEATURE_PER_ITEM; const uint b = bf % INPUT0_BATCH_NUM; - - VECTOR_TYPE feature_block; - + + INPUT_VEC8 feature_block; + ACCUMULATOR_VEC8 result; + if ((x >= OUTPUT_SIZE_X) || (y >= OUTPUT_SIZE_Y)) return; @@ -54,8 +64,8 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE const int offset_y = (int)y*STRIDE_SIZE_Y; int input_idx = b*FEATURE_BLOCK_NUM*INPUT0_SIZE_X*INPUT0_SIZE_Y + FEATURE_BLOCK_NUM*INPUT0_SIZE_X*offset_y + FEATURE_BLOCK_NUM*offset_x + bf / INPUT0_BATCH_NUM; - - out = UNIT_INIT_VAL; + + result = INIT_VAL; __attribute__((opencl_unroll_hint)) for(uint j = 0; j < POOL_SIZE_Y; j++) @@ -64,20 +74,36 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE for(uint i = 0; i < POOL_SIZE_X; i++) { feature_block = vload8(input_idx+FEATURE_BLOCK_NUM*i, input); - out = FUNC_CALL(apply_pooling)(out, feature_block); + result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_VEC8(feature_block)); } input_idx += FEATURE_BLOCK_NUM*INPUT0_SIZE_X; } + OUTPUT_TYPE final_result; + uint output_pos = GET_DATA_INDEX(OUTPUT, b, f, y, x); __attribute__((opencl_unroll_hint)) for(uint i = 0; i < FEATURE_PER_ITEM; i++) { if(f+i < INPUT0_FEATURE_NUM){ #if defined MAX_POOLING - output[output_pos+i] = ACTIVATION(out[i], ACTIVATION_PARAMS); + ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result[i]); + #if HAS_FUSED_OPS + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + #else + final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS)); + #endif + output[output_pos+i] = final_result; #elif defined AVG_POOLING - output[output_pos+i] = ACTIVATION(out[i]/(UNIT_TYPE)(POOL_SIZE_X*POOL_SIZE_Y), ACTIVATION_PARAMS); + ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result[i]/(OUTPUT_TYPE)(POOL_SIZE_X*POOL_SIZE_Y)); + #if HAS_FUSED_OPS + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + #else + final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS)); + #endif + output[output_pos+i] = final_result; #endif } } @@ -85,5 +111,9 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE #undef FEATURE_BLOCK_NUM #undef FEATURE_PER_ITEM -#undef UNIT_INIT_VAL -#undef VECTOR_TYPE \ No newline at end of file + +#undef INIT_VAL +#undef INPUT_VEC8 + +#undef ACCUMULATOR_VEC8 +#undef TO_ACCUMULATOR_VEC8 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl index ca0969047d842d..72a5a1c4c770af 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,38 +15,48 @@ #include "include/include_all.cl" -#define VECTOR_TYPE MAKE_VECTOR_TYPE(UNIT_TYPE,8) +#define INPUT0_VEC8 MAKE_VECTOR_TYPE(INPUT0_TYPE,8) + +#define ACCUMULATOR_VEC8 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 8) +#define TO_ACCUMULATOR_VEC8 CAT(convert_, ACCUMULATOR_VEC8) + #define FEATURE_PER_ITEM 8 -#define FEATURE_BLOCK_NUM (OUTPUT_FEATURE_NUM / 8) +#define FEATURE_BLOCK_NUM (INPUT0_FEATURE_NUM / 8) -#if defined MAX_POOLING - #define UNIT_INIT_VAL UNIT_VAL_MIN -#elif defined AVG_POOLING - #define UNIT_INIT_VAL UNIT_VAL_ZERO +#if MAX_POOLING + #define INIT_VAL ACCUMULATOR_VAL_MIN +#elif AVG_POOLING + #define INIT_VAL ACCUMULATOR_VAL_ZERO #else -#error + #error #endif -inline VECTOR_TYPE FUNC(apply_pooling)(VECTOR_TYPE tmp, VECTOR_TYPE in) +inline ACCUMULATOR_VEC8 FUNC(apply_pooling)(ACCUMULATOR_VEC8 tmp, ACCUMULATOR_VEC8 in) { #if defined MAX_POOLING - return max(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif defined AVG_POOLING return tmp + in; #endif } -KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output) +KERNEL(pooling_gpu_byxf_opt)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) { - VECTOR_TYPE out; const uint x = (uint)get_global_id(0); const uint y = (uint)get_global_id(1); const uint bf = (uint)get_global_id(2); const uint f = bf / INPUT0_BATCH_NUM * FEATURE_PER_ITEM; const uint b = bf % INPUT0_BATCH_NUM; - - VECTOR_TYPE feature_block; - + + INPUT0_VEC8 feature_block; + ACCUMULATOR_VEC8 result; + if ((x >= OUTPUT_SIZE_X) || (y >= OUTPUT_SIZE_Y)) return; @@ -62,7 +72,7 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE #endif int input_idx = b*FEATURE_BLOCK_NUM*INPUT0_SIZE_X*INPUT0_SIZE_Y + FEATURE_BLOCK_NUM*INPUT0_SIZE_X*offset_y + FEATURE_BLOCK_NUM*offset_x + bf / INPUT0_BATCH_NUM; - out = UNIT_INIT_VAL; + result = INIT_VAL; __attribute__((opencl_unroll_hint)) for(uint j = 0; j < POOL_SIZE_Y; j++) @@ -79,13 +89,15 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE if (!zero) { feature_block = vload8(input_idx+FEATURE_BLOCK_NUM*i, input); - out = FUNC_CALL(apply_pooling)(out, feature_block); + result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_VEC8(feature_block)); } } } input_idx += FEATURE_BLOCK_NUM*INPUT0_SIZE_X; } + OUTPUT_TYPE final_result; + uint output_pos = GET_DATA_INDEX(OUTPUT, b, f, y, x); __attribute__((opencl_unroll_hint)) for(uint i = 0; i < FEATURE_PER_ITEM; i++) @@ -93,9 +105,23 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE if(f+i < INPUT0_FEATURE_NUM) { #if defined MAX_POOLING - output[output_pos+i] = ACTIVATION(out[i], ACTIVATION_PARAMS); + ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result[i]); + #if HAS_FUSED_OPS + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + #else + final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS)); + #endif + output[output_pos+i] = final_result; #elif defined AVG_POOLING - output[output_pos+i] = ACTIVATION(out[i]/(UNIT_TYPE)(POOL_SIZE_X*POOL_SIZE_Y), ACTIVATION_PARAMS); + ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result[i]/(OUTPUT_TYPE)(POOL_SIZE_X*POOL_SIZE_Y)); + #if HAS_FUSED_OPS + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + #else + final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS)); + #endif + output[output_pos+i] = final_result; #endif } } @@ -103,5 +129,9 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE #undef FEATURE_BLOCK_NUM #undef FEATURE_PER_ITEM -#undef UNIT_INIT_VAL -#undef VECTOR_TYPE \ No newline at end of file + +#undef INIT_VAL +#undef INPUT0_VEC8 + +#undef ACCUMULATOR_VEC8 +#undef TO_ACCUMULATOR_VEC8 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl index aa3cc42040be79..7c98ee7953085c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,25 +14,26 @@ #include "include/include_all.cl" -#include "include/unit_type.cl" +#include "include/data_types.cl" #if MAX_POOLING - #define INIT_VAL UNIT_VAL_MIN + #define INIT_VAL ACCUMULATOR_VAL_MIN #elif AVG_POOLING - #define INIT_VAL 0 + #define INIT_VAL ACCUMULATOR_VAL_ZERO #else -#error No correct pooling mode defined + #error No correct pooling mode defined #endif -#if defined(USE_FLOAT_ACC) - #define ACC_TYPE2 float2 - #define READ_BLOCK2_INPUT(input, input_total_offset) convert_float2(UNIT_BLOCK_READ2(input,total_input_offset)) - #define TO_UNIT_BLOCK2(values) convert_half2(values) -#else - #define ACC_TYPE2 UNIT_TYPE2 - #define READ_BLOCK2_INPUT(input, input_total_offset) UNIT_BLOCK_READ2(input,total_input_offset) - #define TO_UNIT_BLOCK2(values) values -#endif +#define INPUT_VEC2 MAKE_VECTOR_TYPE(INPUT0_TYPE, 2) + +#define ACCUMULATOR_VEC2 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 2) +#define TO_ACCUMULATOR_VEC2 CAT(convert_, ACCUMULATOR_VEC2) + +#define ACTIVATION_VEC2 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 2) +#define TO_ACTIVATION_VEC2 CAT(convert_, ACTIVATION_VEC2) + +#define OUTPUT_VEC2 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) +#define TO_OUTPUT_VEC2 CAT(convert_, OUTPUT_VEC2) #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) @@ -46,10 +47,10 @@ #define unroll_for __attribute__((opencl_unroll_hint)) for -inline ACC_TYPE2 FUNC(apply_pooling)(ACC_TYPE2 tmp, ACC_TYPE2 in) +inline ACCUMULATOR_VEC2 FUNC(apply_pooling)(ACCUMULATOR_VEC2 tmp, ACCUMULATOR_VEC2 in) { #if MAX_POOLING - return max(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif AVG_POOLING return tmp + in; #endif @@ -57,8 +58,12 @@ inline ACC_TYPE2 FUNC(apply_pooling)(ACC_TYPE2 tmp, ACC_TYPE2 in) __attribute__((intel_reqd_sub_group_size(REQD_SUB_GROUP_SIZE))) KERNEL(pooling_gpu_fs_b_yx_fsv32)( - const __global UNIT_TYPE* input, - __global UNIT_TYPE* output) + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) { const uint out_x = (uint)get_global_id(0); const uint out_y = (uint)get_global_id(1); @@ -69,12 +74,12 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( const uint b = bfs % INPUT0_BATCH_NUM; const uint fs = bfs / INPUT0_BATCH_NUM; - ACC_TYPE2 results = (ACC_TYPE2)(INIT_VAL,INIT_VAL); + ACCUMULATOR_VEC2 results = (ACCUMULATOR_VEC2)(INIT_VAL,INIT_VAL); const uint x_pitch = REQD_FEATURE_SLICE_SIZE; // difference in location between (x+1) and (x) const uint y_pitch = x_pitch * INPUT0_SIZE_X_WITH_PADDING; // difference in location between (y+1) and (y) const uint b_pitch = y_pitch * INPUT0_SIZE_Y_WITH_PADDING; // difference in location between (b+1) and (b) - const uint fs_pitch = b_pitch * INPUT0_BATCH_NUM; // difference in location between (fs+1) and (fs) + const uint fs_pitch = b_pitch * INPUT0_BATCH_NUM; // difference in location between (fs+1) and (fs) const int offset_x = (int)out_x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)out_y*STRIDE_SIZE_Y - PADDING_SIZE_Y; @@ -103,10 +108,8 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( { const size_t input_offset_x = (offset_x + in_dx) * x_pitch; const size_t total_input_offset = padding_offset + fs_offset + b_offset + input_offset_y + input_offset_x; - - ACC_TYPE2 tmp_input = READ_BLOCK2_INPUT(input, input_total_offset); - - results = FUNC_CALL(apply_pooling)(results, tmp_input); + INPUT_VEC2 tmp_input = DT_INPUT_BLOCK_READ2(input, total_input_offset); + results = FUNC_CALL(apply_pooling)(results , TO_ACCUMULATOR_VEC2(tmp_input)); #ifdef DYNAMIC_KERNEL_DIVIDER num_elements++; @@ -115,6 +118,7 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( } } } + #ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y); const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X); @@ -128,10 +132,8 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( { const size_t input_offset_x = (offset_x + in_dx) * x_pitch; const size_t total_input_offset = padding_offset + fs_offset + b_offset + input_offset_y + input_offset_x; - - ACC_TYPE2 tmp_input = READ_BLOCK2_INPUT(input, input_total_offset); - - results = FUNC_CALL(apply_pooling)(results, tmp_input); + INPUT_VEC2 tmp_input = DT_INPUT_BLOCK_READ2(input, total_input_offset); + results = FUNC_CALL(apply_pooling)(results , TO_ACCUMULATOR_VEC2(tmp_input)); } } #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) @@ -147,8 +149,6 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( #endif #endif - results = ACTIVATION(results, ACTIVATION_PARAMS); - const size_t out_x_pitch = REQD_FEATURE_SLICE_SIZE; const size_t out_y_pitch = out_x_pitch * OUTPUT_SIZE_X_WITH_PADDING; const size_t out_b_pitch = out_y_pitch * OUTPUT_SIZE_Y_WITH_PADDING; @@ -166,9 +166,19 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( const bool full_f = OUTPUT_FEATURE_NUM % REQD_FEATURE_SLICE_SIZE == 0 || fs * REQD_FEATURE_SLICE_SIZE + REQD_FEATURE_SLICE_SIZE <= OUTPUT_FEATURE_NUM; + OUTPUT_VEC2 final_result; + ACTIVATION_VEC2 pool_result = TO_ACTIVATION_VEC2(results); + + #if HAS_FUSED_OPS + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + #else + final_result = TO_OUTPUT_VEC2(ACTIVATION(pool_result , ACTIVATION_PARAMS)); + #endif + if (full_f) { - UNIT_BLOCK_WRITE2(output, output_offset, TO_UNIT_BLOCK2(results)); + DT_OUTPUT_BLOCK_WRITE2(output, output_offset, final_result); } else { @@ -176,14 +186,21 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( { if (fs * REQD_FEATURE_SLICE_SIZE + ofi * REQD_SUB_GROUP_SIZE + sglid < OUTPUT_FEATURE_NUM) { - output[output_offset + ofi * REQD_SUB_GROUP_SIZE + sglid] = (UNIT_TYPE)results[ofi]; + output[output_offset + ofi * REQD_SUB_GROUP_SIZE + sglid] = (OUTPUT_TYPE)final_result[ofi]; } } } } -#undef TO_UNIT_BLOCK2 -#undef READ_BLOCK2_INPUT -#undef ACC_TYPE2 #undef FEATURE_SLICE_SIZE #undef INIT_VAL +#undef INPUT_VEC2 + +#undef ACCUMULATOR_VEC2 +#undef TO_ACCUMULATOR_VEC2 + +#undef ACTIVATION_VEC2 +#undef TO_ACTIVATION_VEC2 + +#undef OUTPUT_VEC2 +#undef TO_OUTPUT_VEC2 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl index d81490d7fa3130..4439732718cce3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,19 +15,26 @@ #include "include/include_all.cl" +#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4) +#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4) + +#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4) + +#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE,4) +#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4) + #if MAX_POOLING - #define INIT_VAL CHAR_MIN + #define INIT_VAL ACCUMULATOR_VAL_MIN #elif AVG_POOLING - #define INIT_VAL 0 + #define INIT_VAL ACCUMULATOR_VAL_ZERO #else -#error + #error #endif - -inline int FUNC(apply_pooling)(int tmp, int in) +inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) { #if MAX_POOLING - return max(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif AVG_POOLING return tmp + in; #endif @@ -35,8 +42,12 @@ inline int FUNC(apply_pooling)(int tmp, int in) __attribute__((intel_reqd_sub_group_size(8))) KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( - const __global UNIT_TYPE* input, - __global UNIT_TYPE* output) + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) { const uint x = (uint)get_global_id(0); const uint y = (uint)get_global_id(1); @@ -44,8 +55,7 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( // we process 4 features per workitem that's why we need to divide it const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32; const uint f = ((uint)get_global_id(2) * 4) % aligned32_features; - const uint b = 4 * (((uint)get_global_id(2) * 4) / aligned32_features); - + const uint b = 4 * (((uint)get_global_id(2) * 4) / aligned32_features); if (x >= OUTPUT_SIZE_X) { return; @@ -53,8 +63,7 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - - int4 result[4] = { INIT_VAL }; + ACCUMULATOR_VEC4 result[4] = { INIT_VAL }; #ifdef CHECK_BOUNDRY if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || @@ -86,13 +95,12 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( for(uint b = 0; b < 4; b++) { char4 input_data = as_char4(int_data[b]); - result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], (int)input_data[0]); - result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], (int)input_data[1]); - result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], (int)input_data[2]); - result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], (int)input_data[3]); - + result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], TO_ACCUMULATOR_TYPE(input_data[0])); + result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], TO_ACCUMULATOR_TYPE(input_data[1])); + result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], TO_ACCUMULATOR_TYPE(input_data[2])); + result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], TO_ACCUMULATOR_TYPE(input_data[3])); } - + #ifdef DYNAMIC_KERNEL_DIVIDER num_elementes++; #endif @@ -116,54 +124,104 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( for(uint b = 0; b < 4; b++) { char4 input_data = as_char4(int_data[b]); - result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], (int)input_data[0]); - result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], (int)input_data[1]); - result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], (int)input_data[2]); - result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], (int)input_data[3]); + result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], TO_ACCUMULATOR_TYPE(input_data[0])); + result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], TO_ACCUMULATOR_TYPE(input_data[1])); + result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], TO_ACCUMULATOR_TYPE(input_data[2])); + result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], TO_ACCUMULATOR_TYPE(input_data[3])); } input_idx += IN_X_PITCH; } input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH); } - + #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y; #endif #endif #if defined AVG_POOLING - #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - for(uint b = 0; b < 4; b++) - { - for(uint i = 0; i < 4; i++) + #if ENABLE_ROUND + #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) + for(uint b = 0; b < 4; b++) { - result[b][i] = convert_int(round(((float)result[b][i] / max(num_elementes, (uint)1))); + for(uint i = 0; i < 4; i++) + { + result[b][i] = TO_ACCUMULATOR_TYPE(round(((float)result[b][i] / max(num_elementes, (uint)1)))); + } } - } + #else + for(uint b = 0; b < 4; b++) + { + for(uint i = 0; i < 4; i++) + { + result[b][i] = TO_ACCUMULATOR_TYPE(round((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X))); + } + } + #endif #else - for(uint b = 0; b < 4; b++) - { - for(uint i = 0; i < 4; i++) + #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) + for(uint b = 0; b < 4; b++) + { + for(uint i = 0; i < 4; i++) + { + result[b][i] = TO_ACCUMULATOR_TYPE(((float)result[b][i] / max(num_elementes, (uint)1))); + } + } + #else + for(uint b = 0; b < 4; b++) { - result[b][i] = convert_int(round((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X))); + for(uint i = 0; i < 4; i++) + { + result[b][i] = TO_ACCUMULATOR_TYPE((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)); + } } - } - #endif -#endif + #endif + #endif // ENABLE_ROUND +#endif // AVG_POOLING - int4 char_result; - for(uint b = 0; b < 4; b++) +#if OUTPUT_TYPE_SIZE == 1 + int4 final_result; + + for(uint bi = 0; bi < 4; bi++) { - char4 char_res = as_char4(char_result[b]); - for(uint op = 0; op < 4; op++) - { - char_res[op] = ACTIVATION(convert_char(result[b][op]), ACTIVATION_PARAMS); - } - char_result[b] = as_int(char_res); + #if HAS_FUSED_OPS + ACTIVATION_VEC4 char_result = TO_ACTIVATION_VEC4(convert_char4(result[bi])); + FUSED_OPS; + final_result[bi] = as_int(FUSED_OPS_RESULT); + #else + char4 char_result = ACTIVATION(convert_char4(result[bi]), ACTIVATION_PARAMS); + final_result[bi] = as_int(char_result); + #endif } const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); - intel_sub_group_block_write4((__global uint*)(output + output_pos), as_uint4(char_result)); + intel_sub_group_block_write4((__global uint*)(output + output_pos), as_uint4(final_result)); + +#elif OUTPUT_TYPE_SIZE == 2 || OUTPUT_TYPE_SIZE == 4 + OUTPUT_VEC4 final_result; + + for(uint bi = 0; bi < 4; bi++) + { + #if HAS_FUSED_OPS + ACTIVATION_VEC4 char_result = TO_ACTIVATION_VEC4(TO_OUTPUT_VEC4(result[bi])); + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + #else + char4 char_result = ACTIVATION(TO_OUTPUT_VEC4(result[bi]), ACTIVATION_PARAMS); + final_result = TO_OUTPUT_VEC4(char_result); + #endif + const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b + bi, f, y, x); + vstore4(final_result, 0, output + output_pos); + } +#endif } #undef INIT_VAL +#undef ACCUMULATOR_VEC4 +#undef ACCUMULATOR_VEC4 + +#undef ACTIVATION_VEC4 +#undef TO_ACTIVATION_VEC4 + +#undef OUTPUT_VEC4 +#undef TO_OUTPUT_VEC4 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl index 19d7e50122d3f6..f439e9e6e300ea 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,28 +15,35 @@ #include "include/include_all.cl" +#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4) +#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4) + +#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4) + +#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) +#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4) + #if MAX_POOLING - #define INIT_VAL CHAR_MIN -#elif AVG_POOLING - #define INIT_VAL 0 + #define INIT_VAL ACCUMULATOR_VAL_MIN #else -#error + #error #endif - -inline int FUNC(apply_pooling)(int tmp, int in) +inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) { #if MAX_POOLING - return max(tmp, in); -#elif AVG_POOLING - return tmp + in; + return ACCUMULATOR_MAX_FUNC(tmp, in); #endif } __attribute__((intel_reqd_sub_group_size(32))) KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)( - const __global UNIT_TYPE* input, - __global UNIT_TYPE* output) + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif +) { const uint x = (uint)get_group_id(0); const uint y = (uint)get_group_id(1); @@ -45,7 +52,6 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)( const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32; const uint f = ((bf * 32) % aligned32_features) + (get_sub_group_local_id() % 8) * 4; const uint b = 4 * ((bf * 32) / aligned32_features) + (get_sub_group_local_id() / 8); - if (x >= OUTPUT_SIZE_X) { return; @@ -53,8 +59,8 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)( const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - - int4 result = INIT_VAL; + + ACCUMULATOR_VEC4 result = INIT_VAL; if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y) @@ -81,23 +87,38 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)( int int_data = as_int(input_uint[0]); char4 input_data = zero ? (char4)(INIT_VAL,INIT_VAL,INIT_VAL,INIT_VAL) : as_char4(int_data); - result[0] = FUNC_CALL(apply_pooling)((int)result[0], (int)input_data[0]); - result[1] = FUNC_CALL(apply_pooling)((int)result[1], (int)input_data[1]); - result[2] = FUNC_CALL(apply_pooling)((int)result[2], (int)input_data[2]); - result[3] = FUNC_CALL(apply_pooling)((int)result[3], (int)input_data[3]); + result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0])); + result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1])); + result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2])); + result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3])); } } - char4 char_res; - for(uint op = 0; op < 4; op++) - { - char_res[op] = ACTIVATION(convert_char(result[op]), ACTIVATION_PARAMS); - } + OUTPUT_VEC4 final_result; - const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); + #if HAS_FUSED_OPS + ACTIVATION_VEC4 pool_result; + pool_result = TO_ACTIVATION_VEC4(TO_OUTPUT_VEC4(result)); + FUSED_OPS; + final_result = FUSED_OPS_RESULT; + #else + char4 pool_result; + for(uint op = 0; op < 4; op++) + { + pool_result[op] = ACTIVATION(TO_OUTPUT_TYPE(result[op]), ACTIVATION_PARAMS); + } + final_result = TO_OUTPUT_VEC4(pool_result); + #endif - __global uint* output_uint = (__global uint*)(output + output_pos); - output_uint[0] = as_uint(char_res); + const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); + *((__global OUTPUT_VEC4*)(output + output_pos)) = final_result; } #undef INIT_VAL +#undef ACCUMULATOR_VEC4 + +#undef ACTIVATION_VEC4 +#undef TO_ACTIVATION_VEC4 + +#undef OUTPUT_VEC4 +#undef TO_OUTPUT_VEC4 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl index cdb4cd127ba835..244d32fa19cee1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2017 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,17 +16,17 @@ #include "include/include_all.cl" #if MAX_POOLING - #define INIT_VAL CHAR_MIN + #define INIT_VAL ACCUMULATOR_VAL_MIN #elif AVG_POOLING - #define INIT_VAL 0 + #define INIT_VAL ACCUMULATOR_VAL_ZERO #else -#error + #error #endif -inline int FUNC(apply_pooling)(int tmp, int in) +inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) { #if MAX_POOLING - return max(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif AVG_POOLING return tmp + in; #endif @@ -84,6 +84,7 @@ KERNEL(pooling_gpu_int8_ref)( const uint bf = (uint)get_global_id(0); const uint f = bf / INPUT0_BATCH_NUM; const uint b = bf % INPUT0_BATCH_NUM; + const uint z = 0; #elif OUTPUT_LAYOUT_B_FS_YX_FSV16 const uint x = get_global_id(1); const uint y = get_global_id(2); @@ -92,14 +93,14 @@ KERNEL(pooling_gpu_int8_ref)( const uint b = bf % INPUT0_BATCH_NUM; const uint z = 0; #else -#error "pooling_int8_ref: unsupported layout" + #error "pooling_int8_ref: unsupported layout" #endif const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; const int offset_z = (int)z*STRIDE_SIZE_Z - PADDING_SIZE_Z; - int result = INIT_VAL; + ACCUMULATOR_TYPE result = INIT_VAL; #ifdef CHECK_BOUNDRY if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || @@ -138,8 +139,7 @@ KERNEL(pooling_gpu_int8_ref)( #else const uint input_idx = INPUT0_GET_INDEX(b, f, input_offset_y, input_offset_x); #endif - - result = FUNC_CALL(apply_pooling)(result, (int)input[input_idx]); + result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx])); #ifdef DYNAMIC_KERNEL_DIVIDER num_elementes++; @@ -180,7 +180,7 @@ KERNEL(pooling_gpu_int8_ref)( #else uint input_idx = INPUT0_GET_INDEX(b, f, offset_y + j, offset_x + i); #endif - result = FUNC_CALL(apply_pooling)(result, (int)input[input_idx]); + result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx])); } } } @@ -194,26 +194,29 @@ KERNEL(pooling_gpu_int8_ref)( #if defined AVG_POOLING #if ENABLE_ROUND #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - int pool_res = convert_int(round((float)result / max(num_elementes, (uint)1))); + int not_fused_result = convert_int(round((float)result / max(num_elementes, (uint)1))); #else - int pool_res = convert_int(round((float)result / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X))); + int not_fused_result = convert_int(round((float)result / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X))); #endif #else // ENABLE_ROUND #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - float pool_res = (float)result / max(num_elementes, (uint)1); + float not_fused_result = (float)result / max(num_elementes, (uint)1); #else - float pool_res = (float)result / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X); + float not_fused_result = (float)result / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X); #endif #endif // ENABLE_ROUND #else // defined AVG_POOLING - int pool_res = result; + int not_fused_result = result; #endif // defined AVG_POOLING + OUTPUT_TYPE final_result; + ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(not_fused_result); + #if HAS_FUSED_OPS FUSED_OPS; - OUTPUT_TYPE dst = FUSED_OPS_RESULT; + final_result = FUSED_OPS_RESULT; #else // HAS_FUSED_OPS - OUTPUT_TYPE dst = TO_OUTPUT_TYPE(pool_res); + final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS)); #endif // HAS_FUSED_OPS #if OUTPUT_DIMS == 5 @@ -221,7 +224,7 @@ KERNEL(pooling_gpu_int8_ref)( #else const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x); #endif - output[output_pos] = ACTIVATION(dst, ACTIVATION_PARAMS); + output[output_pos] = final_result; } #undef INIT_VAL diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl index 9d260c2eedb352..999ea6eb3220b9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,38 +16,42 @@ #include "include/include_all.cl" #if MAX_POOLING || MAX_WITH_ARGMAX_POOLING - #define UNIT_INIT_VAL UNIT_VAL_MIN + #define INIT_VAL ACCUMULATOR_VAL_MIN #elif AVG_POOLING - #define UNIT_INIT_VAL UNIT_VAL_ZERO + #define INIT_VAL ACCUMULATOR_VAL_ZERO #else -#error + #error #endif - inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) { #if MAX_POOLING || MAX_WITH_ARGMAX_POOLING - return max(tmp, in); + return ACCUMULATOR_MAX_FUNC(tmp, in); #elif AVG_POOLING return tmp + in; #endif } -KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output +KERNEL(pooling_gpu)( + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output #if MAX_WITH_ARGMAX_POOLING , __global float* arg_max #endif +#if HAS_FUSED_OPS_DECLS + , FUSED_OPS_DECLS +#endif ) { -#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX || OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || \ - OUTPUT_LAYOUT_B_FS_YX_FSV32 || OUTPUT_LAYOUT_B_FS_ZYX_FSV32 +#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX ||\ + OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 const uint x = (uint)get_global_id(0); -#if OUTPUT_DIMS < 5 - const uint y = (uint)get_global_id(1); - const uint z = 0; +#if OUTPUT_DIMS == 5 + const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; + const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y; #else - const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; - const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y; + const uint y = (uint)get_global_id(1); + const uint z = 0; #endif const uint bf = (uint)get_global_id(2); const uint f = bf % INPUT0_FEATURE_NUM; @@ -57,32 +61,51 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output { return; } +#elif OUTPUT_LAYOUT_B_FS_YX_FSV32 || OUTPUT_LAYOUT_B_FS_ZYX_FSV32 + const uint fsv = get_global_id(0); + const uint zyx = get_global_id(1); + const uint fsb = get_global_id(2); + + const uint x = zyx % OUTPUT_SIZE_X; +#if OUTPUT_DIMS == 5 + const uint y = zyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y; + const uint z = zyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y; +#else + const uint y = zyx / OUTPUT_SIZE_X; + const uint z = 0; +#endif + const uint fs = fsb % ((OUTPUT_FEATURE_NUM + 32 - 1) / 32); + const uint b = fsb / ((OUTPUT_FEATURE_NUM + 32 - 1) / 32); + const uint f = fs * 32 + fsv; + + if (f >= OUTPUT_FEATURE_NUM) { + return; + } #elif OUTPUT_LAYOUT_YXFB const uint x = (uint)get_global_id(1); const uint y = (uint)get_global_id(2); - const uint z = 0; const uint bf = (uint)get_global_id(0); const uint f = bf / INPUT0_BATCH_NUM; const uint b = bf % INPUT0_BATCH_NUM; + const uint z = 0; +#else + #error "pooling_gpu_ref: unsupported layout" #endif const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; const int offset_z = (int)z*STRIDE_SIZE_Z - PADDING_SIZE_Z; - ACCUMULATOR_TYPE result = UNIT_INIT_VAL; + ACCUMULATOR_TYPE result = INIT_VAL; #if MAX_WITH_ARGMAX_POOLING uint arg_max_idx = 0; #endif #ifdef CHECK_BOUNDRY - bool out_of_boundry = offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || - offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y; - #if INPUT0_SIZE_Z != 1 - out_of_boundry = out_of_boundry || offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z; - #endif - if (out_of_boundry) + if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || + offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y || + offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z) { return; } @@ -91,122 +114,140 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output uint num_elementes = 0; #endif - const uint batch_and_feature_offset = GET_DATA_INDEX(INPUT0, b, f, 0, 0); -#if OUTPUT_DIMS == 5 // 3D - for(uint k = 0; k < POOL_SIZE_Z; k++) +#if OUTPUT_DIMS == 5 + const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0, 0); +#else + const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0); +#endif + +#if OUTPUT_DIMS == 5 + for(uint l = 0; l < POOL_SIZE_Z; l++) { - int input_offset_z = offset_z + k; + int input_offset_z = offset_z + l; bool zero_z = input_offset_z >= INPUT0_SIZE_Z || input_offset_z < 0; - if(!zero_z) + if (!zero_z) { #endif - for(uint j = 0; j < POOL_SIZE_Y; j++) - { - int input_offset_y = offset_y + j; - bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; - if(!zero_y) - { - for(uint i = 0; i < POOL_SIZE_X; i++) + for(uint j = 0; j < POOL_SIZE_Y; j++) { - int input_offset_x = offset_x + i; - bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; - if(!zero) + int input_offset_y = offset_y + j; + bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; + if(!zero_y) { -#if OUTPUT_DIMS < 5 - const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH; + for(uint i = 0; i < POOL_SIZE_X; i++) + { + int input_offset_x = offset_x + i; + bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; + if(!zero) + { +#if OUTPUT_DIMS == 5 + #if !INPUT0_SIMPLE + const uint input_idx = INPUT0_GET_INDEX(b, f, input_offset_z, input_offset_y, input_offset_x); + #else + const uint input_idx = batch_and_feature_offset + input_offset_z*INPUT0_Z_PITCH + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH; + #endif #else - #if OUTPUT_LAYOUT_B_FS_ZYX_FSV16 - const uint input_idx = GET_DATA_B_FS_ZYX_FSV16_INDEX(INPUT0, b, f, input_offset_z, input_offset_y, input_offset_x); - #elif OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 - const uint input_idx = GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(INPUT0, b, f, input_offset_z, input_offset_y, input_offset_x); - #else - const uint input_idx = batch_and_feature_offset + input_offset_z*INPUT0_Z_PITCH + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH; - #endif + #if !INPUT0_SIMPLE + const uint input_idx = INPUT0_GET_INDEX(b, f, input_offset_y, input_offset_x); + #else + const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH; + #endif #endif #if MAX_WITH_ARGMAX_POOLING - if(input[input_idx] > result) - { + if(input[input_idx] > result) + { #if OUTPUT_DIMS < 5 - const uint input_idx_bfyx_no_padding = input_offset_x + INPUT0_SIZE_X * (input_offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b)); + const uint input_idx_bfyx_no_padding = input_offset_x + INPUT0_SIZE_X * (input_offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b)); #else - const uint input_idx_bfyx_no_padding = input_offset_x + INPUT0_SIZE_X * (input_offset_y + INPUT0_SIZE_Y * + const uint input_idx_bfyx_no_padding = input_offset_x + INPUT0_SIZE_X * (input_offset_y + INPUT0_SIZE_Y * (input_offset_z + INPUT0_SIZE_Z * (f + INPUT0_FEATURE_NUM * b))); #endif - arg_max_idx = input_idx_bfyx_no_padding; - } + arg_max_idx = input_idx_bfyx_no_padding; + } #endif - result = FUNC_CALL(apply_pooling)(result, input[input_idx]); + result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx])); #ifdef DYNAMIC_KERNEL_DIVIDER - num_elementes++; + num_elementes++; #endif + } + } } } - } - } -#if OUTPUT_DIMS == 5 // 3D +#if OUTPUT_DIMS == 5 } } #endif + #ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER -#if INPUT0_SIZE_Z != 1 - const int dend = min(offset_z + POOL_SIZE_Z, INPUT0_SIZE_Z + PADDING_SIZE_Z); -#endif const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y); const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X); -#if INPUT0_SIZE_Z == 1 - const uint num_elementes = (hend - offset_y) * (wend - offset_x); +#if OUTPUT_DIMS == 5 + const int zend = min(offset_z + POOL_SIZE_Z, INPUT0_SIZE_Z + PADDING_SIZE_Z); + const uint num_elementes = (hend - offset_y) * (wend - offset_x) * (zend - offset_z); #else - const uint num_elementes = (dend - offset_z) * (hend - offset_y) * (wend - offset_x); -#endif + const uint num_elementes = (hend - offset_y) * (wend - offset_x); #endif -#else + +#endif // DYNAMIC_WITH_PADDING_KERNEL_DIVIDER + +#else // CHECK_BOUNDRY + #if OUTPUT_DIMS == 5 // 3D - uint input_idx = GET_DATA_INDEX_5D(INPUT0, b, f, offset_z, offset_y, offset_x); + uint input_idx = INPUT0_GET_INDEX(b, f, offset_z, offset_y, offset_x); #else - uint input_idx = GET_DATA_INDEX(INPUT0, b, f, offset_y, offset_x); + uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x); #endif #if MAX_WITH_ARGMAX_POOLING -#if OUTPUT_DIMS < 5 - uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b)); -#else - uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (offset_z + INPUT0_SIZE_Z *(f + INPUT0_FEATURE_NUM * b))); -#endif + #if OUTPUT_DIMS < 5 + uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b)); + #else + uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (offset_z + INPUT0_SIZE_Z *(f + INPUT0_FEATURE_NUM * b))); + #endif #endif -#if OUTPUT_DIMS == 5 // 3D - for(uint k = 0; k < POOL_SIZE_Z; k++) +#if OUTPUT_DIMS == 5 + for(uint l = 0; l < POOL_SIZE_Z; l++) { #endif - for(uint j = 0; j < POOL_SIZE_Y; j++) - { - for(uint i = 0; i < POOL_SIZE_X; i++) + for(uint j = 0; j < POOL_SIZE_Y; j++) { - + for(uint i = 0; i < POOL_SIZE_X; i++) + { #if MAX_WITH_ARGMAX_POOLING if(input[input_idx] > result) arg_max_idx = input_idx_bfyx_no_padding; #endif -#if INPUT0_LAYOUT_B_FS_ZYX_FSV16 - uint input1_idx = INPUT0_GET_INDEX(b, f, offset_z+k, offset_y+j, offset_x+i); - result = FUNC_CALL(apply_pooling)(result, input[input1_idx]); +#if OUTPUT_DIMS == 5 + #if !INPUT0_SIMPLE + uint input_idx = INPUT0_GET_INDEX(b, f, offset_z + l, offset_y + j, offset_x + i); + result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx])); + #else + result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx])); + input_idx += INPUT0_X_PITCH; + #endif #else - result = FUNC_CALL(apply_pooling)(result, input[input_idx]); + #if !INPUT0_SIMPLE + uint input_idx = INPUT0_GET_INDEX(b, f, offset_y + j, offset_x + i); + result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx])); + #else + result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx])); + input_idx += INPUT0_X_PITCH; + #endif #endif - input_idx += INPUT0_X_PITCH; #if MAX_WITH_ARGMAX_POOLING - input_idx_bfyx_no_padding++; + input_idx_bfyx_no_padding++; #endif - } - input_idx += (INPUT0_Y_PITCH - POOL_SIZE_X*INPUT0_X_PITCH); + } + input_idx += (INPUT0_Y_PITCH - POOL_SIZE_X*INPUT0_X_PITCH); #if MAX_WITH_ARGMAX_POOLING - input_idx_bfyx_no_padding += (INPUT0_SIZE_X - POOL_SIZE_X); + input_idx_bfyx_no_padding += (INPUT0_SIZE_X - POOL_SIZE_X); #endif - } + } #if OUTPUT_DIMS == 5 // 3D input_idx += (INPUT0_Z_PITCH - POOL_SIZE_Y*INPUT0_Y_PITCH); #if MAX_WITH_ARGMAX_POOLING @@ -218,7 +259,8 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y*POOL_SIZE_Z; #endif -#endif + +#endif // CHECK_BOUNDRY #if defined AVG_POOLING #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) @@ -226,23 +268,30 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output #else result /= (ACCUMULATOR_TYPE)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X); #endif -#endif +#endif // defined AVG_POOLING -#if OUTPUT_LAYOUT_B_FS_ZYX_FSV16 - const uint output_pos = GET_DATA_B_FS_ZYX_FSV16_INDEX(OUTPUT, b, f, z, y, x); -#elif OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 - const uint output_pos = GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(OUTPUT, b, f, z, y, x); + OUTPUT_TYPE final_result; + ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result); + +#if HAS_FUSED_OPS + FUSED_OPS; + final_result = FUSED_OPS_RESULT; +#else // HAS_FUSED_OPS + final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS)); +#endif // HAS_FUSED_OPS + +#if OUTPUT_DIMS == 5 + const uint output_pos = OUTPUT_GET_INDEX(b, f, z, y, x); #else - const uint output_pos = GET_DATA_INDEX_5D(OUTPUT, b, f, z, y, x); + const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x); #endif - output[output_pos] = ACTIVATION(TO_UNIT_TYPE(result), ACTIVATION_PARAMS); + output[output_pos] = final_result; #if MAX_WITH_ARGMAX_POOLING //INPUT1 macro stands for Argmax const uint arg_max_pos = GET_DATA_INDEX_5D(INPUT1, b, f, z, y, x); arg_max[arg_max_pos] = convert_float(arg_max_idx); #endif - } -#undef UNIT_INIT_VAL +#undef INIT_VAL diff --git a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp index d2e08023ac2135..a0e6533b2dcab2 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp @@ -115,13 +115,13 @@ struct pooling_gpu : typed_primitive_gpu_impl { } // check if last pooling window goes outside of input size + padding. If so the avg pooling size will be - // adjusted to that. + // adjusted to that, to work properly this calculation must take pad_end into account. auto dynamic_mode = (((output_sizes.spatial[0] - 1) * stride.spatial[0]) + primitive->size.spatial[0]) > - -2 * input_offset.spatial[0] + input_sizes.spatial[0] || + (-input_offset.spatial[0] - primitive->pad_end.spatial[0]) + input_sizes.spatial[0] || (((output_sizes.spatial[1] - 1) * stride.spatial[1]) + primitive->size.spatial[1]) > - -2 * input_offset.spatial[1] + input_sizes.spatial[1] || + (-input_offset.spatial[1] - primitive->pad_end.spatial[1]) + input_sizes.spatial[1] || (((output_sizes.spatial[2] - 1) * stride.spatial[2]) + primitive->size.spatial[2]) > - -2 * input_offset.spatial[2] + input_sizes.spatial[2]; + (-input_offset.spatial[2] - primitive->pad_end.spatial[2]) + input_sizes.spatial[2]; if (primitive->mode == pooling_mode::average && dynamic_mode) pp.divMode = kernel_selector::kernel_divider_mode::DYNAMIC_WITH_PADDING; @@ -196,6 +196,7 @@ attach_pooling_gpu::attach_pooling_gpu() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create); // MMAD @@ -214,6 +215,9 @@ attach_pooling_gpu::attach_pooling_gpu() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv32), pooling_gpu::create); // implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::fs_b_yx_fsv32), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::fs_b_yx_fsv32), pooling_gpu::create); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_b_yx_fsv32), pooling_gpu::create); } } // namespace detail diff --git a/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp index 34db0348382a31..2823f73fcd5fda 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp @@ -104,6 +104,10 @@ attach_quantize_gpu::attach_quantize_gpu() { auto val_fw = quantize_gpu::create; implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::fs_b_yx_fsv32), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_b_yx_fsv32), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::fs_b_yx_fsv32), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw); @@ -134,12 +138,28 @@ attach_quantize_gpu::attach_quantize_gpu() { implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv32), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv32), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw); + + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw); implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw); diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp index 66cba243eff45d..591efacc8a737e 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp @@ -36,6 +36,7 @@ #include "lrn_inst.h" #include "mutable_data_inst.h" #include "mvn_inst.h" +#include "pooling_inst.h" #include "normalize_inst.h" #include "permute_inst.h" #include "reshape_inst.h" @@ -328,6 +329,15 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { return false; }; + auto pooling_supports_fusings = [](pooling_node& node) -> bool { + auto pooling_mode = node.as().get_primitive()->mode; + + if (pooling_mode != cldnn::pooling_mode::max_with_argmax) + return true; + + return false; + }; + auto fuse_activation_f = [&](activation_node& activation_node) { auto& input_data = activation_node.get_dependency(0); if (input_data.get_users().size() != 1 || activation_node.get_dependencies().size() >= 3) @@ -341,13 +351,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { should_fuse |= input_data.is_type() && gemm_supports_fusings(input_data.as()); - should_fuse |= input_data.is_type(); - - should_fuse |= input_data.is_type() && - (input_data.get_dependency(0).get_output_layout().data_type == data_types::i8 || - input_data.get_dependency(0).get_output_layout().data_type == data_types::u8) && - (input_data.as().get_primitive()->mode == pooling_mode::average || - input_data.as().get_primitive()->mode == pooling_mode::average_no_padding); + should_fuse |= input_data.is_type() && pooling_supports_fusings(input_data.as()); should_fuse |= input_data.is_type(); @@ -357,6 +361,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { should_fuse |= input_data.is_type(); + should_fuse |= input_data.is_type(); + if (!should_fuse) return; @@ -380,13 +386,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { should_fuse |= input_data.is_type() && gemm_supports_fusings(input_data.as()); - should_fuse |= input_data.is_type(); - - should_fuse |= input_data.is_type() && - (input_data.get_dependency(0).get_output_layout().data_type == data_types::i8 || - input_data.get_dependency(0).get_output_layout().data_type == data_types::u8) && - (input_data.as().get_primitive()->mode == pooling_mode::average || - input_data.as().get_primitive()->mode == pooling_mode::average_no_padding); + should_fuse |= input_data.is_type() && pooling_supports_fusings(input_data.as()); should_fuse |= input_data.is_type(); @@ -396,6 +396,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { should_fuse |= input_data.is_type(); + should_fuse |= input_data.is_type(); + if (!should_fuse) return; @@ -434,13 +436,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { input_data.get_dependency(0).get_output_layout().data_type == data_types::i8) && (out_layout.data_type == data_types::u8 || out_layout.data_type == data_types::i8))); - should_fuse |= input_data.is_type() && - quantize_node.get_scale_shift_opt() && - // TODO: unify pooling ref and ref_int8 kernels and remove this restriction on precision - (input_data.get_dependency(0).get_output_layout().data_type == data_types::u8 || - input_data.get_dependency(0).get_output_layout().data_type == data_types::i8) && - (input_data.as().get_primitive()->mode == pooling_mode::average || - input_data.as().get_primitive()->mode == pooling_mode::average_no_padding); + should_fuse |= input_data.is_type() && quantize_node.get_scale_shift_opt() && + pooling_supports_fusings(input_data.as()); should_fuse |= input_data.is_type() && fc_supports_fusings(input_data.as()) && quantize_node.get_scale_shift_opt() && diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp index a3bbae5806b8c3..1d81204c3f7df6 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -2554,117 +2554,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, mvn_scale_activation_quantize_u8_eltwise_fp mvn_test_params{ CASE_MVN_3D_U8_2, 2, 7 }, }), ); -/* ----------------------------------------------------------------------------------------------------- */ -/* --------------------------------------- Pooling cases ----------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ -struct pooling_test_params { - tensor input_size; - data_types input_type; - format input_format; - pooling_mode mode; - tensor kernel_size; - tensor stride; - tensor offset; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_POOLING_F32_1 {1, 16, 8, 8}, data_types::f32, format::bfyx, pooling_mode::max, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx -#define CASE_POOLING_F32_2 {2, 16, 8, 8}, data_types::f32, format::bfyx, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx - -#define CASE_POOLING_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, pooling_mode::max, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx -#define CASE_POOLING_F16_2 {2, 16, 8, 8}, data_types::f16, format::bfyx, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx - -#define CASE_POOLING_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, pooling_mode::max, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx -#define CASE_POOLING_U8_2 {2, 16, 8, 8}, data_types::u8, format::bfyx, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx -#define CASE_POOLING_U8_3 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx - -#define CASE_POOLING_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, pooling_mode::max, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx -#define CASE_POOLING_I8_2 {2, 16, 8, 8}, data_types::i8, format::bfyx, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx -#define CASE_POOLING_I8_3 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx - -class PoolingFusingTest : public ::BaseFusingTest { -public: - void execute(pooling_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(pooling_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; - } - - layout get_per_channel_layout(pooling_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{1, p.input_size.feature[0], 1, 1} }; - } -}; - -class pooling_activation : public PoolingFusingTest {}; -TEST_P(pooling_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - pooling("pooling", "input", p.mode, p.kernel_size, p.stride, p.offset), - activation("act", "pooling", activation_func::relu), - reorder("reorder_bfyx", "act", format::bfyx, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_activation, - ::testing::ValuesIn(std::vector{ - pooling_test_params{ CASE_POOLING_F32_1, 2, 3 }, - pooling_test_params{ CASE_POOLING_F32_2, 2, 3 }, - pooling_test_params{ CASE_POOLING_F16_1, 2, 3 }, - pooling_test_params{ CASE_POOLING_F16_2, 2, 3 }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 3 }, - pooling_test_params{ CASE_POOLING_U8_2, 2, 3 }, - pooling_test_params{ CASE_POOLING_U8_3, 2, 3 }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 3 }, - pooling_test_params{ CASE_POOLING_I8_2, 2, 3 }, - pooling_test_params{ CASE_POOLING_I8_3, 2, 3 }, -}), ); - -class pooling_scale : public PoolingFusingTest {}; -TEST_P(pooling_scale, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel_size.count())), - pooling("pooling", "input", p.mode, p.kernel_size, p.stride, p.offset), - scale("scale", "pooling", "scale_data"), - reorder("reorder_bfyx", "scale", format::bfyx, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_scale, - ::testing::ValuesIn(std::vector{ - pooling_test_params{ CASE_POOLING_F32_1, 3, 3 }, - pooling_test_params{ CASE_POOLING_F32_2, 3, 3 }, - pooling_test_params{ CASE_POOLING_F16_1, 3, 3 }, - pooling_test_params{ CASE_POOLING_F16_2, 3, 3 }, - pooling_test_params{ CASE_POOLING_U8_1, 3, 3 }, - pooling_test_params{ CASE_POOLING_U8_2, 2, 3 }, - pooling_test_params{ CASE_POOLING_U8_3, 2, 3 }, - pooling_test_params{ CASE_POOLING_I8_1, 3, 3 }, - pooling_test_params{ CASE_POOLING_I8_2, 2, 3 }, - pooling_test_params{ CASE_POOLING_I8_3, 2, 3 }, -}), ); - /* ----------------------------------------------------------------------------------------------------- */ /* ---------------------------------------- LRN cases -------------------------------------------------- */ /* ----------------------------------------------------------------------------------------------------- */ @@ -3707,3 +3596,451 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_u8_eltw_scale_actv_ deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 9 }, deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 9 }, }), ); + +/* ----------------------------------------------------------------------------------------------------- */ +/* --------------------------------------- Pooling cases ----------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ +struct pooling_test_params { + tensor in_shape; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; + pooling_mode pool_mode; + std::string kernel_name; +}; + +#define CASE_POOLING_F32_1 {1, 16, 8, 8}, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F32_2 {2, 16, 8, 8}, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F32_3 {1, 32, 10, 10}, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F32_4 {1, 32, 10, 10}, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfyx +#define CASE_POOLING_F32_5 {1, 32, 10, 10}, data_types::f32, format::byxf, data_types::f32, format::bfyx +#define CASE_POOLING_F32_6 {1, 32, 40, 40}, data_types::f32, format::byxf, data_types::f32, format::bfyx +#define CASE_POOLING_F32_7 {16, 32, 10, 10}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F32_8 {16, 32, 10, 10}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F32_9 {16, 32, 10, 10}, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F32_10 {16, 32, 10, 10, 10}, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfyx + +#define CASE_POOLING_F32_F16_1 {1, 16, 8, 8}, data_types::f32, format::bfyx, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_2 {2, 16, 8, 8}, data_types::f32, format::bfyx, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_3 {1, 32, 10, 10}, data_types::f32, format::bfyx, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_4 {1, 32, 10, 10}, data_types::f32, format::fs_b_yx_fsv32, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_5 {1, 32, 10, 10}, data_types::f32, format::byxf, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_6 {1, 32, 40, 40}, data_types::f32, format::byxf, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_7 {16, 32, 10, 10}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_8 {16, 32, 10, 10}, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_9 {16, 32, 10, 10}, data_types::f32, format::b_fs_zyx_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_10 {16, 32, 10, 10, 10}, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfyx + +#define CASE_POOLING_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F16_3 {1, 32, 10, 10}, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F16_4 {1, 32, 10, 10}, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfyx +#define CASE_POOLING_F16_5 {1, 32, 10, 10}, data_types::f16, format::byxf, data_types::f32, format::bfyx +#define CASE_POOLING_F16_6 {1, 32, 40, 40}, data_types::f16, format::byxf, data_types::f32, format::bfyx +#define CASE_POOLING_F16_7 {16, 32, 10, 10}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F16_8 {16, 32, 10, 10}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F16_9 {16, 32, 10, 10, 10}, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F16_10 {16, 32, 10, 10, 10}, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfyx + +#define CASE_POOLING_F16_FP16_1 {1, 32, 10, 10}, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_2 {1, 32, 10, 10}, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_3 {1, 32, 10, 10}, data_types::f16, format::byxf, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_4 {1, 32, 40, 40}, data_types::f16, format::byxf, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_5 {16, 32, 10, 10}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_6 {16, 32, 10, 10}, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_7 {16, 32, 10, 10, 10}, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_8 {16, 32, 10, 10, 10}, data_types::f16, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfyx + +#define CASE_POOLING_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_U8_2 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_U8_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4 +#define CASE_POOLING_U8_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f32, format::bfyx +#define CASE_POOLING_U8_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx +#define CASE_POOLING_U8_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx + +#define CASE_POOLING_U8_FP16_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4 +#define CASE_POOLING_U8_FP16_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f16, format::bfyx +#define CASE_POOLING_U8_FP16_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx +#define CASE_POOLING_U8_FP16_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx + +#define CASE_POOLING_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_I8_2 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_I8_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f32, format::bfyx +#define CASE_POOLING_I8_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4 +#define CASE_POOLING_I8_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx + +#define CASE_POOLING_I8_FP16_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f16, format::bfyx +#define CASE_POOLING_I8_FP16_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4 +#define CASE_POOLING_I8_FP16_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx + +// Disabled +#define CASE_POOLING_I8_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfyx +#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx +#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx + +class PoolingFusingTest : public ::BaseFusingTest { +public: + void execute(pooling_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + build_options options; + options.set_option(build_option::optimize_data(true)); + if (!p.kernel_name.empty()) { + implementation_desc impl = {p.input_format, p.kernel_name}; + options.set_option(build_option::force_implementations({{"pooling", impl}})); + } + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, options); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + ASSERT_FALSE(network_fused.get_primitives_info().empty()); + ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); + + auto find_and_check = [&](primitive_info& p) -> bool { + if (p.original_id == "pooling" || p.original_id == "output_reorder") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto pi_not_fused = network_not_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_and_check); + auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_and_check); + + ASSERT_TRUE(info_fused != pi_fused.end()); + ASSERT_TRUE(info_not_fused != pi_not_fused.end()); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(pooling_test_params& p) { return layout{p.data_type, p.input_format, p.in_shape}; } + layout get_per_channel_layout(pooling_test_params& p) { + return layout{p.default_type, p.default_format, tensor{1, p.in_shape.feature[0], 1, 1}}; + } +}; + +class pooling_f32_activation : public PoolingFusingTest {}; +TEST_P(pooling_f32_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + pooling("pooling", "input", p.pool_mode, tensor{1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}), + activation("act", "pooling", activation_func::relu), + reorder("output_reorder", "act", format::bfyx, data_types::f32)); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_CASE_P(fusings_gpu, + pooling_f32_activation, + ::testing::ValuesIn(std::vector{ + pooling_test_params{CASE_POOLING_F32_1, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_F32_1, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_F16_1, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_F16_1, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_U8_1, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_U8_1, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_U8_2, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_U8_2, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_I8_2, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_I8_2, 2, 3, pooling_mode::average, ""}, + }), ); + +class pooling_f32_scale : public PoolingFusingTest {}; +TEST_P(pooling_f32_scale, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 3, 3}.count())), + pooling("pooling", "input", p.pool_mode, tensor{1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}), + scale("scale", "pooling", "scale_data"), + reorder("output_reorder", "scale", format::bfyx, data_types::f32)); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_CASE_P(fusings_gpu, + pooling_f32_scale, + ::testing::ValuesIn(std::vector{ + pooling_test_params{CASE_POOLING_F32_1, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_F32_1, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_F16_1, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_F16_1, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_U8_1, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_U8_1, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_U8_2, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_U8_2, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::average, ""}, + pooling_test_params{CASE_POOLING_I8_2, 2, 3, pooling_mode::max, ""}, + pooling_test_params{CASE_POOLING_I8_2, 2, 3, pooling_mode::average, ""}, + }), ); + +class pooling_scale_activation_quantize : public PoolingFusingTest {}; +TEST_P(pooling_scale_activation_quantize, basic) { + auto p = GetParam(); + + create_topologies(input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 4, 4}.count())), + pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), + scale("scale", "pooling", "scale_data"), + activation("activation", "scale", activation_func::relu), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8), + reorder("output_reorder", "quantize", p.default_format, data_types::f32)); + + tolerance = 1.0f; + execute(p); +} + +TEST_P(pooling_scale_activation_quantize, i8_output_data_type) { + auto p = GetParam(); + + create_topologies(input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127, 127)), + data("out_hi", get_mem(get_single_element_layout(p), -127, 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 4, 4}.count())), + pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), + scale("scale", "pooling", "scale_data"), + activation("activation", "scale", activation_func::relu), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("output_reorder", "quantize", p.default_format, data_types::f32)); + + tolerance = 1.0f; + execute(p); +} + +TEST_P(pooling_scale_activation_quantize, per_channel) { + auto p = GetParam(); + + create_topologies(input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 4, 4}.count())), + pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), + scale("scale", "pooling", "scale_data"), + activation("activation", "scale", activation_func::atan), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8), + reorder("output_reorder", "quantize", p.default_format, data_types::f32)); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_CASE_P(fusings_gpu, + pooling_scale_activation_quantize, + ::testing::ValuesIn(std::vector{ + // Input type: FP32 + pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::max, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_4, 2, 5, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F32_4, 2, 5, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F32_5, 2, 5, pooling_mode::average, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F32_5, 2, 5, pooling_mode::max, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F32_6, 2, 5, pooling_mode::average, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F32_6, 2, 5, pooling_mode::max, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F32_7, 2, 5, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F32_7, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F32_8, 2, 5, pooling_mode::average, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F32_8, 2, 5, pooling_mode::max, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F32_9, 2, 5, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_9, 2, 5, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_10, 2, 5, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F32_10, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + + // Input type: INT8 + pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_I8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_I8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"}, + + // Input type: UINT8 + pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"}, + }), ); + +INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu, + pooling_scale_activation_quantize, + ::testing::ValuesIn(std::vector{ + pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"}, + pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, + pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, + pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_average_opt"}, //currently not enabled, fusing not upported + }), ); + +class pooling_scale_activation : public PoolingFusingTest {}; +TEST_P(pooling_scale_activation, basic) { + auto p = GetParam(); + + create_topologies(input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 4, 4}.count())), + pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), + scale("scale", "pooling", "scale_data"), + activation("activation", "scale", activation_func::relu), + reorder("output_reorder", "activation", p.default_format, data_types::f32)); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_CASE_P(fusings_gpu, + pooling_scale_activation, + ::testing::ValuesIn(std::vector{ + // Input type: F32 + pooling_test_params{CASE_POOLING_F32_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F32_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F32_3, 2, 4, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_3, 2, 4, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F32_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F32_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F32_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F32_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F32_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F32_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F32_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F32_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F32_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F32_9, 2, 4, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_9, 2, 4, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F32_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + + // Input type: INT8 + pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_I8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_I8_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, + + // Input type: UINT8 + pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, + + // Input type: FP16 Output type: F32 + pooling_test_params{CASE_POOLING_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F16_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F16_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F16_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F16_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F16_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F16_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F16_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F16_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F16_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F16_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F16_9, 2, 4, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F16_9, 2, 4, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F16_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + + // Input type: FP16 + pooling_test_params{CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F16_FP16_2, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F16_FP16_2, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F16_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F16_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F16_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F16_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F16_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F16_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F16_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F16_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F16_FP16_7, 2, 4, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F16_FP16_7, 2, 4, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F16_FP16_8, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F16_FP16_8, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + + // Input type: FP32 + pooling_test_params{CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt"}, + pooling_test_params{CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_F16_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F32_F16_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"}, + pooling_test_params{CASE_POOLING_F32_F16_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F32_F16_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt"}, + pooling_test_params{CASE_POOLING_F32_F16_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F32_F16_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt"}, + pooling_test_params{CASE_POOLING_F32_F16_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F32_F16_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F32_F16_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F32_F16_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked"}, + pooling_test_params{CASE_POOLING_F32_F16_9, 2, 4, pooling_mode::average, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_F16_9, 2, 4, pooling_mode::max, "pooling_gpu_ref"}, + pooling_test_params{CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"}, + pooling_test_params{CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"}, + + // Input type: INT8 + pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, + + // Input type: UINT8 + pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"}, + pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"}, + pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"}, + pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"}, + }), ); + +INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu, + pooling_scale_activation, + ::testing::ValuesIn(std::vector{ + pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"}, + pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, + pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"}, + pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, + pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"}, + }), ); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp index aae7df75ca356f..78b1fa84c0778e 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -31,6 +31,14 @@ using namespace cldnn; using namespace tests; +namespace cldnn { +template <> +struct type_to_data_type { + static const data_types value = data_types::f16; +}; +} // namespace cldnn + + template struct pooling_mode_output { using type = InputT; @@ -71,7 +79,7 @@ template struct pooling_accumulator { using output_t = typename pooling_mode_output::type; - pooling_accumulator() : _acc(std::numeric_limits::min()) {} + pooling_accumulator() : _acc(std::numeric_limits::lowest()) {} void accumulate(const InputT& val) { using std::max; @@ -82,7 +90,7 @@ struct pooling_accumulator { return static_cast(_acc); } - void reset() { _acc = std::numeric_limits::min(); } + void reset() { _acc = std::numeric_limits::lowest(); } InputT _acc; }; @@ -121,7 +129,7 @@ struct pooling_accumulator { } output_t get(size_t pool_x, size_t pool_y) { - return static_cast(_acc / (pool_x * pool_y)); + return static_cast(_acc / static_cast(pool_x * pool_y)); } void reset() { @@ -2351,6 +2359,7 @@ class pooling_test_base { auto input_lay = layout(input_type(), input_format(), input_size); + auto topo = topology( input_layout("input", input_lay), pooling("pool", @@ -2397,12 +2406,23 @@ class pooling_test_base { auto out_lay = out_mem.get_layout(); auto out_ptr = out_mem.cldnn::memory::template pointer(); + std::string kernel; + for (auto i : net.get_primitives_info()) { + if (i.original_id == "pool") { + kernel = i.kernel_id; + } + } + std::cout << kernel << std::endl; + SCOPED_TRACE("\nkernel: " + kernel); + ASSERT_EQ(out_lay.data_type, output_type()); ASSERT_EQ(out_lay.size.batch[0], expected.size()); ASSERT_EQ(out_lay.size.feature[0], expected[0].size()); ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size()); ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size()); + bool compare_with_tolerance = input_type() == data_types::f16; + for (size_t bi = 0; bi < batch_num(); ++bi) for (size_t fi = 0; fi < expected[0].size(); ++fi) for (size_t yi = 0; yi < expected[0][0].size(); ++yi) @@ -2411,9 +2431,14 @@ class pooling_test_base { size_t offset = out_lay.get_linear_offset(coords); auto ref_val = static_cast(expected[bi][fi][yi][xi]); auto actual_val = static_cast(out_ptr[offset]); - - EXPECT_TRUE(are_equal(ref_val, actual_val)) - << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi; + if (compare_with_tolerance) { + auto tolerance = 1; + ASSERT_NEAR(ref_val, actual_val, tolerance) + << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi; + } else { + EXPECT_TRUE(are_equal(ref_val, actual_val)) + << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi; + } } } @@ -2560,16 +2585,19 @@ TEST_P(pooling_random_test, avg_u8) { INSTANTIATE_TEST_CASE_P( smoke_low_precision, pooling_random_test, - testing::Combine( - testing::Values(1, 2), - testing::Values(3, 32), - testing::Values(std::tuple(3, 3), std::tuple(8, 8)), - testing::Values(std::tuple(1, 1), std::tuple(3, 3)), - testing::Values(std::tuple(1, 1)), - testing::Values(std::tuple(0, 0)), - testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::byxf_af32, format::b_fs_yx_fsv32) - ), - testing::internal::DefaultParamName); + testing::Combine(testing::Values(1, 2), + testing::Values(3, 8), + testing::Values(std::tuple(12, 12), std::tuple(24, 24)), + testing::Values(std::tuple(4, 4), std::tuple(2, 2)), + testing::Values(std::tuple(2, 2)), + testing::Values(std::tuple(0, 0)), + testing::Values(format::yxfb, + format::bfyx, + format::byxf_af32, + format::b_fs_yx_fsv4, + format::b_fs_yx_fsv16, + format::b_fs_yx_fsv32)), + testing::internal::DefaultParamName); template class pooling_scale_random_test_base : public pooling_random_test_base { @@ -2619,30 +2647,44 @@ class pooling_scale_random_test_base : public pooling_random_test_base _shift; }; -using pooling_scale_random_test = pooling_random_test; +using pooling_random_test_fp16_fp32 = pooling_random_test; + +TEST_P(pooling_random_test_fp16_fp32, avg_fp16) { + auto test_case = pooling_random_test_base(); + ASSERT_NO_FATAL_FAILURE(test_case.run_random(GetParam())); +} + +TEST_P(pooling_random_test_fp16_fp32, max_fp16) { + auto test_case = pooling_random_test_base(); + ASSERT_NO_FATAL_FAILURE(test_case.run_random(GetParam())); +} -TEST_P(pooling_scale_random_test, avg_i8) { - auto test_case = pooling_scale_random_test_base(); +TEST_P(pooling_random_test_fp16_fp32, avg_fp32) { + auto test_case = pooling_random_test_base(); ASSERT_NO_FATAL_FAILURE(test_case.run_random(GetParam())); } -TEST_P(pooling_scale_random_test, avg_u8) { - auto test_case = pooling_scale_random_test_base(); +TEST_P(pooling_random_test_fp16_fp32, max_fp32) { + auto test_case = pooling_random_test_base(); ASSERT_NO_FATAL_FAILURE(test_case.run_random(GetParam())); } INSTANTIATE_TEST_CASE_P( smoke_low_precision, - pooling_scale_random_test, - testing::Combine( - testing::Values(1, 2), - testing::Values(3, 32), - testing::Values(std::tuple(3, 3), std::tuple(8, 8)), - testing::Values(std::tuple(1, 1), std::tuple(3, 3)), - testing::Values(std::tuple(1, 1)), - testing::Values(std::tuple(0, 0)), - testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::byxf_af32, format::b_fs_yx_fsv32) - ), + pooling_random_test_fp16_fp32, + testing::Combine(testing::Values(1, 2), + testing::Values(3, 8), + testing::Values(std::tuple(12, 12), std::tuple(24, 24)), + testing::Values(std::tuple(4, 4), std::tuple(2, 2)), + testing::Values(std::tuple(2, 2)), + testing::Values(std::tuple(0, 0)), + testing::Values(format::yxfb, + format::bfyx, + format::byxf, + format::b_fs_yx_fsv16, + format::fs_b_yx_fsv32, + format::b_fs_yx_fsv32, + format::fs_bs_yx_bsv4_fsv32)), testing::internal::DefaultParamName); TEST(pooling_forward_gpu, bsv16_fsv16_max_16x16x8x8_input_2x2_pool_2x2_stride) diff --git a/inference-engine/thirdparty/clDNN/tests/test_utils/float16.h b/inference-engine/thirdparty/clDNN/tests/test_utils/float16.h index 5438037c814582..607bda0d62ac87 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_utils/float16.h +++ b/inference-engine/thirdparty/clDNN/tests/test_utils/float16.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2017 Intel Corporation +// Copyright (c) 2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -17,120 +17,96 @@ #pragma once #include "include/math_utils.h" -struct FLOAT16 -{ - struct representation - { - uint16_t significand : 10; - uint16_t exponent : 5; +struct FLOAT16 { + struct representation { uint16_t sign : 1; + uint16_t exponent : 5; + uint16_t significand : 10; }; - union - { - uint16_t v = 0; - representation format; // added this struct for the .natvis file (for debug) + union { + uint16_t v; + representation format; // added this struct for the .natvis file (for debug) }; - static FLOAT16 min_val() - { - FLOAT16 f16; - f16.v = 0xFC00; - return f16; - } + static constexpr FLOAT16 min_val() { return FLOAT16((uint16_t)(0x0400)); } + + static constexpr FLOAT16 lowest_val() { return FLOAT16((uint16_t)(0xfbff)); } - operator double() const { double d = (double)float16_to_float32(v); return d; } - operator float() const { float f = float16_to_float32(v); return f; } - operator int16_t() const { return *(int16_t*)(&v); } + operator double() const { + double d = (double)float16_to_float32(v); + return d; + } + operator float() const { + float f = float16_to_float32(v); + return f; + } + operator int16_t() const { return *(int16_t *)(&v); } operator long long int() const { return v; } operator uint32_t() const { return v; } FLOAT16(float f) { v = float32_to_float16(f); } + FLOAT16(size_t s) { v = float32_to_float16(float(s)); } FLOAT16(int i) { v = float32_to_float16(float(i)); } - explicit FLOAT16(int16_t d) : v(d) {} - friend FLOAT16 operator +(const FLOAT16 &v1, const FLOAT16 &v2); - friend FLOAT16 operator -(const FLOAT16 &v1, const FLOAT16 &v2); - friend FLOAT16 operator *(const FLOAT16 &v1, const FLOAT16 &v2); - friend FLOAT16 operator /(const FLOAT16 &v1, const FLOAT16 &v2); - friend bool operator >(const FLOAT16 &v1, const FLOAT16 &v2); - friend bool operator >=(const FLOAT16 &v1, const FLOAT16 &v2); - friend bool operator <(const FLOAT16 &v1, const FLOAT16 &v2); - friend bool operator >(const FLOAT16 &v1, const float &v2); - friend bool operator <(const FLOAT16 &v1, const float &v2); - friend bool operator ==(const FLOAT16 &v1, const FLOAT16 &v2); - friend bool operator !=(const FLOAT16 &v1, const FLOAT16 &v2); - - FLOAT16() {} - - FLOAT16& operator +=(const FLOAT16 &v1) - { - *this = (float)*this + (float)v1; - return *this; + // TODO Below should have constructor tag to avoid ambigious behaviour, ex FLOAT16(16.f) != FLOAT16((uint16_t)16) + explicit constexpr FLOAT16(int16_t d) : v(d) {} + explicit constexpr FLOAT16(uint16_t d) : v(d) {} + friend FLOAT16 operator+(const FLOAT16 &v1, const FLOAT16 &v2); + friend FLOAT16 operator-(const FLOAT16 &v1, const FLOAT16 &v2); + friend FLOAT16 operator*(const FLOAT16 &v1, const FLOAT16 &v2); + friend FLOAT16 operator/(const FLOAT16 &v1, const FLOAT16 &v2); + friend bool operator>(const FLOAT16 &v1, const FLOAT16 &v2); + friend bool operator>=(const FLOAT16 &v1, const FLOAT16 &v2); + friend bool operator<(const FLOAT16 &v1, const FLOAT16 &v2); + friend bool operator>(const FLOAT16 &v1, const float &v2); + friend bool operator<(const FLOAT16 &v1, const float &v2); + friend bool operator==(const FLOAT16 &v1, const FLOAT16 &v2); + friend bool operator!=(const FLOAT16 &v1, const FLOAT16 &v2); + + FLOAT16() { v = 0; } + + FLOAT16 &operator+=(const FLOAT16 &v1) { + *this = (float)*this + (float)v1; + return *this; } - FLOAT16& operator /=(const FLOAT16 &v1) - { - *this = (float)*this / (float)v1; - return *this; + FLOAT16 &operator/=(const FLOAT16 &v1) { + *this = (float)*this / (float)v1; + return *this; } - FLOAT16& operator *=(const FLOAT16 &v1) - { + FLOAT16 &operator*=(const FLOAT16 &v1) { *this = (float)*this * (float)v1; return *this; } }; -inline FLOAT16 operator +(const FLOAT16 &v1, const FLOAT16 &v2) -{ - return (float)v1 + (float)v2; -} - -inline FLOAT16 operator -(const FLOAT16 &v1, const FLOAT16 &v2) -{ - return (float)v1 - (float)v2; -} - -inline FLOAT16 operator *(const FLOAT16 &v1, const FLOAT16 &v2) -{ - return (float)v1 * (float)v2; -} - -inline FLOAT16 operator /(const FLOAT16 &v1, const FLOAT16 &v2) -{ - return (float)v1 / (float)v2; -} - -inline bool operator >(const FLOAT16 &v1, const FLOAT16 &v2) -{ - return (float)v1 > (float)v2; -} - -inline bool operator >=(const FLOAT16 &v1, const FLOAT16 &v2) -{ - return (float)v1 >= (float)v2; -} - -inline bool operator <(const FLOAT16 &v1, const FLOAT16 &v2) -{ - return (float)v1 < (float)v2; -} - -inline bool operator >(const FLOAT16 &v1, const float &v2) -{ - return (float)v1 > v2; -} - -inline bool operator <(const FLOAT16 &v1, const float &v2) -{ - return (float)v1 < v2; -} - -inline bool operator ==(const FLOAT16 &v1, const FLOAT16 &v2) -{ - return v1.v == v2.v; -} - -inline bool operator !=(const FLOAT16 &v1, const FLOAT16 &v2) -{ - return v1.v != v2.v; -} +inline FLOAT16 operator+(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 + (float)v2; } + +inline FLOAT16 operator-(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 - (float)v2; } + +inline FLOAT16 operator*(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 * (float)v2; } + +inline FLOAT16 operator/(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 / (float)v2; } + +inline bool operator>(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 > (float)v2; } + +inline bool operator>=(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 >= (float)v2; } + +inline bool operator<(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 < (float)v2; } + +inline bool operator>(const FLOAT16 &v1, const float &v2) { return (float)v1 > v2; } + +inline bool operator<(const FLOAT16 &v1, const float &v2) { return (float)v1 < v2; } + +inline bool operator==(const FLOAT16 &v1, const FLOAT16 &v2) { return v1.v == v2.v; } + +inline bool operator!=(const FLOAT16 &v1, const FLOAT16 &v2) { return v1.v != v2.v; } + +namespace std { + +template <> +struct numeric_limits { + static constexpr FLOAT16 lowest() { return FLOAT16::lowest_val(); } +}; + +} // namespace std From 1ffada0b236f2238cfb41a4b6bf7baa2eb28a056 Mon Sep 17 00:00:00 2001 From: Alexey Suhov Date: Wed, 3 Jun 2020 20:14:35 +0300 Subject: [PATCH 10/12] [Docs] Fixes in readme files: (#750) - change repo name to openvino - update driver version - fix path to samples data - remove section about Movidius driver installation - change latest release to 2020.3 - merge fixes in install_dependencies.sh from 2020 branch --- README.md | 2 +- build-instruction.md | 49 ++++++++++++++--------------------------- get-started-linux.md | 36 +++++++++++++++--------------- install_dependencies.sh | 10 +++++++-- 4 files changed, 44 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index dfaf3e1eebcfae..869616f3ac8fe9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # [OpenVINO™ Toolkit](https://01.org/openvinotoolkit) - Deep Learning Deployment Toolkit repository -[![Stable release](https://img.shields.io/badge/version-2020.2-green.svg)](https://github.com/openvinotoolkit/openvino/releases/tag/2020.2) +[![Stable release](https://img.shields.io/badge/version-2020.3-green.svg)](https://github.com/openvinotoolkit/openvino/releases/tag/2020.3.0) [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE) This toolkit allows developers to deploy pre-trained deep learning models diff --git a/build-instruction.md b/build-instruction.md index 3d5cfe136f2f21..12103ce9875004 100644 --- a/build-instruction.md +++ b/build-instruction.md @@ -28,7 +28,6 @@ - [Add Inference Engine to Your Project](#add-inference-engine-to-your-project) - [(Optional) Additional Installation Steps for the Intel® Movidius™ Neural Compute Stick and Neural Compute Stick 2](#optional-additional-installation-steps-for-the-intel-movidius-neural-compute-stick-and-neural-compute-stick-2) - [For Linux, Raspbian Stretch* OS](#for-linux-raspbian-stretch-os) - - [For Windows](#for-windows-1) - [Next Steps](#next-steps) - [Additional Resources](#additional-resources) @@ -60,12 +59,12 @@ The software was validated on: - [CMake]\* 3.11 or higher - GCC\* 4.8 or higher to build the Inference Engine - Python 2.7 or higher for Inference Engine Python API wrapper -- (Optional) [Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 19.41.14441]. +- (Optional) [Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 20.13.16352]. ### Build Steps 1. Clone submodules: ```sh - cd dldt + cd openvino git submodule update --init --recursive ``` 2. Install build dependencies using the `install_dependencies.sh` script in the @@ -78,7 +77,7 @@ The software was validated on: ``` 3. By default, the build enables the Inference Engine GPU plugin to infer models on your Intel® Processor Graphics. This requires you to - [Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 19.41.14441] + [Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 20.13.16352] before running the build. If you don't want to use the GPU plugin, use the `-DENABLE_CLDNN=OFF` CMake build option and skip the installation of the Intel® Graphics Compute Runtime for OpenCL™ Driver. @@ -172,10 +171,10 @@ Native compilation of the Inference Engine is the most straightforward solution. sudo apt-get install -y git cmake libusb-1.0-0-dev ``` -2. Go to the cloned `dldt` repository: +2. Go to the cloned `openvino` repository: ```bash - cd dldt + cd openvino ``` 3. Initialize submodules: @@ -262,15 +261,15 @@ with the following content: 5. Run Docker\* container with mounted source code folder from host: ```bash - docker run -it -v /absolute/path/to/dldt:/dldt ie_cross_armhf /bin/bash + docker run -it -v /absolute/path/to/openvino:/openvino ie_cross_armhf /bin/bash ``` 6. While in the container: - 1. Go to the cloned `dldt` repository: + 1. Go to the cloned `openvino` repository: ```bash - cd dldt + cd openvino ``` 2. Create a build folder: @@ -291,8 +290,8 @@ with the following content: ``` 7. Press **Ctrl+D** to exit from Docker. You can find the resulting binaries - in the `dldt/bin/armv7l/` directory and the OpenCV* - installation in the `dldt/inference-engine/temp`. + in the `openvino/bin/armv7l/` directory and the OpenCV* + installation in the `openvino/inference-engine/temp`. >**NOTE**: Native applications that link to cross-compiled Inference Engine library require an extra compilation flag `-march=armv7-a`. @@ -381,8 +380,8 @@ cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" ^ 6. Before running the samples, add paths to the TBB and OpenCV binaries used for the build to the `%PATH%` environment variable. By default, TBB binaries are - downloaded by the CMake-based script to the `/inference-engine/temp/tbb/bin` - folder, OpenCV binaries to the `/inference-engine/temp/opencv_4.3.0/opencv/bin` + downloaded by the CMake-based script to the `/inference-engine/temp/tbb/bin` + folder, OpenCV binaries to the `/inference-engine/temp/opencv_4.3.0/opencv/bin` folder. ### Additional Build Options @@ -437,7 +436,7 @@ cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" ^ call "C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\bin\ipsxe-comp-vars.bat" intel64 vs2017 set CXX=icl set CC=icl -:: clean TBBROOT value set by ipsxe-comp-vars.bat, required TBB package will be downloaded by dldt cmake script +:: clean TBBROOT value set by ipsxe-comp-vars.bat, required TBB package will be downloaded by openvino cmake script set TBBROOT= cmake -G Ninja -Wno-dev -DCMAKE_BUILD_TYPE=Release .. cmake --build . --config Release @@ -461,7 +460,7 @@ The software was validated on: 1. Clone submodules: ```sh - cd dldt + cd openvino git submodule update --init --recursive ``` 2. Install build dependencies using the `install_dependencies.sh` script in the @@ -545,7 +544,7 @@ This section describes how to build Inference Engine for Android x86 (64-bit) op 2. Clone submodules ```sh - cd dldt + cd openvino git submodule update --init --recursive ``` @@ -610,7 +609,7 @@ before running the Inference Engine build: For CMake projects, set the `InferenceEngine_DIR` environment variable: ```sh -export InferenceEngine_DIR=/path/to/dldt/build/ +export InferenceEngine_DIR=/path/to/openvino/build/ ``` Then you can find Inference Engine by `find_package`: @@ -660,20 +659,6 @@ sudo ldconfig rm 97-myriad-usbboot.rules ``` -### For Windows - -For Intel® Movidius™ Neural Compute Stick and Intel® Neural Compute Stick 2, -install the Movidius™ VSC driver: - -1. Go to the `/inference-engine/thirdparty/movidius/MovidiusDriver` - directory, where the `DLDT_ROOT_DIR` is the directory to which the DLDT - repository was cloned. -2. Right click on the `Movidius_VSC_Device.inf` file and choose **Install** from - the pop-up menu. - -You have installed the driver for your Intel® Movidius™ Neural Compute Stick -or Intel® Neural Compute Stick 2. - ## Next Steps Congratulations, you have built the Inference Engine. To get started with the @@ -706,7 +691,7 @@ This target collects all dependencies, prepares the nGraph package and copies it [Intel® Distribution of OpenVINO™]:https://software.intel.com/en-us/openvino-toolkit [CMake]:https://cmake.org/download/ -[Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 19.41.14441]:https://github.com/intel/compute-runtime/releases/tag/19.41.14441 +[Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 20.13.16352]:https://github.com/intel/compute-runtime/releases/tag/20.13.16352 [MKL-DNN repository]:https://github.com/intel/mkl-dnn/releases/download/v0.19/mklml_lnx_2019.0.5.20190502.tgz [MKL-DNN repository for Windows]:(https://github.com/intel/mkl-dnn/releases/download/v0.19/mklml_win_2019.0.5.20190502.zip) [OpenBLAS]:https://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int64.zip/download diff --git a/get-started-linux.md b/get-started-linux.md index bf87003b068b48..661fc4ec8ddded 100644 --- a/get-started-linux.md +++ b/get-started-linux.md @@ -1,7 +1,7 @@ -# Get Started with OpenVINO™ Deep Learning Deployment Toolkit (DLDT) on Linux* +# Get Started with OpenVINO™ Toolkit on Linux* This guide provides you with the information that will help you to start using -the DLDT on Linux\*. With this guide, you will learn how to: +the OpenVINO™ Toolkit on Linux\*. With this guide, you will learn how to: 1. [Configure the Model Optimizer](#configure-the-model-optimizer) 2. [Prepare a model for sample inference](#prepare-a-model-for-sample-inference) @@ -10,13 +10,13 @@ the DLDT on Linux\*. With this guide, you will learn how to: 3. [Run the Image Classification Sample Application with the model](#run-the-image-classification-sample-application) ## Prerequisites -1. This guide assumes that you have already cloned the `dldt` repo and +1. This guide assumes that you have already cloned the `openvino` repo and successfully built the Inference Engine and Samples using the [build instructions](inference-engine/README.md). 2. The original structure of the repository directories remains unchanged. -> **NOTE**: Below, the directory to which the `dldt` repository is cloned is -referred to as ``. +> **NOTE**: Below, the directory to which the `openvino` repository is cloned is +referred to as ``. ## Configure the Model Optimizer @@ -53,7 +53,7 @@ If you see error messages, check for any missing dependencies. 1. Go to the Model Optimizer prerequisites directory: ```sh -cd /model_optimizer/install_prerequisites +cd /model_optimizer/install_prerequisites ``` 2. Run the script to configure the Model Optimizer for Caffe, TensorFlow, MXNet, Kaldi\*, and ONNX: @@ -68,7 +68,7 @@ Configure individual frameworks separately **ONLY** if you did not select 1. Go to the Model Optimizer prerequisites directory: ```sh -cd /model_optimizer/install_prerequisites +cd /model_optimizer/install_prerequisites ``` 2. Run the script for your model framework. You can run more than one script: @@ -162,20 +162,20 @@ as `` below) with the Model Downloader: **For CPU (FP32):** ```sh - python3 /model_optimizer/mo.py --input_model /classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP32 --output_dir + python3 /model_optimizer/mo.py --input_model /classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP32 --output_dir ``` **For GPU and MYRIAD (FP16):** ```sh - python3 /model_optimizer/mo.py --input_model /classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP16 --output_dir + python3 /model_optimizer/mo.py --input_model /classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP16 --output_dir ``` After the Model Optimizer script is completed, the produced IR files (`squeezenet1.1.xml`, `squeezenet1.1.bin`) are in the specified `` directory. -3. Copy the `squeezenet1.1.labels` file from the `/inference-engine/samples/sample_data/` +3. Copy the `squeezenet1.1.labels` file from the `/scripts/demo/` folder to the model IR directory. This file contains the classes that ImageNet uses so that the inference results show text instead of classification numbers: ```sh - cp /inference-engine/samples/sample_data/squeezenet1.1.labels + cp /scripts/demo/squeezenet1.1.labels ``` Now you are ready to run the Image Classification Sample Application. @@ -184,28 +184,28 @@ Now you are ready to run the Image Classification Sample Application. The Inference Engine sample applications are automatically compiled when you built the Inference Engine using the [build instructions](inference-engine/README.md). -The binary files are located in the `/inference-engine/bin/intel64/Release` +The binary files are located in the `/inference-engine/bin/intel64/Release` directory. To run the Image Classification sample application with an input image on the prepared IR: 1. Go to the samples build directory: ```sh - cd /inference-engine/bin/intel64/Release + cd /inference-engine/bin/intel64/Release 2. Run the sample executable with specifying the `car.png` file from the - `/inference-engine/samples/sample_data/` directory as an input + `/scripts/demo/` directory as an input image, the IR of your model and a plugin for a hardware device to perform inference on: **For CPU:** ```sh - ./classification_sample -i /inference-engine/samples/sample_data/car.png -m /squeezenet1.1.xml -d CPU + ./classification_sample -i /scripts/demo/car.png -m /squeezenet1.1.xml -d CPU ``` **For GPU:** ```sh - ./classification_sample -i /inference-engine/samples/sample_data/car.png -m /squeezenet1.1.xml -d GPU + ./classification_sample -i /scripts/demo/car.png -m /squeezenet1.1.xml -d GPU ``` **For MYRIAD:** @@ -214,14 +214,14 @@ To run the Image Classification sample application with an input image on the pr Stick or Intel® Neural Compute Stick 2) with the MYRIAD plugin requires performing [additional hardware configuration steps](inference-engine/README.md#optional-additional-installation-steps-for-the-intel-movidius-neural-compute-stick-and-neural-compute-stick-2). ```sh - ./classification_sample -i /inference-engine/samples/sample_data/car.png -m /squeezenet1.1.xml -d MYRIAD + ./classification_sample -i /scripts/demo/car.png -m /squeezenet1.1.xml -d MYRIAD ``` When the Sample Application completes, you will have the label and confidence for the top-10 categories printed on the screen. Below is a sample output with inference results on CPU: ```sh Top 10 results: -Image /home/user/dldt/inference-engine/samples/sample_data/car.png +Image /home/user/openvino/scripts/demo/car.png classid probability label ------- ----------- ----- diff --git a/install_dependencies.sh b/install_dependencies.sh index 6fae78066ebd27..ca31972c3ba1bf 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -22,6 +22,13 @@ yes_or_no() { # install dependencies if [ -f /etc/lsb-release ]; then # Ubuntu + host_cpu=$(uname -m) + if [ $host_cpu = x86_64 ]; then + x86_64_specific_packages="gcc-multilib g++-multilib" + else + x86_64_specific_packages="" + fi + sudo -E apt update sudo -E apt-get install -y \ build-essential \ @@ -32,8 +39,7 @@ if [ -f /etc/lsb-release ]; then ca-certificates \ git \ libboost-regex-dev \ - gcc-multilib \ - g++-multilib \ + $x86_64_specific_packages \ libgtk2.0-dev \ pkg-config \ unzip \ From 2bb7010193ed9bd9e30b690a91997d62e8091b2f Mon Sep 17 00:00:00 2001 From: wistal Date: Thu, 4 Jun 2020 01:33:55 +0800 Subject: [PATCH 11/12] MO should support LRN k param with caffe model, rather than fixed to 1 (#716) Co-authored-by: yipengqu --- model-optimizer/extensions/front/caffe/lrn_ext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model-optimizer/extensions/front/caffe/lrn_ext.py b/model-optimizer/extensions/front/caffe/lrn_ext.py index a8ee3f59c6924e..1c0a72ca06eec3 100644 --- a/model-optimizer/extensions/front/caffe/lrn_ext.py +++ b/model-optimizer/extensions/front/caffe/lrn_ext.py @@ -29,7 +29,7 @@ def extract(cls, node): AttributedLRN.update_node_stat(node, { 'alpha': param.alpha, 'beta': param.beta, - 'bias': 1, + 'bias': param.k, 'local_size': param.local_size, 'region': region, }) From 158d32139f4d358c6be80b54acf9c9c6c8b7cece Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 3 Jun 2020 22:32:55 +0300 Subject: [PATCH 12/12] Revert "Enabled thread tests (#717)" (#756) This reverts commit 99a2423ec0dc6d4570ca989be97da02376b3e169. --- .../behavior/core_threading_tests.cpp | 11 ++++++----- .../shared/include/behavior/core_threading_tests.hpp | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp index bd38670c9dd221..69748a01a524ce 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp @@ -31,11 +31,12 @@ TEST_P(CoreThreadingTestsWithIterations, smoke_LoadNetwork_RemoteContext) { networks.emplace_back(ie.ReadNetwork(model.model_xml_str, model.weights_blob)); } - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::make2InputSubtract())); - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeMultiSingleConv())); - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSingleConv())); - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitConvConcat())); - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitMultiConvConcat())); + // TODO: uncomment after fixing *-31414 + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::make2InputSubtract())); + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeMultiSingleConv())); + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSingleConv())); + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitConvConcat())); + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitMultiConvConcat())); auto ocl_instance = std::make_shared(); ie.SetConfig(config, deviceName); diff --git a/inference-engine/tests/functional/plugin/shared/include/behavior/core_threading_tests.hpp b/inference-engine/tests/functional/plugin/shared/include/behavior/core_threading_tests.hpp index c53f9fc0939c63..0379767355b54c 100644 --- a/inference-engine/tests/functional/plugin/shared/include/behavior/core_threading_tests.hpp +++ b/inference-engine/tests/functional/plugin/shared/include/behavior/core_threading_tests.hpp @@ -183,11 +183,12 @@ TEST_P(CoreThreadingTestsWithIterations, smoke_LoadNetwork) { networks.emplace_back(ie.ReadNetwork(model.model_xml_str, model.weights_blob)); } - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::make2InputSubtract())); - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeMultiSingleConv())); - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSingleConv())); - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitConvConcat())); - networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitMultiConvConcat())); + // TODO: uncomment after fixing *-31414 + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::make2InputSubtract())); + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeMultiSingleConv())); + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSingleConv())); + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitConvConcat())); + // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitMultiConvConcat())); ie.SetConfig(config, deviceName); runParallel([&] () {