From c437bcd7cc1440b4a1d0041614ab66b4e377dd36 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Thu, 15 Oct 2020 10:28:00 +0300 Subject: [PATCH] [IE CLDNN] Fixes for GatherTree and ReverseSequence (#2660) --- .../src/cldnn_engine/cldnn_common_utils.h | 1 + .../src/cldnn_engine/cldnn_engine.cpp | 13 ++- .../src/cldnn_engine/cldnn_infer_request.cpp | 18 +++- .../single_layer_tests/reverse_sequence.cpp | 4 + .../skip_tests_config.cpp | 4 - .../gather_tree/gather_tree_kernel_base.cpp | 91 ++++++++++--------- .../gather_tree/gather_tree_kernel_ref.cpp | 2 +- .../reverse_sequence_kernel_ref.cpp | 6 ++ .../core/cl_kernels/gather_tree_gpu_ref.cl | 43 +++++---- .../core/cl_kernels/reverse_sequence_ref.cl | 17 +--- .../clDNN/src/gpu/gather_tree_gpu.cpp | 8 +- .../clDNN/src/gpu/reverse_sequence_gpu.cpp | 11 ++- 12 files changed, 125 insertions(+), 93 deletions(-) diff --git a/inference-engine/src/cldnn_engine/cldnn_common_utils.h b/inference-engine/src/cldnn_engine/cldnn_common_utils.h index 6423163844c330..384d1576c9bd31 100644 --- a/inference-engine/src/cldnn_engine/cldnn_common_utils.h +++ b/inference-engine/src/cldnn_engine/cldnn_common_utils.h @@ -41,6 +41,7 @@ const auto CldnnTensorFromIEDims = [](const InferenceEngine::SizeVector& dims, i inline cldnn::data_types DataTypeFromPrecision(InferenceEngine::Precision p) { switch (p) { case Precision::I16: + case Precision::U16: case Precision::FP32: return cldnn::data_types::f32; case Precision::FP16: diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp index 5b2818b0b60909..db167790153523 100644 --- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp @@ -196,10 +196,15 @@ clDNNEngine::clDNNEngine() : m_defaultContext(nullptr) { auto check_inputs = [](InferenceEngine::InputsDataMap _networkInputs) { for (auto ii : _networkInputs) { auto input_precision = ii.second->getTensorDesc().getPrecision(); - if (input_precision != InferenceEngine::Precision::FP16 && input_precision != InferenceEngine::Precision::I16 - && input_precision != InferenceEngine::Precision::FP32 && input_precision != InferenceEngine::Precision::U8 - && input_precision != InferenceEngine::Precision::I32 && input_precision != InferenceEngine::Precision::I64 - && input_precision != InferenceEngine::Precision::I8 && input_precision != InferenceEngine::Precision::BOOL) { + if (input_precision != InferenceEngine::Precision::FP16 && + input_precision != InferenceEngine::Precision::FP32 && + input_precision != InferenceEngine::Precision::U8 && + input_precision != InferenceEngine::Precision::I8 && + input_precision != InferenceEngine::Precision::I16 && + input_precision != InferenceEngine::Precision::U16 && + input_precision != InferenceEngine::Precision::I32 && + input_precision != InferenceEngine::Precision::I64 && + input_precision != InferenceEngine::Precision::BOOL) { THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str << "Input image format " << input_precision << " is not supported yet..."; } diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp index bf591b6b029183..931083afcd5198 100644 --- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp @@ -41,6 +41,11 @@ Blob::Ptr CLDNNInferRequest::createInputBlob(const TensorDesc& desc, uint8_t* me return make_shared_blob(desc, reinterpret_cast(mem_ptr)); else return make_shared_blob(desc); + case Precision::U16: + if (mem_ptr != nullptr) + return make_shared_blob(desc, reinterpret_cast(mem_ptr)); + else + return make_shared_blob(desc); case Precision::I32: if (mem_ptr != nullptr) return make_shared_blob(desc, reinterpret_cast(mem_ptr)); @@ -586,7 +591,7 @@ void CLDNNInferRequest::AllocateInputs() { cldnn::pointer mem_ptr = inputsMemory.at(name).pointer(); _inputs[name] = createInputBlob(desc, mem_ptr.data()); - if (desc.getPrecision() == Precision::I16) { + if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) { cldnn::layout layout_fp32 = layout; layout_fp32.data_type = cldnn::data_types::f32; input_alloc(name + fp32_suffix, layout_fp32); @@ -609,7 +614,7 @@ void CLDNNInferRequest::AllocateInputsDyn() { } Blob::Ptr inputBlob = createInputBlob(desc); - if (desc.getPrecision() == Precision::I16) { + if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) { desc.setPrecision(Precision::FP32); auto fp32inputBlob = InferenceEngine::make_shared_blob(desc); fp32inputBlob->allocate(); @@ -910,11 +915,16 @@ void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const if (inputBlob.is()) { // no need to check for reuse _nw_ptr->set_input_data(internalName, memory); - } else if (prec == Precision::I16) { + } else if (prec == Precision::I16 || prec == Precision::U16) { // clDNN doesn't support I16 input precision, so we always have to convert input data to fp32 precision const cldnn::memory& fp32_mem = inputsMemory.at(inputName+fp32_suffix); cldnn::pointer ptr = fp32_mem.pointer(); - copyToFloat(ptr.data(), &inputBlob); + if (prec == Precision::I16) { + copyToFloat(ptr.data(), &inputBlob); + } else { + copyToFloat(ptr.data(), &inputBlob); + } + _nw_ptr->set_input_data(internalName, fp32_mem); } else if (is_same_buffer(inputBlob, memory)) { // If input memory was allocated by cldnn engine and wasn't overwritten by user set_input_data method won't copy input data. diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reverse_sequence.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reverse_sequence.cpp index 2787fed096a782..44d8f3f225bef2 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reverse_sequence.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reverse_sequence.cpp @@ -14,6 +14,10 @@ namespace { const std::vector netPrecisions = { InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16, + InferenceEngine::Precision::U8, + InferenceEngine::Precision::I8, + InferenceEngine::Precision::U16, + InferenceEngine::Precision::I32 }; const std::vector batchAxisIndices = { 0L }; diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp index 7335d737222a20..c3059421342d57 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp @@ -22,9 +22,5 @@ std::vector disabledTestPatterns() { // Expected behavior R"(.*EltwiseLayerTest.*eltwiseOpType=Pow.*netPRC=I64.*)", R"(.*EltwiseLayerTest.*IS=\(.*\..*\..*\..*\..*\).*eltwiseOpType=Pow.*secondaryInputType=CONSTANT.*)", - // TODO: Issue: 40736 - R"(.*ReverseSequenceLayerTest.*)", - // TODO: Issue: 40741 - R"(.*GatherTreeLayerTest.*)", }; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_base.cpp index 17599164668785..1042910b656448 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_base.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -17,47 +17,56 @@ #include "kernel_selector_utils.h" namespace kernel_selector { - JitConstants GatherTreeKernelBase::GetJitConstants(const gather_tree_params & params) const { - JitConstants jit = MakeBaseParamsJitConstants(params); - return jit; - } +JitConstants GatherTreeKernelBase::GetJitConstants(const gather_tree_params & params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + return jit; +} - GatherTreeKernelBase::DispatchData GatherTreeKernelBase::SetDefault(const gather_tree_params & params) const { - std::vector global{ - params.output.Y().v, // beam - params.output.Feature().v, // batch - 1 - }; - const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); - /* - b -> time - f -> batch - y -> beam - */ - DispatchData data; - data.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - data.gws0 = global[0]; - data.gws1 = global[1]; - data.gws2 = global[2]; - data.lws0 = local[0]; - data.lws1 = local[1]; - data.lws2 = local[2]; - return data; - } +GatherTreeKernelBase::DispatchData GatherTreeKernelBase::SetDefault(const gather_tree_params & params) const { + std::vector global{ + params.output.Y().v, // beam + params.output.Feature().v, // batch + 1 + }; + const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); + /* + b -> time + f -> batch + y -> beam + */ + DispatchData data; + data.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; + data.gws0 = global[0]; + data.gws1 = global[1]; + data.gws2 = global[2]; + data.lws0 = local[0]; + data.lws1 = local[1]; + data.lws2 = local[2]; + return data; +} - KernelsData GatherTreeKernelBase::GetCommonKernelsData(const Params& params, - const optional_params& options, - float estimated_time) const { - assert(params.GetType() == KernelType::GATHER_TREE); - const auto& gt_params = static_cast(params); +KernelsData GatherTreeKernelBase::GetCommonKernelsData(const Params& params, + const optional_params& options, + float estimated_time) const { + assert(params.GetType() == KernelType::GATHER_TREE); + const auto& gt_params = static_cast(params); - auto run_info = SetDefault(gt_params); - auto kernel_data = KernelData::Default(params); - auto cldnn_jit = GetJitConstants(gt_params); - auto entry_point = GetEntryPoint(kernelName, gt_params.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - FillCLKernelData(kernel_data.kernels[0], run_info, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, 4); - kernel_data.estimatedTime = estimated_time; - return { kernel_data }; - } + auto run_info = SetDefault(gt_params); + auto kernel_data = KernelData::Default(params); + auto cldnn_jit = GetJitConstants(gt_params); + auto entry_point = GetEntryPoint(kernelName, gt_params.layerID, options); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + FillCLKernelData(kernel_data.kernels[0], + run_info, + params.engineInfo, + kernelName, + jit, + entry_point, + DEFAULT, + false, + false, + static_cast(gt_params.inputs.size())); + kernel_data.estimatedTime = estimated_time; + return { kernel_data }; +} } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_ref.cpp index eb3e0296f97426..f7d7bf72e66036 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather_tree/gather_tree_kernel_ref.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp index 392bdfa715d5e1..f3926a75580c73 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp @@ -20,8 +20,14 @@ namespace kernel_selector { ParamsKey ReverseSequenceKernelRef::GetSupportedKey() const { ParamsKey k; + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::INT32); k.EnableInputDataType(Datatype::F16); k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT32); k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); k.EnableAllInputLayout(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_tree_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_tree_gpu_ref.cl index 3f3bee3f8f27ef..73dba74686c14c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_tree_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_tree_gpu_ref.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,30 +14,37 @@ #include "include/include_all.cl" -KERNEL(gather_tree_gpu_ref.cl)( - const __global UNIT_TYPE* step_input, - const __global UNIT_TYPE* parent_input, - const __global UNIT_TYPE* max_seq_len_input, - const __global UNIT_TYPE* end_token, - __global UNIT_TYPE* output) +KERNEL(gather_tree_gpu_ref)( + const __global INPUT0_TYPE* step_input, + const __global INPUT1_TYPE* parent_input, + const __global INPUT2_TYPE* max_seq_len_input, + const __global INPUT3_TYPE* end_token, + __global OUTPUT_TYPE* output) { - const uint beam = get_global_id(0); - const uint batch = get_global_id(1); + const int beam = get_global_id(0); + const int batch = get_global_id(1); /* b -> time f -> batch y -> beam */ - uint parent = beam; - for(int time = INPUT0_BATCH_NUM - 1; time >= 0; time--) { - while (time >= (uint)max_seq_len_input[batch]) { - output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = end_token[0]; - time--; - } - output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = - step_input[INPUT0_GET_INDEX(time, batch, parent, 0)]; - parent = (uint)parent_input[INPUT0_GET_INDEX(time, batch, parent, 0)]; + const int max_sequence_in_beam = min(INPUT0_BATCH_NUM, (int)max_seq_len_input[batch]); + int time; + for (time = INPUT0_BATCH_NUM - 1; time >= max_sequence_in_beam; time--) { + output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = TO_OUTPUT_TYPE(end_token[0]); } + for (int parent = beam; time >= 0; time--) { + output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = step_input[INPUT0_GET_INDEX(time, batch, parent, 0)]; + parent = parent_input[INPUT1_GET_INDEX(time, batch, parent, 0)]; + } + bool finished = false; + for (int time = 0; time < max_sequence_in_beam; time++) { + if (finished) { + output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = TO_OUTPUT_TYPE(end_token[0]); + } else if (output[OUTPUT_GET_INDEX(time, batch, beam, 0)] == TO_OUTPUT_TYPE(end_token[0])) { + finished = true; + } + } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reverse_sequence_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reverse_sequence_ref.cl index 061079c7298167..7060a20ca6df06 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reverse_sequence_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reverse_sequence_ref.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ #include "include/include_all.cl" -KERNEL(reverse_sequence_ref)(const __global UNIT_TYPE* input, const __global INPUT1_TYPE* seq_lengths, __global UNIT_TYPE* output) +KERNEL(reverse_sequence_ref)(const __global INPUT0_TYPE* input, const __global INPUT1_TYPE* seq_lengths, __global OUTPUT_TYPE* output) { const uint batch = get_global_id(0); const uint feature = get_global_id(1); @@ -23,21 +23,12 @@ KERNEL(reverse_sequence_ref)(const __global UNIT_TYPE* input, const __global INP const uint x = (uint)get_global_id(2) % INPUT0_SIZE_X; uint dimensions[] = { batch, feature, y, x }; - const uint input_index = INPUT0_OFFSET + - batch * INPUT0_BATCH_PITCH + - feature * INPUT0_FEATURE_PITCH + - y * INPUT0_Y_PITCH + - x * INPUT0_X_PITCH; + const uint input_index = INPUT0_GET_INDEX(batch, feature, y, x); const uint length = (uint)seq_lengths[dimensions[BATCH_AXIS]]; if (dimensions[SEQ_AXIS] < length) dimensions[SEQ_AXIS] = length - dimensions[SEQ_AXIS] - 1; - const uint output_index = OUTPUT_OFFSET + - dimensions[0] * OUTPUT_BATCH_PITCH + - dimensions[1] * OUTPUT_FEATURE_PITCH + - dimensions[2] * OUTPUT_Y_PITCH + - dimensions[3] * OUTPUT_X_PITCH; - + const uint output_index = OUTPUT_GET_INDEX(dimensions[0], dimensions[1], dimensions[2], dimensions[3]); output[output_index] = ACTIVATION(input[input_index], ACTIVATION_PARAMS); } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/gather_tree_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/gather_tree_gpu.cpp index 604d28dcd9fc10..9eeff6e0a6cd04 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/gather_tree_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/gather_tree_gpu.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -29,9 +29,11 @@ struct gather_tree_gpu : typed_primitive_gpu_impl { static primitive_impl* create(const gather_tree_node& arg) { auto b_params = get_default_params(arg, 1); - auto b_optional_params = - get_default_optional_params(arg.get_program()); + auto b_optional_params = get_default_optional_params(arg.get_program()); + for (size_t i = 1; i < arg.get_dependencies().size(); i++) { + b_params.inputs.push_back(convert_data_tensor(arg.get_dependency(i).get_output_layout(), 1)); + } auto desc = arg.get_primitive(); auto& kernel_selector = kernel_selector::gather_tree_kernel_selector::Instance(); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/reverse_sequence_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/reverse_sequence_gpu.cpp index ec3a89aad04827..1b4b8480bd2541 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/reverse_sequence_gpu.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/reverse_sequence_gpu.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -59,10 +59,11 @@ namespace detail { attach_reverse_sequence_gpu::attach_reverse_sequence_gpu() { auto val_fw = reverse_sequence_gpu::create; - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), - val_fw); - implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), - val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw); + implementation_map::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw); } } // namespace detail