Skip to content

Commit

Permalink
[IE CLDNN] Fixes for GatherTree and ReverseSequence (openvinotoolkit#…
Browse files Browse the repository at this point in the history
  • Loading branch information
vladimir-paramuzov authored and mryzhov committed Dec 15, 2020
1 parent b8f5b8c commit c437bcd
Show file tree
Hide file tree
Showing 12 changed files with 125 additions and 93 deletions.
1 change: 1 addition & 0 deletions inference-engine/src/cldnn_engine/cldnn_common_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const auto CldnnTensorFromIEDims = [](const InferenceEngine::SizeVector& dims, i
inline cldnn::data_types DataTypeFromPrecision(InferenceEngine::Precision p) {
switch (p) {
case Precision::I16:
case Precision::U16:
case Precision::FP32:
return cldnn::data_types::f32;
case Precision::FP16:
Expand Down
13 changes: 9 additions & 4 deletions inference-engine/src/cldnn_engine/cldnn_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,15 @@ clDNNEngine::clDNNEngine() : m_defaultContext(nullptr) {
auto check_inputs = [](InferenceEngine::InputsDataMap _networkInputs) {
for (auto ii : _networkInputs) {
auto input_precision = ii.second->getTensorDesc().getPrecision();
if (input_precision != InferenceEngine::Precision::FP16 && input_precision != InferenceEngine::Precision::I16
&& input_precision != InferenceEngine::Precision::FP32 && input_precision != InferenceEngine::Precision::U8
&& input_precision != InferenceEngine::Precision::I32 && input_precision != InferenceEngine::Precision::I64
&& input_precision != InferenceEngine::Precision::I8 && input_precision != InferenceEngine::Precision::BOOL) {
if (input_precision != InferenceEngine::Precision::FP16 &&
input_precision != InferenceEngine::Precision::FP32 &&
input_precision != InferenceEngine::Precision::U8 &&
input_precision != InferenceEngine::Precision::I8 &&
input_precision != InferenceEngine::Precision::I16 &&
input_precision != InferenceEngine::Precision::U16 &&
input_precision != InferenceEngine::Precision::I32 &&
input_precision != InferenceEngine::Precision::I64 &&
input_precision != InferenceEngine::Precision::BOOL) {
THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str
<< "Input image format " << input_precision << " is not supported yet...";
}
Expand Down
18 changes: 14 additions & 4 deletions inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ Blob::Ptr CLDNNInferRequest::createInputBlob(const TensorDesc& desc, uint8_t* me
return make_shared_blob<int16_t>(desc, reinterpret_cast<int16_t*>(mem_ptr));
else
return make_shared_blob<int16_t>(desc);
case Precision::U16:
if (mem_ptr != nullptr)
return make_shared_blob<uint16_t>(desc, reinterpret_cast<uint16_t*>(mem_ptr));
else
return make_shared_blob<uint16_t>(desc);
case Precision::I32:
if (mem_ptr != nullptr)
return make_shared_blob<int32_t>(desc, reinterpret_cast<int32_t*>(mem_ptr));
Expand Down Expand Up @@ -586,7 +591,7 @@ void CLDNNInferRequest::AllocateInputs() {
cldnn::pointer<uint8_t> mem_ptr = inputsMemory.at(name).pointer<uint8_t>();
_inputs[name] = createInputBlob(desc, mem_ptr.data());

if (desc.getPrecision() == Precision::I16) {
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
cldnn::layout layout_fp32 = layout;
layout_fp32.data_type = cldnn::data_types::f32;
input_alloc(name + fp32_suffix, layout_fp32);
Expand All @@ -609,7 +614,7 @@ void CLDNNInferRequest::AllocateInputsDyn() {
}

Blob::Ptr inputBlob = createInputBlob(desc);
if (desc.getPrecision() == Precision::I16) {
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
desc.setPrecision(Precision::FP32);
auto fp32inputBlob = InferenceEngine::make_shared_blob<float>(desc);
fp32inputBlob->allocate();
Expand Down Expand Up @@ -910,11 +915,16 @@ void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const
if (inputBlob.is<gpu::ClBlob>()) {
// no need to check for reuse
_nw_ptr->set_input_data(internalName, memory);
} else if (prec == Precision::I16) {
} else if (prec == Precision::I16 || prec == Precision::U16) {
// clDNN doesn't support I16 input precision, so we always have to convert input data to fp32 precision
const cldnn::memory& fp32_mem = inputsMemory.at(inputName+fp32_suffix);
cldnn::pointer<float> ptr = fp32_mem.pointer<float>();
copyToFloat<int16_t>(ptr.data(), &inputBlob);
if (prec == Precision::I16) {
copyToFloat<int16_t>(ptr.data(), &inputBlob);
} else {
copyToFloat<uint16_t>(ptr.data(), &inputBlob);
}

_nw_ptr->set_input_data(internalName, fp32_mem);
} else if (is_same_buffer(inputBlob, memory)) {
// If input memory was allocated by cldnn engine and wasn't overwritten by user set_input_data method won't copy input data.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ namespace {
const std::vector<InferenceEngine::Precision> netPrecisions = {
InferenceEngine::Precision::FP32,
InferenceEngine::Precision::FP16,
InferenceEngine::Precision::U8,
InferenceEngine::Precision::I8,
InferenceEngine::Precision::U16,
InferenceEngine::Precision::I32
};

const std::vector<int64_t> batchAxisIndices = { 0L };
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,5 @@ std::vector<std::string> disabledTestPatterns() {
// Expected behavior
R"(.*EltwiseLayerTest.*eltwiseOpType=Pow.*netPRC=I64.*)",
R"(.*EltwiseLayerTest.*IS=\(.*\..*\..*\..*\..*\).*eltwiseOpType=Pow.*secondaryInputType=CONSTANT.*)",
// TODO: Issue: 40736
R"(.*ReverseSequenceLayerTest.*)",
// TODO: Issue: 40741
R"(.*GatherTreeLayerTest.*)",
};
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019 Intel Corporation
// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -17,47 +17,56 @@
#include "kernel_selector_utils.h"

namespace kernel_selector {
JitConstants GatherTreeKernelBase::GetJitConstants(const gather_tree_params & params) const {
JitConstants jit = MakeBaseParamsJitConstants(params);
return jit;
}
JitConstants GatherTreeKernelBase::GetJitConstants(const gather_tree_params & params) const {
JitConstants jit = MakeBaseParamsJitConstants(params);
return jit;
}

GatherTreeKernelBase::DispatchData GatherTreeKernelBase::SetDefault(const gather_tree_params & params) const {
std::vector<size_t> global{
params.output.Y().v, // beam
params.output.Feature().v, // batch
1
};
const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
/*
b -> time
f -> batch
y -> beam
*/
DispatchData data;
data.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
data.gws0 = global[0];
data.gws1 = global[1];
data.gws2 = global[2];
data.lws0 = local[0];
data.lws1 = local[1];
data.lws2 = local[2];
return data;
}
GatherTreeKernelBase::DispatchData GatherTreeKernelBase::SetDefault(const gather_tree_params & params) const {
std::vector<size_t> global{
params.output.Y().v, // beam
params.output.Feature().v, // batch
1
};
const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
/*
b -> time
f -> batch
y -> beam
*/
DispatchData data;
data.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
data.gws0 = global[0];
data.gws1 = global[1];
data.gws2 = global[2];
data.lws0 = local[0];
data.lws1 = local[1];
data.lws2 = local[2];
return data;
}

KernelsData GatherTreeKernelBase::GetCommonKernelsData(const Params& params,
const optional_params& options,
float estimated_time) const {
assert(params.GetType() == KernelType::GATHER_TREE);
const auto& gt_params = static_cast<const gather_tree_params&>(params);
KernelsData GatherTreeKernelBase::GetCommonKernelsData(const Params& params,
const optional_params& options,
float estimated_time) const {
assert(params.GetType() == KernelType::GATHER_TREE);
const auto& gt_params = static_cast<const gather_tree_params&>(params);

auto run_info = SetDefault(gt_params);
auto kernel_data = KernelData::Default<gather_tree_params>(params);
auto cldnn_jit = GetJitConstants(gt_params);
auto entry_point = GetEntryPoint(kernelName, gt_params.layerID, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
FillCLKernelData(kernel_data.kernels[0], run_info, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, 4);
kernel_data.estimatedTime = estimated_time;
return { kernel_data };
}
auto run_info = SetDefault(gt_params);
auto kernel_data = KernelData::Default<gather_tree_params>(params);
auto cldnn_jit = GetJitConstants(gt_params);
auto entry_point = GetEntryPoint(kernelName, gt_params.layerID, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
FillCLKernelData(kernel_data.kernels[0],
run_info,
params.engineInfo,
kernelName,
jit,
entry_point,
DEFAULT,
false,
false,
static_cast<int>(gt_params.inputs.size()));
kernel_data.estimatedTime = estimated_time;
return { kernel_data };
}
} // namespace kernel_selector
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019 Intel Corporation
// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,14 @@
namespace kernel_selector {
ParamsKey ReverseSequenceKernelRef::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableAllInputLayout();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019 Intel Corporation
// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -14,30 +14,37 @@

#include "include/include_all.cl"

KERNEL(gather_tree_gpu_ref.cl)(
const __global UNIT_TYPE* step_input,
const __global UNIT_TYPE* parent_input,
const __global UNIT_TYPE* max_seq_len_input,
const __global UNIT_TYPE* end_token,
__global UNIT_TYPE* output)
KERNEL(gather_tree_gpu_ref)(
const __global INPUT0_TYPE* step_input,
const __global INPUT1_TYPE* parent_input,
const __global INPUT2_TYPE* max_seq_len_input,
const __global INPUT3_TYPE* end_token,
__global OUTPUT_TYPE* output)
{
const uint beam = get_global_id(0);
const uint batch = get_global_id(1);
const int beam = get_global_id(0);
const int batch = get_global_id(1);
/*
b -> time
f -> batch
y -> beam
*/
uint parent = beam;
for(int time = INPUT0_BATCH_NUM - 1; time >= 0; time--) {

while (time >= (uint)max_seq_len_input[batch]) {
output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = end_token[0];
time--;
}
output[OUTPUT_GET_INDEX(time, batch, beam, 0)] =
step_input[INPUT0_GET_INDEX(time, batch, parent, 0)];
parent = (uint)parent_input[INPUT0_GET_INDEX(time, batch, parent, 0)];
const int max_sequence_in_beam = min(INPUT0_BATCH_NUM, (int)max_seq_len_input[batch]);
int time;
for (time = INPUT0_BATCH_NUM - 1; time >= max_sequence_in_beam; time--) {
output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = TO_OUTPUT_TYPE(end_token[0]);
}

for (int parent = beam; time >= 0; time--) {
output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = step_input[INPUT0_GET_INDEX(time, batch, parent, 0)];
parent = parent_input[INPUT1_GET_INDEX(time, batch, parent, 0)];
}
bool finished = false;
for (int time = 0; time < max_sequence_in_beam; time++) {
if (finished) {
output[OUTPUT_GET_INDEX(time, batch, beam, 0)] = TO_OUTPUT_TYPE(end_token[0]);
} else if (output[OUTPUT_GET_INDEX(time, batch, beam, 0)] == TO_OUTPUT_TYPE(end_token[0])) {
finished = true;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019 Intel Corporation
// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -15,29 +15,20 @@

#include "include/include_all.cl"

KERNEL(reverse_sequence_ref)(const __global UNIT_TYPE* input, const __global INPUT1_TYPE* seq_lengths, __global UNIT_TYPE* output)
KERNEL(reverse_sequence_ref)(const __global INPUT0_TYPE* input, const __global INPUT1_TYPE* seq_lengths, __global OUTPUT_TYPE* output)
{
const uint batch = get_global_id(0);
const uint feature = get_global_id(1);
const uint y = (uint)get_global_id(2) / INPUT0_SIZE_X;
const uint x = (uint)get_global_id(2) % INPUT0_SIZE_X;
uint dimensions[] = { batch, feature, y, x };

const uint input_index = INPUT0_OFFSET +
batch * INPUT0_BATCH_PITCH +
feature * INPUT0_FEATURE_PITCH +
y * INPUT0_Y_PITCH +
x * INPUT0_X_PITCH;
const uint input_index = INPUT0_GET_INDEX(batch, feature, y, x);

const uint length = (uint)seq_lengths[dimensions[BATCH_AXIS]];
if (dimensions[SEQ_AXIS] < length)
dimensions[SEQ_AXIS] = length - dimensions[SEQ_AXIS] - 1;

const uint output_index = OUTPUT_OFFSET +
dimensions[0] * OUTPUT_BATCH_PITCH +
dimensions[1] * OUTPUT_FEATURE_PITCH +
dimensions[2] * OUTPUT_Y_PITCH +
dimensions[3] * OUTPUT_X_PITCH;

const uint output_index = OUTPUT_GET_INDEX(dimensions[0], dimensions[1], dimensions[2], dimensions[3]);
output[output_index] = ACTIVATION(input[input_index], ACTIVATION_PARAMS);
}
8 changes: 5 additions & 3 deletions inference-engine/thirdparty/clDNN/src/gpu/gather_tree_gpu.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019 Intel Corporation
// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,9 +29,11 @@ struct gather_tree_gpu : typed_primitive_gpu_impl<gather_tree> {

static primitive_impl* create(const gather_tree_node& arg) {
auto b_params = get_default_params<kernel_selector::gather_tree_params>(arg, 1);
auto b_optional_params =
get_default_optional_params<kernel_selector::gather_tree_optional_params>(arg.get_program());
auto b_optional_params = get_default_optional_params<kernel_selector::gather_tree_optional_params>(arg.get_program());

for (size_t i = 1; i < arg.get_dependencies().size(); i++) {
b_params.inputs.push_back(convert_data_tensor(arg.get_dependency(i).get_output_layout(), 1));
}
auto desc = arg.get_primitive();

auto& kernel_selector = kernel_selector::gather_tree_kernel_selector::Instance();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
// Copyright (c) 2019 Intel Corporation
// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -59,10 +59,11 @@ namespace detail {

attach_reverse_sequence_gpu::attach_reverse_sequence_gpu() {
auto val_fw = reverse_sequence_gpu::create;
implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
val_fw);
implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
val_fw);
implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw);
implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw);
implementation_map<reverse_sequence>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw);
}

} // namespace detail
Expand Down

0 comments on commit c437bcd

Please sign in to comment.