Skip to content

Commit

Permalink
[IE CLDNN] Bugfix in eltwise ref and optimizing for fsv4 layout (open…
Browse files Browse the repository at this point in the history
  • Loading branch information
byungilm authored and yekruglov committed Jun 7, 2021
1 parent c0fbbb5 commit a5b4afb
Show file tree
Hide file tree
Showing 8 changed files with 857 additions and 12 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "eltwise_kernel_b_fs_yx_fsv4.h"
#include "kernel_selector_utils.h"
#include <algorithm>
#include <string>
#include <vector>

namespace kernel_selector {
static inline bool InputHasFeatureBroadcast(const eltwise_params& params, const size_t op_num, const size_t input_idx);

ParamsKey EltwiseKernel_b_fs_yx_fsv4::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableDifferentTypes();
k.EnableBatching();
k.EnableTensorPitches();
k.EnableTensorOffset();
k.EnableEltwiseBroadcast();
return k;
}

KernelsData EltwiseKernel_b_fs_yx_fsv4::GetKernelsData(const Params& params, const optional_params& options) const {
if (!Validate(params, options)) {
return {};
}

KernelData kd = KernelData::Default<eltwise_params>(params);
eltwise_params& newParams = *static_cast<eltwise_params*>(kd.params.get());

auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
auto cldnn_jit = GetJitConstants(newParams);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);

DispatchData dispatchData = SetDefault(newParams);

auto& kernel = kd.kernels[0];

kernel.workGroups.global = dispatchData.gws;
kernel.workGroups.local = dispatchData.lws;

kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(),
false,
false,
GetFusedPrimitiveInputsCount(params));

return {kd};
}

KernelsPriority EltwiseKernel_b_fs_yx_fsv4::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const {
return FORCE_PRIORITY_1;
}

// Protected
bool EltwiseKernel_b_fs_yx_fsv4::Validate(const Params& params, const optional_params& o) const {
if (!EltwiseKernelBase::Validate(params, o)) {
return false;
}

const auto& ewParams = static_cast<const eltwise_params&>(params);

const auto& output = ewParams.output;
const auto count = output.PhysicalSize();

if (count % vec_size != 0)
return false;

for (size_t i = 0; i < ewParams.inputs.size(); i++) {
if ((ewParams.inputs[i].GetLayout() != DataLayout::b_fs_yx_fsv4) &&
(ewParams.inputs[i].LogicalSize() != 1)) {
return false;
}
}

auto input0 = ewParams.inputs[0];

// Check that padding before features doesn't miss-align the blocks
if (input0.Feature().pad.before % vec_size != 0 || output.Feature().pad.before % vec_size != 0) {
return false;
}

auto compareTensors = [](const DataTensor& input0, const DataTensor& input1) -> bool {
// Check all parameters except DataType
auto& input0_dims = input0.GetDims();
auto& input1_dims = input1.GetDims();
bool same = input0.GetLayout() == input1.GetLayout() &&
input0.GetPaddedVal() == input1.GetPaddedVal() &&
input0.GetViewOffset() == input1.GetViewOffset() &&
input0_dims.size() == input1_dims.size();
if (same) {
for (size_t i = 0; i < input0_dims.size(); i++) {
same &= input0_dims[i].v == input1_dims[i].v &&
input0_dims[i].pad.before == input1_dims[i].pad.before &&
input0_dims[i].pad.after == input1_dims[i].pad.after &&
input0_dims[i].pitch == input1_dims[i].pitch;
}
}
return same;
};

for (size_t i = 1; i < ewParams.inputs.size(); i++) {
if (ewParams.inputs[i].LogicalSize() == input0.LogicalSize() && !(compareTensors(ewParams.inputs[i], input0)))
return false;
if (ewParams.inputs[i].Feature().pad.before % vec_size != 0) {
return false;
}
}

return true;
}

JitConstants EltwiseKernel_b_fs_yx_fsv4::MakeLoadJitConstants(const eltwise_params& params, bool /*useVload8*/) const {
JitConstants jit = {};
std::string vload_decls;
for (size_t op_num = 0; op_num < params.operations.size(); op_num++) {
const std::string op_num_str = std::to_string(op_num);
const auto &ew = params.operations[op_num];
for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
const auto &input = ew.inputs[input_idx];
const std::string name = "INPUT_" + op_num_str + "_" + std::to_string(input_idx);

switch (input.mode) {
case EltwiseInputMode::SCALAR:
jit.AddConstant(MakeJitConstant(name, input.scalar));
break;
case EltwiseInputMode::INPUT_BUFFER:
{
const std::string idx_order = "INPUT" + std::to_string(input.index) + "_IDX_ORDER";
jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*4, y, x"));

if (params.inputs[input.index].LogicalSize() == 1) {
const std::string vload_name = "DO_VLOAD" + std::to_string(op_num) + "_" + std::to_string(input_idx);
const std::string vload_value = "\\\n\tMAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4) tmp_a" + std::to_string(op_num) +
"_" + std::to_string(input_idx) + " = " "(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4))" +
"(input" + std::to_string(input.index) + "[0])";
jit.AddConstant(MakeJitConstant(vload_name, vload_value));
jit.AddConstant(MakeJitConstant(name, "tmp_a" + std::to_string(op_num) + "_" + std::to_string(input_idx)));
} else {
bool feature_broadcasting = (params.inputs[input_idx].Feature().v == 1 && params.output.Feature().v != 1);

if (feature_broadcasting) {
const std::string broadcast_name = "DO_FEATURE_BROADCAST" + std::to_string(op_num) + "_" + std::to_string(input_idx);
std::string broadcast_value = "\\\n\tMAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4) tmp_b" + std::to_string(op_num) +
" = " "(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4))"+"(input" + std::to_string(input.index) +
"[GET_INDEX(INPUT, " + std::to_string(input.index) + ", " + idx_order + ")]);";

jit.AddConstant(MakeJitConstant(broadcast_name, broadcast_value));
jit.AddConstant(MakeJitConstant(name, "tmp_b" + std::to_string(op_num)));
} else {
const std::string vload_name = "DO_VLOAD" + std::to_string(op_num) + "_" + std::to_string(input_idx);
const std::string vload_value = "\\\n\tMAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4) tmp_a" + std::to_string(op_num) +
"_" + std::to_string(input_idx) + " = TO_TYPE(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, " +
std::to_string(vec_size) + "), vload4(0, &input" + std::to_string(input.index) +
"[GET_INDEX(INPUT," + std::to_string(input.index) + ", " + idx_order + ")]));";

jit.AddConstant(MakeJitConstant(vload_name, vload_value));
jit.AddConstant(MakeJitConstant(name, "tmp_a" + std::to_string(op_num) + "_" + std::to_string(input_idx)));
}
}
break;
}
case EltwiseInputMode::OUTPUT_BUFFER:
jit.AddConstant(MakeJitConstant(name, "output[off]"));
break;
case EltwiseInputMode::UNORDERED_ACCESS_INPUT_BUFFER:
jit.AddConstant(MakeJitConstant(
name,
"input" + std::to_string(input.index) + "[(size_t)tmp" + std::to_string(input.tmpIndex) + "]"));
break;
case EltwiseInputMode::INTERMEDIATE_RESULTS_INDEX:
jit.AddConstant(MakeJitConstant(name, "tmp" + std::to_string(input.tmpIndex)));
break;
default:
break;
}
}
}

return jit;
}

JitConstants EltwiseKernel_b_fs_yx_fsv4::GetJitConstants(const eltwise_params& params) const {
JitConstants jit = MakeBaseParamsJitConstants(params);
bool useVload8 = false;

auto blockSize = vec_size;
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
jit.AddConstant(MakeJitConstant("BLOCK_SIZE", blockSize));
jit.AddConstant(MakeJitConstant("BLOCKS_COUNT", CeilDiv(params.output.X().v, blockSize)));

jit.Merge(MakeInputDeclsJitConstants(params, useVload8));
jit.Merge(MakeLoadJitConstants(params, useVload8));
jit.Merge(GetOperationsJitConstants(params, useVload8, vec_size));

std::string do_eltwise;
auto& operations = params.operations;
for (size_t op_num = 0; op_num < operations.size(); op_num++) {
const auto &ew = operations[op_num];
for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
const auto &input = ew.inputs[input_idx];
if (input.mode != EltwiseInputMode::INPUT_BUFFER && input.mode != EltwiseInputMode::SCALAR)
continue;

if (InputHasFeatureBroadcast(params, op_num, input_idx)) {
do_eltwise += "\\\n\tDO_FEATURE_BROADCAST" + std::to_string(op_num) + "_" + std::to_string(input_idx) + ";";
} else {
do_eltwise += "\\\n\tDO_VLOAD" + std::to_string(op_num) + "_" + std::to_string(input_idx) + ";";
}
}
do_eltwise += "\\\n\tOPERATION" + std::to_string(op_num) + ";";
}

do_eltwise += "\\\n\tres = tmp" + std::to_string(operations.size() - 1) + ";";

jit.AddConstant(MakeJitConstant("DO_ELTWISE", do_eltwise));

if (params.layoutBased || params.int8_quantization || params.broadcast) {
jit.Merge(GetTensorFriendlyWorkGroupsJit(params.output));
}

if (!params.stride.empty()) {
jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1));
}

jit.Merge(MakeActivationJitConstants(params.activations, params.output.GetDType(), "_TYPED"));

if (params.output.Feature().v % 4 != 0)
jit.AddConstant(MakeJitConstant("LEFTOVERS", params.output.Feature().v % 4));

if (!params.fused_ops.empty()) {
kernel_selector::Datatype input_dt = GetAccumulatorType(params);
std::vector<std::string> idx_order = {"b", "f_block*4", "y", "x"};
FusedOpsConfiguration conf = {"", idx_order, "res", input_dt, (size_t)vec_size};
conf.vec_axis = Tensor::DataChannelName::FEATURE;
jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
}

jit.AddConstant(MakeJitConstant("QUANTIZATION_TERM", params.int8_quantization));
jit.AddConstant(MakeJitConstant("VEC_SIZE", vec_size));

if (params.broadcast) {
bool need_idx_safe = true;
for (size_t i = 0; i < params.inputs.size(); i++) {
if (params.inputs[i].LogicalSize() == 1) {
need_idx_safe = false;
break;
}
}

if (need_idx_safe)
jit.AddConstant(MakeJitConstant("ELTWISE_BROADCAST", params.broadcast));
}

return jit;
}

EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv4::SetDefault(const eltwise_params& params) const {
DispatchData dispatchData;

dispatchData.gws[0] = params.output.X().v * params.output.Y().v;
dispatchData.gws[1] = CeilDiv(params.output.Feature().v, 4);
dispatchData.gws[2] = params.output.Batch().v;

dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
dispatchData.lws[1] = 1;
dispatchData.lws[2] = 1;

return dispatchData;
}

// Local
static inline bool InputHasFeatureBroadcast(const eltwise_params& params, const size_t op_num, const size_t input_idx) {
const auto &ew = params.operations[op_num];

const auto &input = ew.inputs[input_idx];
if (input.mode == EltwiseInputMode::INPUT_BUFFER) {
if (params.inputs[input_idx].LogicalSize() != 1
&& params.inputs[input_idx].Feature().v == 1
&& params.output.Feature().v != 1) {
return true;
}
}

return false;
}
} // namespace kernel_selector
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (c) 2021 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "eltwise_kernel_base.h"

namespace kernel_selector {
class EltwiseKernel_b_fs_yx_fsv4 : public EltwiseKernelBase {
public:
EltwiseKernel_b_fs_yx_fsv4() : EltwiseKernelBase("eltwise_b_fs_yx_fsv4") {}
virtual ~EltwiseKernel_b_fs_yx_fsv4() {}

KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {
FusedOpType::QUANTIZE,
FusedOpType::ACTIVATION,
FusedOpType::SCALE,
FusedOpType::ELTWISE
};
}

protected:
bool Validate(const Params& p, const optional_params& o) const override;
JitConstants MakeLoadJitConstants(const eltwise_params& params, bool useVload8) const override;
JitConstants GetJitConstants(const eltwise_params& params) const override;
DispatchData SetDefault(const eltwise_params& params) const override;
void PrintWorkSize(const DispatchData& dis);

const int vec_size = 4;
};
} // namespace kernel_selector
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "eltwise_kernel_fs_b_yx_fsv32.h"
#include "eltwise_kernel_b_fs_yx_fsv16.h"
#include "eltwise_kernel_mixed_byxf_and_fs_b_yx_fsv32.h"
#include "eltwise_kernel_b_fs_yx_fsv4.h"

namespace kernel_selector {
eltwise_kernel_selector::eltwise_kernel_selector() {
Expand All @@ -16,6 +17,7 @@ eltwise_kernel_selector::eltwise_kernel_selector() {
Attach<EltwiseKernel_fs_b_yx_fsv32>();
Attach<EltwiseKernel_mixed_byxf_and_fs_b_yx_fsv32>();
Attach<EltwiseKernel_b_fs_yx_fsv16>();
Attach<EltwiseKernel_b_fs_yx_fsv4>();
}

KernelsData eltwise_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@ bool EltwiseKernel_vload8::Validate(const Params& params, const optional_params&
for (size_t i = 0; i < ewParams.inputs.size(); i++) {
if ((ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
(ewParams.inputs[i].GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
(ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv4 && ewParams.inputs[i].Feature().v % 8 != 0) ||
ewParams.inputs[i].GetLayout() == DataLayout::fs_b_yx_fsv32)
return false;
}
if ((ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv4 && ewParams.output.Feature().v % 8 != 0) ||
ewParams.output.GetLayout() == DataLayout::fs_b_yx_fsv32)
return false;

Expand Down
Loading

0 comments on commit a5b4afb

Please sign in to comment.