Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Optimize out Gather by converting to implicit crop #17743

Merged
merged 1 commit into from
Jun 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,72 @@ static bool can_reshape_be_optimized(const reshape_node& node) {
return node.is_in_place() && !node.has_fused_primitives();
}

static bool is_optimizable_padding_for_crop(const crop_node& node) {
const auto& crop_layout = node.get_output_layout();
auto input_layout = node.get_dependency(0).get_output_layout();
auto crop_prim = node.get_primitive();
auto opt_lower_pad = crop_prim->offsets.feature[0];
auto opt_upper_pad = input_layout.feature() - crop_prim->offsets.feature[0] - crop_layout.get_tensor().feature[0];

// do not optimize crop if paddings are not properly aligned
for (auto& usr : node.get_users()) {
auto usr_layout = usr->get_output_layout();
if (usr_layout.format == format::b_fs_yx_fsv16 &&
(opt_lower_pad % 16 != 0 || opt_upper_pad % 16 != 0))
return false;

if (input_layout.data_padding.lower_size().batch[0] != 0 || input_layout.data_padding.upper_size().batch[0] != 0 ||
input_layout.data_padding.lower_size().spatial[0] != 0 || input_layout.data_padding.upper_size().spatial[0] != 0 ||
input_layout.data_padding.lower_size().spatial[1] != 0 || input_layout.data_padding.upper_size().spatial[1] != 0)
return false;

// oneDNN doesn't support paddings
if (usr->get_preferred_impl_type() == impl_types::onednn)
return false;
}

return true;
}

static bool can_crop_be_optimized_along_feature(const crop_node& node) {
const auto& crop_layout = node.get_output_layout();
auto format = crop_layout.format;
auto input_layout = node.get_dependency(0).get_output_layout();
const auto& crop_size = crop_layout.get_tensor();
const auto& out_pad = crop_layout.data_padding;

if (format == format::bfyx && crop_size.batch[0] == input_layout.batch() &&
crop_size.spatial[0] == input_layout.spatial(0) &&
crop_size.spatial[1] == input_layout.spatial(1) && out_pad.lower_size().feature[0] == 0 &&
out_pad.upper_size().feature[0] == 0 && out_pad.lower_size().batch[0] == 0 &&
out_pad.upper_size().batch[0] == 0 && out_pad.lower_size().spatial[0] == 0 &&
out_pad.lower_size().spatial[1] == 0 && out_pad.upper_size().spatial[0] == 0 &&
out_pad.upper_size().spatial[1] == 0) {
return true;
}

return false;
}

static bool can_crop_be_optimized_along_batch(const crop_node& node) {
const auto& crop_layout = node.get_output_layout();
auto format = crop_layout.format;
auto input_layout = node.get_dependency(0).get_output_layout();
const auto crop_shape = crop_layout.get_ordered_dims();
const auto input_shape = input_layout.get_ordered_dims();
const auto& in_padding = input_layout.data_padding;
const auto& out_padding = crop_layout.data_padding;

// Check format's order is 'bxxx' and only batch size is different
if (format::is_simple_data_format(format) && format::traits(format)._order[0] == 0 &&
std::equal(input_shape.begin()+1, input_shape.end(), crop_shape.begin()+1) &&
!out_padding && !in_padding) {
return true;
}

return false;
}

static void propagate_padding_to_opt_out_users(program_node& node, cldnn::padding padding_data) {
if (padding_data == cldnn::padding())
return;
Expand Down Expand Up @@ -366,6 +432,7 @@ void prepare_buffer_fusing::run(program& p) {

if (!can_optimize(node))
continue;

// zero copy
program_helpers::do_for_types<crop>(*node, [&p](crop_node& node) {
// if the node is marked as network output, prevent optimizations which would affect a form of its output,
Expand All @@ -392,56 +459,38 @@ void prepare_buffer_fusing::run(program& p) {
if (p.is_loop_body() && node.get_dependency(0).is_type<lstm_elt>()) {
return;
}
// optimization is available for cropping across depth(features) only

// optimization is available for cropping across depth(features) or batch
// if output padding has defined padding across features already it wouldn't
// work because it expect to have zeros in the padded area.
if (!is_optimizable_padding_for_crop(node))
return;

const auto& crop_layout = node.get_output_layout();
auto format = crop_layout.format;
auto crop_prim = node.get_primitive();
auto input_layout = node.get_input_layout(0);
const auto& crop_size = crop_layout.get_tensor();
const auto& out_padd = crop_layout.data_padding;
auto opt_lower_pad = crop_prim->offsets.feature[0];
auto opt_upper_pad = input_layout.feature() - crop_prim->offsets.feature[0] - crop_size.feature[0];

// do not optimize crop if paddings are not properly aligned
for (auto& usr : node.get_users()) {
auto usr_layout = usr->get_output_layout();
if (usr_layout.format == format::b_fs_yx_fsv16 &&
(opt_lower_pad % 16 != 0 || opt_upper_pad % 16 != 0))
return;
if (input_layout.data_padding.lower_size().batch[0] != 0 || input_layout.data_padding.upper_size().batch[0] != 0 ||
input_layout.data_padding.lower_size().spatial[0] != 0 || input_layout.data_padding.upper_size().spatial[0] != 0 ||
input_layout.data_padding.lower_size().spatial[1] != 0 || input_layout.data_padding.upper_size().spatial[1] != 0)
return;
// oneDNN doesn't support paddings
if (usr->get_preferred_impl_type() == impl_types::onednn)
return;
}

if (format == format::bfyx && crop_size.batch[0] == input_layout.batch() &&
crop_size.spatial[0] == input_layout.spatial(0) &&
crop_size.spatial[1] == input_layout.spatial(1) && out_padd.lower_size().feature[0] == 0 &&
out_padd.upper_size().feature[0] == 0 && out_padd.lower_size().batch[0] == 0 &&
out_padd.upper_size().batch[0] == 0 && out_padd.lower_size().spatial[0] == 0 &&
out_padd.lower_size().spatial[1] == 0 && out_padd.upper_size().spatial[0] == 0 &&
out_padd.upper_size().spatial[1] == 0) {
// Regular crop
// crop input buffer
// |___________data____________|
//
// crop output buffer
// |-------->| offsets[f] |<--|
// |_____data____|
// <------------>
// reference size
//
// In-place crop
// crop output buffer
// |_low_pad_|__data_size__|___|<-upper pad
const auto& out_pad = crop_layout.data_padding;
auto input_layout = node.get_input_layout(0);
auto crop_prim = node.get_primitive();

// feature num of pad should be accumulated if dep has been optimized out.
// Regular crop
// crop input buffer
// |___________data____________|
//
// crop output buffer
// |-------->| offsets[f] |<--|
// |_____data____|
// <------------>
// reference size
//
// In-place crop
// crop output buffer
// |_low_pad_|__data_size__|___|<-upper pad
if (can_crop_be_optimized_along_feature(node)) {
auto crop_prim = node.get_primitive();
auto opt_lower_pad = crop_prim->offsets.feature[0];
auto opt_upper_pad = input_layout.feature() - crop_prim->offsets.feature[0] - crop_size.feature[0];
auto& dep = node.get_dependency(0);
// feature num of pad should be accumulated if dep has been optimized out.
if (dep.is_type<crop>() && dep.can_be_optimized()) {
auto dep_pad = dep.get_output_layout().data_padding;
OPENVINO_ASSERT(
Expand All @@ -454,18 +503,36 @@ void prepare_buffer_fusing::run(program& p) {
opt_upper_pad += dep_pad.upper_size().feature[0];
}

// set padding
node.set_output_padding(
padding({out_padd.lower_size().batch[0],
opt_lower_pad,
out_padd.lower_size().spatial[0],
out_padd.lower_size().spatial[1]},
{out_padd.upper_size().batch[0],
opt_upper_pad,
out_padd.upper_size().spatial[0],
out_padd.upper_size().spatial[1]}));
node.can_be_optimized(true);
propagate_padding_to_opt_out_users(node, node.get_output_layout().data_padding);
padding({out_pad.lower_size().batch[0],
opt_lower_pad,
out_pad.lower_size().spatial[0],
out_pad.lower_size().spatial[1]},
{out_pad.upper_size().batch[0],
opt_upper_pad,
out_pad.upper_size().spatial[0],
out_pad.upper_size().spatial[1]}));
} else if (can_crop_be_optimized_along_batch(node)) {
auto crop_prim = node.get_primitive();
auto opt_lower_pad = crop_prim->offsets.batch[0];
auto opt_upper_pad = input_layout.batch() - crop_prim->offsets.batch[0] - crop_size.batch[0];

auto new_padding = padding({opt_lower_pad,
out_pad.lower_size().feature[0],
out_pad.lower_size().spatial[0],
out_pad.lower_size().spatial[1]},
{opt_upper_pad,
out_pad.upper_size().feature[0],
out_pad.upper_size().spatial[0],
out_pad.upper_size().spatial[1]});
node.set_output_padding(new_padding);
} else {
return;
}

node.can_be_optimized(true);
propagate_padding_to_opt_out_users(node, node.get_output_layout().data_padding);
}
});
}
Expand Down
56 changes: 44 additions & 12 deletions src/plugins/intel_gpu/src/plugin/ops/gather.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

#include "intel_gpu/plugin/program.hpp"
#include "intel_gpu/plugin/common_utils.hpp"
#include "transformations/utils/utils.hpp"

#include "ngraph/op/gather.hpp"

#include "intel_gpu/primitives/gather.hpp"
#include "intel_gpu/primitives/reorder.hpp"
#include "intel_gpu/primitives/reshape.hpp"
#include "intel_gpu/primitives/crop.hpp"

using namespace InferenceEngine;
namespace ov {
Expand Down Expand Up @@ -44,12 +46,13 @@ void CreateGatherOpBase(Program& p, const std::shared_ptr<T>& op, const int64_t
}

// Dynamic path will do shape infer internally, so no need to pass valid out shape for that case
ov::Shape out_shape = op->get_output_partial_shape(0).is_static() ? op->get_output_shape(0) : ov::Shape{};
bool is_static = op->get_output_partial_shape(0).is_static();
ov::Shape out_shape = is_static ? op->get_output_shape(0) : ov::Shape{};

// Update output_shape in case of scalar indice
bool need_reshape = false;
auto out_shape_original = out_shape;
if (!p.use_new_shape_infer() && op->get_output_partial_shape(0).is_static()) {
if (!p.use_new_shape_infer() && is_static) {
auto input1_shape = op->get_input_shape(1);
if (input1_shape.size() == 0 && batch_dim == 0) {
need_reshape = true;
Expand Down Expand Up @@ -77,21 +80,50 @@ void CreateGatherOpBase(Program& p, const std::shared_ptr<T>& op, const int64_t
}
}

// gather
// Set layer name for Gather
auto reshapeName = layerName + "";
if (need_reshape) {
layerName = layerName + "_reshape_output";
}

auto gatherPrim = cldnn::gather(layerName,
reordered_inputs[0],
reordered_inputs[1],
axis,
out_shape,
batch_dim,
support_neg_ind);

p.add_primitive(*op, gatherPrim);
// Check if Gather could be converted to other primitive
const auto input_shape = op->get_input_partial_shape(0);
const auto input_rank = input_shape.rank().get_length();
const auto& indices = op->input_value(1);
if (is_static && axis == 0 && input_rank > 1 && indices.get_partial_shape().rank().get_length() == 0 &&
std::equal(input_shape.begin()+1, input_shape.end(), out_shape.begin()+1)) {
// Gather -> Crop
// this Gather simply divides an input tensor along Batch axis
auto get_crop_layer_name = [&](std::string name, size_t idx)->std::string {
return (name + "/crop_" + std::to_string(idx));
};

// Get indices info to calculate offset
const auto& indices_node = indices.get_node_shared_ptr();
auto indices_constant = std::dynamic_pointer_cast<ngraph::op::v0::Constant>(indices_node);
float result = 0.f;
ov::op::util::get_single_value(indices_constant, result);

// Set tensors for crop shape and offset
InferenceEngine::SizeVector start_offset(input_shape.size());
start_offset[0] = static_cast<size_t>(result);
auto offsetTensor = tensor_from_dims(start_offset, 0);
auto outTensor = tensor_from_dims(out_shape, 1);

// Create Crop
layerName = get_crop_layer_name(layerName, static_cast<size_t>(result));
auto cropPrim = cldnn::crop(layerName, reordered_inputs[0], outTensor, offsetTensor);
p.add_primitive(*op, cropPrim);
} else {
auto gatherPrim = cldnn::gather(layerName,
reordered_inputs[0],
reordered_inputs[1],
axis,
out_shape,
batch_dim,
support_neg_ind);
p.add_primitive(*op, gatherPrim);
}

// Add reorder and reshape for scalar indice
if (need_reshape) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -559,4 +559,23 @@ INSTANTIATE_TEST_SUITE_P(
GatherLayerTest::getTestCaseName
);

const auto GatherAxes0Optimized = []() {
return testing::Combine(testing::ValuesIn({std::vector<size_t>{4, 8, 2, 2}}),
testing::ValuesIn({std::vector<size_t>{}}),
testing::ValuesIn({std::tuple<int, int>{0, 0}}),
testing::ValuesIn(netPrecisionsFP32),
testing::Values(InferenceEngine::Precision::UNSPECIFIED),
testing::Values(InferenceEngine::Precision::UNSPECIFIED),
testing::Values(InferenceEngine::Layout::ANY),
testing::Values(InferenceEngine::Layout::ANY),
testing::Values(CommonTestUtils::DEVICE_GPU));
};

INSTANTIATE_TEST_SUITE_P(
smoke_Gather7Axes0Optimized,
Gather8IndiceScalarLayerTest,
GatherAxes0Optimized(),
Gather8IndiceScalarLayerTest::getTestCaseName
);

} // namespace
Loading