Skip to content

Commit

Permalink
[GPU] Fuse more 'type conversion only' reorders (openvinotoolkit#25270)
Browse files Browse the repository at this point in the history
### Details:
 - Reorder which is only converting type can be fused to the prior node

### Tickets:
 - 144957
  • Loading branch information
yeonbok authored Jul 3, 2024
1 parent 626966b commit 1a26e6f
Show file tree
Hide file tree
Showing 15 changed files with 252 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
#include "convert_color_inst.h"
#include "one_hot_inst.h"
#include "shape_of_inst.h"
#include "gather_inst.h"
#include "select_inst.h"
#include "eltwise_inst.h"
#include "broadcast_inst.h"
#include "permute_inst.h"
#include "depth_to_space_inst.h"
#include "concatenation_inst.h"
Expand Down Expand Up @@ -410,8 +414,11 @@ void remove_redundant_reorders::run(program& p) {
continue;

bool same_data_type = input.get_output_layout().data_type == output_layout.data_type;
bool allowed_dt_conversion_fuse = (input.is_type<one_hot>() || input.is_type<permute>() || input.is_type<mvn>() || input.is_type<concatenation>() ||
input.is_type<depth_to_space>() || input.is_type<region_yolo>() || input.is_type<detection_output>());
bool allowed_dt_conversion_fuse =
(input.is_type<one_hot>() || input.is_type<permute>() || input.is_type<mvn>() ||
input.is_type<concatenation>() || input.is_type<depth_to_space>() || input.is_type<region_yolo>() ||
input.is_type<detection_output>() || input.is_type<gather>() || input.is_type<broadcast>() ||
input.is_type<select>() || input.is_type<eltwise>());
if (!same_data_type && !allowed_dt_conversion_fuse)
continue;

Expand All @@ -426,8 +433,10 @@ void remove_redundant_reorders::run(program& p) {
auto old_output_layout_of_input = input.get_output_layout();
input.set_output_layout(output_layout, false);
if (input.type()->does_possible_implementation_exist(input)) {
// Add fused_primitive_desc of reorder to the previous node which propagates original output layout during shape inference
if (input.is_type<mvn>() || input.is_type<concatenation>()) {
// Add fused_primitive_desc of reorder to the previous node which propagates original output layout
// during shape inference
if (input.is_type<mvn>() || input.is_type<concatenation>() || input.is_type<gather>() ||
input.is_type<broadcast>() || input.is_type<select>() || input.is_type<eltwise>()) {
fused_primitive_desc local_desc(node.get_primitive());
local_desc.f_param = node.get_fuse_params();
local_desc.total_num_deps = node.get_dependencies().size();
Expand Down
14 changes: 12 additions & 2 deletions src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "reshape_inst.h"
#include "arg_max_min_inst.h"
#include "shape_of_inst.h"
#include "select_inst.h"
#include "condition_inst.h"
#include "strided_slice_inst.h"
#include <sstream>
Expand All @@ -33,6 +34,7 @@
#include "prior_box_inst.h"
#include "scatter_nd_update_inst.h"
#include "gather_inst.h"
#include "broadcast_inst.h"
#include "loop_inst.h"
#include "dft_inst.h"
#include "to_string_utils.h"
Expand Down Expand Up @@ -428,10 +430,18 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, reorder_node
bool allow_new_shape_infer = node.get_program().is_new_shape_infer();
// Because mvn and concatenation kernel can work cross-layout, if reorder only performs type conversion,
// fusing reorder to the previous node can be done even if it is a dynamic shape case
if ((prev.is_type<mvn>() || prev.is_type<concatenation>()) &&
if ((prev.is_type<mvn>() || prev.is_type<concatenation>() || prev.is_type<gather>() || prev.is_type<broadcast>() ||
prev.is_type<select>() || prev.is_type<eltwise>()) &&
!prev.is_in_shape_of_subgraph() && node.is_type_conversion_only() &&
(format::is_simple_data_format(fmt_prev) && format::is_simple_data_format(fmt_next)))
(format::is_simple_data_format(fmt_prev) && format::is_simple_data_format(fmt_next)) &&
// If the prev node is backedge of the loop, the type will be changed by fusing reorder.
// We can void only that case if we can check whether the current node is backedge of the network.
// However no such handle is existing yet. (To be done in the future when we need to optimize out the type converting
// reorders in the body network)
!node.get_program().is_body_program() &&
prev.get_preferred_impl_type() != cldnn::impl_types::cpu) {
return true;
}

if (prev.is_dynamic() || (!node.get_users().empty() && node.get_users().front()->is_dynamic()))
return false;
Expand Down
3 changes: 3 additions & 0 deletions src/plugins/intel_gpu/src/graph/select.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ std::vector<layout> select_inst::calc_output_layouts(const select_node& /*node*/

auto desc = impl_param.typed_desc<select>();
auto dt = desc->output_data_types[0].value_or(input1_layout.data_type);
if (impl_param.has_fused_primitives()) {
dt = impl_param.get_output_element_type();
}

ov::op::v1::Select op;
op.set_auto_broadcast(desc->broadcast_spec);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ inline uint FUNC(get_idx_pos)(OPTIONAL_SHAPE_INFO_ARG uint out_b, uint out_f, ui
#define VLOAD CAT(vload, VEC_SIZE)
#define VSTORE CAT(vstore,VEC_SIZE)
#define INPUT0_VTYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE)
#define OUTPUT_VTYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)

KERNEL(broadcast_gpu_ref)(
OPTIONAL_SHAPE_INFO_ARG
Expand Down Expand Up @@ -322,15 +323,18 @@ KERNEL(broadcast_gpu_ref)(
uint output_idx = out_pos;
unroll_for(uint j = 0; j < y_nums; j++) {
unroll_for(uint i = 0; i < x_stride; i++) {
output[output_idx + i] = input[idx_pos + i];
output[output_idx + i] = TO_OUTPUT_TYPE(input[idx_pos + i]);
}
output_idx += OUTPUT_SIZE_X;
}
} else {
uint output_idx = out_pos;
INPUT0_VTYPE input_vec = VLOAD(0, &input[idx_pos]);
unroll_for(uint i = 0; i < y_nums; i++) {
VSTORE(input_vec, 0, &output[output_idx]);
OUTPUT_VTYPE out_v;
for (int j = 0; j < VEC_SIZE; ++j)
out_v[j] = TO_OUTPUT_TYPE(input_vec[j]);
VSTORE(out_v, 0, &output[output_idx]);
output_idx += OUTPUT_SIZE_X;
}

Expand All @@ -339,7 +343,7 @@ KERNEL(broadcast_gpu_ref)(

output_idx = out_pos;
unroll_for(uint i = 0; i < y_nums; i++) {
output[output_idx + x_stride] = input_val;
output[output_idx + x_stride] = TO_OUTPUT_TYPE(input_val);
output_idx += OUTPUT_SIZE_X;
}
}
Expand Down Expand Up @@ -375,7 +379,7 @@ KERNEL(broadcast_gpu_ref)(
const uint out_pos = OUTPUT_GET_INDEX(out_b, out_f, out_y, out_x);
const uint idx_pos = FUNC_CALL(get_idx_pos)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_y, out_x);
#endif
output[out_pos] = input[idx_pos];
output[out_pos] = TO_OUTPUT_TYPE(input[idx_pos]);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,6 @@ KERNEL(eltwise)(
#if QUANTIZATION_TERM && !OUTPUT_IS_FP
output[output_offset] = TO_OUTPUT_TYPE_SAT(ACTIVATION(out, ACTIVATION_PARAMS));
#else
output[output_offset] = ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED);
output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED));
#endif
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ KERNEL(select)(
uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
#endif

const OUTPUT_TYPE res = select(INPUT_2, INPUT_1, MASK);
const OUTPUT_TYPE res = TO_OUTPUT_TYPE(select(INPUT_2, INPUT_1, MASK));

output[output_offset] = res;
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ ParamsKey BroadcastKernelRef::GetSupportedKey() const {
k.EnableAllInputLayout();
k.EnableAllOutputLayout();

k.EnableDifferentTypes();
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,8 @@ class BroadcastKernelRef : public BroadcastKernelBase {
KernelsData GetKernelsData(const Params& params) const override;
KernelsPriority GetKernelsPriority(const Params& params) const override;
ParamsKey GetSupportedKey() const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::REORDER };
}
};
} // namespace kernel_selector
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class EltwiseKernelRef : public EltwiseKernelBase {
return {
FusedOpType::QUANTIZE,
FusedOpType::ACTIVATION,
FusedOpType::ELTWISE
FusedOpType::ELTWISE,
FusedOpType::REORDER
};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ class GatherKernelRef : public KernelBaseOpenCL {
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE,
FusedOpType::ELTWISE,
FusedOpType::ACTIVATION };
FusedOpType::ACTIVATION,
FusedOpType::REORDER };
}

protected:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ class SelectKernelRef : public SelectKernelBase {
KernelsData GetKernelsData(const Params& params) const override;
KernelsPriority GetKernelsPriority(const Params& params) const override;
ParamsKey GetSupportedKey() const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::REORDER };
}


protected:
bool Validate(const Params& p) const override;
Expand Down
48 changes: 35 additions & 13 deletions src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ struct eltwise_test_params {

class EltwiseFusingTest : public ::BaseFusingTest<eltwise_test_params> {
public:
void execute(eltwise_test_params& p) {
void execute(eltwise_test_params& p, bool count_reorder = false) {
auto input_prim = get_mem(get_input_layout(p));
auto input_prim2 = get_mem(get_input_layout2(p));

Expand All @@ -45,7 +45,7 @@ class EltwiseFusingTest : public ::BaseFusingTest<eltwise_test_params> {
network_not_fused.set_input_data("input2", input_prim2);
}

compare(network_not_fused, network_fused, p);
compare(network_not_fused, network_fused, p, count_reorder);
}

layout get_input_layout(eltwise_test_params& p) {
Expand Down Expand Up @@ -545,21 +545,27 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_no_pitches_same_dims_quantize, ::t
eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 },
}));

class eltwise_activation : public EltwiseFusingTest {};
TEST_P(eltwise_activation, basic) {
class eltwise_activation_reorder : public EltwiseFusingTest {};
TEST_P(eltwise_activation_reorder, basic) {
auto p = GetParam();
create_topologies(
input_layout("input", get_input_layout(p)),
input_layout("input2", get_input_layout2(p)),
eltwise("eltwise", { input_info("input"), input_info("input2") }, p.mode, p.default_type),
activation("activation", input_info("eltwise"), activation_func::relu, { 6.0f, 0.0f }),
reorder("out", input_info("activation"), p.default_format, data_types::f32)
);
create_topologies(input_layout("input", get_input_layout(p)),
input_layout("input2", get_input_layout2(p)),
eltwise("eltwise", {input_info("input"), input_info("input2")}, p.mode, p.default_type),
activation("activation", input_info("eltwise"), activation_func::relu, {6.0f, 0.0f}),
reorder("out",
input_info("activation"),
p.default_format,
data_types::f32,
std::vector<float>(),
cldnn::reorder_mean_mode::subtract,
cldnn::padding(),
true));

tolerance = default_tolerance(p.input_type);
execute(p);
execute(p, true);
}

class eltwise_activation : public EltwiseFusingTest {};
TEST_P(eltwise_activation, fp16_out) {
auto p = GetParam();
create_topologies(
Expand All @@ -574,6 +580,21 @@ TEST_P(eltwise_activation, fp16_out) {
execute(p);
}

INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation_reorder, ::testing::ValuesIn(std::vector<eltwise_test_params>{
eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP16_3, 4, 5 },
eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 5 },
eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 4, 5 }
}));

INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(std::vector<eltwise_test_params>{
eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 },
eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 },
Expand All @@ -590,6 +611,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(st
}));



class eltwise_quantize_fs_b_yx_fsv32 : public EltwiseFusingTest {};
TEST_P(eltwise_quantize_fs_b_yx_fsv32, fusing_eltwise_quantize_layout) {
auto p = GetParam();
Expand Down Expand Up @@ -649,4 +671,4 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_quantize_fs_b_yx_fsv32_exception,
eltwise_test_params{ CASE_ELTWISE_FP16_B_FS_YX, 6, 6 },
eltwise_test_params{ CASE_ELTWISE_FP16_BATCH_FS_B, 6, 6 },
eltwise_test_params{ CASE_ELTWISE_FP16_BATCH_B_FS, 6, 6 },
}));
}));
15 changes: 9 additions & 6 deletions src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ struct gather_test_params {

class GatherPrimitiveFusingTest : public ::BaseFusingTest<gather_test_params> {
public:
void execute(gather_test_params& p, bool is_dynamic = false) {
void execute(gather_test_params& p, bool is_dynamic = false, bool count_reorder = false) {
cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
cfg_fused.set_property(ov::intel_gpu::optimize_data(true));
Expand All @@ -50,7 +50,7 @@ class GatherPrimitiveFusingTest : public ::BaseFusingTest<gather_test_params> {
network_not_fused.set_input_data("eltwise_data", elt_input_prim);
}

compare(network_not_fused, network_fused, p);
compare(network_not_fused, network_fused, p, count_reorder);
}

layout get_input_layout(gather_test_params& p, bool is_dynamic = false) {
Expand Down Expand Up @@ -119,6 +119,8 @@ class GatherPrimitiveFusingTest : public ::BaseFusingTest<gather_test_params> {
#define CASE_GATHER_5D_FP16_4 { 3, 2, 2, 2, 2 }, { 2, 3, 1, 1 }, { 3, 2, 2, 3, 2 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx
#define CASE_GATHER_5D_FP16_5 { 1, 1, 2, 1, 1 }, { 3, 1, 1, 1 }, { 1, 1, 1, 1, 3 }, 4, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx

#define CASE_GATHER_INT8_1 { 2, 3, 4, 1 }, { 4 }, { 4, 3, 4, 1 }, 0, data_types::i8, format::bfyx, data_types::f32, format::bfyx

class gather_quantize : public GatherPrimitiveFusingTest {};
TEST_P(gather_quantize, basic) {
auto p = GetParam();
Expand Down Expand Up @@ -223,14 +225,15 @@ TEST_P(gather_eltwise_activation_dynamic, basic) {
gather("gather_prim", input_info("input"), input_info("gather_indices"), p.axis, p.dictionary_shape.size(), p.out_shape),
activation("activation", input_info("gather_prim"), activation_func::abs),
eltwise("eltwise", { input_info("activation"), input_info("eltwise_data") }, eltwise_mode::prod),
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32, std::vector<float>(), cldnn::reorder_mean_mode::subtract, cldnn::padding(), true)
);

tolerance = 1e-5f;
execute(p, true);
execute(p, true, true);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_eltwise_activation_dynamic, ::testing::ValuesIn(std::vector<gather_test_params>{
gather_test_params{ CASE_GATHER_FP32_6, 4, 6 },
gather_test_params{ CASE_GATHER_FP16_6, 4, 6 },
gather_test_params{ CASE_GATHER_FP16_7, 4, 6 },
gather_test_params{ CASE_GATHER_FP16_6, 4, 7 },
gather_test_params{ CASE_GATHER_FP16_7, 5, 8 },
gather_test_params{ CASE_GATHER_INT8_1, 4, 7 },
}));
Loading

0 comments on commit 1a26e6f

Please sign in to comment.