diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 1e5f943600fc05..ac7810c6e9154c 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -295,6 +295,12 @@ void remove_redundant_reorders::run(program& p) { auto o_layout = r_node.get_output_layout(); const auto& i_layout = r_node.get_input_layout(0); + auto is_r_node_rank_changed = r_node.get_output_layout().get_rank() != r_node.get_dependency(0).get_output_layout().get_rank(); + if (is_r_node_rank_changed && + ((!update_implementations && r_node.get_dependency(0).is_type()) || + (r_node.get_dependency(0).is_type() && r_node.get_dependency(0).can_be_optimized()))) + continue; + // Optimize reorder b_fs_yx_fsv16 -> bfyx when spatials are equal to 1. In this case we can reinterpret buffer, // but pads need to be handled correctly. if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx && !r_node.is_output() && diff --git a/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp index 9a4cb71450a53c..0eb425b4dc1119 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp @@ -192,9 +192,9 @@ TEST(add_required_reorders, skip_adding_reorder_batch_axis_padding) { crop_prim = network.get_primitive("crop2"); ASSERT_EQ(crop_prim->can_be_optimized(), true); auto reorder_prim = network.get_primitive("crop1_reorder"); - ASSERT_EQ(reorder_prim->can_be_optimized(), true); + ASSERT_EQ(reorder_prim->can_be_optimized(), false); reorder_prim = network.get_primitive("crop2_reorder"); - ASSERT_EQ(reorder_prim->can_be_optimized(), true); + ASSERT_EQ(reorder_prim->can_be_optimized(), false); auto concate = network.get_primitive("concat"); ASSERT_EQ(concate->can_be_optimized(), false); } diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp index 456fab4ae0286a..1eb11c662608e0 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp @@ -1224,7 +1224,7 @@ TEST(prepare_buffer_fusing, test_implicit_crop_and_outerpadding) { auto reorder_prim = network.get_primitive("gather1_reorder"); ASSERT_EQ(reorder_prim->can_be_optimized(), true); reorder_prim = network.get_primitive("gather2_reorder"); - ASSERT_EQ(reorder_prim->can_be_optimized(), true); + ASSERT_EQ(reorder_prim->can_be_optimized(), false); auto reshape_prim = network.get_primitive("reshape1"); ASSERT_EQ(reshape_prim->can_be_optimized(), true); } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp index 8ade3b6c8e0f31..0f9f119f275a78 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp @@ -2467,6 +2467,99 @@ TEST(reorder_gpu_f32, bfzyx_to_bsv16_fsv16_padded) } } +TEST(reorder_gpu_f32, bfzyx_to_bfyx_padded) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + const int32_t b_in = 1024; + const int32_t f_in = 64; + const int32_t x_in = 72; + const int32_t y_in = 2; + const int32_t z_in = 3; + + const int32_t b_crop = 1024; + const int32_t f_crop = 64; + const int32_t x_crop = 72; + const int32_t y_crop = 2; + const int32_t z_crop = 1; + + const int32_t z0_off = 0; + const int32_t z1_off = 1; + const int32_t z2_off = 2; + + auto input = engine.allocate_memory({ data_types::f32,format::bfzyx,{ b_in, f_in, x_in, y_in, z_in } }); + + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(crop("crop0", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z0_off })); + topology.add(crop("crop1", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z1_off })); + topology.add(crop("crop2", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z2_off })); + topology.add(reorder("reorder0", input_info("crop0"), format::bfyx, data_types::f32)); + topology.add(reorder("reorder1", input_info("crop1"), format::bfyx, data_types::f32)); + topology.add(reorder("reorder2", input_info("crop2"), format::bfyx, data_types::f32)); + topology.add(reshape("reshape0", input_info("reorder0"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in)))); + topology.add(reshape("reshape1", input_info("reorder1"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in)))); + topology.add(reshape("reshape2", input_info("reorder2"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in)))); + + std::vector input_vec = rg.generate_random_1d(input->count(), -10, 10); + set_values(input, input_vec); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + + network.set_input_data("input", input); + auto outputs = network.execute(); + auto output0 = outputs.at("reshape0").get_memory(); + auto output1 = outputs.at("reshape1").get_memory(); + auto output2 = outputs.at("reshape2").get_memory(); + + cldnn::mem_lock output_ptr0(output0, get_test_stream()); + for (int b = 0; b < b_crop; ++b) { + for (int f = 0; f < f_crop; ++f) { + for (int z = 0; z < z_crop; ++z) { + for (int y = 0; y < y_crop; ++y) { + for (int x = 0; x < x_crop; ++x) { + int linear_id = x + x_in * (y + y_in * (z + z0_off + z_in * (f + f_in * b))); + int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b))); + ASSERT_EQ(output_ptr0[output_linear_id], input_vec[linear_id]); + } + } + } + } + } + + cldnn::mem_lock output_ptr1(output1, get_test_stream()); + for (int b = 0; b < b_crop; ++b) { + for (int f = 0; f < f_crop; ++f) { + for (int z = 0; z < z_crop; ++z) { + for (int y = 0; y < y_crop; ++y) { + for (int x = 0; x < x_crop; ++x) { + int linear_id = x + x_in * (y + y_in * (z + z1_off + z_in * (f + f_in * b))); + int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b))); + ASSERT_EQ(output_ptr1[output_linear_id], input_vec[linear_id]); + } + } + } + } + } + + cldnn::mem_lock output_ptr2(output2, get_test_stream()); + for (int b = 0; b < b_crop; ++b) { + for (int f = 0; f < f_crop; ++f) { + for (int z = 0; z < z_crop; ++z) { + for (int y = 0; y < y_crop; ++y) { + for (int x = 0; x < x_crop; ++x) { + int linear_id = x + x_in * (y + y_in * (z + z2_off + z_in * (f + f_in * b))); + int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b))); + ASSERT_EQ(output_ptr2[output_linear_id], input_vec[linear_id]); + } + } + } + } + } +} + TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_allowed) { auto& engine = get_test_engine();