From e110479442bf22c8292d722f07d405a0e38ac58f Mon Sep 17 00:00:00 2001 From: Wilson Seok Date: Thu, 25 Jul 2024 18:05:05 -0700 Subject: [PATCH] [GPU] Add condition for dynamic shape split_lengths for in place crop buffer fusing (#25595) ### Details: - Add condition for dynamic shape split_lengths for in place crop buffer fusing ### Tickets: - 146739 --- src/plugins/intel_gpu/src/graph/crop.cpp | 2 +- .../graph_optimizer/prepare_buffer_fusing.cpp | 8 ++ .../passes/prepare_buffer_fusing_test.cpp | 85 +++++++++++++++++++ 3 files changed, 94 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/crop.cpp b/src/plugins/intel_gpu/src/graph/crop.cpp index 146a1fa89b400b..09c5f01f216e57 100644 --- a/src/plugins/intel_gpu/src/graph/crop.cpp +++ b/src/plugins/intel_gpu/src/graph/crop.cpp @@ -250,7 +250,7 @@ crop_inst::typed_primitive_inst(network& network, crop_node const& node) : paren "Invalid Batch offset: exceeds data for output!"); } - if (node.can_be_optimized()) { + if (!node.is_dynamic() && node.can_be_optimized()) { update_output_memory(); } } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index e3471b37c05bd9..7f1fb69446edb9 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -500,6 +500,14 @@ bool crop_in_place_optimization::match(const program_node& node, if (node.is_constant()) return false; + // do not optimize variadic_split crop when either input1 or input2 is not constant. + // VariadicSplit ngraph shape infer requires value of axis(input1) and split_lengths(input2). + // And non_constant input1/input2 makes risky execution of runtime buffer fusing. + auto& crop_node = node.as(); + if ((crop_node.get_primitive()->op_mode == cldnn::crop_ngraph_op_mode::variadic_split) && + (!crop_node.get_dependency(1).is_constant() || !crop_node.get_dependency(2).is_constant())) + return false; + if (node.get_users().size() > 0) { if (node.get_program().is_body_program() && node.get_dependency(0).is_type()) { return false; diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp index e4a077594c7a7e..e5506388eba273 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp @@ -777,6 +777,91 @@ TEST(prepare_buffer_fusing, in_place_crop_dynamic) { ASSERT_EQ(output_ptr_3[i], out3[i]); } +TEST(prepare_buffer_fusing, in_place_crop_dynamic_split_lengths) { + auto& engine = get_test_engine(); + + auto in_layout = layout{ ov::PartialShape{-1, -1, -1}, data_types::f32, format::bfyx}; + auto in2_layout = layout{ ov::PartialShape{-1, -1}, data_types::f32, format::bfyx}; + auto input_mem = engine.allocate_memory({ {1, 2, 4}, data_types::f32, format::bfyx }); + auto weights_mem = engine.allocate_memory({ {8, 4}, data_types::u8, format::bfyx }); + auto bias_mem = engine.allocate_memory({ {1, 1, 8}, data_types::f32, format::bfyx }); + auto scale_mem = engine.allocate_memory({ {8, 1}, data_types::f32, format::bfyx }); + auto zp_mem = engine.allocate_memory({ {8, 1}, data_types::f32, format::bfyx }); + auto axis_mem = engine.allocate_memory({ {}, data_types::i64, format::bfyx }); + auto shapeof_mem = engine.allocate_memory({ {2, 6}, data_types::f32, format::bfyx }); + + int64_t axis = 2; + set_values(input_mem, { -0.5f, 2.0f, 0.5f, 1.0f, + 0.5f, -2.0f, -0.5f, -1.0f }); + set_values(axis_mem, {axis}); + set_values(shapeof_mem, { 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f}); + set_values(weights_mem, { 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12, + 13, 14, 15, 0, + 15, 14, 13, 12, + 11, 10, 9, 8, + 7, 6, 5, 4, + 3, 2, 1, 0}); + set_values(bias_mem, { 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 2.0f }); + set_values(scale_mem, { 2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 2.0f }); + set_values(zp_mem, { 1.0f, 2.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 2.0f }); + + std::vector out1 = { 13.f, 58.f, -11.f, -62.f }; + std::vector out2 = { -51.f, -108.f, 18.5f, -18.f, 1.f, -4.f, 57.f, 100.f, -8.5f, 6.f, 13.f, 8.f }; + std::vector out3 = { 13.f, 58.f, -51.f, -108.f, 18.5f, -18.f, 1.f, -4.f, -11.f, -62.f, 57.f, 100.f, -8.5f, 6.f, 13.f, 8.f }; + + cldnn::crop_ngraph_op_mode op_mode = cldnn::crop_ngraph_op_mode::variadic_split; + topology topology( + input_layout("input", in_layout), + input_layout("input_shapeof", in2_layout), + data("axis", axis_mem), + data("weights", weights_mem), + data("bias", bias_mem), + data("scale", scale_mem), + data("zp", zp_mem), + fully_connected("fc", input_info("input"), "weights", "bias", "scale", "zp", data_types::f32, 3, 2), + shape_of("shapeof", input_info("input_shapeof"), cldnn::data_types::i64), + crop("crop1", { input_info("fc"), input_info("axis"), input_info("shapeof") }, cldnn::tensor(1), cldnn::tensor(0), op_mode, 0, axis), + reorder("output1", input_info("crop1"), format::bfyx, data_types::f32), + crop("crop2", { input_info("fc"), input_info("axis"), input_info("shapeof") }, cldnn::tensor(1), cldnn::tensor(0), op_mode, 1, axis), + reshape("reshape", input_info("crop2"), true, std::vector{0, 0, 3, 2}, ov::PartialShape{-1, -1, 3, 2}, cldnn::reshape::reshape_mode::base), + reorder("output2", input_info("reshape"), format::bfyx, data_types::f32, std::vector(), reorder_mean_mode::subtract, padding(), true), + reorder("output3", input_info("fc"), format::bfyx, data_types::f32) + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + + network.set_input_data("input", input_mem); + network.set_input_data("input_shapeof", shapeof_mem); + + std::map outputs; + EXPECT_NO_THROW(outputs = network.execute()); + + auto output = outputs.at("output1").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + for (size_t i = 0; i < out1.size(); i++) + ASSERT_EQ(output_ptr[i], out1[i]); + + auto output_2 = outputs.at("output2").get_memory(); + cldnn::mem_lock output_ptr_2(output_2, get_test_stream()); + + for (size_t i = 0; i < out2.size(); i++) + ASSERT_EQ(output_ptr_2[i], out2[i]); + + auto output_3 = outputs.at("output3").get_memory(); + cldnn::mem_lock output_ptr_3(output_3, get_test_stream()); + + for (size_t i = 0; i < out3.size(); i++) + ASSERT_EQ(output_ptr_3[i], out3[i]); +} + // Testing for implicit crop along batch axis and outer padding optimzing. // Outer padding opt includes opt out of reshape and reorder which has padded input only in batch axis // This optimzing also includes offset(outer axis padded input) handling of oneDNN primitive.