From 3c6b8ca84b1dc8806e51e4cd6b060b34318efeeb Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 1 Feb 2024 19:57:25 +0400 Subject: [PATCH] [GPU] Exclude fused dependencies from GEMM's output shape canonicalization func (#22543) Fix zp/scales data types conversion in FC bf tiled kernel --- src/plugins/intel_gpu/src/graph/gemm.cpp | 2 +- .../fully_connected_gpu_bf_tiled.cl | 5 +- .../fully_connected_gpu_bf_tiled_common.cl | 4 +- .../tests/unit/fusions/gemm_fusion_test.cpp | 48 ++++++++++++ .../test_cases/fully_connected_gpu_test.cpp | 75 ++++++++++++++++++- 5 files changed, 128 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/gemm.cpp b/src/plugins/intel_gpu/src/graph/gemm.cpp index 99e8fd7d674a29..51f984bb6666ea 100644 --- a/src/plugins/intel_gpu/src/graph/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/gemm.cpp @@ -202,7 +202,7 @@ layout gemm_inst::transform_output_layout(const std::shared_ptr prim auto N = input1_pshape[n_idx]; auto output_pshape = input_layouts[0].get_partial_shape(); - for (size_t i = 0; i != input_layouts.size(); ++i) { + for (size_t i = 0; i != primitive->input_size(); ++i) { auto input_pshape = input_layouts[i].get_partial_shape(); for (size_t j = 0; j != input_pshape.size(); ++j) { ov::Dimension::merge(output_pshape[j], output_pshape[j], input_pshape[j]); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index fe23ffb9d90a32..766fdfb554cb33 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -55,6 +55,7 @@ #define TO_OUTPUT_VEC_TYPE(x) CAT(convert_, OUTPUT_VEC_TYPE)(x) #define TO_ACTIVATION_VEC_TYPE(x) CAT(convert_, ACTIVATION_VEC_TYPE)(x) #define TO_FILTER_VEC_TYPE(x) CAT(convert_, FILTER_VEC_TYPE)(x) +#define TO_ACCUMULATOR_VEC_TYPE(x) CAT(convert_, ACCUMULATOR_VEC_TYPE)(x) #define INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, TILE_IFM, ptr, offset) #define FILTER_BLOCK_READ(ptr, offset) BLOCK_READN(FILTER_TYPE, TILE_K_OFM_PACKED, ptr, offset) @@ -184,7 +185,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)( #if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1 #if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_scale = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_scale, out_f); + ACCUMULATOR_VEC_TYPE d_scale = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_SCALE_TYPE, TILE_OFM, decompression_scale, out_f)); #elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0 ACCUMULATOR_VEC_TYPE d_scale = 0; unroll_for(uint of = 0; of < TILE_OFM; ++of) { @@ -201,7 +202,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)( #if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR #if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_zp = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_zp, out_f); + ACCUMULATOR_VEC_TYPE d_zp = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_ZP_TYPE, TILE_OFM, decompression_zp, out_f)); #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0 ACCUMULATOR_VEC_TYPE d_zp = 0; unroll_for(uint of = 0; of < TILE_OFM; ++of) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl index 98fe1d1082d3c8..962ea0fb39806c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl @@ -52,7 +52,7 @@ inline void (FUNC_NAME)( #if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1 #if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_scale = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_scale, out_f); + ACCUMULATOR_VEC_TYPE d_scale = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_SCALE_TYPE, TILE_OFM, decompression_scale, out_f)); #elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0 ACCUMULATOR_VEC_TYPE d_scale = 0; unroll_for(uint of = 0; of < TILE_OFM; ++of) { @@ -69,7 +69,7 @@ inline void (FUNC_NAME)( #if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR #if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_zp = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_zp, out_f); + ACCUMULATOR_VEC_TYPE d_zp = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_ZP_TYPE, TILE_OFM, decompression_zp, out_f)); #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0 ACCUMULATOR_VEC_TYPE d_zp = 0; unroll_for(uint of = 0; of < TILE_OFM; ++of) { diff --git a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp index 68d444094f331b..6780b65f67c3be 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp @@ -118,6 +118,7 @@ class GemmFusingTest : public ::BaseFusingTest { #define CASE_GEMM_2IN_FP16_3 { { 1, 1, 64, 64 }, { 1, 1, 64, 64 } }, { 1, 1, 64, 64 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx #define CASE_GEMM_2IN_FP16_4 { { 1, 2, 128, 64 }, { 1, 2, 64, 256 } }, { 1, 2, 128, 256 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx #define CASE_GEMM_2IN_FP16_5 { { 2, 3, 2, 2 }, { 2, 3, 2, 2 } }, { 2, 3, 2, 2 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_2IN_FP16_3D_1 { { 16, 8, 64 }, { 16, 64, 8 }, { 16, 1, 8 } }, { 16, 8, 8 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx #define CASE_GEMM_2IN_FP16_5D_1 { { 2, 3, 5, 6, 4 }, { 2, 3, 5, 4, 6} }, { 2, 3, 5, 6, 6 }, data_types::f16, data_types::f16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx #define CASE_GEMM_2IN_FP16_6D_1 { { 2, 3, 2, 3, 5, 7 }, { 2, 3, 2, 3, 7, 5 } }, { 2, 3, 2, 3, 5, 5 }, data_types::f16, data_types::f16, data_types::f16, format::bfwzyx, data_types::f16, format::bfwzyx @@ -406,6 +407,53 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_add, ::testing::ValuesIn(std::vec gemm_test_params{ CASE_GEMM_2IN_FP16_6D_1, 3, 4, "", broadcast_kinds::feature, eltwise_mode::sub }, })); +class gemm_2in_dynamic_add : public gemm_2in_add {}; +TEST_P(gemm_2in_dynamic_add, add) { + auto p = GetParam(); + + if (engine.get_device_info().supports_immad) + p.expected_fused_primitives++; + + cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + auto eltwise_layout = get_output_layout(p); + auto eltwise_shape = ov::PartialShape::dynamic(eltwise_layout.get_partial_shape().size()); + if (p.broadcast_kind == broadcast_kinds::batch) + eltwise_shape[0] = 1; + else if (p.broadcast_kind == broadcast_kinds::feature) + eltwise_shape[1] = 1; + eltwise_layout.set_partial_shape(eltwise_shape); + + auto in_layout0 = get_input_layout(p, 0); + auto in_layout1 = get_input_layout(p, 1); + + auto in0_pshape = ov::PartialShape::dynamic(p.in_shapes[0].size()); + in0_pshape[2] = p.in_shapes[0][2]; + auto in1_pshape = ov::PartialShape::dynamic(p.in_shapes[1].size()); + in1_pshape[1] = p.in_shapes[1][1]; + + in_layout0.set_partial_shape(in0_pshape); + in_layout1.set_partial_shape(in1_pshape); + + create_topologies( + input_layout("input0", in_layout0), + input_layout("input1", in_layout1), + input_layout("input2", eltwise_layout), + gemm("gemm_prim", { input_info("input0"), input_info("input1") }, data_types::f32, false, false, 1.f, 0.f, in0_pshape.size(), in1_pshape.size()), + eltwise("add_prim", { input_info("gemm_prim"), input_info("input2") }, p.eltwise_m, p.default_type), + reorder("reorder_bfyx", input_info("add_prim"), p.default_format, data_types::f32) + ); + + tolerance = default_tolerance(p.default_type); + execute(p, true); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_dynamic_add, ::testing::ValuesIn(std::vector{ + gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 5, "", broadcast_kinds::batch, eltwise_mode::sum }, + gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 5, "", broadcast_kinds::feature, eltwise_mode::sum }, +})); + class gemm_2in_act_scale_quantize_i8 : public GemmFusingTest {}; TEST_P(gemm_2in_act_scale_quantize_i8, basic) { // TODO: Fix me, refer PR(#15873) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index 3962a9f1f6ec16..4c3d741a499bf4 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1131,6 +1131,75 @@ class fully_connected_gpu_tests: public ::testing::Test { ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 5.0) << "i = " << i; } + void test_compressed_int8_scale_zp_bias(bool is_caching_test) { + auto& engine = get_test_engine(); + + auto input_mem = engine.allocate_memory({ {1, 1, 2}, data_types::f16, format::bfyx }); + auto weights_mem = engine.allocate_memory({ {32, 2}, data_types::u8, format::bfyx }); + auto bias_mem = engine.allocate_memory({ {1, 1, 32}, data_types::f16, format::bfyx }); + auto scale_mem = engine.allocate_memory({ {32, 1}, data_types::f32, format::bfyx }); + auto zp_mem = engine.allocate_memory({ {32, 1}, data_types::f32, format::bfyx }); + + set_values(input_mem, { -0.5f, 2.0f, 0.5f, 1.0f }); + set_values(weights_mem, { 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 0, + 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0, + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 0, + 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0 }); + + set_values(bias_mem, { 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 2.0f, + 2.0f, -1.0f, 4.0f, -3.0f, 6.0f, -5.0f, 8.0f, 3.0f, + 0.0f, -3.0f, 2.0f, -5.0f, 4.0f, -5.0f, 6.0f, 1.0f, + 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 2.0f }); + set_values(scale_mem, { 2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 2.0f, + 3.0f, 5.0f, -1.0f, -3.0f, 1.5f, 0.5f, 3.0f, 3.0f, + 1.0f, 5.0f, -1.0f, -3.0f, -0.5f, -1.5f, 1.0f, 1.0f, + 2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 2.0f }); + set_values(zp_mem, { 1.0f, 2.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 2.0f, + 2.0f, 3.0f, 1.0f, 0.0f, 3.0f, 1.0f, 2.0f, 2.0f, + 1.0f, 2.0f, 2.0f, 1.0f, 1.0f, 1.0f, 6.0f, 1.0f, + 1.0f, 1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 5.0f, 2.0f }); + + topology topology( + input_layout("input", input_mem->get_layout()), + data("weights", weights_mem), + data("bias", bias_mem), + data("scale", scale_mem), + data("zp", zp_mem), + fully_connected("fc_prim", input_info("input"), "weights", "bias", "scale", "zp", data_types::f16, padding(), 3, 2) + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto output_mem = outputs.begin()->second.get_memory(); + + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + ov::PartialShape expected_shape{1, 1, 32}; + ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape()); + + std::vector expected_result = { 5.f, 12.f, -10.f, -48.f, 9.75f, -14.5f, 32.f, -19.f, + 54.5f, 64.f, -9.f, -37.5f, 12.f, -3.f, 6.5f, -7.5f, + 2.f, 14.5f, -4.5f, -38.f, -3.f, -30.5f, 18.5f, -8.f, + 39.f, 62.f, -20.f, -32.f, 6.25f, -8.f, -3.f, -5.f, }; + + for (size_t i = 0; i < output_ptr.size(); i++) { + ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i; + } + } + void test_compressed_scale_bias(bool is_caching_test) { auto& engine = get_test_engine(); @@ -1229,7 +1298,7 @@ class fully_connected_gpu_tests: public ::testing::Test { ov::PartialShape expected_shape{2, 8}; ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape()); - std::vector expected_result = { + std::vector expected_result = { ov::float16(18), ov::float16(84), ov::float16(-66), ov::float16(-116), ov::float16(19.5), ov::float16(-13.5), ov::float16(30), ov::float16(6), ov::float16(-18), ov::float16(-84), ov::float16(66), ov::float16(116), ov::float16(-19.5), ov::float16(13.5), ov::float16(-30), ov::float16(-6) }; @@ -2679,6 +2748,10 @@ TEST_F(fully_connected_gpu_tests, compressed_scale_bias) { this->test_compressed_scale_bias(false); } +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_zp_bias) { + this->test_compressed_int8_scale_zp_bias(false); +} + TEST_F(fully_connected_gpu_tests, compressed_scale_bias_cached) { this->test_compressed_scale_bias(true); }