From 46e504d89d9c9f06ce820779b83b08605ae8e454 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Mon, 1 Apr 2024 19:21:07 +0400 Subject: [PATCH] [GPU] Fix gemm_tiled_opt kernel accuracy for the dynamic case with TILE_N=32 and transposed output shape --- .../cl_kernels/gemm_tiled_opt.cl | 27 ++++++++---- .../kernels/gemm/gemm_kernel_base.cpp | 8 +++- .../kernels/gemm/gemm_kernel_tiled_opt.cpp | 6 +++ .../tests/unit/test_cases/gemm_gpu_test.cpp | 42 ++++++++++++++++--- 4 files changed, 67 insertions(+), 16 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index c05ca859964ebb..f1e59ed335bd5d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -162,10 +162,15 @@ KERNEL(gemm_tiled_opt)( #ifdef BIAS_TERM const uint batch_offset_input2 = FUNC_CALL(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z); #endif // BIAS_TERM - uint write_id = 0; + uint y_write_id = 0; + uint x_write_id = 0; const uint batch_offset_output = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR TR_B, TR_F, TR_W, TR_Z, TR_Y, TR_X); - write_id = 1; - const uint batch_offset_output_diff = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR TR_B, TR_F, TR_W, TR_Z, TR_Y, TR_X) - batch_offset_output; + y_write_id = 1; + x_write_id = 0; + const uint output_y_pitch = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR TR_B, TR_F, TR_W, TR_Z, TR_Y, TR_X) - batch_offset_output; + y_write_id = 0; + x_write_id = 1; + const uint output_x_pitch = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR TR_B, TR_F, TR_W, TR_Z, TR_Y, TR_X) - batch_offset_output; // Start pointers offsets #if TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST @@ -424,7 +429,7 @@ KERNEL(gemm_tiled_opt)( #endif // TILE_K > SIMD_WIDTH } } - #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING + #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING // Read A for next dot_id #if TILE_K_NOT_DIVISIBLE a_read = (dot_id + 1 < tile_m_iterations) ? TILE_K_NOT_DIVISIBLE_CALC ? a_ptr[sglid] : BLOCK_READ_A(a_ptr, 0) : 0; @@ -732,7 +737,13 @@ KERNEL(gemm_tiled_opt)( #endif // HAS_FUSED_OPS } #else - OUTPUT_TYPE* d_ptr_tmp = d_ptr + sglid; + #if TRANSPOSE_OUTPUT == TRANSPOSE_X_LAST + const uint x_pitch = 1; + #else + const uint x_pitch = output_x_pitch; + #endif + OUTPUT_TYPE* d_ptr_tmp = d_ptr + sglid * x_pitch; + #ifdef BIAS_TERM ACCUMULATOR_TYPE_VEC dequantized = (ACCUMULATOR_TYPE_VEC)(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_ptr[sglid]; #else // BIAS_TERM @@ -743,13 +754,13 @@ KERNEL(gemm_tiled_opt)( OUTPUT_TYPE_VEC result = FUSED_OPS_RESULT_VEC; unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) { if (b_raw_global_id + SIMD_WIDTH * n_elem < N) { - *(d_ptr_tmp + SIMD_WIDTH * n_elem) = result[n_elem]; + *(d_ptr_tmp + SIMD_WIDTH * n_elem * x_pitch) = result[n_elem]; } } #else unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) { if (b_raw_global_id + SIMD_WIDTH * n_elem < N) { - *(d_ptr_tmp + SIMD_WIDTH * n_elem) = dequantized[n_elem]; + *(d_ptr_tmp + SIMD_WIDTH * n_elem * x_pitch) = dequantized[n_elem]; } } #endif // HAS_FUSED_OPS @@ -796,7 +807,7 @@ KERNEL(gemm_tiled_opt)( #endif // HAS_FUSED_OPS #endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1 #endif // IS_DYNAMIC - d_ptr += batch_offset_output_diff; + d_ptr += output_y_pitch; #ifdef BIAS_TERM c_ptr += N; #endif // BIAS_TERM diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp index cb59cfee015e96..44eba6cfbc59eb 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_base.cpp @@ -114,13 +114,17 @@ std::vector GemmKernelBase::GetTransposedDims(const std::vector 0 && (params.output_order.back() != (static_cast(params.output_order.size()) - 1))); + if (transpose_output) + jit.AddConstant(MakeJitConstant("TRANSPOSE_OUTPUT", 2 /* set as TRANSPOSE_OTHER */)); + else + jit.AddConstant(MakeJitConstant("TRANSPOSE_OUTPUT", 0 /* set as TRANSPOSE_X_LAST */)); + bool has_dynamic_k_padding = params.transpose_input0 ? params.inputs[0].Y().pad.is_dynamic : params.inputs[0].X().pad.is_dynamic; bool has_dynamic_n_padding = params.transpose_input1 ? params.inputs[1].Y().pad.is_dynamic diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index b06a95f06ad295..de1f67ce0d677d 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -927,7 +927,7 @@ class gemm_gpu_tests: public ::testing::Test { ov::Shape ref_input1_broadcasted_shape; ov::Shape ref_input1_shape; ov::Shape ref_output_shape; - + ref_input0_shape = { BATCH_SIZE, 16, M_SIZE, K_SIZE }; ref_input1_broadcasted_shape = { N_SIZE, BATCH_SIZE, 16, K_SIZE }; ref_input1_shape = { BATCH_SIZE, 16, K_SIZE, N_SIZE }; @@ -1063,7 +1063,7 @@ class gemm_gpu_tests: public ::testing::Test { ov::Shape ref_input1_reshaped_shape; ov::Shape ref_input1_shape; ov::Shape ref_output_shape; - + ref_input0_shape = { BATCH_SIZE, 32, M_SIZE, K_SIZE }; ref_input1_broadcasted_shape = { N_SIZE, BATCH_SIZE, 2, 16, K_SIZE }; ref_input1_reshaped_shape = { N_SIZE, BATCH_SIZE, 32, K_SIZE }; @@ -1313,16 +1313,22 @@ class gemm_gpu_tests: public ::testing::Test { output_shape_default = { M_SIZE, N_SIZE }; } else if (num_dims == 3) { input0_shape_default = { BATCH_SIZE, M_SIZE, K_SIZE }; - input1_shape_default = { BATCH_SIZE, K_SIZE, N_SIZE }; + input1_shape_default = { BATCH_SIZE, K_SIZE, N_SIZE }; output_shape_default = { BATCH_SIZE, M_SIZE, N_SIZE }; } else if (num_dims == 4) { input0_shape_default = { BATCH_SIZE, 1, M_SIZE, K_SIZE}; - input1_shape_default = { BATCH_SIZE, 1, K_SIZE, N_SIZE}; + input1_shape_default = { BATCH_SIZE, 1, K_SIZE, N_SIZE}; output_shape_default = { BATCH_SIZE, 1, M_SIZE, N_SIZE }; } } - void test_transpose_matmul_f32(size_t num_dims, bool is_input_dynamic, bool is_caching_test, std::vector BMKN, std::vector input0_order, std::vector input1_order) { + void test_transpose_matmul_f32(size_t num_dims, + bool is_input_dynamic, + bool is_caching_test, + std::vector BMKN, + std::vector input0_order, + std::vector input1_order, + std::vector output_order = {}) { tests::random_generator rg; rg.set_seed(GET_SUITE_NAME); @@ -1337,6 +1343,7 @@ class gemm_gpu_tests: public ::testing::Test { set_default_shapes(num_dims, BMKN, input0_shape_default, input1_shape_default, output_shape_default); ov::Shape input0_shape(input0_shape_default.size()); ov::Shape input1_shape(input1_shape_default.size()); + ov::Shape output_shape(output_shape_default.size()); for (size_t dim = 0; dim < input0_shape_default.size(); ++dim) { input0_shape[input0_order[dim]] = input0_shape_default[dim]; @@ -1346,6 +1353,12 @@ class gemm_gpu_tests: public ::testing::Test { input1_shape[input1_order[dim]] = input1_shape_default[dim]; } + if (!output_order.empty()) { + for (size_t dim = 0; dim < output_shape_default.size(); ++dim) { + output_shape[output_order[dim]] = output_shape_default[dim]; + } + } + if (is_input_dynamic) { input0_layout = layout{ov::PartialShape::dynamic(input0_shape.size()), data_types::f32, format::bfyx}; input1_layout = layout{ov::PartialShape::dynamic(input1_shape.size()), data_types::f32, format::bfyx}; @@ -1366,7 +1379,7 @@ class gemm_gpu_tests: public ::testing::Test { topology topology; topology.add(input_layout("input0", input0_layout), input_layout("input1", input1_layout), - gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f32, {}, {}, {}, {}, input0_order, input1_order) + gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f32, {}, {}, {}, {}, input0_order, input1_order, output_order) ); ExecutionConfig config = get_test_default_config(engine); @@ -1415,6 +1428,19 @@ class gemm_gpu_tests: public ::testing::Test { false, false); + if (!output_order.empty()) { + std::vector out_data_transposed(ov::shape_size(output_shape_default)); + + ov::reference::transpose((const char *)(ref_out_data.data()), + (char *)(out_data_transposed.data()), + output_shape_default, + sizeof(float), + output_order, + output_shape); + + ref_out_data = out_data_transposed; + } + ASSERT_EQ(output_ptr.size(), ref_out_data.size()); const auto abs_error = 0.0001; @@ -1614,6 +1640,10 @@ TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f32) { this->test_transpose_matmul_f32(4, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0}); } +TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f32_n_tile_32_output_ylast) { + this->test_transpose_matmul_f32(4, true, false, /*BMKN*/{1, 128, 1, 9}, /*input0_order*/{0, 1, 2, 3}, /*input1_order*/{0, 1, 2, 3}, /*output_order*/{0, 1, 3, 2}); +} + TEST_F(gemm_gpu_tests, transpose_matmul_static_4d_f16) { this->test_transpose_matmul_f16(4, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0}); }