From b1e142dfbae11f3102b122e30dd28ec2cdd447c9 Mon Sep 17 00:00:00 2001 From: hyunback kim Date: Thu, 21 Mar 2024 15:47:35 +0900 Subject: [PATCH] [GPU] In gemm_tile_kernel, applied to use block read when N and K byte-size is aligned 4. (#23400) ### Details: - *Element by element read is the bottle-neck in gemm_tiled kernel. Enable block-read when N and K size are aligned 4byte with N and K are leftover*. - *Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.* - *Add GEMM_TILE_M/N/K/SIMD environment variables for convenience.* ### Tickets: - *134279* --------- Signed-off-by: hyunback --- .../intel_gpu/runtime/debug_configuration.hpp | 2 +- .../cl_kernels/gemm_tiled_opt.cl | 34 ++++++++++++------- .../kernels/gemm/gemm_kernel_tiled_opt.cpp | 15 ++++++++ 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index a8f8b34b1c5a7f..15450eb6b842bb 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -102,7 +102,7 @@ class debug_configuration { int verbose_color; // Print verbose color int list_layers; // Print list layers int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive - int print_input_data_shapes; // Print the input data_shape for benchmark_app. + int print_input_data_shapes; // Print the input data_shape for benchmark_app. int disable_usm; // Disable usm usage int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU) int disable_onednn_opt_post_ops; // Disable onednn optimize post operators diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index 10f61b1dd15393..e9079c6fb395f3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -275,10 +275,10 @@ KERNEL(gemm_tiled_opt)( else #endif // INDIRECT_INPUT1 { - #if TILE_N_NOT_DIVISIBLE - b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; - #else + #if N_IS_ALIGNED_4BYTE b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); + #else + b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #endif b_ptr += input1_offset; } @@ -340,11 +340,11 @@ KERNEL(gemm_tiled_opt)( #if INDIRECT_INPUT0 uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table); A_FLOATN a_read = input0[a_idx]; -#elif TILE_K_NOT_DIVISIBLE - A_FLOATN a_read = a_ptr[sglid]; -#else // TILE_K_NOT_DIVISIBLE +#elif K_IS_ALIGNED_4BYTE A_FLOATN a_read = BLOCK_READ_A(a_ptr, 0); -#endif // TILE_K_NOT_DIVISIBLE +#else // K_IS_ALIGNED_4BYTE + A_FLOATN a_read = a_ptr[sglid]; +#endif // K_IS_ALIGNED_4BYTE #endif // IS_DYNAMIC a_ptr += input0_offset; @@ -486,11 +486,11 @@ KERNEL(gemm_tiled_opt)( else #endif { - #if TILE_N_NOT_DIVISIBLE - b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; - #else // TILE_N_NOT_DIVISIBLE + #if N_IS_ALIGNED_4BYTE b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); - #endif // TILE_N_NOT_DIVISIBLE + #else // N_IS_ALIGNED_4BYTE + b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; + #endif // N_IS_ALIGNED_4BYTE b_ptr += input1_offset; } #elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == 0 @@ -529,15 +529,23 @@ KERNEL(gemm_tiled_opt)( } #endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST +#if !INDIRECT_INPUT0 && K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST) + a_ptr = input0 + FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, (K_FULL_ITERATIONS * TILE_K)); +#endif // Loading leftovers of the matrix A and tile C calculation unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { #if INDIRECT_INPUT0 uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid), beam_table); + INPUT0_TYPE a_read = input0[a_idx]; +#else // INDIRECT_INPUT0 +#if K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST) + INPUT0_TYPE a_read = BLOCK_READ_A(a_ptr, 0); + a_ptr += input0_offset; #else uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid)); -#endif INPUT0_TYPE a_read = input0[a_idx]; - +#endif +#endif // INDIRECT_INPUT0 unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) { c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index 56a97df8d0d0ab..eb3b9d2d8a1787 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -52,6 +52,7 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p (GetOuputSize(params.output_order, output, 'X') * GetOuputSize(params.output_order, output, 'Y')); std::vector global = { GetOuputSize(params.output_order, output, 'X'), GetOuputSize(params.output_order, output, 'Y'), total_batches }; + GPU_DEBUG_TRACE_DETAIL << "Draft for global work item size: [" << global[0] << ", " << global[1] << ", " << global[2] << "], " << std::endl; dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size); dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size; @@ -94,6 +95,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem tuning_data.tile_k_size = tuning_data.simd_size; tuning_data.tile_m_size = tuning_data.simd_size; } + // Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32. + if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty()) + tuning_data.tile_n_size = 32; + + GPU_DEBUG_LOG << params.layerID << ": m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl; } else { // In shape agnostic kernel case, the vector size of FusedOpsConfiguration cannot be specified at build time, // so the tile sizes must be the same as simd_size @@ -103,6 +109,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem tuning_data.tile_m_size = tuning_data.simd_size; } + GPU_DEBUG_LOG << params.layerID << ": tile_m_size: " << tuning_data.tile_m_size + << ", tile_n_size: " << tuning_data.tile_n_size + << ", tile_k_size: " << tuning_data.tile_k_size + << ", simd_size: " << tuning_data.simd_size << std::endl; + return tuning_data; } @@ -212,6 +223,8 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons auto leftover_m = m_size % tuning_data.tile_m_size; auto leftover_n = n_size % tuning_data.tile_n_size; auto leftover_k = k_size % tuning_data.tile_k_size; + auto n_aligned_4byte = (n_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0; + auto k_aligned_4byte = (k_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0; jit.AddConstants({ MakeJitConstant("M", m_size), @@ -219,6 +232,8 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons MakeJitConstant("N", n_size), MakeJitConstant("K_PADDED_IN0", k_size), MakeJitConstant("N_PADDED", n_size), + MakeJitConstant("K_IS_ALIGNED_4BYTE", k_aligned_4byte), + MakeJitConstant("N_IS_ALIGNED_4BYTE", n_aligned_4byte), MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size), MakeJitConstant("TILE_M", tuning_data.tile_m_size), MakeJitConstant("TILE_K", tuning_data.tile_k_size),