From 7f4e76694ba501edf2ba69907ec11e52eccb838d Mon Sep 17 00:00:00 2001 From: hyunback kim Date: Wed, 22 May 2024 16:24:13 +0900 Subject: [PATCH] [GPU] gemm_tile supports block read when leftover with 4byte-size align in dynamic. (#24535) * Use block read in 4byte aligned left-over case. Static was already used (https://github.com/openvinotoolkit/openvino/pull/23400) This PR will apply in dynamic. ### Tickets: - *141032* --------- Signed-off-by: hyunback --- .../cl_kernels/gemm_tiled_opt.cl | 20 +++++++++---------- .../kernels/gemm/gemm_kernel_tiled_opt.cpp | 16 +++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index 8dc2103fdca5a3..fa30466de60c8c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -281,13 +281,13 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - #if TILE_N_NOT_DIVISIBLE + if (TILE_N_NOT_DIVISIBLE == 0 || N_IS_ALIGNED_4BYTE) + b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); + else { unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; } - #else // TILE_N_NOT_DIVISIBLE - b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); - #endif // TILE_N_NOT_DIVISIBLE + } #endif // B_VEC_SIZE == 1 b_ptr += input1_offset; } @@ -387,7 +387,7 @@ KERNEL(gemm_tiled_opt)( // Loading A tile and tile C calculation #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING && TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST - A_FLOATN a_read = TILE_K_NOT_DIVISIBLE ? a_ptr[sglid] : BLOCK_READ_A(a_ptr, 0); + A_FLOATN a_read = (TILE_K_NOT_DIVISIBLE == 0 || K_IS_ALIGNED_4BYTE) ? BLOCK_READ_A(a_ptr, 0): a_ptr[sglid]; #endif unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { #if TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST @@ -433,7 +433,7 @@ KERNEL(gemm_tiled_opt)( } #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING // Read A for next dot_id - a_read = (dot_id + 1 < tile_m_iterations) ? TILE_K_NOT_DIVISIBLE ? a_ptr[sglid] : BLOCK_READ_A(a_ptr, 0) : 0; + a_read = (dot_id + 1 < tile_m_iterations) ? (TILE_K_NOT_DIVISIBLE == 0 || K_IS_ALIGNED_4BYTE) ? BLOCK_READ_A(a_ptr, 0) : a_ptr[sglid] : 0; #endif #elif TRANSPOSE_INPUT0 == TRANSPOSE_OTHER // TRANSPOSE_INPUT0 #if INDIRECT_INPUT0 @@ -516,13 +516,13 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - #if TILE_N_NOT_DIVISIBLE + if (TILE_N_NOT_DIVISIBLE == 0 || N_IS_ALIGNED_4BYTE) + b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); + else { unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; } - #else - b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); - #endif // TILE_N_NOT_DIVISIBLE + } #endif // B_VEC_SIZE == 1 b_ptr += input1_offset; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index b367e40308104d..2e804085939732 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -155,6 +155,20 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons const std::string not_divisible_n = "(" + leftover_n + "!=0)"; const std::string not_divisible_k = "(" + leftover_k + "!=0)"; const std::string full_iteration_k = "(" + k_size + "/" + std::to_string(tuning_data.tile_k_size) + ")"; + std::string n_aligned_4byte = "0"; + std::string k_aligned_4byte = "0"; + if (BytesPerElement(params.inputs[0].GetDType()) == 4 || BytesPerElement(params.inputs[0].GetDType()) == 8) { + n_aligned_4byte = "1"; + k_aligned_4byte = "1"; + } else { + auto bytes_per_element = std::to_string(BytesPerElement(params.inputs[0].GetDType())); + if (n_size.find("shape_info") == std::string::npos) { + n_aligned_4byte = "(" + n_size + "*" + bytes_per_element + " % 4 == 0)"; + } + if (k_size.find("shape_info") == std::string::npos) { + k_aligned_4byte = "(" + k_size + "*" + bytes_per_element + " % 4 == 0)"; + } + } jit.AddConstants({ MakeJitConstant("M", m_size), @@ -162,6 +176,8 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons MakeJitConstant("N", n_size), MakeJitConstant("K_PADDED_IN0", k_padded_size_in0), MakeJitConstant("N_PADDED", n_padded_size), + MakeJitConstant("K_IS_ALIGNED_4BYTE", k_aligned_4byte), + MakeJitConstant("N_IS_ALIGNED_4BYTE", n_aligned_4byte), MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size), MakeJitConstant("TILE_M", tuning_data.tile_m_size), MakeJitConstant("TILE_K", tuning_data.tile_k_size),