From 7497c4de2d08a45df251454df48c3a7c72554224 Mon Sep 17 00:00:00 2001 From: hyunback Date: Thu, 16 May 2024 11:51:35 +0900 Subject: [PATCH 1/4] [GPU] gemm_tile supports block read when N and K 4byte-size align in dynamic. Signed-off-by: hyunback --- .../cl_kernels/gemm_tiled_opt.cl | 20 +++++++++---------- .../kernels/gemm/gemm_kernel_tiled_opt.cpp | 5 +++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index 8dc2103fdca5a3..f805052ee711cc 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -281,13 +281,13 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - #if TILE_N_NOT_DIVISIBLE + #if N_IS_ALIGNED_4BYTE + b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); + #else // !N_IS_ALIGNED_4BYTE unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; } - #else // TILE_N_NOT_DIVISIBLE - b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); - #endif // TILE_N_NOT_DIVISIBLE + #endif // !N_IS_ALIGNED_4BYTE #endif // B_VEC_SIZE == 1 b_ptr += input1_offset; } @@ -387,7 +387,7 @@ KERNEL(gemm_tiled_opt)( // Loading A tile and tile C calculation #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING && TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST - A_FLOATN a_read = TILE_K_NOT_DIVISIBLE ? a_ptr[sglid] : BLOCK_READ_A(a_ptr, 0); + A_FLOATN a_read = K_IS_ALIGNED_4BYTE ? BLOCK_READ_A(a_ptr, 0): a_ptr[sglid]; #endif unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { #if TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST @@ -433,7 +433,7 @@ KERNEL(gemm_tiled_opt)( } #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING // Read A for next dot_id - a_read = (dot_id + 1 < tile_m_iterations) ? TILE_K_NOT_DIVISIBLE ? a_ptr[sglid] : BLOCK_READ_A(a_ptr, 0) : 0; + a_read = (dot_id + 1 < tile_m_iterations) ? K_IS_ALIGNED_4BYTE ? BLOCK_READ_A(a_ptr, 0) : a_ptr[sglid] : 0; #endif #elif TRANSPOSE_INPUT0 == TRANSPOSE_OTHER // TRANSPOSE_INPUT0 #if INDIRECT_INPUT0 @@ -516,13 +516,13 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - #if TILE_N_NOT_DIVISIBLE + #if N_IS_ALIGNED_4BYTE + b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); + #else unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; } - #else - b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); - #endif // TILE_N_NOT_DIVISIBLE + #endif // N_IS_ALIGNED_4BYTE #endif // B_VEC_SIZE == 1 b_ptr += input1_offset; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index b367e40308104d..5bbc2e357478d1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -155,6 +155,9 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons const std::string not_divisible_n = "(" + leftover_n + "!=0)"; const std::string not_divisible_k = "(" + leftover_k + "!=0)"; const std::string full_iteration_k = "(" + k_size + "/" + std::to_string(tuning_data.tile_k_size) + ")"; + auto bytes_per_element = std::to_string(BytesPerElement(params.inputs[0].GetDType())); + auto n_aligned_4byte = "(" + n_size + "*" + bytes_per_element + " % 4 == 0)"; + auto k_aligned_4byte = "(" + k_size + "*" + bytes_per_element + " % 4 == 0)"; jit.AddConstants({ MakeJitConstant("M", m_size), @@ -162,6 +165,8 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons MakeJitConstant("N", n_size), MakeJitConstant("K_PADDED_IN0", k_padded_size_in0), MakeJitConstant("N_PADDED", n_padded_size), + MakeJitConstant("K_IS_ALIGNED_4BYTE", k_aligned_4byte), + MakeJitConstant("N_IS_ALIGNED_4BYTE", n_aligned_4byte), MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size), MakeJitConstant("TILE_M", tuning_data.tile_m_size), MakeJitConstant("TILE_K", tuning_data.tile_k_size), From 65f83183591baa2ab40781998bfa294db4d74c98 Mon Sep 17 00:00:00 2001 From: hyunback Date: Thu, 16 May 2024 20:21:47 +0900 Subject: [PATCH 2/4] Change #if to if state in N_IS_ALIGNED_4BYTE for Perf check. Signed-off-by: hyunback --- .../cl_kernels/gemm_tiled_opt.cl | 26 ++++++++++++++----- .../kernels/gemm/gemm_kernel_tiled_opt.cpp | 1 + 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index f805052ee711cc..3a35465518154c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -281,13 +281,20 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - #if N_IS_ALIGNED_4BYTE + // #if N_IS_ALIGNED_4BYTE + // b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); + // #else // !N_IS_ALIGNED_4BYTE + // unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { + // b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; + // } + // #endif // !N_IS_ALIGNED_4BYTE + if (N_IS_ALIGNED_4BYTE) b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); - #else // !N_IS_ALIGNED_4BYTE + else { unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; } - #endif // !N_IS_ALIGNED_4BYTE + } #endif // B_VEC_SIZE == 1 b_ptr += input1_offset; } @@ -516,13 +523,20 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - #if N_IS_ALIGNED_4BYTE + // #if N_IS_ALIGNED_4BYTE + // b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); + // #else + // unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { + // b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; + // } + // #endif // N_IS_ALIGNED_4BYTE + if (N_IS_ALIGNED_4BYTE) b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); - #else + else { unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; } - #endif // N_IS_ALIGNED_4BYTE + } #endif // B_VEC_SIZE == 1 b_ptr += input1_offset; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index 5bbc2e357478d1..d1ae2c5ee67b3b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -147,6 +147,7 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons auto n_size = dims1.dims_sizes[input1_dims[7]]; auto n_padded_size = "(" + dims1_padded.dims_sizes[input1_dims[7]] + ")"; auto k_size = dims0.dims_sizes[input0_dims[7]]; + // GPU_DEBUG_COUT << "m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl; auto k_padded_size_in0 = "(" + dims0_padded.dims_sizes[input0_dims[7]] + ")"; const std::string leftover_m = "(" + m_size + "%" + std::to_string(tuning_data.tile_m_size) + ")"; const std::string leftover_n = "(" + n_size + "%" + std::to_string(tuning_data.tile_n_size) + ")"; From 47ff9037c1c57fd940fe0244532db8628a09aa9d Mon Sep 17 00:00:00 2001 From: hyunback Date: Thu, 16 May 2024 20:57:53 +0900 Subject: [PATCH 3/4] Remove dummy code. Signed-off-by: hyunback --- .../kernel_selector/cl_kernels/gemm_tiled_opt.cl | 14 -------------- .../kernels/gemm/gemm_kernel_tiled_opt.cpp | 1 - 2 files changed, 15 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index 3a35465518154c..9585b9665b99cb 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -281,13 +281,6 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - // #if N_IS_ALIGNED_4BYTE - // b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); - // #else // !N_IS_ALIGNED_4BYTE - // unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { - // b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; - // } - // #endif // !N_IS_ALIGNED_4BYTE if (N_IS_ALIGNED_4BYTE) b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); else { @@ -523,13 +516,6 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - // #if N_IS_ALIGNED_4BYTE - // b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); - // #else - // unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { - // b_tile[b_load_id][b_elem] = b_ptr[sglid + SIMD_WIDTH * b_elem]; - // } - // #endif // N_IS_ALIGNED_4BYTE if (N_IS_ALIGNED_4BYTE) b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); else { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index d1ae2c5ee67b3b..5bbc2e357478d1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -147,7 +147,6 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons auto n_size = dims1.dims_sizes[input1_dims[7]]; auto n_padded_size = "(" + dims1_padded.dims_sizes[input1_dims[7]] + ")"; auto k_size = dims0.dims_sizes[input0_dims[7]]; - // GPU_DEBUG_COUT << "m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl; auto k_padded_size_in0 = "(" + dims0_padded.dims_sizes[input0_dims[7]] + ")"; const std::string leftover_m = "(" + m_size + "%" + std::to_string(tuning_data.tile_m_size) + ")"; const std::string leftover_n = "(" + n_size + "%" + std::to_string(tuning_data.tile_n_size) + ")"; From 503bc3e1850ad3b34a3ec3956850860f22628ed5 Mon Sep 17 00:00:00 2001 From: hyunback Date: Wed, 22 May 2024 08:36:38 +0900 Subject: [PATCH 4/4] Fix llm perf regression. Because of additional mul op, some regression was occured in lln. Minimize calculating 4byte aligned check using leftover constant. Signed-off-by: hyunback --- .../cl_kernels/gemm_tiled_opt.cl | 8 ++++---- .../kernels/gemm/gemm_kernel_tiled_opt.cpp | 17 ++++++++++++++--- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index 9585b9665b99cb..fa30466de60c8c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -281,7 +281,7 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - if (N_IS_ALIGNED_4BYTE) + if (TILE_N_NOT_DIVISIBLE == 0 || N_IS_ALIGNED_4BYTE) b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); else { unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { @@ -387,7 +387,7 @@ KERNEL(gemm_tiled_opt)( // Loading A tile and tile C calculation #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING && TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST - A_FLOATN a_read = K_IS_ALIGNED_4BYTE ? BLOCK_READ_A(a_ptr, 0): a_ptr[sglid]; + A_FLOATN a_read = (TILE_K_NOT_DIVISIBLE == 0 || K_IS_ALIGNED_4BYTE) ? BLOCK_READ_A(a_ptr, 0): a_ptr[sglid]; #endif unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { #if TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST @@ -433,7 +433,7 @@ KERNEL(gemm_tiled_opt)( } #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING // Read A for next dot_id - a_read = (dot_id + 1 < tile_m_iterations) ? K_IS_ALIGNED_4BYTE ? BLOCK_READ_A(a_ptr, 0) : a_ptr[sglid] : 0; + a_read = (dot_id + 1 < tile_m_iterations) ? (TILE_K_NOT_DIVISIBLE == 0 || K_IS_ALIGNED_4BYTE) ? BLOCK_READ_A(a_ptr, 0) : a_ptr[sglid] : 0; #endif #elif TRANSPOSE_INPUT0 == TRANSPOSE_OTHER // TRANSPOSE_INPUT0 #if INDIRECT_INPUT0 @@ -516,7 +516,7 @@ KERNEL(gemm_tiled_opt)( #if B_VEC_SIZE == 1 b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; #else // B_VEC_SIZE == 1 - if (N_IS_ALIGNED_4BYTE) + if (TILE_N_NOT_DIVISIBLE == 0 || N_IS_ALIGNED_4BYTE) b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0); else { unroll_for (uint b_elem = 0; b_elem < B_VEC_SIZE; ++b_elem) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index 5bbc2e357478d1..2e804085939732 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -155,9 +155,20 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons const std::string not_divisible_n = "(" + leftover_n + "!=0)"; const std::string not_divisible_k = "(" + leftover_k + "!=0)"; const std::string full_iteration_k = "(" + k_size + "/" + std::to_string(tuning_data.tile_k_size) + ")"; - auto bytes_per_element = std::to_string(BytesPerElement(params.inputs[0].GetDType())); - auto n_aligned_4byte = "(" + n_size + "*" + bytes_per_element + " % 4 == 0)"; - auto k_aligned_4byte = "(" + k_size + "*" + bytes_per_element + " % 4 == 0)"; + std::string n_aligned_4byte = "0"; + std::string k_aligned_4byte = "0"; + if (BytesPerElement(params.inputs[0].GetDType()) == 4 || BytesPerElement(params.inputs[0].GetDType()) == 8) { + n_aligned_4byte = "1"; + k_aligned_4byte = "1"; + } else { + auto bytes_per_element = std::to_string(BytesPerElement(params.inputs[0].GetDType())); + if (n_size.find("shape_info") == std::string::npos) { + n_aligned_4byte = "(" + n_size + "*" + bytes_per_element + " % 4 == 0)"; + } + if (k_size.find("shape_info") == std::string::npos) { + k_aligned_4byte = "(" + k_size + "*" + bytes_per_element + " % 4 == 0)"; + } + } jit.AddConstants({ MakeJitConstant("M", m_size),