From 0924d66d48086d997341be4035956a44c0e00306 Mon Sep 17 00:00:00 2001
From: hyunback kim <hyunback.kim@intel.com>
Date: Thu, 21 Mar 2024 15:47:35 +0900
Subject: [PATCH] [GPU] In gemm_tile_kernel, applied to use block read when N
 and K byte-size is aligned 4. (#23400)

### Details:
- *Element by element read is the bottle-neck in gemm_tiled kernel.
Enable block-read when N and K size are aligned 4byte with N and K are
leftover*.
- *Increasing tile_n_size has performance improvement when m_size and
n_size are not shallow and n_size is aligned at 32.*
 - *Add GEMM_TILE_M/N/K/SIMD environment variables for convenience.*

### Tickets:
 - *134279*

---------

Signed-off-by: hyunback <hyunback.kim@intel.com>
---
 .../intel_gpu/runtime/debug_configuration.hpp |  2 +-
 .../cl_kernels/gemm_tiled_opt.cl              | 34 ++++++++++++-------
 .../kernels/gemm/gemm_kernel_tiled_opt.cpp    | 15 ++++++++
 3 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
index a8f8b34b1c5a7f..15450eb6b842bb 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -102,7 +102,7 @@ class debug_configuration {
     int verbose_color;                                          // Print verbose color
     int list_layers;                                            // Print list layers
     int print_multi_kernel_perf;                                // Print execution time of each kernel in multi-kernel primitimive
-    int print_input_data_shapes;                                  // Print the input data_shape for benchmark_app.
+    int print_input_data_shapes;                                // Print the input data_shape for benchmark_app.
     int disable_usm;                                            // Disable usm usage
     int disable_onednn;                                         // Disable onednn for discrete GPU (no effect for integrated GPU)
     int disable_onednn_opt_post_ops;                            // Disable onednn optimize post operators
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl
index 10f61b1dd15393..e9079c6fb395f3 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl
@@ -275,10 +275,10 @@ KERNEL(gemm_tiled_opt)(
             else
         #endif // INDIRECT_INPUT1
             {
-        #if TILE_N_NOT_DIVISIBLE
-                b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
-        #else
+        #if N_IS_ALIGNED_4BYTE
                 b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
+        #else
+                b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
         #endif
                 b_ptr += input1_offset;
             }
@@ -340,11 +340,11 @@ KERNEL(gemm_tiled_opt)(
 #if INDIRECT_INPUT0
             uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table);
             A_FLOATN a_read = input0[a_idx];
-#elif TILE_K_NOT_DIVISIBLE
-            A_FLOATN a_read = a_ptr[sglid];
-#else // TILE_K_NOT_DIVISIBLE
+#elif K_IS_ALIGNED_4BYTE
             A_FLOATN a_read = BLOCK_READ_A(a_ptr, 0);
-#endif // TILE_K_NOT_DIVISIBLE
+#else // K_IS_ALIGNED_4BYTE
+            A_FLOATN a_read = a_ptr[sglid];
+#endif // K_IS_ALIGNED_4BYTE
 #endif // IS_DYNAMIC
             a_ptr += input0_offset;
 
@@ -486,11 +486,11 @@ KERNEL(gemm_tiled_opt)(
             else
         #endif
             {
-        #if TILE_N_NOT_DIVISIBLE
-                b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
-        #else // TILE_N_NOT_DIVISIBLE
+        #if N_IS_ALIGNED_4BYTE
                 b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
-        #endif // TILE_N_NOT_DIVISIBLE
+        #else // N_IS_ALIGNED_4BYTE
+                b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
+        #endif // N_IS_ALIGNED_4BYTE
                 b_ptr += input1_offset;
             }
     #elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == 0
@@ -529,15 +529,23 @@ KERNEL(gemm_tiled_opt)(
             }
     #endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST
 
+#if !INDIRECT_INPUT0 && K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
+    a_ptr = input0 + FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, (K_FULL_ITERATIONS * TILE_K));
+#endif
     // Loading leftovers of the matrix A and tile C calculation
     unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
 #if INDIRECT_INPUT0
         uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid), beam_table);
+        INPUT0_TYPE a_read = input0[a_idx];
+#else  // INDIRECT_INPUT0
+#if K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
+        INPUT0_TYPE a_read = BLOCK_READ_A(a_ptr, 0);
+        a_ptr += input0_offset;
 #else
         uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid));
-#endif
         INPUT0_TYPE a_read = input0[a_idx];
-
+#endif
+#endif // INDIRECT_INPUT0
         unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) {
             c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]);
         }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
index 56a97df8d0d0ab..eb3b9d2d8a1787 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
@@ -52,6 +52,7 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
                             (GetOuputSize(params.output_order, output, 'X') * GetOuputSize(params.output_order, output, 'Y'));
         std::vector<size_t> global = { GetOuputSize(params.output_order, output, 'X'), GetOuputSize(params.output_order, output, 'Y'),
                                        total_batches };
+        GPU_DEBUG_TRACE_DETAIL << "Draft for global work item size: [" << global[0] << ", " << global[1] << ", " << global[2] << "], " << std::endl;
 
         dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
         dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
@@ -94,6 +95,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
             tuning_data.tile_k_size = tuning_data.simd_size;
             tuning_data.tile_m_size = tuning_data.simd_size;
         }
+        // Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.
+        if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty())
+            tuning_data.tile_n_size = 32;
+
+        GPU_DEBUG_LOG << params.layerID << ": m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl;
     } else {
         // In shape agnostic kernel case, the vector size of FusedOpsConfiguration cannot be specified at build time,
         // so the tile sizes must be the same as simd_size
@@ -103,6 +109,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
         tuning_data.tile_m_size = tuning_data.simd_size;
     }
 
+    GPU_DEBUG_LOG << params.layerID << ": tile_m_size: " << tuning_data.tile_m_size
+                    << ", tile_n_size: " << tuning_data.tile_n_size
+                    << ", tile_k_size: " << tuning_data.tile_k_size
+                    << ", simd_size: " << tuning_data.simd_size << std::endl;
+
     return tuning_data;
 }
 
@@ -212,6 +223,8 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
         auto leftover_m = m_size % tuning_data.tile_m_size;
         auto leftover_n = n_size % tuning_data.tile_n_size;
         auto leftover_k = k_size % tuning_data.tile_k_size;
+        auto n_aligned_4byte = (n_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;
+        auto k_aligned_4byte = (k_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;
 
         jit.AddConstants({
             MakeJitConstant("M", m_size),
@@ -219,6 +232,8 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
             MakeJitConstant("N", n_size),
             MakeJitConstant("K_PADDED_IN0", k_size),
             MakeJitConstant("N_PADDED", n_size),
+            MakeJitConstant("K_IS_ALIGNED_4BYTE", k_aligned_4byte),
+            MakeJitConstant("N_IS_ALIGNED_4BYTE", n_aligned_4byte),
             MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
             MakeJitConstant("TILE_M", tuning_data.tile_m_size),
             MakeJitConstant("TILE_K", tuning_data.tile_k_size),