openvinotoolkit · yeonbok · Mar 21, 2024 · Mar 12, 2024 · Mar 13, 2024 · Mar 13, 2024
@@ -102,7 +102,7 @@ class debug_configuration {
     int verbose_color;                                          // Print verbose color
     int list_layers;                                            // Print list layers
     int print_multi_kernel_perf;                                // Print execution time of each kernel in multi-kernel primitimive
-    int print_input_data_shapes;                                  // Print the input data_shape for benchmark_app.
+    int print_input_data_shapes;                                // Print the input data_shape for benchmark_app.
     int disable_usm;                                            // Disable usm usage
     int disable_onednn;                                         // Disable onednn for discrete GPU (no effect for integrated GPU)
     int disable_onednn_opt_post_ops;                            // Disable onednn optimize post operators
@@ -134,6 +134,7 @@ class debug_configuration {
     int disable_build_time_weight_reorder_for_dynamic_nodes;    // Disable build time weight reordering for dynamic nodes
     int disable_runtime_skip_reorder;                           // Disable runtime skip reorder
     int disable_primitive_fusing;                               // Disable primitive fusing
+    int env_var;                                                // Enable environment variable
     std::set<int64_t> dump_iteration;                           // Dump n-th execution of network.
     std::vector<std::string> load_layers_raw_dump;              // List of layers to load dumped raw binary and filenames
     static const debug_configuration *get_instance();
@@ -143,6 +144,7 @@ class debug_configuration {
     bool is_layer_for_dumping(const std::string& layerName, bool is_output = false, bool is_input = false) const;
     bool is_target_iteration(int64_t iteration) const;
     std::string get_matched_from_filelist(const std::vector<std::string>& file_names, std::string pattern) const;
+    bool get_env(std::string key, int &val) const;
 
     struct memory_preallocation_params {
         bool is_initialized = false;

@@ -275,10 +275,10 @@ KERNEL(gemm_tiled_opt)(
             else
         #endif // INDIRECT_INPUT1
             {
-        #if TILE_N_NOT_DIVISIBLE
-                b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
-        #else
+        #if N_IS_ALIGNED_4BYTE
                 b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
+        #else
+                b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
         #endif
                 b_ptr += input1_offset;
             }
@@ -340,11 +340,11 @@ KERNEL(gemm_tiled_opt)(
 #if INDIRECT_INPUT0
             uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table);
             A_FLOATN a_read = input0[a_idx];
-#elif TILE_K_NOT_DIVISIBLE
-            A_FLOATN a_read = a_ptr[sglid];
-#else // TILE_K_NOT_DIVISIBLE
+#elif K_IS_ALIGNED_4BYTE
             A_FLOATN a_read = BLOCK_READ_A(a_ptr, 0);
-#endif // TILE_K_NOT_DIVISIBLE
+#else // K_IS_ALIGNED_4BYTE
+            A_FLOATN a_read = a_ptr[sglid];
+#endif // K_IS_ALIGNED_4BYTE
 #endif // IS_DYNAMIC
             a_ptr += input0_offset;
 
@@ -486,11 +486,11 @@ KERNEL(gemm_tiled_opt)(
             else
         #endif
             {
-        #if TILE_N_NOT_DIVISIBLE
-                b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
-        #else // TILE_N_NOT_DIVISIBLE
+        #if N_IS_ALIGNED_4BYTE
                 b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
-        #endif // TILE_N_NOT_DIVISIBLE
+        #else // N_IS_ALIGNED_4BYTE
+                b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
+        #endif // N_IS_ALIGNED_4BYTE
                 b_ptr += input1_offset;
             }
     #elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == 0
@@ -529,15 +529,23 @@ KERNEL(gemm_tiled_opt)(
             }
     #endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST
 
+#if !INDIRECT_INPUT0 && K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
+    a_ptr = input0 + FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, (K_FULL_ITERATIONS * TILE_K));
+#endif
     // Loading leftovers of the matrix A and tile C calculation
     unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
 #if INDIRECT_INPUT0
         uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid), beam_table);
+        INPUT0_TYPE a_read = input0[a_idx];
+#else  // INDIRECT_INPUT0
+#if K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
+        INPUT0_TYPE a_read = BLOCK_READ_A(a_ptr, 0);
+        a_ptr += input0_offset;
 #else
         uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid));
-#endif
         INPUT0_TYPE a_read = input0[a_idx];
-
+#endif
+#endif // INDIRECT_INPUT0
         unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) {
             c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]);
         }

@@ -52,6 +52,7 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
                             (GetOuputSize(params.output_order, output, 'X') * GetOuputSize(params.output_order, output, 'Y'));
         std::vector<size_t> global = { GetOuputSize(params.output_order, output, 'X'), GetOuputSize(params.output_order, output, 'Y'),
                                        total_batches };
+        GPU_DEBUG_LOG << "[" << global[0] << ", " << global[1] << ", " << global[2] << "], " << std::endl;
 
         dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
         dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
@@ -60,6 +61,9 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
         dispatchData.lws[0] = td.simd_size;
         dispatchData.lws[1] = 1;
         dispatchData.lws[2] = 1;
+
+        GPU_DEBUG_LOG << "gws: [" << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << "], "
+                        <<"lws: [" << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << "] " << std::endl;
     }
     return dispatchData;
 }
@@ -94,6 +98,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
             tuning_data.tile_k_size = tuning_data.simd_size;
             tuning_data.tile_m_size = tuning_data.simd_size;
         }
+        // Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.
+        if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty())
+            tuning_data.tile_n_size = 32;
+
+        GPU_DEBUG_LOG << "m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl;
     } else {
         // In shape agnostic kernel case, the vector size of FusedOpsConfiguration cannot be specified at build time,
         // so the tile sizes must be the same as simd_size
@@ -103,6 +112,24 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
         tuning_data.tile_m_size = tuning_data.simd_size;
     }
 
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->env_var) {
+        int val;
+        if (debug_config->get_env("GEMM_TILE_M", val))
+            tuning_data.tile_m_size = val;
+        if (debug_config->get_env("GEMM_TILE_N", val))
+            tuning_data.tile_n_size = val;
+        if (debug_config->get_env("GEMM_TILE_K", val))
+            tuning_data.tile_k_size = val;
+        if (debug_config->get_env("GEMM_TILE_SIMD", val))
+            tuning_data.simd_size = val;
+    }
+
+    GPU_DEBUG_LOG << "tile_m_size: " << tuning_data.tile_m_size
+                    << ", tile_n_size: " << tuning_data.tile_n_size
+                    << ", tile_k_size: " << tuning_data.tile_k_size
+                    << ", simd_size: " << tuning_data.simd_size << std::endl;
+
     return tuning_data;
 }
 
@@ -212,13 +239,17 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
         auto leftover_m = m_size % tuning_data.tile_m_size;
         auto leftover_n = n_size % tuning_data.tile_n_size;
         auto leftover_k = k_size % tuning_data.tile_k_size;
+        auto n_aligned_4byte = (n_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;
+        auto k_aligned_4byte = (k_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;
 
         jit.AddConstants({
             MakeJitConstant("M", m_size),
             MakeJitConstant("K", k_size),
             MakeJitConstant("N", n_size),
             MakeJitConstant("K_PADDED_IN0", k_size),
             MakeJitConstant("N_PADDED", n_size),
+            MakeJitConstant("K_IS_ALIGNED_4BYTE", k_aligned_4byte),
+            MakeJitConstant("N_IS_ALIGNED_4BYTE", n_aligned_4byte),
             MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
             MakeJitConstant("TILE_M", tuning_data.tile_m_size),
             MakeJitConstant("TILE_K", tuning_data.tile_k_size),

diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@@ -162,6 +162,7 @@ static void print_help_messages() {
                                " Currently, other layers except input-layer('parameter' type) are loading binaries for only input."
                                " Different input or output tensors are seperated by ','. Different layers are separated by space. For example, "
                                " \"[input_layer_name1]:[binary_dumped_file1],[binary_dump_file2] [input_layer_name2]:[binary_dump_1],[binary_dump_2]\"");
+    message_list.emplace_back("OV_GPU_EnvVar", "Enable environment variable");
 
     auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
         [](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
@@ -211,7 +212,8 @@ debug_configuration::debug_configuration()
         , disable_memory_reuse(0)
         , disable_build_time_weight_reorder_for_dynamic_nodes(0)
         , disable_runtime_skip_reorder(0)
-        , disable_primitive_fusing(0) {
+        , disable_primitive_fusing(0)
+        , env_var(0) {
 #ifdef GPU_DEBUG_CONFIG
     get_gpu_debug_env_var("Help", help);
     get_common_debug_env_var("Verbose", verbose);
@@ -261,6 +263,7 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("MemPreallocationOptions", mem_preallocation_params_str);
     std::string load_dump_raw_bin_str;
     get_gpu_debug_env_var("LoadDumpRawBinary", load_dump_raw_bin_str);
+    get_gpu_debug_env_var("EnvVar", env_var);
 
     if (help > 0) {
         print_help_messages();
@@ -519,4 +522,16 @@ bool debug_configuration::is_target_iteration(int64_t iteration) const {
     return false;
 #endif
 }
+
+bool debug_configuration::get_env(std::string key, int &val) const {
+#ifdef GPU_DEBUG_CONFIG
+        if (const auto env_var = std::getenv(key.c_str())) {
+            val = std::atoi(env_var);
+            return true;
+        }
+        return false;
+#else
+    return false;
+#endif
+}
 } // namespace cldnn