Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] In gemm_tile_kernel, applied to use block read when N and K byte-size is aligned 4. #23400

Merged
merged 9 commits into from
Mar 21, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ class debug_configuration {
int verbose_color; // Print verbose color
int list_layers; // Print list layers
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
int print_input_data_shapes; // Print the input data_shape for benchmark_app.
int print_input_data_shapes; // Print the input data_shape for benchmark_app.
int disable_usm; // Disable usm usage
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
Expand Down Expand Up @@ -134,6 +134,7 @@ class debug_configuration {
int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
int disable_runtime_skip_reorder; // Disable runtime skip reorder
int disable_primitive_fusing; // Disable primitive fusing
int env_var; // Enable environment variable
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames
static const debug_configuration *get_instance();
Expand All @@ -143,6 +144,7 @@ class debug_configuration {
bool is_layer_for_dumping(const std::string& layerName, bool is_output = false, bool is_input = false) const;
bool is_target_iteration(int64_t iteration) const;
std::string get_matched_from_filelist(const std::vector<std::string>& file_names, std::string pattern) const;
bool get_env(std::string key, int &val) const;

struct memory_preallocation_params {
bool is_initialized = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,10 @@ KERNEL(gemm_tiled_opt)(
else
#endif // INDIRECT_INPUT1
{
#if TILE_N_NOT_DIVISIBLE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#else
#if N_IS_ALIGNED_4BYTE
b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
#else
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#endif
b_ptr += input1_offset;
}
Expand Down Expand Up @@ -340,11 +340,11 @@ KERNEL(gemm_tiled_opt)(
#if INDIRECT_INPUT0
uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table);
A_FLOATN a_read = input0[a_idx];
#elif TILE_K_NOT_DIVISIBLE
A_FLOATN a_read = a_ptr[sglid];
#else // TILE_K_NOT_DIVISIBLE
#elif K_IS_ALIGNED_4BYTE
A_FLOATN a_read = BLOCK_READ_A(a_ptr, 0);
#endif // TILE_K_NOT_DIVISIBLE
#else // K_IS_ALIGNED_4BYTE
A_FLOATN a_read = a_ptr[sglid];
#endif // K_IS_ALIGNED_4BYTE
#endif // IS_DYNAMIC
a_ptr += input0_offset;

Expand Down Expand Up @@ -486,11 +486,11 @@ KERNEL(gemm_tiled_opt)(
else
#endif
{
#if TILE_N_NOT_DIVISIBLE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#else // TILE_N_NOT_DIVISIBLE
#if N_IS_ALIGNED_4BYTE
b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
#endif // TILE_N_NOT_DIVISIBLE
#else // N_IS_ALIGNED_4BYTE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#endif // N_IS_ALIGNED_4BYTE
b_ptr += input1_offset;
}
#elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == 0
Expand Down Expand Up @@ -529,15 +529,23 @@ KERNEL(gemm_tiled_opt)(
}
#endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST

#if !INDIRECT_INPUT0 && K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
a_ptr = input0 + FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, (K_FULL_ITERATIONS * TILE_K));
#endif
// Loading leftovers of the matrix A and tile C calculation
unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
#if INDIRECT_INPUT0
uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid), beam_table);
INPUT0_TYPE a_read = input0[a_idx];
#else // INDIRECT_INPUT0
#if K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
INPUT0_TYPE a_read = BLOCK_READ_A(a_ptr, 0);
a_ptr += input0_offset;
#else
uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid));
#endif
INPUT0_TYPE a_read = input0[a_idx];

#endif
#endif // INDIRECT_INPUT0
unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) {
c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
(GetOuputSize(params.output_order, output, 'X') * GetOuputSize(params.output_order, output, 'Y'));
std::vector<size_t> global = { GetOuputSize(params.output_order, output, 'X'), GetOuputSize(params.output_order, output, 'Y'),
total_batches };
GPU_DEBUG_LOG << "[" << global[0] << ", " << global[1] << ", " << global[2] << "], " << std::endl;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is too internal information. If you need this, it would be better to use more detailed level with proper explanation. e.g.
GPU_DEBUG_TRACE_DETAIL << "Draft for global work item size: " << ....

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Applied


dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
Expand All @@ -60,6 +61,9 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
dispatchData.lws[0] = td.simd_size;
dispatchData.lws[1] = 1;
dispatchData.lws[2] = 1;

GPU_DEBUG_LOG << "gws: [" << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << "], "
<<"lws: [" << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << "] " << std::endl;
yeonbok marked this conversation as resolved.
Show resolved Hide resolved
}
return dispatchData;
}
Expand Down Expand Up @@ -94,6 +98,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
tuning_data.tile_k_size = tuning_data.simd_size;
tuning_data.tile_m_size = tuning_data.simd_size;
}
// Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.
if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty())
tuning_data.tile_n_size = 32;

GPU_DEBUG_LOG << "m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl;
} else {
// In shape agnostic kernel case, the vector size of FusedOpsConfiguration cannot be specified at build time,
// so the tile sizes must be the same as simd_size
Expand All @@ -103,6 +112,24 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
tuning_data.tile_m_size = tuning_data.simd_size;
}

GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->env_var) {
int val;
if (debug_config->get_env("GEMM_TILE_M", val))
tuning_data.tile_m_size = val;
if (debug_config->get_env("GEMM_TILE_N", val))
tuning_data.tile_n_size = val;
if (debug_config->get_env("GEMM_TILE_K", val))
tuning_data.tile_k_size = val;
if (debug_config->get_env("GEMM_TILE_SIMD", val))
tuning_data.simd_size = val;
}

GPU_DEBUG_LOG << "tile_m_size: " << tuning_data.tile_m_size
<< ", tile_n_size: " << tuning_data.tile_n_size
<< ", tile_k_size: " << tuning_data.tile_k_size
<< ", simd_size: " << tuning_data.simd_size << std::endl;

return tuning_data;
}

Expand Down Expand Up @@ -212,13 +239,17 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
auto leftover_m = m_size % tuning_data.tile_m_size;
auto leftover_n = n_size % tuning_data.tile_n_size;
auto leftover_k = k_size % tuning_data.tile_k_size;
auto n_aligned_4byte = (n_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;
auto k_aligned_4byte = (k_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;

jit.AddConstants({
MakeJitConstant("M", m_size),
MakeJitConstant("K", k_size),
MakeJitConstant("N", n_size),
MakeJitConstant("K_PADDED_IN0", k_size),
MakeJitConstant("N_PADDED", n_size),
MakeJitConstant("K_IS_ALIGNED_4BYTE", k_aligned_4byte),
MakeJitConstant("N_IS_ALIGNED_4BYTE", n_aligned_4byte),
MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
MakeJitConstant("TILE_M", tuning_data.tile_m_size),
MakeJitConstant("TILE_K", tuning_data.tile_k_size),
Expand Down
17 changes: 16 additions & 1 deletion src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ static void print_help_messages() {
" Currently, other layers except input-layer('parameter' type) are loading binaries for only input."
" Different input or output tensors are seperated by ','. Different layers are separated by space. For example, "
" \"[input_layer_name1]:[binary_dumped_file1],[binary_dump_file2] [input_layer_name2]:[binary_dump_1],[binary_dump_2]\"");
message_list.emplace_back("OV_GPU_EnvVar", "Enable environment variable");

auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
[](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
Expand Down Expand Up @@ -211,7 +212,8 @@ debug_configuration::debug_configuration()
, disable_memory_reuse(0)
, disable_build_time_weight_reorder_for_dynamic_nodes(0)
, disable_runtime_skip_reorder(0)
, disable_primitive_fusing(0) {
, disable_primitive_fusing(0)
, env_var(0) {
#ifdef GPU_DEBUG_CONFIG
get_gpu_debug_env_var("Help", help);
get_common_debug_env_var("Verbose", verbose);
Expand Down Expand Up @@ -261,6 +263,7 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("MemPreallocationOptions", mem_preallocation_params_str);
std::string load_dump_raw_bin_str;
get_gpu_debug_env_var("LoadDumpRawBinary", load_dump_raw_bin_str);
get_gpu_debug_env_var("EnvVar", env_var);

if (help > 0) {
print_help_messages();
Expand Down Expand Up @@ -519,4 +522,16 @@ bool debug_configuration::is_target_iteration(int64_t iteration) const {
return false;
#endif
}

bool debug_configuration::get_env(std::string key, int &val) const {
yeonbok marked this conversation as resolved.
Show resolved Hide resolved
#ifdef GPU_DEBUG_CONFIG
if (const auto env_var = std::getenv(key.c_str())) {
val = std::atoi(env_var);
return true;
}
return false;
#else
return false;
#endif
}
} // namespace cldnn
Loading