Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] In gemm_tile_kernel, applied to use block read when N and K byte-size is aligned 4. #23400

Merged
merged 9 commits into from
Mar 21, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ class debug_configuration {
int verbose_color; // Print verbose color
int list_layers; // Print list layers
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
int print_input_data_shapes; // Print the input data_shape for benchmark_app.
int print_input_data_shapes; // Print the input data_shape for benchmark_app.
int disable_usm; // Disable usm usage
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,10 @@ KERNEL(gemm_tiled_opt)(
else
#endif // INDIRECT_INPUT1
{
#if TILE_N_NOT_DIVISIBLE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#else
#if N_IS_ALIGNED_4BYTE
b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
#else
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#endif
b_ptr += input1_offset;
}
Expand Down Expand Up @@ -340,11 +340,11 @@ KERNEL(gemm_tiled_opt)(
#if INDIRECT_INPUT0
uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table);
A_FLOATN a_read = input0[a_idx];
#elif TILE_K_NOT_DIVISIBLE
A_FLOATN a_read = a_ptr[sglid];
#else // TILE_K_NOT_DIVISIBLE
#elif K_IS_ALIGNED_4BYTE
A_FLOATN a_read = BLOCK_READ_A(a_ptr, 0);
#endif // TILE_K_NOT_DIVISIBLE
#else // K_IS_ALIGNED_4BYTE
A_FLOATN a_read = a_ptr[sglid];
#endif // K_IS_ALIGNED_4BYTE
#endif // IS_DYNAMIC
a_ptr += input0_offset;

Expand Down Expand Up @@ -486,11 +486,11 @@ KERNEL(gemm_tiled_opt)(
else
#endif
{
#if TILE_N_NOT_DIVISIBLE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#else // TILE_N_NOT_DIVISIBLE
#if N_IS_ALIGNED_4BYTE
b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
#endif // TILE_N_NOT_DIVISIBLE
#else // N_IS_ALIGNED_4BYTE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#endif // N_IS_ALIGNED_4BYTE
b_ptr += input1_offset;
}
#elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == 0
Expand Down Expand Up @@ -529,15 +529,23 @@ KERNEL(gemm_tiled_opt)(
}
#endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST

#if !INDIRECT_INPUT0 && K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
a_ptr = input0 + FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, (K_FULL_ITERATIONS * TILE_K));
#endif
// Loading leftovers of the matrix A and tile C calculation
unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
#if INDIRECT_INPUT0
uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid), beam_table);
INPUT0_TYPE a_read = input0[a_idx];
#else // INDIRECT_INPUT0
#if K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
INPUT0_TYPE a_read = BLOCK_READ_A(a_ptr, 0);
a_ptr += input0_offset;
#else
uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid));
#endif
INPUT0_TYPE a_read = input0[a_idx];

#endif
#endif // INDIRECT_INPUT0
unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) {
c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
(GetOuputSize(params.output_order, output, 'X') * GetOuputSize(params.output_order, output, 'Y'));
std::vector<size_t> global = { GetOuputSize(params.output_order, output, 'X'), GetOuputSize(params.output_order, output, 'Y'),
total_batches };
GPU_DEBUG_TRACE_DETAIL << "Draft for global work item size: [" << global[0] << ", " << global[1] << ", " << global[2] << "], " << std::endl;

dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
Expand Down Expand Up @@ -94,6 +95,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
tuning_data.tile_k_size = tuning_data.simd_size;
tuning_data.tile_m_size = tuning_data.simd_size;
}
// Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.
if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty())
tuning_data.tile_n_size = 32;

GPU_DEBUG_LOG << params.layerID << ": m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl;
yeonbok marked this conversation as resolved.
Show resolved Hide resolved
} else {
// In shape agnostic kernel case, the vector size of FusedOpsConfiguration cannot be specified at build time,
// so the tile sizes must be the same as simd_size
Expand All @@ -103,6 +109,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
tuning_data.tile_m_size = tuning_data.simd_size;
}

GPU_DEBUG_LOG << params.layerID << ": tile_m_size: " << tuning_data.tile_m_size
<< ", tile_n_size: " << tuning_data.tile_n_size
<< ", tile_k_size: " << tuning_data.tile_k_size
<< ", simd_size: " << tuning_data.simd_size << std::endl;

return tuning_data;
}

Expand Down Expand Up @@ -212,13 +223,17 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
auto leftover_m = m_size % tuning_data.tile_m_size;
auto leftover_n = n_size % tuning_data.tile_n_size;
auto leftover_k = k_size % tuning_data.tile_k_size;
auto n_aligned_4byte = (n_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;
auto k_aligned_4byte = (k_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;

jit.AddConstants({
MakeJitConstant("M", m_size),
MakeJitConstant("K", k_size),
MakeJitConstant("N", n_size),
MakeJitConstant("K_PADDED_IN0", k_size),
MakeJitConstant("N_PADDED", n_size),
MakeJitConstant("K_IS_ALIGNED_4BYTE", k_aligned_4byte),
MakeJitConstant("N_IS_ALIGNED_4BYTE", n_aligned_4byte),
MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
MakeJitConstant("TILE_M", tuning_data.tile_m_size),
MakeJitConstant("TILE_K", tuning_data.tile_k_size),
Expand Down
Loading