Skip to content

Commit

Permalink
[GPU] In gemm_tile_kernel, applied to use block read when N and K byt…
Browse files Browse the repository at this point in the history
…e-size is aligned 4. (openvinotoolkit#23400)

### Details:
- *Element by element read is the bottle-neck in gemm_tiled kernel.
Enable block-read when N and K size are aligned 4byte with N and K are
leftover*.
- *Increasing tile_n_size has performance improvement when m_size and
n_size are not shallow and n_size is aligned at 32.*
 - *Add GEMM_TILE_M/N/K/SIMD environment variables for convenience.*

### Tickets:
 - *134279*

---------

Signed-off-by: hyunback <[email protected]>
  • Loading branch information
hyunback authored and alvoron committed Apr 29, 2024
1 parent 441aebb commit 0924d66
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ class debug_configuration {
int verbose_color; // Print verbose color
int list_layers; // Print list layers
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
int print_input_data_shapes; // Print the input data_shape for benchmark_app.
int print_input_data_shapes; // Print the input data_shape for benchmark_app.
int disable_usm; // Disable usm usage
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,10 @@ KERNEL(gemm_tiled_opt)(
else
#endif // INDIRECT_INPUT1
{
#if TILE_N_NOT_DIVISIBLE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#else
#if N_IS_ALIGNED_4BYTE
b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
#else
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#endif
b_ptr += input1_offset;
}
Expand Down Expand Up @@ -340,11 +340,11 @@ KERNEL(gemm_tiled_opt)(
#if INDIRECT_INPUT0
uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table);
A_FLOATN a_read = input0[a_idx];
#elif TILE_K_NOT_DIVISIBLE
A_FLOATN a_read = a_ptr[sglid];
#else // TILE_K_NOT_DIVISIBLE
#elif K_IS_ALIGNED_4BYTE
A_FLOATN a_read = BLOCK_READ_A(a_ptr, 0);
#endif // TILE_K_NOT_DIVISIBLE
#else // K_IS_ALIGNED_4BYTE
A_FLOATN a_read = a_ptr[sglid];
#endif // K_IS_ALIGNED_4BYTE
#endif // IS_DYNAMIC
a_ptr += input0_offset;

Expand Down Expand Up @@ -486,11 +486,11 @@ KERNEL(gemm_tiled_opt)(
else
#endif
{
#if TILE_N_NOT_DIVISIBLE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#else // TILE_N_NOT_DIVISIBLE
#if N_IS_ALIGNED_4BYTE
b_tile[b_load_id] = BLOCK_READ_B(b_ptr, 0);
#endif // TILE_N_NOT_DIVISIBLE
#else // N_IS_ALIGNED_4BYTE
b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid];
#endif // N_IS_ALIGNED_4BYTE
b_ptr += input1_offset;
}
#elif TRANSPOSE_INPUT1 == TRANSPOSE_OTHER // TRANSPOSE_INPUT1 == 0
Expand Down Expand Up @@ -529,15 +529,23 @@ KERNEL(gemm_tiled_opt)(
}
#endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST

#if !INDIRECT_INPUT0 && K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
a_ptr = input0 + FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, (K_FULL_ITERATIONS * TILE_K));
#endif
// Loading leftovers of the matrix A and tile C calculation
unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
#if INDIRECT_INPUT0
uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid), beam_table);
INPUT0_TYPE a_read = input0[a_idx];
#else // INDIRECT_INPUT0
#if K_IS_ALIGNED_4BYTE && (TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST)
INPUT0_TYPE a_read = BLOCK_READ_A(a_ptr, 0);
a_ptr += input0_offset;
#else
uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (K_FULL_ITERATIONS * TILE_K + sglid));
#endif
INPUT0_TYPE a_read = input0[a_idx];

#endif
#endif // INDIRECT_INPUT0
unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) {
c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ GemmKernelBase::DispatchData GemmKernelTiledOpt::SetDefault(const gemm_params& p
(GetOuputSize(params.output_order, output, 'X') * GetOuputSize(params.output_order, output, 'Y'));
std::vector<size_t> global = { GetOuputSize(params.output_order, output, 'X'), GetOuputSize(params.output_order, output, 'Y'),
total_batches };
GPU_DEBUG_TRACE_DETAIL << "Draft for global work item size: [" << global[0] << ", " << global[1] << ", " << global[2] << "], " << std::endl;

dispatchData.gws[0] = Align(global[0], td.tile_n_size) / (td.tile_n_size / td.simd_size);
dispatchData.gws[1] = Align(global[1], td.tile_m_size) / td.tile_m_size;
Expand Down Expand Up @@ -94,6 +95,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
tuning_data.tile_k_size = tuning_data.simd_size;
tuning_data.tile_m_size = tuning_data.simd_size;
}
// Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.
if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty())
tuning_data.tile_n_size = 32;

GPU_DEBUG_LOG << params.layerID << ": m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl;
} else {
// In shape agnostic kernel case, the vector size of FusedOpsConfiguration cannot be specified at build time,
// so the tile sizes must be the same as simd_size
Expand All @@ -103,6 +109,11 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
tuning_data.tile_m_size = tuning_data.simd_size;
}

GPU_DEBUG_LOG << params.layerID << ": tile_m_size: " << tuning_data.tile_m_size
<< ", tile_n_size: " << tuning_data.tile_n_size
<< ", tile_k_size: " << tuning_data.tile_k_size
<< ", simd_size: " << tuning_data.simd_size << std::endl;

return tuning_data;
}

Expand Down Expand Up @@ -212,13 +223,17 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
auto leftover_m = m_size % tuning_data.tile_m_size;
auto leftover_n = n_size % tuning_data.tile_n_size;
auto leftover_k = k_size % tuning_data.tile_k_size;
auto n_aligned_4byte = (n_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;
auto k_aligned_4byte = (k_size * BytesPerElement(params.inputs[0].GetDType())) % 4 == 0;

jit.AddConstants({
MakeJitConstant("M", m_size),
MakeJitConstant("K", k_size),
MakeJitConstant("N", n_size),
MakeJitConstant("K_PADDED_IN0", k_size),
MakeJitConstant("N_PADDED", n_size),
MakeJitConstant("K_IS_ALIGNED_4BYTE", k_aligned_4byte),
MakeJitConstant("N_IS_ALIGNED_4BYTE", n_aligned_4byte),
MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size),
MakeJitConstant("TILE_M", tuning_data.tile_m_size),
MakeJitConstant("TILE_K", tuning_data.tile_k_size),
Expand Down

0 comments on commit 0924d66

Please sign in to comment.