Skip to content

Commit

Permalink
Fix unaligned IFM leftovers processing in case of compressed weights …
Browse files Browse the repository at this point in the history
…and add decompression scale post op support
  • Loading branch information
sshlyapn committed Dec 5, 2023
1 parent 9f0c13f commit 2eec4fd
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ inline void FUNC(fc_bf_tiled_kernel_tile_b1)(
uint offset_ofm = out_f + fi*SIMD + get_sub_group_local_id();
#if DECOMPRESSION_SCALE_GROUPS_NUM > 1
const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH +
((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
((kii + ki*TILE_K + iterations*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
#else
ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH];
Expand All @@ -324,7 +324,7 @@ inline void FUNC(fc_bf_tiled_kernel_tile_b1)(
ACCUMULATOR_TYPE dzp = DECOMPRESSION_ZP_VALUE;
#elif DECOMPRESSION_ZP_GROUPS_NUM > 1
const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH +
((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
((kii + ki*TILE_K + iterations*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset];
#else
ACCUMULATOR_TYPE dzp = d_zps[fi % DECOMPRESSION_ZP_LENGTH];
Expand Down Expand Up @@ -612,12 +612,16 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
const uint w_idx = kii * TILE_OFM + fi;
const uint offset_ofm = out_f + fi*SIMD + sglid;
const uint offset_ifm = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE + load_iter * FILTER_LOAD_BLOCK_SIZE + kii;
#if DECOMPRESSION_SCALE_GROUPS_NUM > 1
const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH +
(offset_ifm / DECOMPRESSION_SCALE_GROUP_SIZE) * DECOMPRESSION_SCALE_FEATURE_PITCH;
ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
#if !DECOMPRESSION_SCALE_POST_OP
#if DECOMPRESSION_SCALE_GROUPS_NUM > 1
const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH +
(offset_ifm / DECOMPRESSION_SCALE_GROUP_SIZE) * DECOMPRESSION_SCALE_FEATURE_PITCH;
ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
#else
ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH];
#endif
#else
ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH];
ACCUMULATOR_TYPE ds = ACCUMULATOR_VAL_ONE;
#endif

#if DECOMPRESSION_ZP_TERM
Expand Down Expand Up @@ -756,7 +760,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)(

#if DECOMPRESSION_SCALE_GROUPS_NUM > 1
const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH +
((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
#else
ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH];
Expand Down Expand Up @@ -800,7 +804,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
uint offset_ofm = out_f + fi*SIMD + get_sub_group_local_id();
#if DECOMPRESSION_SCALE_GROUPS_NUM > 1
const uint scale_offset = (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) * DECOMPRESSION_SCALE_BATCH_PITCH +
((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
((kii + ki*TILE_K + iterations*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
#else
ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH];
Expand All @@ -811,7 +815,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
ACCUMULATOR_TYPE dzp = DECOMPRESSION_ZP_VALUE;
#elif DECOMPRESSION_ZP_GROUPS_NUM > 1
const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH +
((kii + ki*TILE_K + ni*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
((kii + ki*TILE_K + iterations*TILE_IFM*SIMD) / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
ACCUMULATOR_TYPE dzp = decompression_zp[zp_offset];
#else
ACCUMULATOR_TYPE dzp = d_zps[fi % DECOMPRESSION_ZP_LENGTH];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para

jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE", weights_dt, tile_k_ofm));
const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v;
// Do not use SCALE_POST_OP for SLM kernel
// Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance
if (scale_group_size % simd == 0 && !dispatchData.use_slm)
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
}
Expand Down Expand Up @@ -509,7 +509,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
tparams.exec_options,
autoTuneIndex);

// In case of dynamic params try to compile optimized SLM kernel for large batches
// In case of dynamic params try to configure additional optimized SLM kernel for large batches
if (params.is_shape_agnostic) {
auto tparams = GetAutoTuneParams(fc_params, KernelType::SLM, autoTuneIndex);
auto can_select_slm_kernel = tparams.kernel_type == KernelType::SLM;
Expand Down Expand Up @@ -541,22 +541,22 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
output_batch *= prim_params.outputs[0].Feature().v;

// Choose one of the two shape agnostic kernels:
// - kd.kernels[0] for batches < 256 (default version)
// - kd.kernels[0] for batches <= 240 (default version)
// - kd.kernels[1] for batches >= 256 (slm version)
const auto default_alignment = 16;
// We can use SLM version if `output_batch + default_alignment > 256` because memory is aligned (whether 16 or 64 elements)
// We can use SLM version if `output_batch + default_alignment > 256` because memory and batch are aligned (whether 16 or 64 elements)
const auto skip_kernel_idx = output_batch + default_alignment > 256 ? 0 : 1;
const auto selected_kernel_idx = 1 - skip_kernel_idx;
const auto execute_kernel_idx = 1 - skip_kernel_idx;

kd.kernels[skip_kernel_idx].skip_execution = true;

GPU_DEBUG_TRACE_DETAIL << "FC bf tiled: " << (selected_kernel_idx == 1 ? "SLM" : "Default") << " shape-agnostic kernel version "
GPU_DEBUG_TRACE_DETAIL << "FC bf tiled: " << (execute_kernel_idx == 1 ? "SLM" : "Default") << " shape-agnostic kernel version "
<< "will be used for batch size = " << output_batch << "\n";

auto dispatchData = SetDefault(prim_params, -1, selected_kernel_idx);
kd.kernels[selected_kernel_idx].params.workGroups.global = dispatchData.gws;
kd.kernels[selected_kernel_idx].params.workGroups.local = dispatchData.lws;
kd.kernels[selected_kernel_idx].skip_execution = KernelData::SkipKernelExecution(prim_params);
auto dispatchData = SetDefault(prim_params, -1, execute_kernel_idx);
kd.kernels[execute_kernel_idx].params.workGroups.global = dispatchData.gws;
kd.kernels[execute_kernel_idx].params.workGroups.local = dispatchData.lws;
kd.kernels[execute_kernel_idx].skip_execution = KernelData::SkipKernelExecution(prim_params);
};
}

Expand Down

0 comments on commit 2eec4fd

Please sign in to comment.