Skip to content

Commit

Permalink
[GPU] Modify fc_gpu_bf_tiled kernel to enable weight zp (#26367)
Browse files Browse the repository at this point in the history
### Details:
 - *item1*
 - *...*

### Tickets:
 - CVS-150930

---------

Signed-off-by: Min, Byung-il <[email protected]>
Signed-off-by: Min, Byungil <[email protected]>
  • Loading branch information
byungilm authored Sep 24, 2024
1 parent de30969 commit e1c167a
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -886,38 +886,44 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx);
SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + ((IFM_SIZE / 2) * 16)));
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked;
dq_wei_unpacked.s0123 = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
dq_wei_unpacked.s4567 = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
// loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking
dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
#else
SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx);
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed));
#endif

// Calculate zero-point and scale only for DECOMPRESSION_SCALE_POST_OP enabled
// Calculate weight : w = (w - dzp) * ds
#if DECOMPRESSION_ZP_TERM
#if DECOMPRESSION_ZP_SCALAR
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE);
dq_wei_unpacked -= dzp;
#elif DECOMPRESSION_ZP_GROUPS_NUM > 1
DQ_SLM_FILTER_UNPACKED_VEC dzp;
DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked);
const uint ni_offset = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE;
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
const uint offset_ofm = out_f + fi*SIMD + sglid;
unroll_for(uint kii = 0; kii < FILTER_LOAD_BLOCK_SIZE; ++kii) {
const uint offset_ofm = out_f + fi*SIMD + sglid;
const uint offset_ifm = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE + load_iter * FILTER_LOAD_BLOCK_SIZE + kii;
const uint offset_ifm = ni_offset + load_iter * FILTER_LOAD_BLOCK_SIZE + kii;
const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH +
(offset_ifm / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
dzp[W_IDX] = decompression_zp[zp_offset];
w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - TO_DQ_TYPE(decompression_zp[zp_offset]);
}
}
#else
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(d_zps[0]);
DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked);
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
unroll_for(uint kii = 0; kii < FILTER_LOAD_BLOCK_SIZE; ++kii) {
w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - d_zps[fi % DECOMPRESSION_ZP_LENGTH];
}
}
#endif
#else
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(ACCUMULATOR_VAL_ZERO);
#endif

// Calculate weight : w = (w - dzp) * ds
dq_wei_unpacked -= dzp;

#if FILTER_LOAD_BLOCK_SIZE == 2
DQ_SLM_FILTER_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23};
char_slm_weight[wei_local_idx] = as_int(wei_1);
Expand Down Expand Up @@ -1117,7 +1123,7 @@ KERNEL(fc)(
#endif
) {
#if USE_SLM
#if DYNAMIC_QUANTIZE
#if DYNAMIC_QUANTIZE && (TILE_OFM == 2)
__local int dq_wei_local_mem[SIMD * TILE_OFM * SIMD];
#else
__local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD];
Expand Down Expand Up @@ -1259,7 +1265,7 @@ KERNEL(fc)(
#endif
);
} else {
#if USE_SLM && DYNAMIC_QUANTIZE
#if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
OPTIONAL_SHAPE_INFO_TENSOR
input,
Expand Down Expand Up @@ -1306,7 +1312,7 @@ KERNEL(fc)(
#endif
}
#else
#if USE_SLM && DYNAMIC_QUANTIZE
#if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
OPTIONAL_SHAPE_INFO_TENSOR
input,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,7 @@ static bool should_dynamic_quantize(const fully_connected_params& params) {
if ((scale_group_size % simd == 0) && (input_f % dynamic_quantization_group_size == 0) &&
(params.is_shape_agnostic || (params.inputs[0].Batch().v > 1 && input_b > min_slm_size)) &&
params.inputs[0].GetDType() == Datatype::F16 &&
(params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4) &&
(params.decompression_zero_point.Feature().v == 1)) {
(params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4)) {
GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size " << scale_group_size << ", Input (" <<
kernel_selector::toString(params.inputs[0].GetDType()) << ", " << kernel_selector::toString(params.outputs[0].GetLayout()) <<
") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << ", Y: " << params.inputs[0].Y().v << std ::endl;
Expand Down Expand Up @@ -524,13 +523,15 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
if (scale_group_size % simd == 0 && !dispatchData.use_slm)
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
}
if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2)
if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii"));
else if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16)
} else if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) {
jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii"));
else
} else {
jit.AddConstant(MakeJitConstant("W_IDX", "kii * TILE_OFM + fi"));
}

jit.AddConstant(MakeJitConstant("W_DYN_QUAN_IDX", "fi * TILE_K + kii"));

if (dispatchData.use_slm) {
OPENVINO_ASSERT(dispatchData.tile_n == 2, "[GPU] Unsupported TILE_OFM size for SLM kernel configuration");
Expand Down Expand Up @@ -576,14 +577,14 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
}

// Validated perf gain, Dynamic quantize force enable SCALE_POST_OP for char type multiplication
if (should_dynamic_quantize(params) && dispatchData.tile_m > 1 && dispatchData.tile_n == 2) {
if (should_dynamic_quantize(params)) {
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1));
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
jit.AddConstant(MakeJitConstant("DQ_TYPE", "char"));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
} else {
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", -1));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size));
}

jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2540,7 +2540,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
}

void test_compressed_int4_scale_dyn_quan_weight_i4(bool is_dynamic, int batch = 1, int ifm = 512, int ofm = 2048,
int quantize_group_size = 32, int scales_group_size = 128) {
int quantize_group_size = 32, int scales_group_size = 128,
bool is_wzp_test = false, bool is_wzp_scalar = false) {
tests::random_generator rg(GET_SUITE_NAME);
auto& engine = get_test_engine();

Expand All @@ -2550,12 +2551,15 @@ class fully_connected_gpu_tests: public ::testing::Test {
long int batch_num = batch;
long int ifm_num = ifm;
long int ofm_num = ofm;
long int wzp_num = is_wzp_scalar ? 1 : ofm_num;

auto input_ps = ov::PartialShape{ batch_num, 1, ifm_num };
auto input_mem = engine.allocate_memory({ input_ps, data_types::f16, format::bfyx });

auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::i4, format::bfyx });
auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::fbyx });
auto dcomp_zp_mem = engine.allocate_memory({ {wzp_num, 1}, data_types::u8, format::bfyx });


auto input_data = rg.generate_random_1d<ov::float16>(batch_num * ifm_num, -2.f, 2.f);
set_values(input_mem, input_data);
Expand All @@ -2566,28 +2570,38 @@ class fully_connected_gpu_tests: public ::testing::Test {
auto scale_data = rg.generate_random_1d<ov::float16>(ofm_num * ifm_num / scales_group_size, -2.f, 2.f);
set_values(scale_mem, scale_data);

if (is_wzp_test) {
auto zp_data = rg.generate_random_1d<uint8_t>(wzp_num, 0, 2);
set_values(dcomp_zp_mem, zp_data);
}

auto in_layout = is_dynamic ? layout{ ov::PartialShape{ -1, -1, -1 }, data_types::f16, format::bfyx }
: layout{ input_ps, data_types::f16, format::bfyx };

auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, 3, 2);
fc_prim.decompression_zero_point_scalar = 0;
auto dcomp_zp_name = is_wzp_test ? "wzp" : "";
auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, 3, 2);

if (is_wzp_test) {
fc_prim.compressed_weights = true;
fc_prim.decompression_zero_point = is_wzp_test ? "wzp" : "";
}

// Implemented dynamic quantize kernel
auto get_ref_results = [&]() {
topology topology(
input_layout("input", in_layout),
data("weights", weights_mem),
data("scale", scale_mem),
fc_prim
);
topology topo;
topo.add(input_layout("input", in_layout));
topo.add(data("weights", weights_mem));
topo.add(data("scale", scale_mem));
topo.add(data("wzp", dcomp_zp_mem));
topo.add(fc_prim);

auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl };
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
config.set_property(ov::hint::dynamic_quantization_group_size(0));

network network(engine, topology, config);
network network(engine, topo, config);
network.set_input_data("input", input_mem);

auto outputs = network.execute();
Expand All @@ -2604,6 +2618,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
input_layout("input", in_layout),
data("weights", weights_mem),
data("scale", scale_mem),
data("wzp", dcomp_zp_mem),
fc_prim
);

Expand Down Expand Up @@ -3699,6 +3714,26 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_edge_ca
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 359, 1536, 2560, 128, 64);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_no_wzp) {
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 32, 32, false);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp) {
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 32, 32, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_scalar) {
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 32, 32, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_128) {
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 128, 128, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_static) {
this->test_compressed_int4_scale_dyn_quan_weight_i4(false, 320, 1024, 1024, 32, 32, true);
}

TEST_F(fully_connected_gpu_tests, compressed_scale_bias) {
this->test_compressed_scale_bias(false);
}
Expand Down

0 comments on commit e1c167a

Please sign in to comment.