Skip to content

Commit

Permalink
[GPU] Exclude fused dependencies from GEMM's output shape canonicaliz…
Browse files Browse the repository at this point in the history
…ation func (openvinotoolkit#22543)

Fix zp/scales data types conversion in FC bf tiled kernel
  • Loading branch information
sshlyapn authored Feb 1, 2024
1 parent 02d7cb9 commit 3c6b8ca
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 6 deletions.
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ layout gemm_inst::transform_output_layout(const std::shared_ptr<const gemm> prim
auto N = input1_pshape[n_idx];

auto output_pshape = input_layouts[0].get_partial_shape();
for (size_t i = 0; i != input_layouts.size(); ++i) {
for (size_t i = 0; i != primitive->input_size(); ++i) {
auto input_pshape = input_layouts[i].get_partial_shape();
for (size_t j = 0; j != input_pshape.size(); ++j) {
ov::Dimension::merge(output_pshape[j], output_pshape[j], input_pshape[j]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#define TO_OUTPUT_VEC_TYPE(x) CAT(convert_, OUTPUT_VEC_TYPE)(x)
#define TO_ACTIVATION_VEC_TYPE(x) CAT(convert_, ACTIVATION_VEC_TYPE)(x)
#define TO_FILTER_VEC_TYPE(x) CAT(convert_, FILTER_VEC_TYPE)(x)
#define TO_ACCUMULATOR_VEC_TYPE(x) CAT(convert_, ACCUMULATOR_VEC_TYPE)(x)

#define INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, TILE_IFM, ptr, offset)
#define FILTER_BLOCK_READ(ptr, offset) BLOCK_READN(FILTER_TYPE, TILE_K_OFM_PACKED, ptr, offset)
Expand Down Expand Up @@ -184,7 +185,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)(

#if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1
#if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0
ACCUMULATOR_VEC_TYPE d_scale = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_scale, out_f);
ACCUMULATOR_VEC_TYPE d_scale = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_SCALE_TYPE, TILE_OFM, decompression_scale, out_f));
#elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0
ACCUMULATOR_VEC_TYPE d_scale = 0;
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
Expand All @@ -201,7 +202,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)(

#if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR
#if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0
ACCUMULATOR_VEC_TYPE d_zp = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_zp, out_f);
ACCUMULATOR_VEC_TYPE d_zp = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_ZP_TYPE, TILE_OFM, decompression_zp, out_f));
#elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0
ACCUMULATOR_VEC_TYPE d_zp = 0;
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ inline void (FUNC_NAME)(

#if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1
#if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0
ACCUMULATOR_VEC_TYPE d_scale = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_scale, out_f);
ACCUMULATOR_VEC_TYPE d_scale = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_SCALE_TYPE, TILE_OFM, decompression_scale, out_f));
#elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0
ACCUMULATOR_VEC_TYPE d_scale = 0;
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
Expand All @@ -69,7 +69,7 @@ inline void (FUNC_NAME)(

#if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR
#if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0
ACCUMULATOR_VEC_TYPE d_zp = BLOCK_READN(ACCUMULATOR_TYPE, TILE_OFM, decompression_zp, out_f);
ACCUMULATOR_VEC_TYPE d_zp = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_ZP_TYPE, TILE_OFM, decompression_zp, out_f));
#elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0
ACCUMULATOR_VEC_TYPE d_zp = 0;
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
Expand Down
48 changes: 48 additions & 0 deletions src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ class GemmFusingTest : public ::BaseFusingTest<gemm_test_params> {
#define CASE_GEMM_2IN_FP16_3 { { 1, 1, 64, 64 }, { 1, 1, 64, 64 } }, { 1, 1, 64, 64 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx
#define CASE_GEMM_2IN_FP16_4 { { 1, 2, 128, 64 }, { 1, 2, 64, 256 } }, { 1, 2, 128, 256 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx
#define CASE_GEMM_2IN_FP16_5 { { 2, 3, 2, 2 }, { 2, 3, 2, 2 } }, { 2, 3, 2, 2 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx
#define CASE_GEMM_2IN_FP16_3D_1 { { 16, 8, 64 }, { 16, 64, 8 }, { 16, 1, 8 } }, { 16, 8, 8 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx
#define CASE_GEMM_2IN_FP16_5D_1 { { 2, 3, 5, 6, 4 }, { 2, 3, 5, 4, 6} }, { 2, 3, 5, 6, 6 }, data_types::f16, data_types::f16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx
#define CASE_GEMM_2IN_FP16_6D_1 { { 2, 3, 2, 3, 5, 7 }, { 2, 3, 2, 3, 7, 5 } }, { 2, 3, 2, 3, 5, 5 }, data_types::f16, data_types::f16, data_types::f16, format::bfwzyx, data_types::f16, format::bfwzyx

Expand Down Expand Up @@ -406,6 +407,53 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_add, ::testing::ValuesIn(std::vec
gemm_test_params{ CASE_GEMM_2IN_FP16_6D_1, 3, 4, "", broadcast_kinds::feature, eltwise_mode::sub },
}));

class gemm_2in_dynamic_add : public gemm_2in_add {};
TEST_P(gemm_2in_dynamic_add, add) {
auto p = GetParam();

if (engine.get_device_info().supports_immad)
p.expected_fused_primitives++;

cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true));
cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true));

auto eltwise_layout = get_output_layout(p);
auto eltwise_shape = ov::PartialShape::dynamic(eltwise_layout.get_partial_shape().size());
if (p.broadcast_kind == broadcast_kinds::batch)
eltwise_shape[0] = 1;
else if (p.broadcast_kind == broadcast_kinds::feature)
eltwise_shape[1] = 1;
eltwise_layout.set_partial_shape(eltwise_shape);

auto in_layout0 = get_input_layout(p, 0);
auto in_layout1 = get_input_layout(p, 1);

auto in0_pshape = ov::PartialShape::dynamic(p.in_shapes[0].size());
in0_pshape[2] = p.in_shapes[0][2];
auto in1_pshape = ov::PartialShape::dynamic(p.in_shapes[1].size());
in1_pshape[1] = p.in_shapes[1][1];

in_layout0.set_partial_shape(in0_pshape);
in_layout1.set_partial_shape(in1_pshape);

create_topologies(
input_layout("input0", in_layout0),
input_layout("input1", in_layout1),
input_layout("input2", eltwise_layout),
gemm("gemm_prim", { input_info("input0"), input_info("input1") }, data_types::f32, false, false, 1.f, 0.f, in0_pshape.size(), in1_pshape.size()),
eltwise("add_prim", { input_info("gemm_prim"), input_info("input2") }, p.eltwise_m, p.default_type),
reorder("reorder_bfyx", input_info("add_prim"), p.default_format, data_types::f32)
);

tolerance = default_tolerance(p.default_type);
execute(p, true);
}

INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_dynamic_add, ::testing::ValuesIn(std::vector<gemm_test_params>{
gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 5, "", broadcast_kinds::batch, eltwise_mode::sum },
gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 5, "", broadcast_kinds::feature, eltwise_mode::sum },
}));

class gemm_2in_act_scale_quantize_i8 : public GemmFusingTest {};
TEST_P(gemm_2in_act_scale_quantize_i8, basic) {
// TODO: Fix me, refer PR(#15873)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1131,6 +1131,75 @@ class fully_connected_gpu_tests: public ::testing::Test {
ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 5.0) << "i = " << i;
}

void test_compressed_int8_scale_zp_bias(bool is_caching_test) {
auto& engine = get_test_engine();

auto input_mem = engine.allocate_memory({ {1, 1, 2}, data_types::f16, format::bfyx });
auto weights_mem = engine.allocate_memory({ {32, 2}, data_types::u8, format::bfyx });
auto bias_mem = engine.allocate_memory({ {1, 1, 32}, data_types::f16, format::bfyx });
auto scale_mem = engine.allocate_memory({ {32, 1}, data_types::f32, format::bfyx });
auto zp_mem = engine.allocate_memory({ {32, 1}, data_types::f32, format::bfyx });

set_values<ov::float16>(input_mem, { -0.5f, 2.0f, 0.5f, 1.0f });
set_values<uint8_t>(weights_mem, { 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 0,
15, 14, 13, 12, 11, 10, 9, 8,
7, 6, 5, 4, 3, 2, 1, 0,
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 0,
15, 14, 13, 12, 11, 10, 9, 8,
7, 6, 5, 4, 3, 2, 1, 0 });

set_values<ov::float16>(bias_mem, { 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 2.0f,
2.0f, -1.0f, 4.0f, -3.0f, 6.0f, -5.0f, 8.0f, 3.0f,
0.0f, -3.0f, 2.0f, -5.0f, 4.0f, -5.0f, 6.0f, 1.0f,
1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 2.0f });
set_values<float>(scale_mem, { 2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 2.0f,
3.0f, 5.0f, -1.0f, -3.0f, 1.5f, 0.5f, 3.0f, 3.0f,
1.0f, 5.0f, -1.0f, -3.0f, -0.5f, -1.5f, 1.0f, 1.0f,
2.0f, 4.0f, -2.0f, -4.0f, 0.5f, -0.5f, 2.0f, 2.0f });
set_values<float>(zp_mem, { 1.0f, 2.0f, 2.0f, 1.0f, 4.0f, 1.0f, 6.0f, 2.0f,
2.0f, 3.0f, 1.0f, 0.0f, 3.0f, 1.0f, 2.0f, 2.0f,
1.0f, 2.0f, 2.0f, 1.0f, 1.0f, 1.0f, 6.0f, 1.0f,
1.0f, 1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 5.0f, 2.0f });

topology topology(
input_layout("input", input_mem->get_layout()),
data("weights", weights_mem),
data("bias", bias_mem),
data("scale", scale_mem),
data("zp", zp_mem),
fully_connected("fc_prim", input_info("input"), "weights", "bias", "scale", "zp", data_types::f16, padding(), 3, 2)
);

auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));

network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input_mem);

auto outputs = network->execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "fc_prim");

auto output_mem = outputs.begin()->second.get_memory();

cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());

ov::PartialShape expected_shape{1, 1, 32};
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());

std::vector<ov::float16> expected_result = { 5.f, 12.f, -10.f, -48.f, 9.75f, -14.5f, 32.f, -19.f,
54.5f, 64.f, -9.f, -37.5f, 12.f, -3.f, 6.5f, -7.5f,
2.f, 14.5f, -4.5f, -38.f, -3.f, -30.5f, 18.5f, -8.f,
39.f, 62.f, -20.f, -32.f, 6.25f, -8.f, -3.f, -5.f, };

for (size_t i = 0; i < output_ptr.size(); i++) {
ASSERT_EQ(expected_result[i], output_ptr[i]) << "i = " << i;
}
}

void test_compressed_scale_bias(bool is_caching_test) {
auto& engine = get_test_engine();

Expand Down Expand Up @@ -1229,7 +1298,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
ov::PartialShape expected_shape{2, 8};
ASSERT_EQ(expected_shape, output_mem->get_layout().get_partial_shape());

std::vector<ov::float16> expected_result = {
std::vector<ov::float16> expected_result = {
ov::float16(18), ov::float16(84), ov::float16(-66), ov::float16(-116), ov::float16(19.5), ov::float16(-13.5), ov::float16(30), ov::float16(6),
ov::float16(-18), ov::float16(-84), ov::float16(66), ov::float16(116), ov::float16(-19.5), ov::float16(13.5), ov::float16(-30), ov::float16(-6) };

Expand Down Expand Up @@ -2679,6 +2748,10 @@ TEST_F(fully_connected_gpu_tests, compressed_scale_bias) {
this->test_compressed_scale_bias(false);
}

TEST_F(fully_connected_gpu_tests, compressed_int8_scale_zp_bias) {
this->test_compressed_int8_scale_zp_bias(false);
}

TEST_F(fully_connected_gpu_tests, compressed_scale_bias_cached) {
this->test_compressed_scale_bias(true);
}
Expand Down

0 comments on commit 3c6b8ca

Please sign in to comment.