Skip to content

Commit

Permalink
[GPU] Fix gemm_tiled_opt kernel accuracy for the dynamic case with TI…
Browse files Browse the repository at this point in the history
…LE_N=32 and transposed output shape
  • Loading branch information
sshlyapn committed Apr 1, 2024
1 parent d384662 commit 46e504d
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,15 @@ KERNEL(gemm_tiled_opt)(
#ifdef BIAS_TERM
const uint batch_offset_input2 = FUNC_CALL(get_input2_batch_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z);
#endif // BIAS_TERM
uint write_id = 0;
uint y_write_id = 0;
uint x_write_id = 0;
const uint batch_offset_output = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR TR_B, TR_F, TR_W, TR_Z, TR_Y, TR_X);
write_id = 1;
const uint batch_offset_output_diff = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR TR_B, TR_F, TR_W, TR_Z, TR_Y, TR_X) - batch_offset_output;
y_write_id = 1;
x_write_id = 0;
const uint output_y_pitch = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR TR_B, TR_F, TR_W, TR_Z, TR_Y, TR_X) - batch_offset_output;
y_write_id = 0;
x_write_id = 1;
const uint output_x_pitch = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR TR_B, TR_F, TR_W, TR_Z, TR_Y, TR_X) - batch_offset_output;

// Start pointers offsets
#if TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST
Expand Down Expand Up @@ -424,7 +429,7 @@ KERNEL(gemm_tiled_opt)(
#endif // TILE_K > SIMD_WIDTH
}
}
#if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING
#if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING
// Read A for next dot_id
#if TILE_K_NOT_DIVISIBLE
a_read = (dot_id + 1 < tile_m_iterations) ? TILE_K_NOT_DIVISIBLE_CALC ? a_ptr[sglid] : BLOCK_READ_A(a_ptr, 0) : 0;
Expand Down Expand Up @@ -732,7 +737,13 @@ KERNEL(gemm_tiled_opt)(
#endif // HAS_FUSED_OPS
}
#else
OUTPUT_TYPE* d_ptr_tmp = d_ptr + sglid;
#if TRANSPOSE_OUTPUT == TRANSPOSE_X_LAST
const uint x_pitch = 1;
#else
const uint x_pitch = output_x_pitch;
#endif
OUTPUT_TYPE* d_ptr_tmp = d_ptr + sglid * x_pitch;

#ifdef BIAS_TERM
ACCUMULATOR_TYPE_VEC dequantized = (ACCUMULATOR_TYPE_VEC)(ALPHA) * c_tile[write_id] + TO_ACCUMULATOR_TYPE(BETA) * c_ptr[sglid];
#else // BIAS_TERM
Expand All @@ -743,13 +754,13 @@ KERNEL(gemm_tiled_opt)(
OUTPUT_TYPE_VEC result = FUSED_OPS_RESULT_VEC;
unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) {
if (b_raw_global_id + SIMD_WIDTH * n_elem < N) {
*(d_ptr_tmp + SIMD_WIDTH * n_elem) = result[n_elem];
*(d_ptr_tmp + SIMD_WIDTH * n_elem * x_pitch) = result[n_elem];
}
}
#else
unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) {
if (b_raw_global_id + SIMD_WIDTH * n_elem < N) {
*(d_ptr_tmp + SIMD_WIDTH * n_elem) = dequantized[n_elem];
*(d_ptr_tmp + SIMD_WIDTH * n_elem * x_pitch) = dequantized[n_elem];
}
}
#endif // HAS_FUSED_OPS
Expand Down Expand Up @@ -796,7 +807,7 @@ KERNEL(gemm_tiled_opt)(
#endif // HAS_FUSED_OPS
#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
#endif // IS_DYNAMIC
d_ptr += batch_offset_output_diff;
d_ptr += output_y_pitch;
#ifdef BIAS_TERM
c_ptr += N;
#endif // BIAS_TERM
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,17 @@ std::vector<std::string> GemmKernelBase::GetTransposedDims(const std::vector<int
break;
case 6:
if (is_tiled_opt) {
dim_ids.push_back("(y+write_id)");
dim_ids.push_back("(y+y_write_id)");
} else {
dim_ids.push_back("y");
}
break;
case 7:
dim_ids.push_back("x");
if (is_tiled_opt) {
dim_ids.push_back("(x+x_write_id)");
} else {
dim_ids.push_back("x");
}
break;
default:
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,12 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
MakeJitConstant("TR_X", GetTransposedDims(params.output_order, true).at(7)),
});

bool transpose_output = (params.output_order.size() > 0 && (params.output_order.back() != (static_cast<int>(params.output_order.size()) - 1)));
if (transpose_output)
jit.AddConstant(MakeJitConstant("TRANSPOSE_OUTPUT", 2 /* set as TRANSPOSE_OTHER */));
else
jit.AddConstant(MakeJitConstant("TRANSPOSE_OUTPUT", 0 /* set as TRANSPOSE_X_LAST */));

bool has_dynamic_k_padding = params.transpose_input0 ? params.inputs[0].Y().pad.is_dynamic
: params.inputs[0].X().pad.is_dynamic;
bool has_dynamic_n_padding = params.transpose_input1 ? params.inputs[1].Y().pad.is_dynamic
Expand Down
42 changes: 36 additions & 6 deletions src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,7 @@ class gemm_gpu_tests: public ::testing::Test {
ov::Shape ref_input1_broadcasted_shape;
ov::Shape ref_input1_shape;
ov::Shape ref_output_shape;

ref_input0_shape = { BATCH_SIZE, 16, M_SIZE, K_SIZE };
ref_input1_broadcasted_shape = { N_SIZE, BATCH_SIZE, 16, K_SIZE };
ref_input1_shape = { BATCH_SIZE, 16, K_SIZE, N_SIZE };
Expand Down Expand Up @@ -1063,7 +1063,7 @@ class gemm_gpu_tests: public ::testing::Test {
ov::Shape ref_input1_reshaped_shape;
ov::Shape ref_input1_shape;
ov::Shape ref_output_shape;

ref_input0_shape = { BATCH_SIZE, 32, M_SIZE, K_SIZE };
ref_input1_broadcasted_shape = { N_SIZE, BATCH_SIZE, 2, 16, K_SIZE };
ref_input1_reshaped_shape = { N_SIZE, BATCH_SIZE, 32, K_SIZE };
Expand Down Expand Up @@ -1313,16 +1313,22 @@ class gemm_gpu_tests: public ::testing::Test {
output_shape_default = { M_SIZE, N_SIZE };
} else if (num_dims == 3) {
input0_shape_default = { BATCH_SIZE, M_SIZE, K_SIZE };
input1_shape_default = { BATCH_SIZE, K_SIZE, N_SIZE };
input1_shape_default = { BATCH_SIZE, K_SIZE, N_SIZE };
output_shape_default = { BATCH_SIZE, M_SIZE, N_SIZE };
} else if (num_dims == 4) {
input0_shape_default = { BATCH_SIZE, 1, M_SIZE, K_SIZE};
input1_shape_default = { BATCH_SIZE, 1, K_SIZE, N_SIZE};
input1_shape_default = { BATCH_SIZE, 1, K_SIZE, N_SIZE};
output_shape_default = { BATCH_SIZE, 1, M_SIZE, N_SIZE };
}
}

void test_transpose_matmul_f32(size_t num_dims, bool is_input_dynamic, bool is_caching_test, std::vector<size_t> BMKN, std::vector<int64_t> input0_order, std::vector<int64_t> input1_order) {
void test_transpose_matmul_f32(size_t num_dims,
bool is_input_dynamic,
bool is_caching_test,
std::vector<size_t> BMKN,
std::vector<int64_t> input0_order,
std::vector<int64_t> input1_order,
std::vector<int64_t> output_order = {}) {
tests::random_generator rg;
rg.set_seed(GET_SUITE_NAME);

Expand All @@ -1337,6 +1343,7 @@ class gemm_gpu_tests: public ::testing::Test {
set_default_shapes(num_dims, BMKN, input0_shape_default, input1_shape_default, output_shape_default);
ov::Shape input0_shape(input0_shape_default.size());
ov::Shape input1_shape(input1_shape_default.size());
ov::Shape output_shape(output_shape_default.size());

for (size_t dim = 0; dim < input0_shape_default.size(); ++dim) {
input0_shape[input0_order[dim]] = input0_shape_default[dim];
Expand All @@ -1346,6 +1353,12 @@ class gemm_gpu_tests: public ::testing::Test {
input1_shape[input1_order[dim]] = input1_shape_default[dim];
}

if (!output_order.empty()) {
for (size_t dim = 0; dim < output_shape_default.size(); ++dim) {
output_shape[output_order[dim]] = output_shape_default[dim];
}
}

if (is_input_dynamic) {
input0_layout = layout{ov::PartialShape::dynamic(input0_shape.size()), data_types::f32, format::bfyx};
input1_layout = layout{ov::PartialShape::dynamic(input1_shape.size()), data_types::f32, format::bfyx};
Expand All @@ -1366,7 +1379,7 @@ class gemm_gpu_tests: public ::testing::Test {
topology topology;
topology.add(input_layout("input0", input0_layout),
input_layout("input1", input1_layout),
gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f32, {}, {}, {}, {}, input0_order, input1_order)
gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f32, {}, {}, {}, {}, input0_order, input1_order, output_order)
);

ExecutionConfig config = get_test_default_config(engine);
Expand Down Expand Up @@ -1415,6 +1428,19 @@ class gemm_gpu_tests: public ::testing::Test {
false,
false);

if (!output_order.empty()) {
std::vector<float> out_data_transposed(ov::shape_size(output_shape_default));

ov::reference::transpose((const char *)(ref_out_data.data()),
(char *)(out_data_transposed.data()),
output_shape_default,
sizeof(float),
output_order,
output_shape);

ref_out_data = out_data_transposed;
}

ASSERT_EQ(output_ptr.size(), ref_out_data.size());

const auto abs_error = 0.0001;
Expand Down Expand Up @@ -1614,6 +1640,10 @@ TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f32) {
this->test_transpose_matmul_f32(4, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0});
}

TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f32_n_tile_32_output_ylast) {
this->test_transpose_matmul_f32(4, true, false, /*BMKN*/{1, 128, 1, 9}, /*input0_order*/{0, 1, 2, 3}, /*input1_order*/{0, 1, 2, 3}, /*output_order*/{0, 1, 3, 2});
}

TEST_F(gemm_gpu_tests, transpose_matmul_static_4d_f16) {
this->test_transpose_matmul_f16(4, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0});
}
Expand Down

0 comments on commit 46e504d

Please sign in to comment.