diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index b26b11ce97df6a..be41f5faef3272 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -293,8 +293,17 @@ bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params, size_t output_f = bf_size.second; auto batch_size = params.is_shape_agnostic ? Align(output_b, tparams.tile_b) : output_b; - if (batch_size % (tparams.tile_b * tparams.dispatch_bsv) != 0) - return false; + // If batch size is prime number, still can apply tile execution to avoid poor performance. + if (batch_size % (tparams.tile_b * tparams.dispatch_bsv) != 0) { + if ((tparams.dispatch_bsv != 1) || batch_size == 1) + return false; + size_t tile = simd; + while (batch_size % tile != 0) + tile--; + if (tile > 1) + return false; + } + if (CeilDiv(output_f, tparams.tile_ofm * simd) % tparams.dispatch_fsv != 0) return false; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index dde1b6215148b3..eed9760348f669 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1227,6 +1227,59 @@ TEST(fully_connected_gpu, bf_tiled_with_pad) { } } +TEST(fully_connected_gpu, bf_tiled_with_unaligned_batch) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + // Test parameters with unaligned batch size. + const int batch_num = 17; + const int feature_num = 1; + const int input_x = 1; + const int input_y = 64; + const int output_y = input_y; + + // Allocate memory + auto input_mem = + engine.allocate_memory({{batch_num, feature_num, input_y, input_x}, data_types::f16, format::bfyx}); + auto weights_mem = engine.allocate_memory({{output_y, input_y}, data_types::f16, format::bfyx}); + + // Generate random input data and set values + auto input_data = rg.generate_random_4d(batch_num, feature_num, input_y, input_x, -1, 1); + auto weights_data = rg.generate_random_4d(output_y, input_y, 1, 1, -1, 1); + + auto input_data_bfyx = flatten_4d(format::bfyx, input_data); + auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data); + + std::vector empty_bias(output_y, 0); + set_values(input_mem, input_data_bfyx); + set_values(weights_mem, weights_data_bfyx); + auto reference_output = dynamic_fully_connected_reference_calc(batch_num * feature_num, + input_y, + output_y, + input_data_bfyx, + weights_data_bfyx, + empty_bias); + topology topology(input_layout("input", input_mem->get_layout()), + data("weights", weights_mem), + fully_connected("fc_prim", input_info("input"), "weights", "", 3, 3)); + + // Set data optimization to allow weights reordering to optimal format + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = {format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl}; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"fc_prim", fc_impl_desc}})); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + auto output_mem = outputs.at("fc_prim").get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + ASSERT_EQ(output_mem->count(), batch_num * feature_num * output_y); + + for (size_t i = 0; i < batch_num * feature_num * output_y; ++i) { + ASSERT_FLOAT_EQ(reference_output[i], output_ptr[i]) << " i = " << i; + } +} TEST(fully_connected_gpu, DISABLED_fs_byx_fsv32_b34) {