[GPU] Optimize iGPU FC with prime number batch size (openvinotoolkit#…

…24893) ### Details: Solve iGPU FC low performance issue when FC batch size is not aligned with 2/4 - Desc: Sometimes FC input shape is not aligned with 2/4, such as ViT models will adopt 257x4096 or 577x4096, in this unligned batch size, iGPU will perform FC very slowly, about 23ms for 257x4096->257x1024 and 50ms for 577x4096->577x1024. - Root cause: When FC's batch size is not aligned with 2/4, it will not choose best TuneParams and fallback to default parameters, which leads to worst performance. See blow figure: EU active is about 3.5% while XVE Thread occupancy almost is 100%, and global memory read bandwidth is 77 GB/s, which has reached hw bandwidth limitation (~75GB/s), it means that memory utilization in L3 cache is too low. ![image](https://github.com/openvinotoolkit/openvino/assets/31196718/a9debd4e-bc77-45ac-9942-01813b0d61ab) - Solution: If FC's bactch size is not aligned with 2/4, we still can use tile_b=16 with dispatch_bsv==1 as TuneParams, which can benefit from the higer ratio of GFLOPS and Data read bandwidth. - Test result on MTL: ![image](https://github.com/openvinotoolkit/openvino/assets/31196718/8c6b566c-8389-419f-836e-eaab29f8ef02) FC 257x4096->257x1024: latency improved from 23ms to 0.9ms <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns="http://www.w3.org/TR/REC-html40"> <head> <meta name=ProgId content=OneNote.File> <meta name=Generator content="Microsoft OneNote 15"> </head> <body lang=en-US style='font-family:Calibri;font-size:11.0pt'>  <div style='direction:ltr'> | master | PR to opt -- | -- | -- CLIP visual | 0.99 FPS | 13.00 FPS ViT_B | 5.37 FPS | 20.40 FPS Vit_L | 0.56 FPS | 4.91 FPS </div>  </body> </html> ### Tickets: - CVS-142833 --------- Co-authored-by: Chen Peter <[email protected]>
DanLiu2Intel · Oct 23, 2024 · f4e8b82 · f4e8b82
1 parent 6458855
commit f4e8b82
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 2 deletions.
diff --git a/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -293,8 +293,17 @@ bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params,
     size_t output_f = bf_size.second;
 
     auto batch_size = params.is_shape_agnostic ? Align(output_b, tparams.tile_b) : output_b;
-    if (batch_size % (tparams.tile_b * tparams.dispatch_bsv) != 0)
-        return false;
+    // If batch size is prime number, still can apply tile execution to avoid poor performance.
+    if (batch_size % (tparams.tile_b * tparams.dispatch_bsv) != 0) {
+        if ((tparams.dispatch_bsv != 1) || batch_size == 1)
+            return false;
+        size_t tile = simd;
+        while (batch_size % tile != 0)
+            tile--;
+        if (tile > 1)
+            return false;
+    }
+
     if (CeilDiv(output_f, tparams.tile_ofm * simd) % tparams.dispatch_fsv != 0)
         return false;
 

diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -1227,6 +1227,59 @@ TEST(fully_connected_gpu, bf_tiled_with_pad) {
     }
 }
 
+TEST(fully_connected_gpu, bf_tiled_with_unaligned_batch) {
+    tests::random_generator rg(GET_SUITE_NAME);
+    auto& engine = get_test_engine();
+    // Test parameters with unaligned batch size.
+    const int batch_num = 17;
+    const int feature_num = 1;
+    const int input_x = 1;
+    const int input_y = 64;
+    const int output_y = input_y;
+
+    // Allocate memory
+    auto input_mem =
+        engine.allocate_memory({{batch_num, feature_num, input_y, input_x}, data_types::f16, format::bfyx});
+    auto weights_mem = engine.allocate_memory({{output_y, input_y}, data_types::f16, format::bfyx});
+
+    // Generate random input data and set values
+    auto input_data = rg.generate_random_4d<ov::float16>(batch_num, feature_num, input_y, input_x, -1, 1);
+    auto weights_data = rg.generate_random_4d<ov::float16>(output_y, input_y, 1, 1, -1, 1);
+
+    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
+    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+
+    std::vector<ov::float16> empty_bias(output_y, 0);
+    set_values(input_mem, input_data_bfyx);
+    set_values(weights_mem, weights_data_bfyx);
+    auto reference_output = dynamic_fully_connected_reference_calc<ov::float16>(batch_num * feature_num,
+                                                                                input_y,
+                                                                                output_y,
+                                                                                input_data_bfyx,
+                                                                                weights_data_bfyx,
+                                                                                empty_bias);
+    topology topology(input_layout("input", input_mem->get_layout()),
+                      data("weights", weights_mem),
+                      fully_connected("fc_prim", input_info("input"), "weights", "", 3, 3));
+
+    // Set data optimization to allow weights reordering to optimal format
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    ov::intel_gpu::ImplementationDesc fc_impl_desc = {format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl};
+    config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"fc_prim", fc_impl_desc}}));
+
+    network network(engine, topology, config);
+    network.set_input_data("input", input_mem);
+
+    auto outputs = network.execute();
+    auto output_mem = outputs.at("fc_prim").get_memory();
+    cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());
+    ASSERT_EQ(output_mem->count(), batch_num * feature_num * output_y);
+
+    for (size_t i = 0; i < batch_num * feature_num * output_y; ++i) {
+        ASSERT_FLOAT_EQ(reference_output[i], output_ptr[i]) << " i = " << i;
+    }
+}
 
 TEST(fully_connected_gpu, DISABLED_fs_byx_fsv32_b34)
 {