cms-sw · cmsbuild · Feb 28, 2024 · Feb 26, 2024
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
@@ -130,6 +130,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
 
   template <typename TrackerTraits>
   struct FindClus {
+    // assume that we can cover the whole module with up to 16 blockDimension-wide iterations
+    static constexpr uint32_t maxIterGPU = 16;
+
+    // this must be larger than maxPixInModule / maxIterGPU, and should be a multiple of the warp size
+    static constexpr uint32_t maxElementsPerBlock =
+        cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / maxIterGPU, 128);
+
     template <typename TAcc>
     ALPAKA_FN_ACC void operator()(const TAcc& acc,
                                   SiPixelDigisSoAView digi_view,
@@ -292,17 +299,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
 #endif
 
         [[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
-        // Assume that we can cover the whole module with up to 16 blockDimension-wide iterations
-        // This maxIter value was tuned for GPU, with 256 or 512 threads per block.
-        // Hence, also works for CPU case, with 256 or 512 elements per thread.
-        // Real constrainst is maxIter = hist.size() / blockDimension,
-        // with blockDimension = threadPerBlock * elementsPerThread.
-        // Hence, maxIter can be tuned accordingly to the workdiv.
-        constexpr unsigned int maxIterGPU = 16;
+        // assume that we can cover the whole module with up to maxIterGPU blockDimension-wide iterations
         ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < maxIterGPU);
 
-        // NB: can be tuned.
-        constexpr uint32_t maxElements = cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? 256 : 1;
+        // number of elements per thread
+        constexpr uint32_t maxElements =
+            cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? maxElementsPerBlock : 1;
         ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u] <= maxElements));
 
         constexpr unsigned int maxIter = maxIterGPU * maxElements;

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -641,25 +641,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
             cms::alpakatools::make_device_view(alpaka::getDev(queue), clusters_d->view().moduleStart(), 1u);
         alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement);
 
-        // TODO
-        // - we are fixing this here since it is used at compile time also in the kernel
-        // - put maxIter in the Geometry traits
-        constexpr auto threadsOrElementsFindClus = 256;
-
+        const auto elementsPerBlockFindClus = FindClus<TrackerTraits>::maxElementsPerBlock;
         const auto workDivMaxNumModules =
-            cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, threadsOrElementsFindClus);
-        // NB: With present FindClus() / chargeCut() algorithm,
-        // threadPerBlock (GPU) or elementsPerThread (CPU) = 256 show optimal performance.
-        // Though, it does not have to be the same number for CPU/GPU cases.
-
+            cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, elementsPerBlockFindClus);
 #ifdef GPU_DEBUG
-        std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << threadsOrElementsFindClus
+        std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus
                   << " threadsPerBlockOrElementsPerThread\n";
 #endif
-
         alpaka::exec<Acc1D>(
             queue, workDivMaxNumModules, FindClus<TrackerTraits>{}, digis_d->view(), clusters_d->view(), wordCounter);
-
 #ifdef GPU_DEBUG
         alpaka::wait(queue);
 #endif
@@ -740,14 +730,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           cms::alpakatools::make_device_view(alpaka::getDev(queue), clusters_d->view().moduleStart(), 1u);
       alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement);
 
-      /// should be larger than maxPixInModule/16 aka (maxPixInModule/maxiter in the kernel)
-
-      const auto threadsPerBlockFindClus = 256;
-      const auto workDivMaxNumModules = cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, threadsPerBlockFindClus);
-
+      const auto elementsPerBlockFindClus = FindClus<TrackerTraits>::maxElementsPerBlock;
+      const auto workDivMaxNumModules =
+          cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, elementsPerBlockFindClus);
 #ifdef GPU_DEBUG
       alpaka::wait(queue);
-      std::cout << "FindClus kernel launch with " << numberOfModules << " blocks of " << threadsPerBlockFindClus
+      std::cout << "FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus
                 << " threadsPerBlockOrElementsPerThread\n";
 #endif
       alpaka::exec<Acc1D>(