Skip to content

Commit

Permalink
Merge pull request #44082 from fwyzard/fix_FindClus_block_size_140x
Browse files Browse the repository at this point in the history
Correct the FindClus block size [14.0.x]
  • Loading branch information
cmsbuild authored Feb 28, 2024
2 parents c7f69db + 85c769b commit 7874d41
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {

template <typename TrackerTraits>
struct FindClus {
// assume that we can cover the whole module with up to 16 blockDimension-wide iterations
static constexpr uint32_t maxIterGPU = 16;

// this must be larger than maxPixInModule / maxIterGPU, and should be a multiple of the warp size
static constexpr uint32_t maxElementsPerBlock =
cms::alpakatools::round_up_by(TrackerTraits::maxPixInModule / maxIterGPU, 128);

template <typename TAcc>
ALPAKA_FN_ACC void operator()(const TAcc& acc,
SiPixelDigisSoAView digi_view,
Expand Down Expand Up @@ -292,17 +299,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelClustering {
#endif

[[maybe_unused]] const uint32_t blockDimension = alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[0u];
// Assume that we can cover the whole module with up to 16 blockDimension-wide iterations
// This maxIter value was tuned for GPU, with 256 or 512 threads per block.
// Hence, also works for CPU case, with 256 or 512 elements per thread.
// Real constrainst is maxIter = hist.size() / blockDimension,
// with blockDimension = threadPerBlock * elementsPerThread.
// Hence, maxIter can be tuned accordingly to the workdiv.
constexpr unsigned int maxIterGPU = 16;
// assume that we can cover the whole module with up to maxIterGPU blockDimension-wide iterations
ALPAKA_ASSERT_ACC((hist.size() / blockDimension) < maxIterGPU);

// NB: can be tuned.
constexpr uint32_t maxElements = cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? 256 : 1;
// number of elements per thread
constexpr uint32_t maxElements =
cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? maxElementsPerBlock : 1;
ALPAKA_ASSERT_ACC((alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u] <= maxElements));

constexpr unsigned int maxIter = maxIterGPU * maxElements;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -641,25 +641,15 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
cms::alpakatools::make_device_view(alpaka::getDev(queue), clusters_d->view().moduleStart(), 1u);
alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement);

// TODO
// - we are fixing this here since it is used at compile time also in the kernel
// - put maxIter in the Geometry traits
constexpr auto threadsOrElementsFindClus = 256;

const auto elementsPerBlockFindClus = FindClus<TrackerTraits>::maxElementsPerBlock;
const auto workDivMaxNumModules =
cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, threadsOrElementsFindClus);
// NB: With present FindClus() / chargeCut() algorithm,
// threadPerBlock (GPU) or elementsPerThread (CPU) = 256 show optimal performance.
// Though, it does not have to be the same number for CPU/GPU cases.

cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, elementsPerBlockFindClus);
#ifdef GPU_DEBUG
std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << threadsOrElementsFindClus
std::cout << " FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus
<< " threadsPerBlockOrElementsPerThread\n";
#endif

alpaka::exec<Acc1D>(
queue, workDivMaxNumModules, FindClus<TrackerTraits>{}, digis_d->view(), clusters_d->view(), wordCounter);

#ifdef GPU_DEBUG
alpaka::wait(queue);
#endif
Expand Down Expand Up @@ -740,14 +730,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
cms::alpakatools::make_device_view(alpaka::getDev(queue), clusters_d->view().moduleStart(), 1u);
alpaka::memcpy(queue, nModules_Clusters_h, moduleStartFirstElement);

/// should be larger than maxPixInModule/16 aka (maxPixInModule/maxiter in the kernel)

const auto threadsPerBlockFindClus = 256;
const auto workDivMaxNumModules = cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, threadsPerBlockFindClus);

const auto elementsPerBlockFindClus = FindClus<TrackerTraits>::maxElementsPerBlock;
const auto workDivMaxNumModules =
cms::alpakatools::make_workdiv<Acc1D>(numberOfModules, elementsPerBlockFindClus);
#ifdef GPU_DEBUG
alpaka::wait(queue);
std::cout << "FindClus kernel launch with " << numberOfModules << " blocks of " << threadsPerBlockFindClus
std::cout << "FindClus kernel launch with " << numberOfModules << " blocks of " << elementsPerBlockFindClus
<< " threadsPerBlockOrElementsPerThread\n";
#endif
alpaka::exec<Acc1D>(
Expand Down

0 comments on commit 7874d41

Please sign in to comment.