ITS-Tracking: introduce multi-ROF seeding vertexer (AliceO2Group#13323)

* Add multi rof vertexer idea * More vertices is better than less * Improve tracklet validation and manage late vertices * Fix GPU compilation * Fix leak and bugs * Add vertices in both rofs * Fix rebasing * Fix non-deltaRof behaviour * Fix multiple iterations * Refactor tracklets and fix second iteration with multirof
noferini · Aug 21, 2024 · ba9e426 · ba9e426
1 parent 56ac000
commit ba9e426
Show file tree

Hide file tree

Showing 22 changed files with 524 additions and 343 deletions.
diff --git a/DataFormats/Detectors/Common/src/CTFHeader.cxx b/DataFormats/Detectors/Common/src/CTFHeader.cxx
@@ -18,7 +18,7 @@ using DetID = o2::detectors::DetID;
 /// describe itsel as a string
 std::string CTFHeader::describe() const
 {
-  return fmt::format("Run:{:07d} TF:{} Orbit:{:08d} CteationTime:{} Detectors: {}", run, tfCounter, firstTForbit, creationTime, DetID::getNames(detectors));
+  return fmt::format("Run:{:07d} TF:{} Orbit:{:08d} CreationTime:{} Detectors: {}", run, tfCounter, firstTForbit, creationTime, DetID::getNames(detectors));
 }
 
 void CTFHeader::print() const

diff --git a/DataFormats/Reconstruction/include/ReconstructionDataFormats/Vertex.h b/DataFormats/Reconstruction/include/ReconstructionDataFormats/Vertex.h
@@ -146,8 +146,8 @@ class Vertex : public VertexBase
   GPUd() void setChi2(float v) { mChi2 = v; }
   GPUd() float getChi2() const { return mChi2; }
 
-  GPUd() const Stamp& getTimeStamp() const { return mTimeStamp; }
-  GPUd() Stamp& getTimeStamp() { return mTimeStamp; }
+  GPUhd() const Stamp& getTimeStamp() const { return mTimeStamp; }
+  GPUhd() Stamp& getTimeStamp() { return mTimeStamp; }
   GPUd() void setTimeStamp(const Stamp& v) { mTimeStamp = v; }
 
  protected:

diff --git a/DataFormats/common/include/CommonDataFormat/TimeStamp.h b/DataFormats/common/include/CommonDataFormat/TimeStamp.h
@@ -28,7 +28,7 @@ class TimeStamp
   GPUhdDefault() TimeStamp() CON_DEFAULT;
   GPUhdDefault() ~TimeStamp() CON_DEFAULT;
   GPUdi() TimeStamp(T time) { mTimeStamp = time; }
-  GPUdi() T getTimeStamp() const { return mTimeStamp; }
+  GPUhdi() T getTimeStamp() const { return mTimeStamp; }
   GPUdi() void setTimeStamp(T t) { mTimeStamp = t; }
   GPUdi() bool operator==(const TimeStamp<T>& t) const { return mTimeStamp == t.mTimeStamp; }
 

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -210,15 +210,15 @@ class TimeFrameGPU : public TimeFrame
   /// interface
   int getNClustersInRofSpan(const int, const int, const int) const;
   IndexTableUtils* getDeviceIndexTableUtils() { return mIndexTableUtilsDevice; }
-  int* getDeviceROframesClusters(const int layer) { return mROframesClustersDevice[layer]; }
+  int* getDeviceROFramesClusters(const int layer) { return mROFramesClustersDevice[layer]; }
   std::vector<std::vector<Vertex>>& getVerticesInChunks() { return mVerticesInChunks; }
   std::vector<std::vector<int>>& getNVerticesInChunks() { return mNVerticesInChunks; }
   std::vector<o2::its::TrackITSExt>& getTrackITSExt() { return mTrackITSExt; }
   std::vector<std::vector<o2::MCCompLabel>>& getLabelsInChunks() { return mLabelsInChunks; }
   int getNAllocatedROFs() const { return mNrof; } // Allocated means maximum nROF for each chunk while populated is the number of loaded ones.
   StaticTrackingParameters<nLayers>* getDeviceTrackingParameters() { return mTrackingParamsDevice; }
   Vertex* getDeviceVertices() { return mVerticesDevice; }
-  int* getDeviceROframesPV() { return mROframesPVDevice; }
+  int* getDeviceROFramesPV() { return mROFramesPVDevice; }
   unsigned char* getDeviceUsedClusters(const int);
   const o2::base::Propagator* getChainPropagator();
 
@@ -251,10 +251,10 @@ class TimeFrameGPU : public TimeFrame
   // Device pointers
   StaticTrackingParameters<nLayers>* mTrackingParamsDevice;
   IndexTableUtils* mIndexTableUtilsDevice;
-  std::array<int*, nLayers> mROframesClustersDevice;
+  std::array<int*, nLayers> mROFramesClustersDevice;
   std::array<unsigned char*, nLayers> mUsedClustersDevice;
   Vertex* mVerticesDevice;
-  int* mROframesPVDevice;
+  int* mROFramesPVDevice;
 
   // Hybrid pref
   std::array<Cluster*, nLayers> mClustersDevice;
@@ -314,7 +314,7 @@ size_t TimeFrameGPU<nLayers>::loadChunkData(const size_t chunk, const size_t off
 template <int nLayers>
 inline int TimeFrameGPU<nLayers>::getNClustersInRofSpan(const int rofIdstart, const int rofSpanSize, const int layerId) const
 {
-  return static_cast<int>(mROframesClusters[layerId][(rofIdstart + rofSpanSize) < mROframesClusters.size() ? rofIdstart + rofSpanSize : mROframesClusters.size() - 1] - mROframesClusters[layerId][rofIdstart]);
+  return static_cast<int>(mROFramesClusters[layerId][(rofIdstart + rofSpanSize) < mROFramesClusters.size() ? rofIdstart + rofSpanSize : mROFramesClusters.size() - 1] - mROFramesClusters[layerId][rofIdstart]);
 }
 } // namespace gpu
 } // namespace its

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -443,21 +443,21 @@ void TimeFrameGPU<nLayers>::initDevice(const int chunks,
       mMemChunks[iChunk].allocate(GpuTimeFrameChunk<nLayers>::computeRofPerChunk(mGpuParams, mAvailMemGB), mGpuStreams[iChunk]);
     }
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      checkGPUError(cudaMalloc(reinterpret_cast<void**>(&mROframesClustersDevice[iLayer]), mROframesClusters[iLayer].size() * sizeof(int)));
+      checkGPUError(cudaMalloc(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int)));
       checkGPUError(cudaMalloc(reinterpret_cast<void**>(&(mUsedClustersDevice[iLayer])), sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof));
     }
     checkGPUError(cudaMalloc(reinterpret_cast<void**>(&mVerticesDevice), sizeof(Vertex) * mGpuParams.maxVerticesCapacity));
-    checkGPUError(cudaMalloc(reinterpret_cast<void**>(&mROframesPVDevice), sizeof(int) * (mNrof + 1)));
+    checkGPUError(cudaMalloc(reinterpret_cast<void**>(&mROFramesPVDevice), sizeof(int) * (mNrof + 1)));
 
     mFirstInit = false;
   }
   if (maxLayers < nLayers) { // Vertexer
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      checkGPUError(cudaMemcpy(mROframesClustersDevice[iLayer], mROframesClusters[iLayer].data(), mROframesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice));
+      checkGPUError(cudaMemcpy(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice));
     }
   } else { // Tracker
     checkGPUError(cudaMemcpy(mVerticesDevice, mPrimaryVertices.data(), sizeof(Vertex) * mPrimaryVertices.size(), cudaMemcpyHostToDevice));
-    checkGPUError(cudaMemcpy(mROframesPVDevice, mROframesPV.data(), sizeof(int) * mROframesPV.size(), cudaMemcpyHostToDevice));
+    checkGPUError(cudaMemcpy(mROFramesPVDevice, mROFramesPV.data(), sizeof(int) * mROFramesPV.size(), cudaMemcpyHostToDevice));
     if (!iteration) {
       for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
         checkGPUError(cudaMemset(mUsedClustersDevice[iLayer], 0, sizeof(unsigned char) * mGpuParams.clustersPerROfCapacity * mNrof));

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -353,8 +353,8 @@ GPUg() void printTrackletsNotStrided(const Tracklet* t,
 // Compute the tracklets for a given layer
 template <int nLayers = 7>
 GPUg() void computeLayerTrackletsKernelSingleRof(
-  const int rof0,
-  const int maxRofs,
+  const short rof0,
+  const short maxRofs,
   const int layerIndex,
   const Cluster* clustersCurrentLayer,        // input data rof0
   const Cluster* clustersNextLayer,           // input data rof0-delta <rof0< rof0+delta (up to 3 rofs)
@@ -385,8 +385,8 @@ GPUg() void computeLayerTrackletsKernelSingleRof(
     if (usedClustersLayer[currentSortedIndex]) {
       continue;
     }
-    int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0;
-    int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF;
+    short minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0;
+    short maxRof = (rof0 == static_cast<short>(maxRofs - trkPars->DeltaROF)) ? rof0 : rof0 + trkPars->DeltaROF;
     const float inverseR0{1.f / currentCluster.radius};
     for (int iPrimaryVertex{0}; iPrimaryVertex < nVertices; iPrimaryVertex++) {
       const auto& primaryVertex{vertices[iPrimaryVertex]};
@@ -410,7 +410,7 @@ GPUg() void computeLayerTrackletsKernelSingleRof(
       }
       constexpr int tableSize{256 * 128 + 1}; // hardcoded for the time being
 
-      for (int rof1{minRof}; rof1 <= maxRof; ++rof1) {
+      for (short rof1{minRof}; rof1 <= maxRof; ++rof1) {
         if (!(roFrameClustersNext[rof1 + 1] - roFrameClustersNext[rof1])) { // number of clusters on next layer > 0
           continue;
         }
@@ -561,7 +561,7 @@ GPUg() void computeLayerTrackletsKernelMultipleRof(
                 const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)};
                 const size_t stride{currentClusterIndex * maxTrackletsPerCluster};
                 if (storedTracklets < maxTrackletsPerCluster) {
-                  new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast<ushort>(rof0), static_cast<ushort>(rof1)};
+                  new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast<short>(rof0), static_cast<short>(rof1)};
                 }
                 // else {
                 // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets);

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cu
@@ -178,7 +178,7 @@ GPUg() void trackleterKernelSingleRof(
   Tracklet* Tracklets,
   int* foundTracklets,
   const IndexTableUtils* utils,
-  const int rofId,
+  const short rofId,
   const size_t maxTrackletsPerCluster = 1e2)
 {
   const int phiBins{utils->getNphiBins()};
@@ -234,15 +234,15 @@ GPUg() void trackleterKernelMultipleRof(
   Tracklet* Tracklets,
   int* foundTracklets,
   const IndexTableUtils* utils,
-  const unsigned int startRofId,
-  const unsigned int rofSize,
+  const short startRofId,
+  const short rofSize,
   const float phiCut,
   const size_t maxTrackletsPerCluster = 1e2)
 {
   const int phiBins{utils->getNphiBins()};
   const int zBins{utils->getNzBins()};
-  for (unsigned int iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) {
-    auto rof = iRof + startRofId;
+  for (auto iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) {
+    short rof = static_cast<short>(iRof) + startRofId;
     auto* clustersNextLayerRof = clustersNextLayer + (sizeNextLClusters[rof] - sizeNextLClusters[startRofId]);
     auto* clustersCurrentLayerRof = clustersCurrentLayer + (sizeCurrentLClusters[rof] - sizeCurrentLClusters[startRofId]);
     auto nClustersNextLayerRof = sizeNextLClusters[rof + 1] - sizeNextLClusters[rof];
@@ -273,9 +273,9 @@ GPUg() void trackleterKernelMultipleRof(
             if (o2::gpu::GPUCommonMath::Abs(smallestAngleDifference(currentCluster.phi, nextCluster.phi)) < phiCut) {
               if (storedTracklets < maxTrackletsPerCluster) {
                 if constexpr (Mode == TrackletMode::Layer0Layer1) {
-                  new (TrackletsRof + stride + storedTracklets) Tracklet{iNextLayerClusterIndex, iCurrentLayerClusterIndex, nextCluster, currentCluster, static_cast<int>(rof), static_cast<int>(rof)};
+                  new (TrackletsRof + stride + storedTracklets) Tracklet{iNextLayerClusterIndex, iCurrentLayerClusterIndex, nextCluster, currentCluster, rof, rof};
                 } else {
-                  new (TrackletsRof + stride + storedTracklets) Tracklet{iCurrentLayerClusterIndex, iNextLayerClusterIndex, currentCluster, nextCluster, static_cast<int>(rof), static_cast<int>(rof)};
+                  new (TrackletsRof + stride + storedTracklets) Tracklet{iCurrentLayerClusterIndex, iNextLayerClusterIndex, currentCluster, nextCluster, rof, rof};
                 }
                 ++storedTracklets;
               }
@@ -625,8 +625,8 @@ void VertexerTraitsGPU::computeTracklets(const int iteration)
         gpu::trackleterKernelMultipleRof<TrackletMode::Layer0Layer1><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),         // const Cluster* clustersNextLayer,    // 0 2
           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),         // const Cluster* clustersCurrentLayer, // 1 1
-          mTimeFrameGPU->getDeviceROframesClusters(0),                   // const int* sizeNextLClusters,
-          mTimeFrameGPU->getDeviceROframesClusters(1),                   // const int* sizeCurrentLClusters,
+          mTimeFrameGPU->getDeviceROFramesClusters(0),                   // const int* sizeNextLClusters,
+          mTimeFrameGPU->getDeviceROFramesClusters(1),                   // const int* sizeCurrentLClusters,
           mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(0),      // const int* nextIndexTables,
           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),        // Tracklet* Tracklets,
           mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // int* foundTracklets,
@@ -639,8 +639,8 @@ void VertexerTraitsGPU::computeTracklets(const int iteration)
         gpu::trackleterKernelMultipleRof<TrackletMode::Layer1Layer2><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(2),         // const Cluster* clustersNextLayer,    // 0 2
           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),         // const Cluster* clustersCurrentLayer, // 1 1
-          mTimeFrameGPU->getDeviceROframesClusters(2),                   // const int* sizeNextLClusters,
-          mTimeFrameGPU->getDeviceROframesClusters(1),                   // const int* sizeCurrentLClusters,
+          mTimeFrameGPU->getDeviceROFramesClusters(2),                   // const int* sizeNextLClusters,
+          mTimeFrameGPU->getDeviceROFramesClusters(1),                   // const int* sizeCurrentLClusters,
           mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(2),      // const int* nextIndexTables,
           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),        // Tracklet* Tracklets,
           mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // int* foundTracklets,
@@ -653,8 +653,8 @@ void VertexerTraitsGPU::computeTracklets(const int iteration)
         gpu::trackletSelectionKernelMultipleRof<true><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),            // const Cluster* clusters0,               // Clusters on layer 0
           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),            // const Cluster* clusters1,               // Clusters on layer 1
-          mTimeFrameGPU->getDeviceROframesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
-          mTimeFrameGPU->getDeviceROframesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
+          mTimeFrameGPU->getDeviceROFramesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
+          mTimeFrameGPU->getDeviceROFramesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),           // Tracklet* tracklets01,                  // Tracklets on layer 0-1
           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),           // Tracklet* tracklets12,                  // Tracklets on layer 1-2
           mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0),    // const int* nFoundTracklets01,           // Number of tracklets found on layers 0-1
@@ -686,8 +686,8 @@ void VertexerTraitsGPU::computeTracklets(const int iteration)
         gpu::trackletSelectionKernelMultipleRof<false><<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0),            // const Cluster* clusters0,               // Clusters on layer 0
           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1),            // const Cluster* clusters1,               // Clusters on layer 1
-          mTimeFrameGPU->getDeviceROframesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
-          mTimeFrameGPU->getDeviceROframesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
+          mTimeFrameGPU->getDeviceROFramesClusters(0),                      // const int* sizeClustersL0,              // Number of clusters on layer 0 per ROF
+          mTimeFrameGPU->getDeviceROFramesClusters(1),                      // const int* sizeClustersL1,              // Number of clusters on layer 1 per ROF
           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0),           // Tracklet* tracklets01,                  // Tracklets on layer 0-1
           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1),           // Tracklet* tracklets12,                  // Tracklets on layer 1-2
           mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0),    // const int* nFoundTracklets01,           // Number of tracklets found on layers 0-1
@@ -721,8 +721,8 @@ void VertexerTraitsGPU::computeTracklets(const int iteration)
         std::vector<bool> usedLines;
         for (int rofId{0}; rofId < rofs; ++rofId) {
           auto rof = offset + rofId;
-          auto clustersL1offsetRof = mTimeFrameGPU->getROframeClusters(1)[rof] - mTimeFrameGPU->getROframeClusters(1)[offset]; // starting cluster offset for this ROF
-          auto nClustersL1Rof = mTimeFrameGPU->getROframeClusters(1)[rof + 1] - mTimeFrameGPU->getROframeClusters(1)[rof];     // number of clusters for this ROF
+          auto clustersL1offsetRof = mTimeFrameGPU->getROFrameClusters(1)[rof] - mTimeFrameGPU->getROFrameClusters(1)[offset]; // starting cluster offset for this ROF
+          auto nClustersL1Rof = mTimeFrameGPU->getROFrameClusters(1)[rof + 1] - mTimeFrameGPU->getROFrameClusters(1)[rof];     // number of clusters for this ROF
           auto linesOffsetRof = exclusiveFoundLinesHost[clustersL1offsetRof];                                                  // starting line offset for this ROF
           auto nLinesRof = exclusiveFoundLinesHost[clustersL1offsetRof + nClustersL1Rof] - linesOffsetRof;
           gsl::span<const o2::its::Line> linesInRof(lines.data() + linesOffsetRof, static_cast<gsl::span<o2::its::Line>::size_type>(nLinesRof));
@@ -754,7 +754,7 @@ void VertexerTraitsGPU::computeTracklets(const int iteration)
     int start{0};
     for (int rofId{0}; rofId < mTimeFrameGPU->getNVerticesInChunks()[chunkId].size(); ++rofId) {
       gsl::span<const Vertex> rofVerts{mTimeFrameGPU->getVerticesInChunks()[chunkId].data() + start, static_cast<gsl::span<Vertex>::size_type>(mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId])};
-      mTimeFrameGPU->addPrimaryVertices(rofVerts);
+      mTimeFrameGPU->addPrimaryVertices(rofVerts, rofId, 0);
       if (mTimeFrameGPU->hasMCinformation()) {
         // mTimeFrameGPU->getVerticesLabels().emplace_back();
         // TODO: add MC labels