diff --git a/GPU/GPUTracking/Base/GPUParam.h b/GPU/GPUTracking/Base/GPUParam.h index d4edadc709b8d..2d0b978241a98 100644 --- a/GPU/GPUTracking/Base/GPUParam.h +++ b/GPU/GPUTracking/Base/GPUParam.h @@ -38,7 +38,6 @@ namespace gpu struct GPUSettingsRec; struct GPUSettingsGTP; struct GPURecoStepConfiguration; -struct GPUTPCClusterOccupancyMapBin; struct GPUParamSlice { float Alpha; // slice angle @@ -60,7 +59,7 @@ struct GPUParam_t { GPUTPCGeometry tpcGeometry; // TPC Geometry GPUTPCGMPolynomialField polynomialField; // Polynomial approx. of magnetic field for TPC GM - const GPUTPCClusterOccupancyMapBin* occupancyMap; // Ptr to TPC occupancy map + const unsigned int* occupancyMap; // Ptr to TPC occupancy map GPUParamSlice SliceParam[GPUCA_NSLICES]; @@ -104,7 +103,7 @@ struct GPUParam : public internal::GPUParam_t GPUd() float GetClusterError2(int yz, int type, float z, float angle2, float scaledMult, float scaledAvgCharge) const; GPUd() void GetClusterErrors2(char sector, int row, float z, float sinPhi, float DzDs, float time, float avgCharge, float& ErrY2, float& ErrZ2) const; GPUd() void UpdateClusterError2ByState(short clusterState, float& ErrY2, float& ErrZ2) const; - GPUd() float GetScaledMult(int iSlice, int iRow, float time) const; + GPUd() float GetScaledMult(float time) const; GPUd() void Slice2Global(int iSlice, float x, float y, float z, float* X, float* Y, float* Z) const; GPUd() void Global2Slice(int iSlice, float x, float y, float z, float* X, float* Y, float* Z) const; diff --git a/GPU/GPUTracking/Base/GPUParam.inc b/GPU/GPUTracking/Base/GPUParam.inc index 4df53e3ff6a75..52a6c76419f73 100644 --- a/GPU/GPUTracking/Base/GPUParam.inc +++ b/GPU/GPUTracking/Base/GPUParam.inc @@ -163,7 +163,7 @@ GPUdi() void MEM_LG(GPUParam)::GetClusterErrors2(char sector, int iRow, float z, float angleY2 = s2 * sec2; // dy/dx float angleZ2 = DzDs * DzDs * sec2; // dz/dx - float mult = time >= 0.f ? GetScaledMult(sector, iRow, time) / tpcGeometry.Row2X(iRow) : 0.f; + float mult = time >= 0.f ? GetScaledMult(time) / tpcGeometry.Row2X(iRow) : 0.f; ErrY2 = GetClusterError2(0, rowType, z, angleY2, mult, avgCharge); ErrZ2 = GetClusterError2(1, rowType, z, angleZ2, mult, avgCharge); @@ -191,14 +191,14 @@ GPUdi() void MEM_LG(GPUParam)::UpdateClusterError2ByState(short clusterState, fl } MEM_CLASS_PRE() -GPUdi() float MEM_LG(GPUParam)::GetScaledMult(int iSlice, int iRow, float time) const +GPUdi() float MEM_LG(GPUParam)::GetScaledMult(float time) const { #if !defined(__OPENCL__) || defined(__OPENCLCPP__) if (!occupancyMap) { return 0.f; } const unsigned int bin = CAMath::Max(0.f, time / rec.tpc.occupancyMapTimeBins); - return occupancyMap[bin].bin[iSlice][iRow] * rec.tpc.clusterErrorOccupancyScaler; + return occupancyMap[bin] * rec.tpc.clusterErrorOccupancyScaler; #else return 0.f; #endif diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 896b3b2ba77d6..fcdebe0951002 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -729,6 +729,15 @@ void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size) return retVal; } +void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device) +{ + if (device) { + return AllocateVolatileDeviceMemory(size); + } + mVolatileChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]); + return GPUProcessor::alignPointer(mVolatileChunks.back().get()); +} + void GPUReconstruction::ResetRegisteredMemoryPointers(GPUProcessor* proc) { for (unsigned int i = 0; i < mMemoryResources.size(); i++) { @@ -796,6 +805,12 @@ void GPUReconstruction::ReturnVolatileDeviceMemory() } } +void GPUReconstruction::ReturnVolatileMemory() +{ + ReturnVolatileDeviceMemory(); + mVolatileChunks.clear(); +} + void GPUReconstruction::PushNonPersistentMemory(unsigned long tag) { mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), tag); diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 518f0af43aae4..dc629d86553e2 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -62,6 +62,7 @@ class GPUReconstruction std::shared_ptr mMyLib = nullptr; std::vector mMemoryResources; std::vector> mUnmanagedChunks; + std::vector> mVolatileChunks; std::vector> mChains; public: @@ -205,10 +206,12 @@ class GPUReconstruction void AllocateRegisteredForeignMemory(short res, GPUReconstruction* rec, GPUOutputControl* control = nullptr); void* AllocateUnmanagedMemory(size_t size, int type); void* AllocateVolatileDeviceMemory(size_t size); + void* AllocateVolatileMemory(size_t size, bool device); void FreeRegisteredMemory(GPUProcessor* proc, bool freeCustom = false, bool freePermanent = false); void FreeRegisteredMemory(short res); void ClearAllocatedMemory(bool clearOutputs = true); void ReturnVolatileDeviceMemory(); + void ReturnVolatileMemory(); void PushNonPersistentMemory(unsigned long tag); void PopNonPersistentMemory(RecoStep step, unsigned long tag); void BlockStackedMemory(GPUReconstruction* rec); diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 0a5064635b3a6..fca2c802a602e 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -375,7 +375,7 @@ unsigned int GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor(bool condition, return mNestedLoopOmpFactor; } -void GPUReconstructionCPU::UpdateParamOccupancyMap(const GPUTPCClusterOccupancyMapBin* mapHost, const GPUTPCClusterOccupancyMapBin* mapGPU, int stream) +void GPUReconstructionCPU::UpdateParamOccupancyMap(const unsigned int* mapHost, const unsigned int* mapGPU, int stream) { param().occupancyMap = mapHost; if (IsGPU()) { diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index 6f72074b53f8f..387122d8358dd 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -136,7 +136,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernelsmResourceOccupancyMap, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcOccupancyMap)]); ReleaseEvent(mEvents->init); auto* ptr = doGPU ? mInputsShadow->mTPCClusterOccupancyMap : mInputsHost->mTPCClusterOccupancyMap; + auto* ptrTmp = (GPUTPCClusterOccupancyMapBin*)mRec->AllocateVolatileMemory(GPUTPCClusterOccupancyMapBin::getTotalSize(param()), doGPU); int streamOccMap = mRec->NStreams() - 1; - runKernel(GetGridAutoStep(streamOccMap, RecoStep::TPCSliceTracking), krnlRunRangeNone, {}, ptr, GPUTPCClusterOccupancyMapBin::getTotalSize(param())); - runKernel(GetGridBlk(GPUCA_NSLICES * GPUCA_ROW_COUNT, streamOccMap), krnlRunRangeNone, krnlEventNone, ptr); - runKernel(GetGridBlk(GPUCA_NSLICES * GPUCA_ROW_COUNT, streamOccMap), krnlRunRangeNone, {&mEvents->init}, ptr); + runKernel(GetGridAutoStep(streamOccMap, RecoStep::TPCSliceTracking), krnlRunRangeNone, {}, ptrTmp, GPUTPCClusterOccupancyMapBin::getTotalSize(param())); + runKernel(GetGridBlk(GPUCA_NSLICES * GPUCA_ROW_COUNT, streamOccMap), krnlRunRangeNone, krnlEventNone, ptrTmp); + runKernel(GetGridBlk(GPUTPCClusterOccupancyMapBin::getNBins(param()), streamOccMap), krnlRunRangeNone, krnlEventNone, ptrTmp, ptr); + mRec->ReturnVolatileMemory(); if (doGPU) { - TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap); + TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap, streamOccMap, &mEvents->init); } else { - TransferMemoryResourceLinkToGPU(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap); + TransferMemoryResourceLinkToGPU(RecoStep::TPCSliceTracking, mInputsHost->mResourceOccupancyMap, streamOccMap, &mEvents->init); } mRec->UpdateParamOccupancyMap(mInputsHost->mTPCClusterOccupancyMap, mInputsShadow->mTPCClusterOccupancyMap, streamOccMap); } diff --git a/GPU/GPUTracking/Global/GPUTrackingInputProvider.h b/GPU/GPUTracking/Global/GPUTrackingInputProvider.h index 9081773ddc205..d63b3e74cf220 100644 --- a/GPU/GPUTracking/Global/GPUTrackingInputProvider.h +++ b/GPU/GPUTracking/Global/GPUTrackingInputProvider.h @@ -87,7 +87,7 @@ class GPUTrackingInputProvider : public GPUProcessor o2::tpc::ClusterNative* mPclusterNativeBuffer = nullptr; o2::tpc::ClusterNative* mPclusterNativeOutput = nullptr; - GPUTPCClusterOccupancyMapBin* mTPCClusterOccupancyMap = nullptr; + unsigned int* mTPCClusterOccupancyMap = nullptr; unsigned int* mErrorCodes = nullptr; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h index 82296b82a2196..5949e2ff6ad2b 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h @@ -215,7 +215,7 @@ class GPUTPCGMMerger : public GPUProcessor std::vector StreamerUncorrectedZY(int iSlice, int iRow, const GPUTPCGMTrackParam& track, const GPUTPCGMPropagator& prop) const; void DebugStreamerUpdate(int iTrk, int ihit, float xx, float yy, float zz, const GPUTPCGMMergedTrackHit& cluster, const o2::tpc::ClusterNative& clusterNative, const GPUTPCGMTrackParam& track, const GPUTPCGMPropagator& prop, const gputpcgmmergertypes::InterpolationErrorHit& interpolation, char rejectChi2, bool refit, int retVal) const; - static void DebugStreamerReject(float mAlpha, int iRow, float posY, float posZ, short clusterState, char rejectChi2, const gputpcgmmergertypes::InterpolationErrorHit& inter, bool refit, int retVal, float err2Y, float err2Z, const GPUTPCGMTrackParam& track, char sector, const GPUParam& param, float time, float avgCharge); + static void DebugStreamerReject(float mAlpha, int iRow, float posY, float posZ, short clusterState, char rejectChi2, const gputpcgmmergertypes::InterpolationErrorHit& inter, bool refit, int retVal, float err2Y, float err2Z, const GPUTPCGMTrackParam& track, const GPUParam& param, float time, float avgCharge); #endif GPUdi() int SliceTrackInfoFirst(int iSlice) const { return mSliceTrackInfoIndex[iSlice]; } diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx index 80dc82b1ea8a6..2a3b49417c748 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx @@ -331,11 +331,12 @@ void GPUTPCGMMerger::DebugRefitMergedTrack(const GPUTPCGMMergedTrack& track) con std::vector GPUTPCGMMerger::StreamerOccupancyBin(int iSlice, int iRow, float time) const { - std::vector retVal(5); + std::vector retVal(1 + 2 * Param().rec.tpc.occupancyMapTimeBinsAverage); #ifdef DEBUG_STREAMER const int bin = CAMath::Max(0.f, time / Param().rec.tpc.occupancyMapTimeBins); - for (int i = 0; i < 5; i++) { - retVal[i] = (bin - 2 + i >= 0 && bin - 2 + i < GPUTPCClusterOccupancyMapBin::getNBins(Param())) ? Param().occupancyMap[bin - 2 + i].bin[iSlice][iRow] : 0; + for (int i = 0; i < 1 + 2 * Param().rec.tpc.occupancyMapTimeBinsAverage; i++) { + const int mybin = bin + i - Param().rec.tpc.occupancyMapTimeBinsAverage; + retVal[i] = (mybin >= 0 && mybin < GPUTPCClusterOccupancyMapBin::getNBins(Param())) ? Param().occupancyMap[i] : 0; } #endif return retVal; @@ -375,10 +376,10 @@ void GPUTPCGMMerger::DebugStreamerUpdate(int iTrk, int ihit, float xx, float yy, #endif } -void GPUTPCGMMerger::DebugStreamerReject(float mAlpha, int iRow, float posY, float posZ, short clusterState, char rejectChi2, const gputpcgmmergertypes::InterpolationErrorHit& inter, bool refit, int retVal, float err2Y, float err2Z, const GPUTPCGMTrackParam& track, char sector, const GPUParam& param, float time, float avgCharge) +void GPUTPCGMMerger::DebugStreamerReject(float mAlpha, int iRow, float posY, float posZ, short clusterState, char rejectChi2, const gputpcgmmergertypes::InterpolationErrorHit& inter, bool refit, int retVal, float err2Y, float err2Z, const GPUTPCGMTrackParam& track, const GPUParam& param, float time, float avgCharge) { #ifdef DEBUG_STREAMER - float scaledMult = (time >= 0.f ? param.GetScaledMult(sector, iRow, time) / param.tpcGeometry.Row2X(iRow) : 0.f); + float scaledMult = (time >= 0.f ? param.GetScaledMult(time) / param.tpcGeometry.Row2X(iRow) : 0.f); o2::utils::DebugStreamer::instance()->getStreamer("debug_InterpolateReject", "UPDATE") << o2::utils::DebugStreamer::instance()->getUniqueTreeName("tree_InterpolateReject").data() << "mAlpha=" << mAlpha << "iRow=" << iRow diff --git a/GPU/GPUTracking/Merger/GPUTPCGMPropagator.cxx b/GPU/GPUTracking/Merger/GPUTPCGMPropagator.cxx index d7445e5fb27f3..0c3f723bc43c8 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMPropagator.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMPropagator.cxx @@ -664,7 +664,7 @@ GPUd() int GPUTPCGMPropagator::Update(float posY, float posZ, int iRow, const GP } else { int retVal = InterpolateReject(param, posY, posZ, clusterState, rejectChi2, inter, err2Y, err2Z); GPUCA_DEBUG_STREAMER_CHECK(if (o2::utils::DebugStreamer::checkStream(o2::utils::StreamFlags::streamRejectCluster, iTrk)) { - GPUTPCGMMerger::DebugStreamerReject(mAlpha, iRow, posY, posZ, clusterState, rejectChi2, *inter, refit, retVal, err2Y, err2Z, *mT, sector, param, time, avgCharge); + GPUTPCGMMerger::DebugStreamerReject(mAlpha, iRow, posY, posZ, clusterState, rejectChi2, *inter, refit, retVal, err2Y, err2Z, *mT, param, time, avgCharge); }); if (retVal) { return retVal; diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx index 9bf1322e72ed5..84c6c65a8a7b9 100644 --- a/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx +++ b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.cxx @@ -36,33 +36,21 @@ GPUdii() void GPUTPCCreateOccupancyMap::Thread(i } template <> -GPUdii() void GPUTPCCreateOccupancyMap::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors, GPUTPCClusterOccupancyMapBin* GPUrestrict() map) +GPUdii() void GPUTPCCreateOccupancyMap::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors, GPUTPCClusterOccupancyMapBin* GPUrestrict() map, unsigned int* GPUrestrict() output) { GPUParam& GPUrestrict() param = processors.param; - const int iSliceRow = iBlock * nThreads + iThread; - if (iSliceRow > GPUCA_ROW_COUNT * GPUCA_NSLICES) { - return; - } - static constexpr unsigned int FOLD_BINS_BEEFORE_AFTER = 2; - static constexpr unsigned int FOLD_BINS = FOLD_BINS_BEEFORE_AFTER * 2 + 1; - const unsigned int iSlice = iSliceRow / GPUCA_ROW_COUNT; - const unsigned int iRow = iSliceRow % GPUCA_ROW_COUNT; - const unsigned int nBins = GPUTPCClusterOccupancyMapBin::getNBins(param); - if (nBins < FOLD_BINS) { + const unsigned int bin = iBlock * nThreads + iThread; + if (bin >= GPUTPCClusterOccupancyMapBin::getNBins(param)) { return; } - unsigned short lastVal[FOLD_BINS_BEEFORE_AFTER]; - unsigned int sum = (FOLD_BINS_BEEFORE_AFTER + 1) * map[0].bin[iSlice][iRow]; - for (unsigned int i = 0; i < FOLD_BINS_BEEFORE_AFTER; i++) { - sum += map[i + 1].bin[iSlice][iRow]; - lastVal[i] = map[0].bin[iSlice][iRow]; - } - unsigned int lastValIndex = 0; - for (unsigned int i = 0; i < nBins; i++) { - unsigned short useLastVal = lastVal[lastValIndex]; - lastVal[lastValIndex] = map[i].bin[iSlice][iRow]; - map[i].bin[iSlice][iRow] = sum / FOLD_BINS; - sum += map[CAMath::Min(i + FOLD_BINS_BEEFORE_AFTER + 1, nBins - 1)].bin[iSlice][iRow] - useLastVal; - lastValIndex = lastValIndex < FOLD_BINS_BEEFORE_AFTER - 1 ? lastValIndex + 1 : 0; + int binmin = CAMath::Max(0, bin - param.rec.tpc.occupancyMapTimeBinsAverage); + int binmax = CAMath::Min(GPUTPCClusterOccupancyMapBin::getNBins(param), bin + param.rec.tpc.occupancyMapTimeBinsAverage + 1); + unsigned int sum = 0; + for (int i = binmin; i < binmax; i++) { + for (int iSliceRow = 0; iSliceRow < GPUCA_NSLICES * GPUCA_ROW_COUNT; iSliceRow++) { + sum += (&map[i].bin[0][0])[iSliceRow]; + } } + sum /= binmax - binmin; + output[bin] = sum; } diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h index 33a0d18b92a30..0cfd5dd45d9b3 100644 --- a/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h +++ b/GPU/GPUTracking/SliceTracker/GPUTPCCreateOccupancyMap.h @@ -30,8 +30,8 @@ class GPUTPCCreateOccupancyMap : public GPUKernelTemplate fill = 0, fold = 1 }; GPUhdi() CONSTEXPR static GPUDataTypes::RecoStep GetRecoStep() { return GPUDataTypes::RecoStep::TPCSliceTracking; } - template - GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors, GPUTPCClusterOccupancyMapBin* map); + template + GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors, Args... args); }; } // namespace GPUCA_NAMESPACE::gpu diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index ec97247ee6c79..8bdb543230ab6 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -48,7 +48,7 @@ o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, globalTracks1" "= TPCMERG o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, globalTracks2" "= TPCMERGER" NO single char parameter) o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, borderTracks" "= TPCMERGER" NO single char parameter) o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fill" "= TPCOCCUPANCY" LB simple GPUTPCClusterOccupancyMapBin* map) -o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fold" "= TPCOCCUPANCY" LB simple GPUTPCClusterOccupancyMapBin* map) +o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fold" "= TPCOCCUPANCY" LB simple GPUTPCClusterOccupancyMapBin* map "unsigned int*" output) o2_gpu_add_kernel("GPUTPCGMMergerTrackFit" "GPUTPCGMMergerGPU TPCMERGER TPCTRACKER MATLUT TPCDEDX" LB simple int mode) o2_gpu_add_kernel("GPUTPCGMMergerFollowLoopers" "GPUTPCGMMergerGPU TPCMERGER TPCTRACKER MATLUT" LB simple) o2_gpu_add_kernel("GPUTPCGMMergerUnpackResetIds" "GPUTPCGMMergerGPU TPCMERGER" LB simple int iSlice)