Skip to content

Commit

Permalink
BUG fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
VinInn committed May 8, 2022
1 parent 3ae45f7 commit 849da8c
Show file tree
Hide file tree
Showing 9 changed files with 32 additions and 18 deletions.
20 changes: 14 additions & 6 deletions HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ namespace memoryPool {
SimplePoolAllocator *getPool(Where where);

// allocate either on current device or on host
inline std::pair<void *, int> alloc(uint64_t size, SimplePoolAllocator &pool);
/* inline */ std::pair<void *, int> alloc(uint64_t size, SimplePoolAllocator &pool);

// schedule free
inline void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool);
/* inline */ void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool);

template <typename T>
auto copy(buffer<T> &dst, buffer<T> const &src, uint64_t size, cudaStream_t stream) {
Expand All @@ -31,9 +31,17 @@ namespace memoryPool {
}

struct CudaDeleterBase : public DeleterBase {
CudaDeleterBase(cudaStream_t const &stream, Where where) : DeleterBase(getPool(where)), m_stream(stream) {}

CudaDeleterBase(cudaStream_t const &stream, SimplePoolAllocator *pool) : DeleterBase(pool), m_stream(stream) {}
CudaDeleterBase(cudaStream_t const &stream, Where where) : DeleterBase(getPool(where)), m_stream(stream) {
// if (stream) return;
// std::cout << "0 stream???" << std::endl;
// throw std::bad_alloc();
}

CudaDeleterBase(cudaStream_t const &stream, SimplePoolAllocator *pool) : DeleterBase(pool), m_stream(stream) {
// if (stream) return;
// std::cout << "0 stream???" << std::endl;
// throw std::bad_alloc();
}

~CudaDeleterBase() override = default;

Expand Down Expand Up @@ -83,4 +91,4 @@ namespace memoryPool {
} // namespace cuda
} // namespace memoryPool

#include "cudaMemoryPoolImpl.h"
// #include "cudaMemoryPoolImpl.h"
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace {
auto message = cudaGetErrorString(status);
std::cout << " error " << error << ": " << message << std::endl;
}
// std::cout << "free callaback" << std::endl;
// std::cout << "free callaback for stream " << streamId << std::endl;
auto payload = (memoryPool::Payload *)(p);
memoryPool::scheduleFree(payload);
}
Expand All @@ -29,7 +29,7 @@ namespace {

struct CudaAlloc {
static void scheduleFree(memoryPool::Payload *payload, cudaStream_t stream) {
// std::cout << "schedule free" << std::endl;
// std::cout << "schedule free for stream " << stream <<std::endl;
if (stream)
cudaCheck(cudaStreamAddCallback(stream, freeCallback, payload,0));
// cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload));
Expand Down Expand Up @@ -74,14 +74,14 @@ namespace memoryPool {
SimplePoolAllocator *getPool(Where where);

// allocate either on current device or on host (actually anywhere, not cuda specific)
inline std::pair<void *, int> alloc(uint64_t size, SimplePoolAllocator &pool) {
/*inline*/ std::pair<void *, int> alloc(uint64_t size, SimplePoolAllocator &pool) {
int i = pool.alloc(size);
void *p = pool.pointer(i);
return std::pair<void *, int>(p, i);
}

// schedule free
inline void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool) {
/*inline*/ void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool) {
auto payload = new Payload{&pool, std::move(buckets)};
CudaHostAlloc::scheduleFree(payload, stream);
}
Expand Down
1 change: 1 addition & 0 deletions HeterogeneousCore/CUDAUtilities/interface/memoryPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ namespace memoryPool {
if (!me)
throw std::bad_alloc();
if(!p) std::cout << "delete null pointer!!! " << m_bucket << std::endl;
if (m_bucket<0) std::cout << "delete with negative bucket!!!" << std::endl;
// assert(p == pool()->pointer(m_bucket));
(*me)(m_bucket);
}
Expand Down
1 change: 1 addition & 0 deletions HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h"

#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"

Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ void go() {
kernel_test<<<1, 128, 0, stream>>>(n, dp, me);

// better sync each "event"
cudaStreamSynchronize(stream);
// cudaStreamSynchronize(stream);
#else
// do something???
for (int k = 0; k < n; ++k) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
fast_fit_resultsGPU.get());
cudaCheck(cudaGetLastError());
}

#ifdef GPU_DEBUG
cudaDeviceSynchronize();
cudaCheck(cudaGetLastError());
#endif
} // loop on concurrent fits
}
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,6 @@ template <>
void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
int32_t nhits = hh.nHits();

isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};

#ifdef NTUPLE_DEBUG
std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
#endif
Expand All @@ -147,7 +145,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
#endif

// in principle we can use "nhits" to heuristically dimension the workspace...
memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(nullptr, memoryPool::onDevice));
memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onDevice));
device_isOuterHitOfCell_ = memoryPool::cuda::make_buffer<GPUCACell::OuterHitOfCellContainer>(std::max(1, nhits),deleter);
assert(device_isOuterHitOfCell_.get());

Expand Down Expand Up @@ -325,6 +323,9 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
cudaCheck(cudaGetLastError());
}
#ifdef GPU_DEBUG
//std::cout << "sync stream " << cudaStream << std::endl;
//cudaStreamSynchronize(cudaStream);
//std::cout << "sync stream done " << cudaStream << std::endl;
cudaDeviceSynchronize();
cudaCheck(cudaGetLastError());
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t
device_theCellNeighbors_ = memoryPool::cuda::make_buffer<caConstants::CellNeighborsVector>(1,deleter);
device_theCellTracks_ = memoryPool::cuda::make_buffer<caConstants::CellTracksVector>(1,deleter);

// #ifdef GPU_DEBUG
#ifdef GPU_DEBUG
std::cout << "Allocation for tuple building. N hits " << nHits
<< ((where==memoryPool::onDevice) ? " on GPU" : " on CPU")<<std::endl;
// #endif
#endif

nHits++; // storage requires one more counter;
assert(nHits > 0);
Expand Down
2 changes: 1 addition & 1 deletion RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const e
bsHost->betaStar = bs.betaStar();

BeamSpotCUDA bsDevice(ctx.stream());
cudaMemcpyAsync(bsDevice.data(), bsHost.get(), sizeof(BeamSpotPOD), cudaMemcpyHostToDevice, ctx.stream());
cudaCheck(cudaMemcpyAsync(bsDevice.data(), bsHost.get(), sizeof(BeamSpotPOD), cudaMemcpyHostToDevice, ctx.stream()));

ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice));
}
Expand Down

0 comments on commit 849da8c

Please sign in to comment.