drive from service

cms-sw · May 15, 2022 · e2d5802 · e2d5802
1 parent 4743b8e
commit e2d5802
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 34 deletions.
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -16,6 +16,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
@@ -135,6 +136,8 @@ CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getU
   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
   if (not configEnabled) {
     edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
+    // enable cpu memory pool
+    memoryPool::cuda::init(true);
     return;
   }
 
@@ -361,6 +364,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getU
   cms::cuda::getEventCache().clear();
   cms::cuda::getStreamCache().clear();
 
+  // enable memory pool
+  memoryPool::cuda::init(false);
+
   if (verbose_) {
     log << '\n' << "CUDAService fully initialized";
   }
@@ -381,6 +387,9 @@ CUDAService::~CUDAService() {
     cms::cuda::getEventCache().clear();
     cms::cuda::getStreamCache().clear();
 
+    // destroy cpu memory pool
+    memoryPool::cuda::shutdown();
+
     for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaDeviceSynchronize());

diff --git a/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h b/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h
@@ -129,10 +129,10 @@ struct SimplePoolAllocatorImpl final : public SimplePoolAllocator {
   SimplePoolAllocatorImpl(int maxSlots) : SimplePoolAllocator(maxSlots) {}
 
   ~SimplePoolAllocatorImpl() override {
-//    garbageCollect();
-#ifdef MEMORY_POOL_DEBUG
+    garbageCollect();
+    //#ifdef MEMORY_POOL_DEBUG
     dumpStat();
-#endif
+    //#endif
   }
 
   Pointer doAlloc(size_t size) override { return Traits::alloc(size); }

diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h
@@ -10,6 +10,9 @@
 namespace memoryPool {
   namespace cuda {
 
+    void init(bool onlyCPU = false);
+    void shutdown();
+
     void dumpStat();
 
     SimplePoolAllocator *getPool(Where where);

diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc b/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc
@@ -74,51 +74,61 @@ struct CudaHostAlloc : public CudaAlloc {
 
 namespace {
 
-  constexpr int poolSize = 128 * 1024;
-
-  SimplePoolAllocatorImpl<PosixAlloc> cpuPool(poolSize);
-
-  SimplePoolAllocatorImpl<CudaHostAlloc> hostPool(poolSize);
-
-  struct DevicePools {
-    using Pool = SimplePoolAllocatorImpl<CudaDeviceAlloc>;
-    DevicePools(int size) {
-      int devices = 0;
-      auto status = cudaGetDeviceCount(&devices);
-      if (status == cudaSuccess && devices > 0) {
-        m_devicePools.reserve(devices);
-        for (int i = 0; i < devices; ++i)
-          m_devicePools.emplace_back(new Pool(size));
-      }
-    }
-    //return pool for current device
-    Pool &operator()() {
-      int dev = -1;
-      cudaGetDevice(&dev);
-      return *m_devicePools[dev];
-    }
+  // FIXME : move it in its own place
+  std::unique_ptr<SimplePoolAllocatorImpl<PosixAlloc>> cpuPool;
+
+  std::unique_ptr<SimplePoolAllocatorImpl<CudaHostAlloc>> hostPool;
 
-    std::vector<std::unique_ptr<Pool>> m_devicePools;
-  };
+  using DevicePool = SimplePoolAllocatorImpl<CudaDeviceAlloc>;
+  std::vector<std::unique_ptr<DevicePool>> devicePools;
 
-  DevicePools devicePool(poolSize);
+  void initDevicePools(int size) {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status == cudaSuccess && devices > 0) {
+      devicePools.reserve(devices);
+      for (int i = 0; i < devices; ++i)
+        devicePools.emplace_back(new DevicePool(size));
+    }
+  }
+
+  DevicePool *getDevicePool() {
+    int dev = -1;
+    cudaGetDevice(&dev);
+    return devicePools[dev].get();
+  }
 
 }  // namespace
 
 namespace memoryPool {
   namespace cuda {
 
+    void init(bool onlyCPU) {
+      constexpr int poolSize = 128 * 1024;
+      cpuPool = std::make_unique<SimplePoolAllocatorImpl<PosixAlloc>>(poolSize);
+      if (onlyCPU)
+        return;
+      initDevicePools(poolSize);
+      hostPool = std::make_unique<SimplePoolAllocatorImpl<CudaHostAlloc>>(poolSize);
+    }
+
+    void shutdown() {
+      cpuPool.reset();
+      devicePools.clear();
+      hostPool.reset();
+    }
+
     void dumpStat() {
       std::cout << "device pool" << std::endl;
-      devicePool().dumpStat();
+      getDevicePool()->dumpStat();
       std::cout << "host pool" << std::endl;
-      hostPool.dumpStat();
+      hostPool->dumpStat();
     }
 
     SimplePoolAllocator *getPool(Where where) {
-      return onCPU == where
-                 ? (SimplePoolAllocator *)(&cpuPool)
-                 : (onDevice == where ? (SimplePoolAllocator *)(&devicePool()) : (SimplePoolAllocator *)(&hostPool));
+      return onCPU == where ? (SimplePoolAllocator *)(cpuPool.get())
+                            : (onDevice == where ? (SimplePoolAllocator *)(getDevicePool())
+                                                 : (SimplePoolAllocator *)(hostPool.get()));
     }
 
     // allocate either on current device or on host (actually anywhere, not cuda specific)

diff --git a/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu b/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu
@@ -29,6 +29,7 @@ int main() {
     cudaStreamCreate(&(streams[i]));
   }
 
+  memoryPool::cuda::init(false);
   memoryPool::cuda::dumpStat();
 
   auto& stream = streams[0];
@@ -94,6 +95,7 @@ int main() {
 
   cudaStreamSynchronize(stream);
   memoryPool::cuda::dumpStat();
+  memoryPool::cuda::shutdown();
 
   return 0;
 }
diff --git a/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu b/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu
@@ -81,6 +81,8 @@ void go() {
 
 #endif
 
+  memoryPool::cuda::init(false);
+
   bool stop = false;
   bool bin24 = false;
   Thread monitor([&] {
@@ -216,6 +218,8 @@ void go() {
   cudaDeviceSynchronize();
   std::cout << "\nfinished\n" << std::endl;
   memoryPool::cuda::dumpStat();
+  std::cout << "\nshutdown\n" << std::endl;
+  memoryPool::cuda::shutdown();
 }
 
 #ifdef __CUDACC__