import from vin repo

cms-sw · May 4, 2022 · f37385a · f37385a
1 parent 4ac34bc
commit f37385a
Show file tree

Hide file tree

Showing 8 changed files with 823 additions and 0 deletions.
diff --git a/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h b/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h
@@ -0,0 +1,216 @@
+#pragma once
+
+#include<atomic>
+#include<array>
+#include<memory>
+#include<algorithm>
+#include<cassert>
+#include<mutex>
+#include <vector>
+#include <cstdint>
+#include<iostream>
+#include<chrono>
+
+namespace poolDetails {
+
+ constexpr int bucket(uint64_t s) { return 64-__builtin_clzl(s-1); }
+ constexpr uint64_t bucketSize(int b) { return 1LL<<b;}
+
+};
+
+
+class SimplePoolAllocator {
+
+public:
+
+  using Pointer = void *;
+
+
+  virtual ~SimplePoolAllocator() = default;
+
+  virtual Pointer doAlloc(size_t size) =0;
+  virtual void doFree(Pointer ptr) = 0;
+
+  SimplePoolAllocator(int maxSlots) : m_maxSlots(maxSlots) {
+    for ( auto & p : m_used) p = true;
+  }
+
+  int size() const { return m_size;}
+
+  Pointer pointer(int i) const { return m_slots[i]; }
+
+  void free(int i) {
+    m_last[i] = -1;
+    m_used[i]=false;
+  }
+
+  int alloc(uint64_t s) {
+    auto i = allocImpl(s);
+
+    //test garbage
+    // if(totBytes>4507964512) garbageCollect();
+
+    if (i>=0) {
+       assert(m_used[i]);
+       if (nullptr==m_slots[i]) std::cout << "race ??? " << i << ' ' << m_bucket[i] << ' ' << m_last[i] << std::endl;
+       assert(m_slots[i]);
+       return i;
+    } 
+    garbageCollect();
+    i =  allocImpl(s);
+    if (i>=0) { assert(m_used[i]); assert(m_slots[i]);assert(m_last[i]>=0);}
+    return i;
+  }
+
+  int allocImpl(uint64_t s) {
+    auto b = poolDetails::bucket(s);
+    assert(s<=poolDetails::bucketSize(b));
+    int ls = size();
+    // look for an existing slot
+    for (int i=0; i<ls; ++i) {
+      if (b!=m_bucket[i]) continue;    
+      if (m_used[i]) continue;
+      bool exp = false;
+      if (m_used[i].compare_exchange_strong(exp,true)) {
+        // verify if in the mean time the garbage collector did operate
+        if(nullptr == m_slots[i]) {
+          assert(m_bucket[i]<0);
+          m_used[i] = false;
+          continue;
+        }
+        m_last[i] = 0;
+        return i;
+      }
+    }
+
+    // try to create in existing slot (if garbage has been collected)
+    ls = useOld(b);
+    if (ls>=0) return ls;
+
+    // try to allocate a new slot
+    if (m_size>=m_maxSlots) return -1;
+    ls = m_size++;
+    if (ls>=m_maxSlots) return -1;
+    m_last[ls] = 2;
+    return createAt(ls,b);
+  }
+
+  int createAt(int ls, int b) {
+    assert(m_used[ls]);
+    assert(m_last[ls]>0);
+    m_bucket[ls]=b;
+    auto as = poolDetails::bucketSize(b);
+    assert(nullptr==m_slots[ls]);
+    m_slots[ls] = doAlloc(as);
+    if (nullptr == m_slots[ls]) return -1;
+    totBytes+=as;
+    nAlloc++;
+    return ls;
+  }
+
+  void garbageCollect() {
+    int ls = size();
+    for (int i=0; i<ls; ++i) {
+      if (m_used[i]) continue;
+      if (m_bucket[i]<0) continue; 
+      bool exp = false;
+      if (!m_used[i].compare_exchange_strong(exp,true)) continue;
+      assert(m_used[i]);
+      if( nullptr != m_slots[i]) {
+        assert(m_bucket[i]>=0);  
+        doFree(m_slots[i]);
+        nFree++;
+        totBytes-= poolDetails::bucketSize(m_bucket[i]);
+      }
+      m_slots[i] = nullptr;
+      m_bucket[i] = -1;
+      m_last[i] = -3;
+      m_used[i] = false; // here memory fence as well
+    }
+  }
+
+
+  int useOld(int b) {
+    int ls = size();
+    for (int i=0; i<ls; ++i) {
+      if ( m_bucket[i]>=0) continue;
+      if (m_used[i]) continue;
+      bool exp = false;
+      if (!m_used[i].compare_exchange_strong(exp,true)) continue;
+      if( nullptr != m_slots[i]) { // ops allocated and freed
+        assert(m_bucket[i]>=0);
+        assert(m_last[i] = -1);
+        m_used[i] = false;
+        continue;
+      }
+      assert(m_used[i]);
+      m_last[i] = 1;
+      return createAt(i,b);
+    }
+    return -1;
+  }
+
+  void dumpStat() const {
+   uint64_t fn=0; 
+   uint64_t fs=0;
+   int ls = size();
+   for (int i=0; i<ls; ++i) {
+      if (m_used[i]) {
+        auto b = m_bucket[i];
+        if (b<0) continue;
+        fn++;
+        fs += (1LL<<b);
+      }
+   }
+   std::cout << "# slots " << size() << '\n'
+              << "# bytes " << totBytes << '\n'
+              << "# alloc " << nAlloc << '\n'
+              << "# free " << nFree << '\n'
+              << "# used " << fn << ' ' << fs << '\n'
+              << std::endl;
+  }
+
+
+private:
+
+  const int m_maxSlots;
+
+  std::vector<int> m_last = std::vector<int>(m_maxSlots,-2);
+
+
+  std::vector<int> m_bucket = std::vector<int>(m_maxSlots,-1);
+  std::vector<Pointer> m_slots = std::vector<Pointer>(m_maxSlots,nullptr);
+  std::vector<std::atomic<bool>> m_used = std::vector<std::atomic<bool>>(m_maxSlots);
+  std::atomic<int> m_size=0;
+
+  std::atomic<uint64_t> totBytes = 0;
+  std::atomic<uint64_t> nAlloc = 0;
+  std::atomic<uint64_t> nFree = 0;
+
+};
+
+
+template<typename Traits>
+struct SimplePoolAllocatorImpl final : public SimplePoolAllocator {
+
+  SimplePoolAllocatorImpl(int maxSlots) : SimplePoolAllocator(maxSlots){}
+
+  ~SimplePoolAllocatorImpl() override = default;
+
+  Pointer doAlloc(size_t size) override { return Traits::alloc(size);}
+  void doFree(Pointer ptr) override { Traits::free(ptr);}
+
+};
+
+
+#include <cstdlib>
+struct PosixAlloc {
+
+  using Pointer = void *;
+
+  static Pointer alloc(size_t size) { return ::malloc(size); }
+  static void free(Pointer ptr) { ::free(ptr); }
+
+};
+
+
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h
@@ -0,0 +1,86 @@
+#pragma once
+#include "memoryPool.h"
+#include <vector>
+
+// only for cudaStream_t 
+#include<cuda.h>
+
+
+namespace memoryPool {
+  namespace cuda {
+
+    void dumpStat();
+
+    SimplePoolAllocator * getPool(Where where);
+
+    // allocate either on current device or on host
+    std::pair<void *,int> alloc(uint64_t size, SimplePoolAllocator & pool);
+
+    // schedule free
+    void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator & pool);
+
+
+    struct CudaDeleterBase  : public DeleterBase {
+
+     CudaDeleterBase(cudaStream_t const & stream, Where where) : DeleterBase(getPool(where)),
+           m_stream(stream) {}
+
+      CudaDeleterBase(cudaStream_t const & stream, SimplePoolAllocator * pool) : DeleterBase(pool),
+           m_stream(stream) {}
+
+      ~CudaDeleterBase() override = default;
+
+
+      cudaStream_t m_stream;
+
+    };
+
+    struct DeleteOne final : public CudaDeleterBase {
+
+      using CudaDeleterBase::CudaDeleterBase;
+
+      ~DeleteOne() override = default;
+      void operator()(int bucket) override {
+          free(m_stream, std::vector<int>(1,bucket), *pool());
+      }
+
+    };
+
+    struct BundleDelete final : public CudaDeleterBase {
+
+      using CudaDeleterBase::CudaDeleterBase;
+
+      ~BundleDelete() override {
+         free(m_stream, std::move(m_buckets), *pool());
+      }
+
+      void operator()(int bucket) override {
+         m_buckets.push_back(bucket);
+      }
+
+      std::vector<int> m_buckets;
+
+    };
+
+     template<typename T>
+      unique_ptr<T> make_unique(uint64_t size, Deleter del) {
+        auto ret = alloc(sizeof(T)*size,*del.pool());
+        if (ret.second<0) throw std::bad_alloc();
+        del.setBucket(ret.second);
+        return unique_ptr<T>((T*)(ret.first),del);
+      }
+
+      template<typename T>
+      unique_ptr<T> make_unique(uint64_t size, cudaStream_t const & stream, Where where) {
+         return make_unique<T>(sizeof(T)*size,Deleter(std::make_shared<DeleteOne>(stream,getPool(where))));
+      }
+
+/*
+      template< class T, class... Args >
+      memoryPool::unique_ptr<T> make_unique( Args&&... args );
+      template< class T, class... Args >
+      memoryPool::unique_ptr<T> make_unique(Deleter del, Args&&... args );
+*/
+
+  } // cuda
+} // memoryPool
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h
@@ -0,0 +1,50 @@
+#pragma once
+#include<memory>
+#include<new>
+
+
+class SimplePoolAllocator;
+
+namespace memoryPool {
+
+  enum Where {onCPU,onDevice,onHost, unified};
+
+  class DeleterBase {
+    public: 
+
+    explicit DeleterBase(SimplePoolAllocator * pool) : m_pool(pool){}
+    virtual ~DeleterBase() = default;
+    virtual void operator()(int bucket) =0;
+
+    SimplePoolAllocator * pool() const { return m_pool;}
+
+    protected:
+     SimplePoolAllocator * m_pool;
+  };
+
+  class Deleter {
+  public:
+    explicit Deleter(int bucket=-1) : m_bucket(bucket) {}
+    Deleter(std::shared_ptr<DeleterBase> del, int bucket=-1) : me(del), m_bucket(bucket) {}
+
+    void set(std::shared_ptr<DeleterBase> del) { me=del;}
+    void setBucket(int bucket) { m_bucket = bucket;}
+
+    void operator()(void * p) {
+      if (!me) throw std::bad_alloc(); 
+      (*me)(m_bucket);
+    }
+
+    SimplePoolAllocator * pool() const { return me->pool();}
+
+  private:
+    std::shared_ptr<DeleterBase> me;
+    int m_bucket;
+  };
+
+
+  template <typename T>
+  using unique_ptr = std::unique_ptr<T,Deleter>;
+
+
+}