Skip to content

Commit

Permalink
add basic support for CUDA Graph
Browse files Browse the repository at this point in the history
  • Loading branch information
sneaxiy committed Sep 28, 2021
1 parent af4f018 commit d9af897
Show file tree
Hide file tree
Showing 16 changed files with 583 additions and 15 deletions.
6 changes: 5 additions & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ endif()
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy )
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)

if (WITH_GPU)
target_link_libraries(allocator_facade cuda_graph)
endif()

cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
if (WITH_TESTING)
Expand Down
145 changes: 135 additions & 10 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/cuda_graph.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
Expand All @@ -47,17 +48,64 @@ PADDLE_DEFINE_EXPORTED_bool(
"Whether to use system allocator to allocate CPU and GPU memory. "
"Only used for unittests.");

DECLARE_string(allocator_strategy);

namespace paddle {
namespace memory {
namespace allocation {

#ifdef PADDLE_WITH_CUDA
class CUDAGraphAllocator
: public Allocator,
public std::enable_shared_from_this<CUDAGraphAllocator> {
private:
class PrivateAllocation : public Allocation {
public:
PrivateAllocation(CUDAGraphAllocator* allocator,
AllocationPtr underlying_allocation)
: Allocation(underlying_allocation->ptr(),
underlying_allocation->size(),
underlying_allocation->place()),
allocator_(allocator->shared_from_this()),
underlying_allocation_(std::move(underlying_allocation)) {}

private:
std::shared_ptr<Allocator> allocator_;
AllocationPtr underlying_allocation_;
};

explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
: underlying_allocator_(allocator) {}

public:
static std::shared_ptr<Allocator> Create(
const std::shared_ptr<Allocator>& allocator) {
return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
}

protected:
Allocation* AllocateImpl(size_t size) {
VLOG(10) << "Allocate " << size << " for CUDA Graph";
return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
}

void FreeImpl(Allocation* allocation) {
VLOG(10) << "delete for CUDA Graph";
delete allocation;
}

private:
std::shared_ptr<Allocator> underlying_allocator_;
};
#endif

class AllocatorFacadePrivate {
public:
using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;

AllocatorFacadePrivate() {
auto strategy = GetAllocatorStrategy();
switch (strategy) {
explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
strategy_ = GetAllocatorStrategy();
switch (strategy_) {
case AllocatorStrategy::kNaiveBestFit: {
InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
Expand Down Expand Up @@ -91,7 +139,8 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id));
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
allow_free_idle_chunk);
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
Expand All @@ -117,7 +166,7 @@ class AllocatorFacadePrivate {

default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unsupported allocator strategy: %d", static_cast<int>(strategy)));
"Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
}
}
InitZeroSizeAllocators();
Expand All @@ -130,11 +179,29 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe();
}

inline const AllocatorMap& GetAllocatorMap() {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_EQ(
iter != cuda_graph_allocator_map_.end(), true,
platform::errors::PermissionDenied(
"No memory pool is prepared for CUDA Graph capturing."));
return iter->second->allocators_;
} else {
return allocators_;
}
#else
return allocators_;
#endif
}

inline const std::shared_ptr<Allocator>& GetAllocator(
const platform::Place& place, size_t size) {
const auto& allocators =
(size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
: allocators_)
: GetAllocatorMap())
: zero_size_allocators_);
auto iter = allocators.find(place);
PADDLE_ENFORCE_NE(iter, allocators.end(),
Expand All @@ -145,6 +212,7 @@ class AllocatorFacadePrivate {

private:
void InitSystemAllocators() {
if (!system_allocators_.empty()) return;
system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
#ifdef PADDLE_WITH_XPU
int device_count = platform::GetXPUDeviceCount();
Expand Down Expand Up @@ -183,10 +251,11 @@ class AllocatorFacadePrivate {
allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
}

void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize());
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
}
#endif

Expand Down Expand Up @@ -226,6 +295,7 @@ class AllocatorFacadePrivate {
};

void InitZeroSizeAllocators() {
if (!zero_size_allocators_.empty()) return;
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Expand Down Expand Up @@ -279,12 +349,57 @@ class AllocatorFacadePrivate {
}
}

#ifdef PADDLE_WITH_CUDA

public:
void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
platform::errors::InvalidArgument(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=\"auto_growth\", but got "
"FLAGS_allocator_strategy=\"%s\"",
FLAGS_allocator_strategy));
auto& allocator = cuda_graph_allocator_map_[id];
PADDLE_ENFORCE_EQ(
allocator.get(), nullptr,
platform::errors::InvalidArgument(
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(
new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
for (auto& item : allocator->allocators_) {
auto& old_allocator = item.second;
old_allocator = CUDAGraphAllocator::Create(old_allocator);
}
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
}

void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_EQ(iter != cuda_graph_allocator_map_.end(), true,
platform::errors::InvalidArgument(
"Cannot find CUDA Graph with ID = %d", id));
cuda_graph_allocator_map_.erase(iter);
VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
}
#endif

private:
AllocatorMap allocators_;
AllocatorMap zero_size_allocators_;
AllocatorMap system_allocators_;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
cuda_graph_allocator_map_;
#endif
AllocatorStrategy strategy_;

static AllocatorMap zero_size_allocators_;
static AllocatorMap system_allocators_;
};

AllocatorFacadePrivate::AllocatorMap
AllocatorFacadePrivate::zero_size_allocators_;
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;

// Pimpl. Make interface clean.
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
// delete m_ may cause core dump when the destructor of python in conflict with
Expand Down Expand Up @@ -316,6 +431,16 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}

#ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
return m_->PrepareMemoryPoolForCUDAGraph(id);
}

void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
return m_->RemoveMemoryPoolOfCUDAGraph(id);
}
#endif

} // namespace allocation
} // namespace memory
} // namespace paddle
8 changes: 8 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif
#include "paddle/fluid/platform/place.h"

namespace paddle {
Expand Down Expand Up @@ -54,6 +57,11 @@ class AllocatorFacade {
uint64_t Release(const platform::Place& place);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);

#ifdef PADDLE_WITH_CUDA
void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
#endif

// TODO(yy): Allocate a Copy-On-Write allocation?
private:
AllocatorFacade();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ namespace allocation {

AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
size_t chunk_size)
size_t chunk_size, bool allow_free_idle_chunk)
: underlying_allocator_(
std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
alignment_(alignment),
chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {}
chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
allow_free_idle_chunk_(allow_free_idle_chunk) {}

Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
size = AlignedSize(size, alignment_);
Expand Down Expand Up @@ -139,6 +140,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
}

uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
if (!allow_free_idle_chunk_) {
return 0;
}
uint64_t bytes = 0;
for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) {
auto &blocks = chunk_it->blocks_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
public:
AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
size_t chunk_size = 0);
size_t chunk_size = 0, bool allow_free_idle_chunk = true);

bool IsAllocThreadSafe() const override { return true; }

Expand Down Expand Up @@ -86,6 +86,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
std::list<Chunk> chunks_;
size_t alignment_;
size_t chunk_size_;
bool allow_free_idle_chunk_;

SpinLock spinlock_;
};
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,14 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

IF(WITH_GPU)
nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
ELSE()
cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
ENDIF()

IF(WITH_ROCM)
hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()
Expand Down
Loading

0 comments on commit d9af897

Please sign in to comment.