Skip to content

Commit

Permalink
[Allocator] add new allocator strategy (#62638)
Browse files Browse the repository at this point in the history
* add new allocator strategy
  • Loading branch information
wanghuancoder authored Mar 25, 2024
1 parent 7750ec4 commit 6261015
Show file tree
Hide file tree
Showing 10 changed files with 349 additions and 29 deletions.
1 change: 1 addition & 0 deletions paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ set(ALLOCATOR_SRCS
allocator_strategy.cc
allocator_facade.cc
auto_growth_best_fit_allocator.cc
auto_growth_best_fit_allocator_v2.cc
virtual_memory_auto_growth_best_fit_allocator.cc
retry_allocator.cc
memory_block.cc
Expand Down
119 changes: 93 additions & 26 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_v2.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
Expand Down Expand Up @@ -103,6 +104,12 @@ PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory,
"managed memory, only available for auto_growth "
"strategy");

PADDLE_DEFINE_EXPORTED_bool(
use_auto_growth_v2,
false,
"Whether to use AutoGrowthBestFitAllocatorV2 for auto_growth "
"strategy");

COMMON_DECLARE_string(allocator_strategy);
COMMON_DECLARE_uint64(auto_growth_chunk_size_in_mb);
COMMON_DECLARE_bool(use_auto_growth_pinned_allocator);
Expand Down Expand Up @@ -887,11 +894,22 @@ class AllocatorFacadePrivate {
<< FLAGS_auto_growth_chunk_size_in_mb;
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = CreateCUDAAllocator(p);
cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
chunk_size,
allow_free_idle_chunk_);
if (FLAGS_use_auto_growth_v2) {
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocatorV2>(
cuda_allocator,
platform::GpuMinChunkSize(),
p,
chunk_size,
allow_free_idle_chunk_);
} else {
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
chunk_size,
allow_free_idle_chunk_);
}
#endif

#if defined(PADDLE_WITH_CUDA)
Expand All @@ -918,12 +936,22 @@ class AllocatorFacadePrivate {
cuda_allocator, platform::GpuMinChunkSize(), p);
} else {
auto cuda_allocator = CreateCUDAAllocator(p);
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
/*chunk_size=*/chunk_size,
allow_free_idle_chunk_);
if (FLAGS_use_auto_growth_v2) {
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocatorV2>(
cuda_allocator,
platform::GpuMinChunkSize(),
p,
/*chunk_size=*/chunk_size,
allow_free_idle_chunk_);
} else {
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
/*chunk_size=*/chunk_size,
allow_free_idle_chunk_);
}
}
#else
auto cuda_allocator = CreateCUDAAllocator(p);
Expand Down Expand Up @@ -958,9 +986,21 @@ class AllocatorFacadePrivate {
VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
underlying_allocator = cuda_allocator;
}

cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
underlying_allocator, alignment, chunk_size, allow_free_idle_chunk_);
if (FLAGS_use_auto_growth_v2) {
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocatorV2>(
underlying_allocator,
alignment,
p,
chunk_size,
allow_free_idle_chunk_);
} else {
cuda_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocator>(underlying_allocator,
alignment,
chunk_size,
allow_free_idle_chunk_);
}
#endif
#endif
}
Expand All @@ -973,11 +1013,20 @@ class AllocatorFacadePrivate {
<< FLAGS_auto_growth_chunk_size_in_mb;
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = CreateCUDAAllocator(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
if (FLAGS_use_auto_growth_v2) {
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocatorV2>(
cuda_allocator,
platform::GpuMinChunkSize(),
p,
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
} else {
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
}
#endif

#if defined(PADDLE_WITH_CUDA)
Expand All @@ -1004,11 +1053,20 @@ class AllocatorFacadePrivate {
cuda_allocator, platform::GpuMinChunkSize(), p);
} else {
auto cuda_allocator = CreateCUDAAllocator(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
if (FLAGS_use_auto_growth_v2) {
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocatorV2>(
cuda_allocator,
platform::GpuMinChunkSize(),
p,
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
} else {
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
}
}

#else
Expand Down Expand Up @@ -1044,8 +1102,17 @@ class AllocatorFacadePrivate {
VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
underlying_allocator = cuda_allocator;
}
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
underlying_allocator, alignment, chunk_size, allow_free_idle_chunk);
if (FLAGS_use_auto_growth_v2) {
allocators_[p] =
std::make_shared<AutoGrowthBestFitAllocatorV2>(underlying_allocator,
alignment,
p,
chunk_size,
allow_free_idle_chunk);
} else {
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
underlying_allocator, alignment, chunk_size, allow_free_idle_chunk);
}
#endif
#endif
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
return FreeIdleChunks();
}

private:
protected:
uint64_t FreeIdleChunks();
void Trace() const;

Expand Down
170 changes: 170 additions & 0 deletions paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_v2.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_v2.h"

#include <algorithm>
#include <mutex> // NOLINT

#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/device_manager.h"

PD_DECLARE_bool(free_idle_chunk);
PD_DECLARE_bool(free_when_no_cache_hit);

namespace paddle {
namespace memory {
namespace allocation {

AutoGrowthBestFitAllocatorV2::AutoGrowthBestFitAllocatorV2(
const std::shared_ptr<Allocator> &underlying_allocator,
size_t alignment,
platform::CUDAPlace place,
size_t chunk_size,
bool allow_free_idle_chunk,
int extra_padding_size)
: AutoGrowthBestFitAllocator(underlying_allocator,
alignment,
chunk_size,
true,
extra_padding_size),
place_(place) {}

phi::Allocation *AutoGrowthBestFitAllocatorV2::AllocateImpl(
size_t unaligned_size) {
platform::RecordEvent record("AutoGrowthBestFitAllocatorV2::Allocate",
platform::TracerEventType::UserDefined,
9 /*level*/);

size_t size = AlignedSize(unaligned_size + extra_padding_size_, alignment_);

VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size
<< ", extra size " << extra_padding_size_;

std::lock_guard<SpinLock> guard(spinlock_);

BlockIt block_it;
if (AutoGrowthBestFitAllocatorV2State::GetInstance().IsWarmup()) {
auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
if (iter != free_blocks_.end() && iter->second->size_ >= unaligned_size &&
iter->second->size_ <= size) {
block_it = iter->second;
free_blocks_.erase(iter);
block_it->is_free_ = false;
VLOG(10) << "Allocate " << size << " bytes from chunk size "
<< block_it->size_ << " by strict_matching_state.";
} else {
size_t actual_avail, actual_total;
{
platform::CUDADeviceGuard guard(place_.device);
#ifdef PADDLE_WITH_HIP
auto result = hipMemGetInfo(&actual_avail, &actual_total);
#else
auto result = cudaMemGetInfo(&actual_avail, &actual_total);
#endif
if (result != gpuSuccess) {
actual_avail = 0;
}
}

if (actual_avail < size) {
FreeIdleChunks();
}

chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
underlying_allocator_->Allocate(size)));

auto *chunk = &(*chunks_.rbegin());
size = chunk->allocation_->size();
uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
auto &blocks = chunk->blocks_;
blocks.emplace_back(p, size, false, chunk);
block_it = --(blocks.end());
VLOG(2) << "Not found and reallocate " << size << "("
<< static_cast<void *>(p) << ") by strict_matching_state.";
}
} else {
if (is_first_switch_to_regular_) {
FreeIdleChunks();
is_first_switch_to_regular_ = false;
}
auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));

if (iter != free_blocks_.end()) {
block_it = iter->second;
free_blocks_.erase(iter);
auto *chunk = block_it->chunk_;
size_t remaining_size = block_it->size_ - size;
VLOG(10) << "Allocate " << size << " bytes from chunk size "
<< block_it->size_ << ", remaining " << remaining_size;
if (remaining_size == 0) {
block_it->is_free_ = false;
} else {
auto remaining_free_block = chunk->blocks_.insert(
block_it, Block(block_it->ptr_, remaining_size, true, chunk));
free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
remaining_free_block);
block_it->ptr_ =
reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
block_it->size_ = size;
block_it->is_free_ = false;
}
} else {
if (FLAGS_free_when_no_cache_hit) {
FreeIdleChunks();
}
size_t realloc_size = std::max(size, chunk_size_);

try {
chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
underlying_allocator_->Allocate(realloc_size)));
} catch (BadAlloc &ex) {
if (FLAGS_free_when_no_cache_hit) throw ex;
FreeIdleChunks();
chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
underlying_allocator_->Allocate(realloc_size)));
}

auto *chunk = &(*chunks_.rbegin());
realloc_size = chunk->allocation_->size();
uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
auto &blocks = chunk->blocks_;

size_t remaining_size = realloc_size - size;
if (remaining_size > 0) {
blocks.emplace_back(p, remaining_size, true, chunk);
free_blocks_.emplace(std::make_pair(remaining_size, p),
--(blocks.end()));
}
blocks.emplace_back(p + remaining_size, size, false, chunk);
block_it = --(blocks.end());
VLOG(2) << "Not found and reallocate " << realloc_size << "("
<< static_cast<void *>(p) << "), and remaining "
<< remaining_size;
}
}
++total_alloc_times_;
total_alloc_size_ += size;
VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
return new BlockAllocation(block_it);
}

} // namespace allocation
} // namespace memory
} // namespace paddle
#endif
Loading

0 comments on commit 6261015

Please sign in to comment.