[Allocator] add new allocator strategy (#62638)

* add new allocator strategy
PaddlePaddle · Mar 25, 2024 · 6261015 · 6261015
1 parent 7750ec4
commit 6261015
Show file tree

Hide file tree

Showing 10 changed files with 349 additions and 29 deletions.
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -11,6 +11,7 @@ set(ALLOCATOR_SRCS
     allocator_strategy.cc
     allocator_facade.cc
     auto_growth_best_fit_allocator.cc
+    auto_growth_best_fit_allocator_v2.cc
     virtual_memory_auto_growth_best_fit_allocator.cc
     retry_allocator.cc
     memory_block.cc

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_v2.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
@@ -103,6 +104,12 @@ PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory,
                             "managed memory, only available for auto_growth "
                             "strategy");
 
+PADDLE_DEFINE_EXPORTED_bool(
+    use_auto_growth_v2,
+    false,
+    "Whether to use AutoGrowthBestFitAllocatorV2 for auto_growth "
+    "strategy");
+
 COMMON_DECLARE_string(allocator_strategy);
 COMMON_DECLARE_uint64(auto_growth_chunk_size_in_mb);
 COMMON_DECLARE_bool(use_auto_growth_pinned_allocator);
@@ -887,11 +894,22 @@ class AllocatorFacadePrivate {
             << FLAGS_auto_growth_chunk_size_in_mb;
 #if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = CreateCUDAAllocator(p);
-    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator,
-        platform::GpuMinChunkSize(),
-        chunk_size,
-        allow_free_idle_chunk_);
+    if (FLAGS_use_auto_growth_v2) {
+      cuda_allocators_[p][stream] =
+          std::make_shared<AutoGrowthBestFitAllocatorV2>(
+              cuda_allocator,
+              platform::GpuMinChunkSize(),
+              p,
+              chunk_size,
+              allow_free_idle_chunk_);
+    } else {
+      cuda_allocators_[p][stream] =
+          std::make_shared<AutoGrowthBestFitAllocator>(
+              cuda_allocator,
+              platform::GpuMinChunkSize(),
+              chunk_size,
+              allow_free_idle_chunk_);
+    }
 #endif
 
 #if defined(PADDLE_WITH_CUDA)
@@ -918,12 +936,22 @@ class AllocatorFacadePrivate {
               cuda_allocator, platform::GpuMinChunkSize(), p);
     } else {
       auto cuda_allocator = CreateCUDAAllocator(p);
-      cuda_allocators_[p][stream] =
-          std::make_shared<AutoGrowthBestFitAllocator>(
-              cuda_allocator,
-              platform::GpuMinChunkSize(),
-              /*chunk_size=*/chunk_size,
-              allow_free_idle_chunk_);
+      if (FLAGS_use_auto_growth_v2) {
+        cuda_allocators_[p][stream] =
+            std::make_shared<AutoGrowthBestFitAllocatorV2>(
+                cuda_allocator,
+                platform::GpuMinChunkSize(),
+                p,
+                /*chunk_size=*/chunk_size,
+                allow_free_idle_chunk_);
+      } else {
+        cuda_allocators_[p][stream] =
+            std::make_shared<AutoGrowthBestFitAllocator>(
+                cuda_allocator,
+                platform::GpuMinChunkSize(),
+                /*chunk_size=*/chunk_size,
+                allow_free_idle_chunk_);
+      }
     }
 #else
     auto cuda_allocator = CreateCUDAAllocator(p);
@@ -958,9 +986,21 @@ class AllocatorFacadePrivate {
       VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
       underlying_allocator = cuda_allocator;
     }
-
-    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
-        underlying_allocator, alignment, chunk_size, allow_free_idle_chunk_);
+    if (FLAGS_use_auto_growth_v2) {
+      cuda_allocators_[p][stream] =
+          std::make_shared<AutoGrowthBestFitAllocatorV2>(
+              underlying_allocator,
+              alignment,
+              p,
+              chunk_size,
+              allow_free_idle_chunk_);
+    } else {
+      cuda_allocators_[p][stream] =
+          std::make_shared<AutoGrowthBestFitAllocator>(underlying_allocator,
+                                                       alignment,
+                                                       chunk_size,
+                                                       allow_free_idle_chunk_);
+    }
 #endif
 #endif
   }
@@ -973,11 +1013,20 @@ class AllocatorFacadePrivate {
             << FLAGS_auto_growth_chunk_size_in_mb;
 #if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = CreateCUDAAllocator(p);
-    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator,
-        platform::GpuMinChunkSize(),
-        /*chunk_size=*/chunk_size,
-        allow_free_idle_chunk);
+    if (FLAGS_use_auto_growth_v2) {
+      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocatorV2>(
+          cuda_allocator,
+          platform::GpuMinChunkSize(),
+          p,
+          /*chunk_size=*/chunk_size,
+          allow_free_idle_chunk);
+    } else {
+      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+          cuda_allocator,
+          platform::GpuMinChunkSize(),
+          /*chunk_size=*/chunk_size,
+          allow_free_idle_chunk);
+    }
 #endif
 
 #if defined(PADDLE_WITH_CUDA)
@@ -1004,11 +1053,20 @@ class AllocatorFacadePrivate {
               cuda_allocator, platform::GpuMinChunkSize(), p);
     } else {
       auto cuda_allocator = CreateCUDAAllocator(p);
-      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-          cuda_allocator,
-          platform::GpuMinChunkSize(),
-          /*chunk_size=*/chunk_size,
-          allow_free_idle_chunk);
+      if (FLAGS_use_auto_growth_v2) {
+        allocators_[p] = std::make_shared<AutoGrowthBestFitAllocatorV2>(
+            cuda_allocator,
+            platform::GpuMinChunkSize(),
+            p,
+            /*chunk_size=*/chunk_size,
+            allow_free_idle_chunk);
+      } else {
+        allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+            cuda_allocator,
+            platform::GpuMinChunkSize(),
+            /*chunk_size=*/chunk_size,
+            allow_free_idle_chunk);
+      }
     }
 
 #else
@@ -1044,8 +1102,17 @@ class AllocatorFacadePrivate {
       VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
       underlying_allocator = cuda_allocator;
     }
-    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        underlying_allocator, alignment, chunk_size, allow_free_idle_chunk);
+    if (FLAGS_use_auto_growth_v2) {
+      allocators_[p] =
+          std::make_shared<AutoGrowthBestFitAllocatorV2>(underlying_allocator,
+                                                         alignment,
+                                                         p,
+                                                         chunk_size,
+                                                         allow_free_idle_chunk);
+    } else {
+      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+          underlying_allocator, alignment, chunk_size, allow_free_idle_chunk);
+    }
 #endif
 #endif
   }

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -48,7 +48,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
     return FreeIdleChunks();
   }
 
- private:
+ protected:
   uint64_t FreeIdleChunks();
   void Trace() const;
 

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_v2.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_v2.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_v2.h"
+
+#include <algorithm>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/backends/device_manager.h"
+
+PD_DECLARE_bool(free_idle_chunk);
+PD_DECLARE_bool(free_when_no_cache_hit);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+AutoGrowthBestFitAllocatorV2::AutoGrowthBestFitAllocatorV2(
+    const std::shared_ptr<Allocator> &underlying_allocator,
+    size_t alignment,
+    platform::CUDAPlace place,
+    size_t chunk_size,
+    bool allow_free_idle_chunk,
+    int extra_padding_size)
+    : AutoGrowthBestFitAllocator(underlying_allocator,
+                                 alignment,
+                                 chunk_size,
+                                 true,
+                                 extra_padding_size),
+      place_(place) {}
+
+phi::Allocation *AutoGrowthBestFitAllocatorV2::AllocateImpl(
+    size_t unaligned_size) {
+  platform::RecordEvent record("AutoGrowthBestFitAllocatorV2::Allocate",
+                               platform::TracerEventType::UserDefined,
+                               9 /*level*/);
+
+  size_t size = AlignedSize(unaligned_size + extra_padding_size_, alignment_);
+
+  VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size
+           << ", extra size " << extra_padding_size_;
+
+  std::lock_guard<SpinLock> guard(spinlock_);
+
+  BlockIt block_it;
+  if (AutoGrowthBestFitAllocatorV2State::GetInstance().IsWarmup()) {
+    auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
+    if (iter != free_blocks_.end() && iter->second->size_ >= unaligned_size &&
+        iter->second->size_ <= size) {
+      block_it = iter->second;
+      free_blocks_.erase(iter);
+      block_it->is_free_ = false;
+      VLOG(10) << "Allocate " << size << " bytes from chunk size "
+               << block_it->size_ << " by strict_matching_state.";
+    } else {
+      size_t actual_avail, actual_total;
+      {
+        platform::CUDADeviceGuard guard(place_.device);
+#ifdef PADDLE_WITH_HIP
+        auto result = hipMemGetInfo(&actual_avail, &actual_total);
+#else
+        auto result = cudaMemGetInfo(&actual_avail, &actual_total);
+#endif
+        if (result != gpuSuccess) {
+          actual_avail = 0;
+        }
+      }
+
+      if (actual_avail < size) {
+        FreeIdleChunks();
+      }
+
+      chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+          underlying_allocator_->Allocate(size)));
+
+      auto *chunk = &(*chunks_.rbegin());
+      size = chunk->allocation_->size();
+      uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
+      auto &blocks = chunk->blocks_;
+      blocks.emplace_back(p, size, false, chunk);
+      block_it = --(blocks.end());
+      VLOG(2) << "Not found and reallocate " << size << "("
+              << static_cast<void *>(p) << ") by strict_matching_state.";
+    }
+  } else {
+    if (is_first_switch_to_regular_) {
+      FreeIdleChunks();
+      is_first_switch_to_regular_ = false;
+    }
+    auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
+
+    if (iter != free_blocks_.end()) {
+      block_it = iter->second;
+      free_blocks_.erase(iter);
+      auto *chunk = block_it->chunk_;
+      size_t remaining_size = block_it->size_ - size;
+      VLOG(10) << "Allocate " << size << " bytes from chunk size "
+               << block_it->size_ << ", remaining " << remaining_size;
+      if (remaining_size == 0) {
+        block_it->is_free_ = false;
+      } else {
+        auto remaining_free_block = chunk->blocks_.insert(
+            block_it, Block(block_it->ptr_, remaining_size, true, chunk));
+        free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
+                             remaining_free_block);
+        block_it->ptr_ =
+            reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
+        block_it->size_ = size;
+        block_it->is_free_ = false;
+      }
+    } else {
+      if (FLAGS_free_when_no_cache_hit) {
+        FreeIdleChunks();
+      }
+      size_t realloc_size = std::max(size, chunk_size_);
+
+      try {
+        chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+            underlying_allocator_->Allocate(realloc_size)));
+      } catch (BadAlloc &ex) {
+        if (FLAGS_free_when_no_cache_hit) throw ex;
+        FreeIdleChunks();
+        chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+            underlying_allocator_->Allocate(realloc_size)));
+      }
+
+      auto *chunk = &(*chunks_.rbegin());
+      realloc_size = chunk->allocation_->size();
+      uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
+      auto &blocks = chunk->blocks_;
+
+      size_t remaining_size = realloc_size - size;
+      if (remaining_size > 0) {
+        blocks.emplace_back(p, remaining_size, true, chunk);
+        free_blocks_.emplace(std::make_pair(remaining_size, p),
+                             --(blocks.end()));
+      }
+      blocks.emplace_back(p + remaining_size, size, false, chunk);
+      block_it = --(blocks.end());
+      VLOG(2) << "Not found and reallocate " << realloc_size << "("
+              << static_cast<void *>(p) << "), and remaining "
+              << remaining_size;
+    }
+  }
+  ++total_alloc_times_;
+  total_alloc_size_ += size;
+  VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
+  return new BlockAllocation(block_it);
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+#endif