Skip to content

Commit

Permalink
[cherry-pick] Thread-local Allocator, test=release/2.0 (#24061)
Browse files Browse the repository at this point in the history
* cherry-pick of DeviceContext Split, test=develop (#23737)

* New feature: thread local allocator, test=develop (#23989)

* add the thread_local_allocator, test=develop

* refactor the thread_local_allocator, test=develop

* provides option setting strategy, test=develop

* add boost dependency to cuda_stream, test=develop

* declare the stream::Priority as enum class, test=develop

* deal with PADDLE_ENFORCE_CUDA_SUCCESS macro in pr #23816
  • Loading branch information
Shixiaowei02 authored Apr 23, 2020
1 parent c1b4d1c commit 597cc05
Show file tree
Hide file tree
Showing 14 changed files with 637 additions and 90 deletions.
4 changes: 3 additions & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@ endif()

if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
endif()

cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)

nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
if (WITH_GPU)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
else ()
set(AllocatorFacadeDeps)
endif()
Expand Down
17 changes: 17 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
Expand Down Expand Up @@ -80,6 +81,18 @@ class AllocatorFacadePrivate {
break;
}

case AllocatorStrategy::kThreadLocal: {
InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
break;
}

default: {
PADDLE_THROW("Unsupported allocator strategy: %d",
static_cast<int>(strategy));
Expand Down Expand Up @@ -136,6 +149,10 @@ class AllocatorFacadePrivate {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}

void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
}

void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/memory/allocation/allocator_strategy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ static AllocatorStrategy GetStrategyFromFlag() {
return AllocatorStrategy::kAutoGrowth;
}

if (FLAGS_allocator_strategy == "thread_local") {
return AllocatorStrategy::kThreadLocal;
}

PADDLE_THROW("Unsupported allocator strategy: %s", FLAGS_allocator_strategy);
}

Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/allocation/allocator_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace paddle {
namespace memory {
namespace allocation {

enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth };
enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth, kThreadLocal };

extern AllocatorStrategy GetAllocatorStrategy();

Expand Down
76 changes: 76 additions & 0 deletions paddle/fluid/memory/allocation/thread_local_allocator.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/memory/allocation/thread_local_allocator.h"

namespace paddle {
namespace memory {
namespace allocation {

ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
: place_(p) {
if (platform::is_gpu_place(place_)) {
buddy_allocator_.reset(new memory::detail::BuddyAllocator(
std::unique_ptr<memory::detail::SystemAllocator>(
new memory::detail::GPUAllocator(
boost::get<platform::CUDAPlace>(place_).device)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
} else {
LOG(FATAL) << "Thread local allocator only supports CUDAPlace now.";
}
}

std::shared_ptr<ThreadLocalAllocatorImpl> ThreadLocalCUDAAllocatorPool::Get(
int gpu_id) {
auto pos = std::distance(devices_.begin(),
std::find(devices_.begin(), devices_.end(), gpu_id));
PADDLE_ENFORCE_LT(
pos, devices_.size(),
platform::errors::InvalidArgument(
"The position of device should be less than the size of devices."));
std::call_once(*init_flags_[pos], [this, pos, gpu_id] {
platform::SetDeviceId(devices_[pos]);
allocators_[pos].reset(
new ThreadLocalAllocatorImpl(platform::CUDAPlace(gpu_id)));
});
return allocators_[pos];
}

ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
: devices_(platform::GetSelectedDevices()) {
auto gpu_num = devices_.size();
allocators_.resize(gpu_num);
init_flags_.reserve(gpu_num);
for (size_t i = 0; i < gpu_num; ++i) {
init_flags_.emplace_back(new std::once_flag());
}
}

ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
void* ptr = buddy_allocator_->Alloc(size);
auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
return tl_allocation;
}

void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
buddy_allocator_->Free(allocation->ptr());
delete allocation;
}

} // namespace allocation
} // namespace memory
} // namespace paddle
100 changes: 100 additions & 0 deletions paddle/fluid/memory/allocation/thread_local_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <memory>
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"

namespace paddle {
namespace memory {
namespace allocation {

class ThreadLocalAllocatorImpl;

class ThreadLocalAllocation : public Allocation {
public:
ThreadLocalAllocation(void* ptr, size_t size, platform::Place place)
: Allocation(ptr, size, place) {}

void SetThreadLocalAllocatorImpl(
std::shared_ptr<ThreadLocalAllocatorImpl> allocator) {
allocator_ = allocator;
}

std::shared_ptr<ThreadLocalAllocatorImpl> GetAllocator() {
return allocator_;
}

private:
std::shared_ptr<ThreadLocalAllocatorImpl> allocator_;
};

class ThreadLocalAllocatorImpl
: public std::enable_shared_from_this<ThreadLocalAllocatorImpl> {
public:
explicit ThreadLocalAllocatorImpl(const platform::Place& p);
ThreadLocalAllocation* AllocateImpl(size_t size);
void FreeImpl(ThreadLocalAllocation* allocation);

private:
std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
platform::Place place_;
};

class ThreadLocalCUDAAllocatorPool {
public:
static ThreadLocalCUDAAllocatorPool& Instance() {
static thread_local ThreadLocalCUDAAllocatorPool pool;
return pool;
}

std::shared_ptr<ThreadLocalAllocatorImpl> Get(int gpu_id);

private:
ThreadLocalCUDAAllocatorPool();
std::vector<int> devices_;
std::vector<std::unique_ptr<std::once_flag>> init_flags_;
std::vector<std::shared_ptr<ThreadLocalAllocatorImpl>> allocators_;
};

class ThreadLocalCUDAAllocator : public Allocator {
public:
explicit ThreadLocalCUDAAllocator(const platform::CUDAPlace& p)
: gpu_id_(p.device) {}

bool IsAllocThreadSafe() const override { return true; }

protected:
Allocation* AllocateImpl(size_t size) override {
return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl(
size);
}
void FreeImpl(Allocation* allocation) override {
auto* tl_allocation = static_cast<ThreadLocalAllocation*>(allocation);
auto allocator_impl = tl_allocation->GetAllocator();
allocator_impl->FreeImpl(tl_allocation);
}

private:
int gpu_id_;
};

} // namespace allocation
} // namespace memory
} // namespace paddle
93 changes: 93 additions & 0 deletions paddle/fluid/memory/allocation/thread_local_allocator_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include <algorithm>
#include <condition_variable> // NOLINT
#include <functional>
#include <iostream>
#include <thread> // NOLINT
#include <utility>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/gpu_info.h"

DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_string(allocator_strategy);

namespace paddle {
namespace memory {
namespace allocation {

TEST(ThreadLocalAllocator, cross_scope_release) {
FLAGS_fraction_of_gpu_memory_to_use = 0.1;
FLAGS_allocator_strategy = "thread_local";

const size_t thread_num = 5;
const std::vector<int> devices = platform::GetSelectedDevices();

std::vector<std::vector<void *>> allocator_addresses(devices.size());
std::vector<std::vector<AllocationPtr>> thread_allocations(devices.size());

for (size_t i = 0; i < devices.size(); ++i) {
allocator_addresses[i].resize(thread_num);
thread_allocations[i].resize(thread_num);
}

std::vector<std::thread> threads(thread_num);
std::mutex mutex;
std::condition_variable cv;
bool flag = false;

for (size_t i = 0; i < threads.size(); ++i) {
threads[i] = std::thread([&, i]() {
{
std::unique_lock<std::mutex> lock(mutex);
cv.wait(lock, [&] { return flag; });
}
for (size_t j = 0; j < devices.size(); ++j) {
thread_allocations[j][i] =
memory::Alloc(platform::CUDAPlace(devices[j]), 10);
auto tl_allocator_impl =
ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]);
allocator_addresses[j][i] = tl_allocator_impl.get();
}
});
}

{
std::lock_guard<std::mutex> lock(mutex);
flag = true;
cv.notify_all();
}

for (auto &th : threads) {
th.join();
}

for (auto &addresses : allocator_addresses) {
std::sort(addresses.begin(), addresses.end());
ASSERT_EQ(std::adjacent_find(addresses.begin(), addresses.end(),
std::equal_to<void *>()),
addresses.end());
}

::testing::FLAGS_gtest_death_test_style = "threadsafe";
ASSERT_EXIT(([&]() { thread_allocations.clear(); }(), exit(0)),
::testing::ExitedWithCode(0), ".*");
}

} // namespace allocation
} // namespace memory
} // namespace paddle
3 changes: 2 additions & 1 deletion paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

add_subdirectory(dynload)
add_subdirectory(stream)

cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
Expand All @@ -54,7 +55,7 @@ IF(WITH_DGC)
ENDIF()

IF(WITH_GPU)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
ENDIF()

IF(WITH_MKLDNN)
Expand Down
Loading

0 comments on commit 597cc05

Please sign in to comment.