From 84d1c734ca2fe7a17e000467823d49891507cf0b Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Sun, 25 Jun 2017 15:40:45 -0700 Subject: [PATCH 01/16] add paddle/memory/detail/cpu_allocator* --- paddle/CMakeLists.txt | 1 + paddle/memory/CMakeLists.txt | 1 + paddle/memory/README.md | 14 ++--- paddle/memory/detail/CMakeLists.txt | 1 + paddle/memory/detail/cpu_allocator.h | 63 ++++++++++++++++++++++ paddle/memory/detail/cpu_allocator_test.cc | 32 +++++++++++ paddle/memory/memory.cc | 51 ++++++++++++++++++ paddle/memory/memory.h | 27 ++++++++++ paddle/platform/place.cc | 12 ++--- paddle/platform/place.h | 45 ++++++++++------ paddle/platform/place_test.cc | 14 ++--- 11 files changed, 224 insertions(+), 37 deletions(-) create mode 100644 paddle/memory/CMakeLists.txt create mode 100644 paddle/memory/detail/CMakeLists.txt create mode 100644 paddle/memory/detail/cpu_allocator.h create mode 100644 paddle/memory/detail/cpu_allocator_test.cc create mode 100644 paddle/memory/memory.cc create mode 100644 paddle/memory/memory.h diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 573bd937a351a..0cddb95244fcf 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -10,6 +10,7 @@ add_subdirectory(trainer) add_subdirectory(scripts) add_subdirectory(optimizer) add_subdirectory(strings) +add_subdirectory(memory) # Do not build go directory until go cmake is working smoothly. # if(CMAKE_Go_COMPILER) diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt new file mode 100644 index 0000000000000..3943c3cfad31d --- /dev/null +++ b/paddle/memory/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(detail) diff --git a/paddle/memory/README.md b/paddle/memory/README.md index fd32d07ef40fc..e5f7880e4cad3 100644 --- a/paddle/memory/README.md +++ b/paddle/memory/README.md @@ -31,7 +31,7 @@ In `paddle/memory/memory.h` we have: namespace memory { template void* Alloc(Place, size_t); template void Free(Place, void*); -template void Used(Place); +template size_t Used(Place); } // namespace memory ``` @@ -39,7 +39,7 @@ These function templates have specializations on either `platform::CPUPlace` or ```cpp template<> -void Alloc(CPUPlace p, size_t size) { +void* Alloc(CPUPlace p, size_t size) { return GetCPUBuddyAllocator()->Alloc(size); } ``` @@ -102,15 +102,11 @@ class BuddyAllocator { }; ``` -#### System Allocators - -The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`. A system allocator holds information about a device, including the amount of memory has been allocated, so we can call +Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace. -- `GPUAllocator::Used()` and -- `CPUAllocator::Used()` - -to get the amount of memory that has been allocated so far. +#### System Allocators +The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`. ## Justification diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt new file mode 100644 index 0000000000000..fb8a11062da91 --- /dev/null +++ b/paddle/memory/detail/CMakeLists.txt @@ -0,0 +1 @@ +cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h new file mode 100644 index 0000000000000..8a872d3800d3d --- /dev/null +++ b/paddle/memory/detail/cpu_allocator.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // for malloc and free +#include // for size_t + +namespace paddle { +namespace memory { +namespace detail { + +// CPUAllocator calls cudaMallocHost, which returns +// pinned and mlocked memory as staging areas for data exchange +// between host and device. Allocates too much would reduce the +// amount of memory available to the system for paging. So, by +// default, we should use CPUAllocator. +template +class CPUAllocator { +public: + void* Alloc(size_t size); + void Free(void* p); +}; + +template <> +class CPUAllocator { +public: + void* Alloc(size_t size) { return malloc(size); } + void Free(void* p) { free(p); } +}; + +// If CMake macro WITH_GPU is OFF, C++ compiler won't generate the +// following specialization that depends on the CUDA library. +#ifdef WITH_GPU +template <> +class CPUAllocator { +public: + void* Alloc(size_t size) { + void* p; + if (cudaMallocHost(&p, size) != cudaSuccess) { + return NULL; + } + return *p; + } + + void Free(void* p) { cudaFreeHost(p); } +}; +#endif // WITH_GPU + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc new file mode 100644 index 0000000000000..0aa33a22fd0bc --- /dev/null +++ b/paddle/memory/detail/cpu_allocator_test.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/memory/detail/cpu_allocator.h" +#include "gtest/gtest.h" + +TEST(CPUAllocator, NonStaging) { + paddle::memory::detail::CPUAllocator a; + void* p = a.Alloc(4096); + EXPECT_NE(p, nullptr); + a.Free(p); +} + +#ifdef WITH_GPU +TEST(CPUAllocator, Staging) { + paddle::memory::detail::CPUAllocator a; + void* p = a.Alloc(4096); + EXPECT_NE(p, nullptr); + a.Free(p); +} +#endif // WITH_GPU diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc new file mode 100644 index 0000000000000..5f1253ede6818 --- /dev/null +++ b/paddle/memory/memory.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/memory/memory.h" + +namespace paddle { +namespace memory { + +template <> +void* Alloc(CPUPlace, size_t size) { + return GetCPUBuddyAllocator()->Alloc(size); +} + +template <> +void* Alloc(GPUPlace pl, size_t size) { + return GetGPUBuddyAllocator(pl.device)->Alloc(size); +} + +template <> +void Free(CPUPlace, void* p) { + return GetCPUBuddyAllocator()->Free(p); +} + +template <> +void* Alloc(GPUPlace pl, void* p) { + return GetGPUBuddyAllocator(pl.device)->Free(p); +} + +template <> +size_t Used(CPUPlace) { + return GetCPUBuddyAllocator()->Used(); +} + +template <> +size_t Alloc(GPUPlace pl) { + return GetGPUBuddyAllocator(pl.device)->Used(); +} + +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h new file mode 100644 index 0000000000000..ae8ac6ca523ad --- /dev/null +++ b/paddle/memory/memory.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/frameowork/place.h" + +namespace paddle { +namespace memory { + +typename void* Alloc(Place, size_t); +typename void Free(Place, void*); +typename size_t Used(Place); + +} // namespace memory +} // namespace paddle diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc index 1afd03c01169d..0704820aa0507 100644 --- a/paddle/platform/place.cc +++ b/paddle/platform/place.cc @@ -8,8 +8,8 @@ namespace detail { class PlacePrinter : public boost::static_visitor<> { public: PlacePrinter(std::ostream &os) : os_(os) {} - void operator()(const CpuPlace &) { os_ << "CpuPlace"; } - void operator()(const GpuPlace &p) { os_ << "GpuPlace(" << p.device << ")"; } + void operator()(const CPUPlace &) { os_ << "CPUPlace"; } + void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; } private: std::ostream &os_; @@ -22,14 +22,14 @@ static Place the_default_place; void set_place(const Place &place) { the_default_place = place; } const Place &get_place() { return the_default_place; } -const GpuPlace default_gpu() { return GpuPlace(0); } -const CpuPlace default_cpu() { return CpuPlace(); } +const GPUPlace default_gpu() { return GPUPlace(0); } +const CPUPlace default_cpu() { return CPUPlace(); } bool is_gpu_place(const Place &p) { - return boost::apply_visitor(IsGpuPlace(), p); + return boost::apply_visitor(IsGPUPlace(), p); } bool is_cpu_place(const Place &p) { - return !boost::apply_visitor(IsGpuPlace(), p); + return !boost::apply_visitor(IsGPUPlace(), p); } bool places_are_same_class(const Place &p1, const Place &p2) { diff --git a/paddle/platform/place.h b/paddle/platform/place.h index 489572c526e16..7cead183884bc 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -1,43 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #pragma once + #include #include namespace paddle { namespace platform { -struct CpuPlace { +struct CPUPlace { // WORKAROUND: for some reason, omitting this constructor // causes errors with boost 1.59 and OSX - CpuPlace() {} + CPUPlace() {} // needed for variant equality comparison - inline bool operator==(const CpuPlace &) const { return true; } - inline bool operator!=(const CpuPlace &) const { return false; } + inline bool operator==(const CPUPlace &) const { return true; } + inline bool operator!=(const CPUPlace &) const { return false; } }; -struct GpuPlace { - GpuPlace() : GpuPlace(0) {} - GpuPlace(int d) : device(d) {} +struct GPUPlace { + GPUPlace() : GPUPlace(0) {} + GPUPlace(int d) : device(d) {} // needed for variant equality comparison - inline bool operator==(const GpuPlace &o) const { return device == o.device; } - inline bool operator!=(const GpuPlace &o) const { return !(*this == o); } + inline bool operator==(const GPUPlace &o) const { return device == o.device; } + inline bool operator!=(const GPUPlace &o) const { return !(*this == o); } int device; }; -struct IsGpuPlace : public boost::static_visitor { - bool operator()(const CpuPlace &) const { return false; } - bool operator()(const GpuPlace &gpu) const { return true; } +struct IsGPUPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const GPUPlace &gpu) const { return true; } }; -typedef boost::variant Place; +typedef boost::variant Place; void set_place(const Place &); const Place &get_place(); -const GpuPlace default_gpu(); -const CpuPlace default_cpu(); +const GPUPlace default_gpu(); +const CPUPlace default_cpu(); bool is_gpu_place(const Place &); bool is_cpu_place(const Place &); diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc index 73fccceedf691..33e2e5a439ce6 100644 --- a/paddle/platform/place_test.cc +++ b/paddle/platform/place_test.cc @@ -3,8 +3,8 @@ #include "gtest/gtest.h" TEST(Place, Equality) { - paddle::platform::CpuPlace cpu; - paddle::platform::GpuPlace g0(0), g1(1), gg0(0); + paddle::platform::CPUPlace cpu; + paddle::platform::GPUPlace g0(0), g1(1), gg0(0); EXPECT_EQ(cpu, cpu); EXPECT_EQ(g0, g0); @@ -22,19 +22,19 @@ TEST(Place, Default) { EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu())); EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu())); - paddle::platform::set_place(paddle::platform::CpuPlace()); + paddle::platform::set_place(paddle::platform::CPUPlace()); EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place())); } TEST(Place, Print) { { std::stringstream ss; - ss << paddle::platform::GpuPlace(1); - EXPECT_EQ("GpuPlace(1)", ss.str()); + ss << paddle::platform::GPUPlace(1); + EXPECT_EQ("GPUPlace(1)", ss.str()); } { std::stringstream ss; - ss << paddle::platform::CpuPlace(); - EXPECT_EQ("CpuPlace", ss.str()); + ss << paddle::platform::CPUPlace(); + EXPECT_EQ("CPUPlace", ss.str()); } } From db128c4586c3c925a6c53a9ae770cb07cdbea1bf Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Sun, 25 Jun 2017 17:54:06 -0700 Subject: [PATCH 02/16] Pass cpu_allocator_test --- CMakeLists.txt | 2 +- cmake/generic.cmake | 4 ++++ paddle/memory/detail/CMakeLists.txt | 6 +++++- paddle/memory/detail/cpu_allocator.h | 13 +++++++++---- paddle/memory/detail/cpu_allocator_test.cc | 16 +++++++++++----- paddle/memory/memory.cc | 14 ++++++++++++-- paddle/memory/memory.h | 16 +++++++++++++--- 7 files changed, 55 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5d7f2c7ec76d..3c719d35eced2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,7 +71,7 @@ if(ANDROID) "Disable RDMA when cross-compiling for Android" FORCE) endif(ANDROID) -set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING +set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") if (WITH_C_API AND WITH_PYTHON) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 69e8164a00d1f..840155750e1ac 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -78,6 +78,10 @@ # # cc_test(example_test SRCS example_test.cc DEPS example glog gflags) +if(WITH_GPU) + add_definitions(-DPADDLE_WITH_GPU) +endif() + if(NOT APPLE) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index fb8a11062da91..c425e9f947d07 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1 +1,5 @@ -cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) +if(${WITH_GPU}) + nv_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # nv_test links CUDA, but +else(${WITH_GPU}) + cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # cc_test doesn't. +endif(${WITH_GPU}) diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h index 8a872d3800d3d..0d8ea3f52b92f 100644 --- a/paddle/memory/detail/cpu_allocator.h +++ b/paddle/memory/detail/cpu_allocator.h @@ -17,6 +17,11 @@ limitations under the License. */ #include // for malloc and free #include // for size_t +#ifdef PADDLE_WITH_GPU +#include +#include +#endif // PADDLE_WITH_GPU + namespace paddle { namespace memory { namespace detail { @@ -40,9 +45,9 @@ class CPUAllocator { void Free(void* p) { free(p); } }; -// If CMake macro WITH_GPU is OFF, C++ compiler won't generate the +// If CMake macro PADDLE_WITH_GPU is OFF, C++ compiler won't generate the // following specialization that depends on the CUDA library. -#ifdef WITH_GPU +#ifdef PADDLE_WITH_GPU template <> class CPUAllocator { public: @@ -51,12 +56,12 @@ class CPUAllocator { if (cudaMallocHost(&p, size) != cudaSuccess) { return NULL; } - return *p; + return p; } void Free(void* p) { cudaFreeHost(p); } }; -#endif // WITH_GPU +#endif // PADDLE_WITH_GPU } // namespace detail } // namespace memory diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc index 0aa33a22fd0bc..464bc84e5c7b5 100644 --- a/paddle/memory/detail/cpu_allocator_test.cc +++ b/paddle/memory/detail/cpu_allocator_test.cc @@ -22,11 +22,17 @@ TEST(CPUAllocator, NonStaging) { a.Free(p); } -#ifdef WITH_GPU +#ifdef PADDLE_WITH_GPU TEST(CPUAllocator, Staging) { paddle::memory::detail::CPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p); + + int devices; + if (cudaGetDeviceCount(&devices) == cudaSuccess && devices > 0) { + void* p = a.Alloc(4096); + EXPECT_NE(p, nullptr); + a.Free(p); + } else { + EXPECT_EQ(a.Alloc(4096), nullptr); + } } -#endif // WITH_GPU +#endif // PADDLE_WITH_GPU diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 5f1253ede6818..b617923731a4d 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -19,7 +19,11 @@ namespace memory { template <> void* Alloc(CPUPlace, size_t size) { - return GetCPUBuddyAllocator()->Alloc(size); + return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size); +} + +void* AllocStaging(CPUPlace, size_t size) { + return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size); } template <> @@ -29,9 +33,14 @@ void* Alloc(GPUPlace pl, size_t size) { template <> void Free(CPUPlace, void* p) { - return GetCPUBuddyAllocator()->Free(p); + return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); +} + +void FreeStaging(CPUPlace, void* p) { + return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); } +#ifdef PADDLE_WITH_GPU template <> void* Alloc(GPUPlace pl, void* p) { return GetGPUBuddyAllocator(pl.device)->Free(p); @@ -46,6 +55,7 @@ template <> size_t Alloc(GPUPlace pl) { return GetGPUBuddyAllocator(pl.device)->Used(); } +#endif // PADDLE_WITH_GPU } // namespace memory } // namespace paddle diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index ae8ac6ca523ad..8c15a133bb4e9 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -19,9 +19,19 @@ limitations under the License. */ namespace paddle { namespace memory { -typename void* Alloc(Place, size_t); -typename void Free(Place, void*); -typename size_t Used(Place); +template +void* Alloc(Place, size_t); +template +void Free(Place, void*); +template +size_t Used(Place); + +// Staging memory means "pinned" host memory that can be mapped into +// the CUDA memory space and accessed by the device rapidly. Don't +// allocate too much staging memory; otherwise system performance will +// degrade because the OS cannot find enough swap memory space. +void* AllocStaging(CPUPlace, size_t); +void* FreeStaging(CPUPlace, size_t); } // namespace memory } // namespace paddle From ce938ae5f9baea2b2d136154ee9a696b394929e1 Mon Sep 17 00:00:00 2001 From: liaogang Date: Mon, 26 Jun 2017 23:32:46 +0800 Subject: [PATCH 03/16] FIX: Pinned memory --- paddle/memory/README.md | 1 + paddle/memory/detail/CMakeLists.txt | 6 +--- paddle/memory/detail/cpu_allocator.h | 39 ++++++++++++---------- paddle/memory/detail/cpu_allocator_test.cc | 16 +++------ 4 files changed, 27 insertions(+), 35 deletions(-) diff --git a/paddle/memory/README.md b/paddle/memory/README.md index e5f7880e4cad3..96a331a486f57 100644 --- a/paddle/memory/README.md +++ b/paddle/memory/README.md @@ -97,6 +97,7 @@ class BuddyAllocator { struct Block { size_t size; Block* left, right; + size_t index; // allocator id }; ... }; diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index c425e9f947d07..fb8a11062da91 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,5 +1 @@ -if(${WITH_GPU}) - nv_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # nv_test links CUDA, but -else(${WITH_GPU}) - cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # cc_test doesn't. -endif(${WITH_GPU}) +cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h index 0d8ea3f52b92f..a487fecef49b7 100644 --- a/paddle/memory/detail/cpu_allocator.h +++ b/paddle/memory/detail/cpu_allocator.h @@ -14,20 +14,19 @@ limitations under the License. */ #pragma once -#include // for malloc and free #include // for size_t +#include // for malloc and free -#ifdef PADDLE_WITH_GPU -#include -#include -#endif // PADDLE_WITH_GPU +#ifndef _WIN32 +#include // for mlock and munlock +#endif namespace paddle { namespace memory { namespace detail { -// CPUAllocator calls cudaMallocHost, which returns -// pinned and mlocked memory as staging areas for data exchange +// CPUAllocator calls mlock, which returns +// pinned and locked memory as staging areas for data exchange // between host and device. Allocates too much would reduce the // amount of memory available to the system for paging. So, by // default, we should use CPUAllocator. @@ -35,33 +34,37 @@ template class CPUAllocator { public: void* Alloc(size_t size); - void Free(void* p); + void Free(void* p, size_t size); }; template <> class CPUAllocator { public: - void* Alloc(size_t size) { return malloc(size); } - void Free(void* p) { free(p); } + void* Alloc(size_t size) { return std::malloc(size); } + void Free(void* p, size_t size) { std::free(p); } }; -// If CMake macro PADDLE_WITH_GPU is OFF, C++ compiler won't generate the -// following specialization that depends on the CUDA library. -#ifdef PADDLE_WITH_GPU template <> class CPUAllocator { public: void* Alloc(size_t size) { - void* p; - if (cudaMallocHost(&p, size) != cudaSuccess) { - return NULL; + void* p = std::malloc(size); + if (p == nullptr) { + return p; } +#ifndef _WIN32 + mlock(p, size); +#endif return p; } - void Free(void* p) { cudaFreeHost(p); } + void Free(void* p, size_t size) { +#ifndef _WIN32 + munlock(p, size); +#endif + std::free(p); + } }; -#endif // PADDLE_WITH_GPU } // namespace detail } // namespace memory diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc index 464bc84e5c7b5..4e45266cd8ad8 100644 --- a/paddle/memory/detail/cpu_allocator_test.cc +++ b/paddle/memory/detail/cpu_allocator_test.cc @@ -19,20 +19,12 @@ TEST(CPUAllocator, NonStaging) { paddle::memory::detail::CPUAllocator a; void* p = a.Alloc(4096); EXPECT_NE(p, nullptr); - a.Free(p); + a.Free(p, 4096); } -#ifdef PADDLE_WITH_GPU TEST(CPUAllocator, Staging) { paddle::memory::detail::CPUAllocator a; - - int devices; - if (cudaGetDeviceCount(&devices) == cudaSuccess && devices > 0) { - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p); - } else { - EXPECT_EQ(a.Alloc(4096), nullptr); - } + void* p = a.Alloc(4096); + EXPECT_NE(p, nullptr); + a.Free(p, 4096); } -#endif // PADDLE_WITH_GPU From ce70df86b1e8c892cdde5312caa0c2699f368f7d Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 27 Jun 2017 00:15:36 +0800 Subject: [PATCH 04/16] Add gpu_allocator --- paddle/memory/.clang-format | 5 ++ paddle/memory/detail/CMakeLists.txt | 1 + paddle/memory/detail/cpu_allocator.h | 6 +- paddle/memory/detail/gpu_allocator.h | 92 ++++++++++++++++++++++ paddle/memory/detail/gpu_allocator_test.cc | 30 +++++++ 5 files changed, 131 insertions(+), 3 deletions(-) create mode 100644 paddle/memory/.clang-format create mode 100644 paddle/memory/detail/gpu_allocator.h create mode 100644 paddle/memory/detail/gpu_allocator_test.cc diff --git a/paddle/memory/.clang-format b/paddle/memory/.clang-format new file mode 100644 index 0000000000000..29282dc87e2c4 --- /dev/null +++ b/paddle/memory/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index fb8a11062da91..81ca8a0bbf0ef 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1 +1,2 @@ cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) +nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc) diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h index a487fecef49b7..17753ccef718f 100644 --- a/paddle/memory/detail/cpu_allocator.h +++ b/paddle/memory/detail/cpu_allocator.h @@ -32,21 +32,21 @@ namespace detail { // default, we should use CPUAllocator. template class CPUAllocator { -public: + public: void* Alloc(size_t size); void Free(void* p, size_t size); }; template <> class CPUAllocator { -public: + public: void* Alloc(size_t size) { return std::malloc(size); } void Free(void* p, size_t size) { std::free(p); } }; template <> class CPUAllocator { -public: + public: void* Alloc(size_t size) { void* p = std::malloc(size); if (p == nullptr) { diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/gpu_allocator.h new file mode 100644 index 0000000000000..9452c41fb8975 --- /dev/null +++ b/paddle/memory/detail/gpu_allocator.h @@ -0,0 +1,92 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // for size_t + +#include +#include + +namespace paddle { +namespace memory { +namespace detail { + +inline void throw_on_error(cudaError_t e, const char* message) { + if (e) { + throw thrust::system_error(e, thrust::cuda_category(), message); + } +} + +// GPUAllocator calls cudaHostMalloc, which returns +// pinned and locked memory as staging areas for data exchange +// between host and device. Allocates too much would reduce the +// amount of memory available to the system for paging. So, by +// default, we should use GPUAllocator. +template +class GPUAllocator { +public: + void* Alloc(size_t size); + void Free(void* p, size_t size); +}; + +template <> +class GPUAllocator { +public: + void* Alloc(size_t size) { + void* p = 0; + cudaError_t result = cudaMalloc(&p, size); + if (result == cudaSuccess) { + return p; + } + // clear last error + cudaGetLastError(); + return nullptr; + } + + void Free(void* p, size_t size) { + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + auto err = cudaFree(p); + if (err != cudaErrorCudartUnloading) { + throw_on_error(err, "cudaFree failed"); + } + } +}; + +template <> +class GPUAllocator { +public: + void* Alloc(size_t size) { + void* p = 0; + cudaError_t result = cudaMallocHost(&p, size); + if (result == cudaSuccess) { + return p; + } + // clear last error + cudaGetLastError(); + return nullptr; + } + + void Free(void* p, size_t size) { + throw_on_error(cudaFreeHost(p), "cudaFreeHost failed"); + } +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/gpu_allocator_test.cc b/paddle/memory/detail/gpu_allocator_test.cc new file mode 100644 index 0000000000000..18c1c9ab43084 --- /dev/null +++ b/paddle/memory/detail/gpu_allocator_test.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/memory/detail/gpu_allocator.h" +#include "gtest/gtest.h" + +TEST(GPUAllocator, NonStaging) { + paddle::memory::detail::GPUAllocator a; + void* p = a.Alloc(4096); + EXPECT_NE(p, nullptr); + a.Free(p, 4096); +} + +TEST(GPUAllocator, Staging) { + paddle::memory::detail::GPUAllocator a; + void* p = a.Alloc(4096); + EXPECT_NE(p, nullptr); + a.Free(p, 4096); +} From e02859c0f53dfe4616976b015d4fefd8aaa6eb39 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 26 Jun 2017 15:27:01 -0700 Subject: [PATCH 05/16] Replace {cpu,gpu}_allocator.h and {cpu,gpu}_allocator_test.cc by system_allocator{.h,_test.cc} --- paddle/memory/CMakeLists.txt | 6 ++ paddle/memory/detail/CMakeLists.txt | 3 +- paddle/memory/detail/cpu_allocator.h | 71 ----------------- paddle/memory/detail/cpu_allocator_test.cc | 30 ------- .../{gpu_allocator.h => system_allocator.h} | 79 +++++++++++-------- ...cator_test.cc => system_allocator_test.cc} | 20 ++++- paddle/memory/memory.cc | 67 +++++++--------- paddle/memory/memory.h | 16 +--- 8 files changed, 106 insertions(+), 186 deletions(-) delete mode 100644 paddle/memory/detail/cpu_allocator.h delete mode 100644 paddle/memory/detail/cpu_allocator_test.cc rename paddle/memory/detail/{gpu_allocator.h => system_allocator.h} (58%) rename paddle/memory/detail/{gpu_allocator_test.cc => system_allocator_test.cc} (69%) diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt index 3943c3cfad31d..86625124967d7 100644 --- a/paddle/memory/CMakeLists.txt +++ b/paddle/memory/CMakeLists.txt @@ -1 +1,7 @@ add_subdirectory(detail) + +if(${WITH_GPU}) + nv_library(memory SRCS memory.cc) +else(${WITH_GPU}) + cc_library(memory SRCS memroy.cc) +endif(${WITH_GPU}) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index 81ca8a0bbf0ef..3b5bbd7a12fab 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,2 +1 @@ -cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) -nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc) +cc_test(system_allocator_test SRCS system_allocator_test.cc) diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h deleted file mode 100644 index 17753ccef718f..0000000000000 --- a/paddle/memory/detail/cpu_allocator.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // for size_t -#include // for malloc and free - -#ifndef _WIN32 -#include // for mlock and munlock -#endif - -namespace paddle { -namespace memory { -namespace detail { - -// CPUAllocator calls mlock, which returns -// pinned and locked memory as staging areas for data exchange -// between host and device. Allocates too much would reduce the -// amount of memory available to the system for paging. So, by -// default, we should use CPUAllocator. -template -class CPUAllocator { - public: - void* Alloc(size_t size); - void Free(void* p, size_t size); -}; - -template <> -class CPUAllocator { - public: - void* Alloc(size_t size) { return std::malloc(size); } - void Free(void* p, size_t size) { std::free(p); } -}; - -template <> -class CPUAllocator { - public: - void* Alloc(size_t size) { - void* p = std::malloc(size); - if (p == nullptr) { - return p; - } -#ifndef _WIN32 - mlock(p, size); -#endif - return p; - } - - void Free(void* p, size_t size) { -#ifndef _WIN32 - munlock(p, size); -#endif - std::free(p); - } -}; - -} // namespace detail -} // namespace memory -} // namespace paddle diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc deleted file mode 100644 index 4e45266cd8ad8..0000000000000 --- a/paddle/memory/detail/cpu_allocator_test.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/memory/detail/cpu_allocator.h" -#include "gtest/gtest.h" - -TEST(CPUAllocator, NonStaging) { - paddle::memory::detail::CPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); -} - -TEST(CPUAllocator, Staging) { - paddle::memory::detail::CPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); -} diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/system_allocator.h similarity index 58% rename from paddle/memory/detail/gpu_allocator.h rename to paddle/memory/detail/system_allocator.h index 9452c41fb8975..0a64553188995 100644 --- a/paddle/memory/detail/gpu_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -14,20 +14,58 @@ limitations under the License. */ #pragma once -#include // for size_t +#include // for size_t +#include // for mlock and munlock +#include // for malloc and free -#include +#ifndef PADDLE_ONLY_CPU #include +#include +#endif // PADDLE_ONLY_CPU namespace paddle { namespace memory { namespace detail { +class SystemAllocator { + public: + virtual void* Alloc(size_t size) = 0; + virtual void* Free(void* p) = 0; +}; + +// CPUAllocator calls mlock, which returns pinned +// and locked memory as staging areas for data exchange between host +// and device. Allocates too much would reduce the amount of memory +// available to the system for paging. So, by default, we should use +// CPUAllocator. +template +class CPUAllocator : public SystemAllocator { + public: + virtual void* Alloc(size_t size) { + void* p = std::malloc(size); + if (p != nullptr && lock_memory) { + mlock(p, size); + } + return p; + } + + virtual void Free(void* p, size_t size) { + if (p != nullptr && lock_memory) { + munlock(p, size); + } + std::free(p); + } +}; + +#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. + +namespace { inline void throw_on_error(cudaError_t e, const char* message) { if (e) { throw thrust::system_error(e, thrust::cuda_category(), message); } } +} // namespace // GPUAllocator calls cudaHostMalloc, which returns // pinned and locked memory as staging areas for data exchange @@ -36,17 +74,11 @@ inline void throw_on_error(cudaError_t e, const char* message) { // default, we should use GPUAllocator. template class GPUAllocator { -public: - void* Alloc(size_t size); - void Free(void* p, size_t size); -}; - -template <> -class GPUAllocator { -public: + public: void* Alloc(size_t size) { void* p = 0; - cudaError_t result = cudaMalloc(&p, size); + cudaError_t result = + staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size); if (result == cudaSuccess) { return p; } @@ -60,32 +92,15 @@ class GPUAllocator { // that is returned if you ever call cudaFree after the // driver has already shutdown. This happens only if the // process is terminating, in which case we don't care if - // cudaFree succeeds. - auto err = cudaFree(p); + // cudaFree succeeds. + auto err = staging ? cudaFreeHost(p) : cudaFree(p); if (err != cudaErrorCudartUnloading) { - throw_on_error(err, "cudaFree failed"); + throw_on_error(err, "cudaFree failed"); } } }; -template <> -class GPUAllocator { -public: - void* Alloc(size_t size) { - void* p = 0; - cudaError_t result = cudaMallocHost(&p, size); - if (result == cudaSuccess) { - return p; - } - // clear last error - cudaGetLastError(); - return nullptr; - } - - void Free(void* p, size_t size) { - throw_on_error(cudaFreeHost(p), "cudaFreeHost failed"); - } -}; +#endif // PADDLE_ONLY_CPU } // namespace detail } // namespace memory diff --git a/paddle/memory/detail/gpu_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc similarity index 69% rename from paddle/memory/detail/gpu_allocator_test.cc rename to paddle/memory/detail/system_allocator_test.cc index 18c1c9ab43084..4e7b8018b6a07 100644 --- a/paddle/memory/detail/gpu_allocator_test.cc +++ b/paddle/memory/detail/system_allocator_test.cc @@ -12,9 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/memory/detail/gpu_allocator.h" +#include "paddle/memory/detail/system_allocator.h" #include "gtest/gtest.h" +TEST(CPUAllocator, NoLockMem) { + paddle::memory::detail::CPUAllocator a; + void* p = a.Alloc(4096); + EXPECT_NE(p, nullptr); + a.Free(p, 4096); +} + +TEST(CPUAllocator, LockMem) { + paddle::memory::detail::CPUAllocator a; + void* p = a.Alloc(4096); + EXPECT_NE(p, nullptr); + a.Free(p, 4096); +} + +#ifndef PADDLE_ONLY_CPU + TEST(GPUAllocator, NonStaging) { paddle::memory::detail::GPUAllocator a; void* p = a.Alloc(4096); @@ -28,3 +44,5 @@ TEST(GPUAllocator, Staging) { EXPECT_NE(p, nullptr); a.Free(p, 4096); } + +#endif // PADDLE_ONLY_CPU diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index b617923731a4d..ca3c01ebdb035 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -14,48 +14,41 @@ limitations under the License. */ #include "paddle/memory/memory.h" +#include "paddle/memory/detail/cpu_allocator.h" +#include "paddle/memory/detail/gpu_allocator.h" + namespace paddle { namespace memory { -template <> -void* Alloc(CPUPlace, size_t size) { - return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size); -} - -void* AllocStaging(CPUPlace, size_t size) { - return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size); -} - -template <> -void* Alloc(GPUPlace pl, size_t size) { - return GetGPUBuddyAllocator(pl.device)->Alloc(size); -} - -template <> -void Free(CPUPlace, void* p) { - return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); -} - -void FreeStaging(CPUPlace, void* p) { - return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); -} - -#ifdef PADDLE_WITH_GPU -template <> -void* Alloc(GPUPlace pl, void* p) { - return GetGPUBuddyAllocator(pl.device)->Free(p); -} - -template <> -size_t Used(CPUPlace) { +void Alloc(paddle::platform::Place pl, size_t size) { +#ifndef PADDLE_ONLY_CPU + if (paddle::platform::is_gpu_place(pl)) { + return GetGPUBuddyAllocator(pl.device)->Alloc(size); + } +#endif // PADDLE_ONLY_CPU + PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); + return GetCPUBuddyAllocator()->Alloc(size); +} + +void Free(paddle::platform::Place pl, void* p) { +#ifndef PADDLE_ONLY_CPU + if (paddle::platform::is_gpu_place(pl)) { + GetGPUBuddyAllocator(pl.device)->Free(p); + } +#endif // PADDLE_ONLY_CPU + PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); + GetCPUBuddyAllocator()->Free(p); +} + +size_t Used(paddle::platform::Place pl) { +#ifndef PADDLE_ONLY_CPU + if (paddle::platform::is_gpu_place(pl)) { + return GetGPUBuddyAllocator(pl.device)->Used(); + } +#endif // PADDLE_ONLY_CPU + PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); return GetCPUBuddyAllocator()->Used(); } -template <> -size_t Alloc(GPUPlace pl) { - return GetGPUBuddyAllocator(pl.device)->Used(); -} -#endif // PADDLE_WITH_GPU - } // namespace memory } // namespace paddle diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 8c15a133bb4e9..0bc609205eca2 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -19,19 +19,9 @@ limitations under the License. */ namespace paddle { namespace memory { -template -void* Alloc(Place, size_t); -template -void Free(Place, void*); -template -size_t Used(Place); - -// Staging memory means "pinned" host memory that can be mapped into -// the CUDA memory space and accessed by the device rapidly. Don't -// allocate too much staging memory; otherwise system performance will -// degrade because the OS cannot find enough swap memory space. -void* AllocStaging(CPUPlace, size_t); -void* FreeStaging(CPUPlace, size_t); +void* Alloc(paddle::framework::Place, size_t); +void Free(paddle::framework::Place, void*); +size_t Used(paddle::framework::Place); } // namespace memory } // namespace paddle From 6250d108bfd39afb3b2beba438ecb22eca8991bc Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 27 Jun 2017 09:51:55 +0800 Subject: [PATCH 06/16] FIX: clang-format --- paddle/memory/detail/gpu_allocator.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/gpu_allocator.h index 9452c41fb8975..682afdf7d3349 100644 --- a/paddle/memory/detail/gpu_allocator.h +++ b/paddle/memory/detail/gpu_allocator.h @@ -16,8 +16,8 @@ limitations under the License. */ #include // for size_t -#include #include +#include namespace paddle { namespace memory { @@ -36,14 +36,14 @@ inline void throw_on_error(cudaError_t e, const char* message) { // default, we should use GPUAllocator. template class GPUAllocator { -public: + public: void* Alloc(size_t size); void Free(void* p, size_t size); }; template <> class GPUAllocator { -public: + public: void* Alloc(size_t size) { void* p = 0; cudaError_t result = cudaMalloc(&p, size); @@ -60,22 +60,22 @@ class GPUAllocator { // that is returned if you ever call cudaFree after the // driver has already shutdown. This happens only if the // process is terminating, in which case we don't care if - // cudaFree succeeds. + // cudaFree succeeds. auto err = cudaFree(p); if (err != cudaErrorCudartUnloading) { - throw_on_error(err, "cudaFree failed"); + throw_on_error(err, "cudaFree failed"); } } }; template <> class GPUAllocator { -public: + public: void* Alloc(size_t size) { void* p = 0; cudaError_t result = cudaMallocHost(&p, size); if (result == cudaSuccess) { - return p; + return p; } // clear last error cudaGetLastError(); From f149d183f7d78fdaa171f2afabaf8a138596c8ff Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 26 Jun 2017 20:41:33 -0700 Subject: [PATCH 07/16] Add system_allocator --- paddle/memory/detail/CMakeLists.txt | 6 +- paddle/memory/detail/system_allocator.h | 84 ++++++++++++------- paddle/memory/detail/system_allocator_test.cc | 44 +++++----- 3 files changed, 81 insertions(+), 53 deletions(-) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index 3b5bbd7a12fab..c16dfadeb2180 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1 +1,5 @@ -cc_test(system_allocator_test SRCS system_allocator_test.cc) +if(${WITH_GPU}) + nv_test(system_allocator_test SRCS system_allocator_test.cc) +else(${WITH_GPU}) + cc_test(system_allocator_test SRCS system_allocator_test.cc) +endif(${WITH_GPU}) diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index 0a64553188995..1768f9a0da6c9 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -23,14 +23,31 @@ limitations under the License. */ #include #endif // PADDLE_ONLY_CPU +#include "paddle/platform/assert.h" + namespace paddle { namespace memory { namespace detail { -class SystemAllocator { +class CPUDeleter { public: - virtual void* Alloc(size_t size) = 0; - virtual void* Free(void* p) = 0; + CPUDeleter(void* ptr, size_t size, bool locked) + : ptr_(ptr), size_(size), locked_(locked) {} + + void* Ptr() { return ptr_; } + + void operator()(void* ptr) { + PADDLE_ASSERT(ptr == ptr_); + if (ptr_ != nullptr && locked_) { + munlock(ptr_, size_); + } + std::free(ptr_); + } + + private: + void* ptr_; + size_t size_; + bool locked_; }; // CPUAllocator calls mlock, which returns pinned @@ -39,21 +56,14 @@ class SystemAllocator { // available to the system for paging. So, by default, we should use // CPUAllocator. template -class CPUAllocator : public SystemAllocator { +class CPUAllocator { public: - virtual void* Alloc(size_t size) { + static CPUDeleter Alloc(size_t size) { void* p = std::malloc(size); if (p != nullptr && lock_memory) { mlock(p, size); } - return p; - } - - virtual void Free(void* p, size_t size) { - if (p != nullptr && lock_memory) { - munlock(p, size); - } - std::free(p); + return CPUDeleter(p, size, lock_memory); } }; @@ -67,6 +77,32 @@ inline void throw_on_error(cudaError_t e, const char* message) { } } // namespace +class GPUDeleter { + public: + GPUDeleter(void* ptr, size_t size, bool staging) + : ptr_(ptr), size_(size), staging_(staging) {} + + void* Ptr() { return ptr_; } + + void operator()(void* ptr) { + PADDLE_ASSERT(ptr == ptr_); + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr); + if (err != cudaErrorCudartUnloading) { + throw_on_error(err, "cudaFree{Host} failed"); + } + } + + private: + void* ptr_; + size_t size_; + bool staging_; +}; + // GPUAllocator calls cudaHostMalloc, which returns // pinned and locked memory as staging areas for data exchange // between host and device. Allocates too much would reduce the @@ -75,28 +111,14 @@ inline void throw_on_error(cudaError_t e, const char* message) { template class GPUAllocator { public: - void* Alloc(size_t size) { + static GPUDeleter Alloc(size_t size) { void* p = 0; cudaError_t result = staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size); - if (result == cudaSuccess) { - return p; - } - // clear last error - cudaGetLastError(); - return nullptr; - } - - void Free(void* p, size_t size) { - // Purposefully allow cudaErrorCudartUnloading, because - // that is returned if you ever call cudaFree after the - // driver has already shutdown. This happens only if the - // process is terminating, in which case we don't care if - // cudaFree succeeds. - auto err = staging ? cudaFreeHost(p) : cudaFree(p); - if (err != cudaErrorCudartUnloading) { - throw_on_error(err, "cudaFree failed"); + if (result != cudaSuccess) { + cudaGetLastError(); // clear error if there is any. } + return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging); } }; diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc index 4e7b8018b6a07..fec70a65b77d5 100644 --- a/paddle/memory/detail/system_allocator_test.cc +++ b/paddle/memory/detail/system_allocator_test.cc @@ -13,36 +13,38 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/memory/detail/system_allocator.h" + +#include +#include + #include "gtest/gtest.h" -TEST(CPUAllocator, NoLockMem) { - paddle::memory::detail::CPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); +template +void TestAllocator() { + { + auto d = Allocator::Alloc(sizeof(int)); + EXPECT_NE(d.Ptr(), nullptr); + std::unique_ptr p(static_cast(d.Ptr()), d); + } + { + auto d = Allocator::Alloc(0); + EXPECT_EQ(d.Ptr(), nullptr); + std::unique_ptr p(static_cast(d.Ptr()), d); + } } +TEST(CPUAllocator, NoLockMem) { + TestAllocator>(); +} TEST(CPUAllocator, LockMem) { - paddle::memory::detail::CPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); + TestAllocator>(); } #ifndef PADDLE_ONLY_CPU - -TEST(GPUAllocator, NonStaging) { - paddle::memory::detail::GPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); +TEST(GPUAllocator, NoStaging) { + TestAllocator>(); } - TEST(GPUAllocator, Staging) { - paddle::memory::detail::GPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); + TestAllocator>(); } - #endif // PADDLE_ONLY_CPU From dd08d337c0138c9def5f7ce95f88bae5599e5f92 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 28 Jun 2017 01:30:57 +0800 Subject: [PATCH 08/16] FIX: fix cmake type error --- CMakeLists.txt | 2 ++ paddle/CMakeLists.txt | 10 +--------- paddle/memory/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c719d35eced2..b779caefb9a8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ if(NOT CMAKE_CROSSCOMPILING) endif(NOT CMAKE_CROSSCOMPILING) find_package(Git REQUIRED) find_package(Threads REQUIRED) +find_package(Boost QUIET) include(simd) @@ -109,6 +110,7 @@ include_directories("${PROJ_ROOT}") include_directories("${PROJ_ROOT}/paddle/cuda/include") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient") +include_directories(${Boost_INCLUDE_DIRS}) set(EXTERNAL_LIBS ${GFLAGS_LIBRARIES} diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 0cddb95244fcf..979b68e827218 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -10,17 +10,9 @@ add_subdirectory(trainer) add_subdirectory(scripts) add_subdirectory(optimizer) add_subdirectory(strings) -add_subdirectory(memory) - -# Do not build go directory until go cmake is working smoothly. -# if(CMAKE_Go_COMPILER) -# add_subdirectory(go) -# endif() - -find_package(Boost QUIET) if(Boost_FOUND) - include_directories(${Boost_INCLUDE_DIRS}) + add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) endif() diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt index 86625124967d7..e74ce75c9398d 100644 --- a/paddle/memory/CMakeLists.txt +++ b/paddle/memory/CMakeLists.txt @@ -3,5 +3,5 @@ add_subdirectory(detail) if(${WITH_GPU}) nv_library(memory SRCS memory.cc) else(${WITH_GPU}) - cc_library(memory SRCS memroy.cc) + cc_library(memory SRCS memory.cc) endif(${WITH_GPU}) From dde0da9e0ffee7a49510061a139ab2abc7ab55b9 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 28 Jun 2017 01:31:24 +0800 Subject: [PATCH 09/16] ENH: Add cuda.h in platform --- paddle/platform/cuda.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 paddle/platform/cuda.h diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h new file mode 100644 index 0000000000000..864a5d3340a8f --- /dev/null +++ b/paddle/platform/cuda.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifndef PADDLE_ONLY_CPU + +#include +#include + +namespace paddle { +namespace platform { + +inline void throw_on_error(cudaError_t e, const char* message) { + if (e) { + throw thrust::system_error(e, thrust::cuda_category(), message); + } +} + +int GetDeviceCount(void) { + int count; + throw_on_error(cudaGetDeviceCount(&count), + "cudaGetDeviceCount failed"); + return count; +} + +} // namespace platform +} // namespace paddle + +#endif // PADDLE_ONLY_CPU From 29c7512b3ce13ca7b89d3ff3f4aea2c7d7f27478 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 28 Jun 2017 01:31:46 +0800 Subject: [PATCH 10/16] FIX: fix memory.h/cc --- paddle/memory/memory.cc | 23 ++++++++++++++--------- paddle/memory/memory.h | 8 ++++---- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index ca3c01ebdb035..0d123d99e234a 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -13,41 +13,46 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/memory/memory.h" +#include "paddle/memory/detail/buddy_allocator.h" +#include "paddle/memory/detail/system_allocator.h" +#include "paddle/platform/assert.h" -#include "paddle/memory/detail/cpu_allocator.h" -#include "paddle/memory/detail/gpu_allocator.h" +#include namespace paddle { namespace memory { -void Alloc(paddle::platform::Place pl, size_t size) { +void* Alloc(platform::Place pl, size_t size) { #ifndef PADDLE_ONLY_CPU if (paddle::platform::is_gpu_place(pl)) { - return GetGPUBuddyAllocator(pl.device)->Alloc(size); + size_t gpu_id = boost::get(pl).device; + return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size); } #endif // PADDLE_ONLY_CPU PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - return GetCPUBuddyAllocator()->Alloc(size); + return detail::GetCPUBuddyAllocator()->Alloc(size); } void Free(paddle::platform::Place pl, void* p) { #ifndef PADDLE_ONLY_CPU if (paddle::platform::is_gpu_place(pl)) { - GetGPUBuddyAllocator(pl.device)->Free(p); + size_t gpu_id = boost::get(pl).device; + detail::GetGPUBuddyAllocator(gpu_id)->Free(p); } #endif // PADDLE_ONLY_CPU PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - GetCPUBuddyAllocator()->Free(p); + detail::GetCPUBuddyAllocator()->Free(p); } size_t Used(paddle::platform::Place pl) { #ifndef PADDLE_ONLY_CPU if (paddle::platform::is_gpu_place(pl)) { - return GetGPUBuddyAllocator(pl.device)->Used(); + size_t gpu_id = boost::get(pl).device; + return detail::GetGPUBuddyAllocator(gpu_id)->Used(); } #endif // PADDLE_ONLY_CPU PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); - return GetCPUBuddyAllocator()->Used(); + return detail::GetCPUBuddyAllocator()->Used(); } } // namespace memory diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 0bc609205eca2..a33092bade65e 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -14,14 +14,14 @@ limitations under the License. */ #pragma once -#include "paddle/frameowork/place.h" +#include "paddle/platform/place.h" namespace paddle { namespace memory { -void* Alloc(paddle::framework::Place, size_t); -void Free(paddle::framework::Place, void*); -size_t Used(paddle::framework::Place); +void* Alloc(paddle::platform::Place, size_t); +void Free(paddle::platform::Place, void*); +size_t Used(paddle::platform::Place); } // namespace memory } // namespace paddle From b22dd12854150c31b9cb9e3e550bdee4b5df5977 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 28 Jun 2017 01:32:06 +0800 Subject: [PATCH 11/16] ENH: Add buddy allocator draft --- paddle/memory/detail/CMakeLists.txt | 4 +- paddle/memory/detail/buddy_allocator.h | 79 ++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 paddle/memory/detail/buddy_allocator.h diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index c16dfadeb2180..cd5622203ff51 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,5 +1,5 @@ if(${WITH_GPU}) - nv_test(system_allocator_test SRCS system_allocator_test.cc) + nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog) else(${WITH_GPU}) - cc_test(system_allocator_test SRCS system_allocator_test.cc) + cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog) endif(${WITH_GPU}) diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h new file mode 100644 index 0000000000000..35e96fd50782a --- /dev/null +++ b/paddle/memory/detail/buddy_allocator.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/memory/detail/system_allocator.h" + +namespace paddle { +namespace memory { +namespace detail { + +template +class BuddyAllocator { + public: + // TODO(gangliao): This is a draft, add Buddy Allocator Algorithm soon + BuddyAllocator() {} + ~BuddyAllocator() {} + + public: + void* Alloc(size_t size) { + return Allocator::Alloc(size); + } + void Free(void*) { + // Because all info like size are stored in meta data, + // thus it's duplicate if add the parameter `size` in + // `Free(void*)` interface. + } + size_t Used(); + + public: + BuddyAllocator(const BuddyAllocator&) = delete; + BuddyAllocator& operator=(const BuddyAllocator&) = delete; + + private: + size_t min_alloc_size_; + size_t max_alloc_size_; + + private: + std::mutex mutex_; +}; + +BuddyAllocator* GetCPUBuddyAllocator() { + static BuddyAllocator* a = nullptr; + if (a == nullptr) { + a = new BuddyAllocator(); + } + return a; +} + +#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. + +BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static BuddyAllocator** as = NULL; + if (as == NULL) { + int gpu_num = platform::GetDeviceCount(); + as = new BuddyAllocator*[gpu_num]; + for (int gpu = 0; gpu < gpu_num; gpu++) { + as[gpu] = new BuddyAllocator(); + } + } + return as[gpu_id]; +} + +#endif // PADDLE_ONLY_CPU + +} // namespace detail +} // namespace memory +} // namespace paddle From 79373dabc8d2e4edc87fbef40efdfa1f54b35a9f Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 28 Jun 2017 01:33:06 +0800 Subject: [PATCH 12/16] TEST: Add test for system allocator and deleter --- paddle/memory/detail/system_allocator.h | 108 ++++++------------ paddle/memory/detail/system_allocator_test.cc | 40 ++++--- 2 files changed, 60 insertions(+), 88 deletions(-) diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index 1768f9a0da6c9..f411019854e1a 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -18,107 +18,69 @@ limitations under the License. */ #include // for mlock and munlock #include // for malloc and free -#ifndef PADDLE_ONLY_CPU -#include -#include -#endif // PADDLE_ONLY_CPU - +#include #include "paddle/platform/assert.h" +#include "paddle/platform/cuda.h" + +DEFINE_bool(uses_pinned_memory, false, + "If set, allocate cpu/gpu pinned memory."); namespace paddle { namespace memory { namespace detail { -class CPUDeleter { - public: - CPUDeleter(void* ptr, size_t size, bool locked) - : ptr_(ptr), size_(size), locked_(locked) {} - - void* Ptr() { return ptr_; } - - void operator()(void* ptr) { - PADDLE_ASSERT(ptr == ptr_); - if (ptr_ != nullptr && locked_) { - munlock(ptr_, size_); - } - std::free(ptr_); - } - - private: - void* ptr_; - size_t size_; - bool locked_; -}; - -// CPUAllocator calls mlock, which returns pinned -// and locked memory as staging areas for data exchange between host -// and device. Allocates too much would reduce the amount of memory -// available to the system for paging. So, by default, we should use -// CPUAllocator. -template +// If uses_pinned_memory is true, CPUAllocator calls mlock, which +// returns pinned and locked memory as staging areas for data exchange +// between host and device. Allocates too much would reduce the amount +// of memory available to the system for paging. So, by default, we +// should set false to uses_pinned_memory. class CPUAllocator { public: - static CPUDeleter Alloc(size_t size) { + static void* Alloc(size_t size) { void* p = std::malloc(size); - if (p != nullptr && lock_memory) { + if (p != nullptr && FLAGS_uses_pinned_memory) { mlock(p, size); } - return CPUDeleter(p, size, lock_memory); + return p; } -}; - -#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. - -namespace { -inline void throw_on_error(cudaError_t e, const char* message) { - if (e) { - throw thrust::system_error(e, thrust::cuda_category(), message); - } -} -} // namespace - -class GPUDeleter { - public: - GPUDeleter(void* ptr, size_t size, bool staging) - : ptr_(ptr), size_(size), staging_(staging) {} - - void* Ptr() { return ptr_; } - void operator()(void* ptr) { - PADDLE_ASSERT(ptr == ptr_); - // Purposefully allow cudaErrorCudartUnloading, because - // that is returned if you ever call cudaFree after the - // driver has already shutdown. This happens only if the - // process is terminating, in which case we don't care if - // cudaFree succeeds. - cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr); - if (err != cudaErrorCudartUnloading) { - throw_on_error(err, "cudaFree{Host} failed"); + static void Free(void* p, size_t size) { + if (p != nullptr && FLAGS_uses_pinned_memory) { + munlock(p, size); } + std::free(p); } - - private: - void* ptr_; - size_t size_; - bool staging_; }; +#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. + // GPUAllocator calls cudaHostMalloc, which returns // pinned and locked memory as staging areas for data exchange // between host and device. Allocates too much would reduce the // amount of memory available to the system for paging. So, by // default, we should use GPUAllocator. -template class GPUAllocator { public: - static GPUDeleter Alloc(size_t size) { + static void* Alloc(size_t size) { void* p = 0; - cudaError_t result = - staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size); + cudaError_t result = FLAGS_uses_pinned_memory ? cudaMallocHost(&p, size) + : cudaMalloc(&p, size); if (result != cudaSuccess) { cudaGetLastError(); // clear error if there is any. } - return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging); + return result == cudaSuccess ? p : nullptr; + } + + static void Free(void* p, size_t size) { + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + cudaError_t err = FLAGS_uses_pinned_memory ? cudaFreeHost(p) : cudaFree(p); + if (err != cudaErrorCudartUnloading) { + platform::throw_on_error(err, "cudaFree{Host} failed"); + } } }; diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc index fec70a65b77d5..829d3558ba4dd 100644 --- a/paddle/memory/detail/system_allocator_test.cc +++ b/paddle/memory/detail/system_allocator_test.cc @@ -17,34 +17,44 @@ limitations under the License. */ #include #include +#include "glog/logging.h" #include "gtest/gtest.h" template -void TestAllocator() { - { - auto d = Allocator::Alloc(sizeof(int)); - EXPECT_NE(d.Ptr(), nullptr); - std::unique_ptr p(static_cast(d.Ptr()), d); - } - { - auto d = Allocator::Alloc(0); - EXPECT_EQ(d.Ptr(), nullptr); - std::unique_ptr p(static_cast(d.Ptr()), d); - } +void TestAllocator(void* p) { + p = Allocator::Alloc(1024); + + int* i = static_cast(p); + std::shared_ptr ptr(i, [](int* p) { Allocator::Free(p, 1024); }); + + EXPECT_NE(p, nullptr); } TEST(CPUAllocator, NoLockMem) { - TestAllocator>(); + void* p = nullptr; + FLAGS_uses_pinned_memory = false; + TestAllocator(p); + EXPECT_EQ(p, nullptr); } + TEST(CPUAllocator, LockMem) { - TestAllocator>(); + void* p = nullptr; + FLAGS_uses_pinned_memory = true; + TestAllocator(p); + EXPECT_EQ(p, nullptr); } #ifndef PADDLE_ONLY_CPU TEST(GPUAllocator, NoStaging) { - TestAllocator>(); + void* p = nullptr; + FLAGS_uses_pinned_memory = false; + TestAllocator(p); + EXPECT_EQ(p, nullptr); } TEST(GPUAllocator, Staging) { - TestAllocator>(); + void* p = nullptr; + FLAGS_uses_pinned_memory = true; + TestAllocator(p); + EXPECT_EQ(p, nullptr); } #endif // PADDLE_ONLY_CPU From b8f5922d88e5f7949eb9a469f761ad49981d677a Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Tue, 27 Jun 2017 16:32:24 -0700 Subject: [PATCH 13/16] Make CPUAllocator and GPUAllocator subclasses of SystemAllocator --- paddle/memory/detail/CMakeLists.txt | 6 +- paddle/memory/detail/system_allocator.h | 80 +++++-------------- paddle/memory/detail/system_allocator_test.cc | 57 +++++++------ 3 files changed, 59 insertions(+), 84 deletions(-) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index cd5622203ff51..72d3749ad789e 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,5 +1,7 @@ if(${WITH_GPU}) - nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog) + nv_library(system_allocator SRCS system_allocator.cc DEPS gflags) + nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) else(${WITH_GPU}) - cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog) + cc_library(system_allocator SRCS system_allocator.cc DEPS gflags) + cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) endif(${WITH_GPU}) diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index f411019854e1a..184b383f7f782 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -14,76 +14,38 @@ limitations under the License. */ #pragma once -#include // for size_t -#include // for mlock and munlock -#include // for malloc and free - -#include -#include "paddle/platform/assert.h" -#include "paddle/platform/cuda.h" - -DEFINE_bool(uses_pinned_memory, false, - "If set, allocate cpu/gpu pinned memory."); +#include // for size_t namespace paddle { namespace memory { namespace detail { -// If uses_pinned_memory is true, CPUAllocator calls mlock, which -// returns pinned and locked memory as staging areas for data exchange -// between host and device. Allocates too much would reduce the amount -// of memory available to the system for paging. So, by default, we -// should set false to uses_pinned_memory. -class CPUAllocator { +// SystemAllocator is the parent class of CPUAllocator and +// GPUAllocator. A BuddyAllocator object uses a SystemAllocator* +// pointing to the underlying system allocator. An alternative to +// this class hierarchy is to pass a system allocator class to +// BuddyAllocator as a template parameter. This approach makes +// BuddyAllocator a class template, and it's very complicated +// algorithm would make the buddy_allocator.h messy. +class SystemAllocator { public: - static void* Alloc(size_t size) { - void* p = std::malloc(size); - if (p != nullptr && FLAGS_uses_pinned_memory) { - mlock(p, size); - } - return p; - } - - static void Free(void* p, size_t size) { - if (p != nullptr && FLAGS_uses_pinned_memory) { - munlock(p, size); - } - std::free(p); - } + virtual ~SystemAllocator() {} + virtual void* Alloc(size_t size) = 0; + virtual void Free(void* p, size_t size) = 0; }; -#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. - -// GPUAllocator calls cudaHostMalloc, which returns -// pinned and locked memory as staging areas for data exchange -// between host and device. Allocates too much would reduce the -// amount of memory available to the system for paging. So, by -// default, we should use GPUAllocator. -class GPUAllocator { +class CPUAllocator : public SystemAllocator { public: - static void* Alloc(size_t size) { - void* p = 0; - cudaError_t result = FLAGS_uses_pinned_memory ? cudaMallocHost(&p, size) - : cudaMalloc(&p, size); - if (result != cudaSuccess) { - cudaGetLastError(); // clear error if there is any. - } - return result == cudaSuccess ? p : nullptr; - } - - static void Free(void* p, size_t size) { - // Purposefully allow cudaErrorCudartUnloading, because - // that is returned if you ever call cudaFree after the - // driver has already shutdown. This happens only if the - // process is terminating, in which case we don't care if - // cudaFree succeeds. - cudaError_t err = FLAGS_uses_pinned_memory ? cudaFreeHost(p) : cudaFree(p); - if (err != cudaErrorCudartUnloading) { - platform::throw_on_error(err, "cudaFree{Host} failed"); - } - } + virtual void* Alloc(size_t size); + virtual void Free(void* p, size_t size); }; +#ifndef PADDLE_ONLY_CPU +class GPUAllocator : public SystemAllocator { + public: + virtual void* Alloc(size_t size); + virtual void Free(void* p, size_t size); +}; #endif // PADDLE_ONLY_CPU } // namespace detail diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc index 829d3558ba4dd..c461d8ac6265a 100644 --- a/paddle/memory/detail/system_allocator_test.cc +++ b/paddle/memory/detail/system_allocator_test.cc @@ -17,44 +17,55 @@ limitations under the License. */ #include #include -#include "glog/logging.h" +#include "gflags/gflags.h" #include "gtest/gtest.h" -template -void TestAllocator(void* p) { - p = Allocator::Alloc(1024); +DECLARE_bool(use_pinned_memory); - int* i = static_cast(p); - std::shared_ptr ptr(i, [](int* p) { Allocator::Free(p, 1024); }); +void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) { + bool freed = false; + { + void* p = a->Alloc(size); + if (size > 0) { + EXPECT_NE(p, nullptr); + } else { + EXPECT_EQ(p, nullptr); + } - EXPECT_NE(p, nullptr); + int* i = static_cast(p); + std::shared_ptr ptr(i, [&freed, a, size](void* p) { + freed = true; + a->Free(p, size); + }); + } + EXPECT_TRUE(freed); } TEST(CPUAllocator, NoLockMem) { - void* p = nullptr; - FLAGS_uses_pinned_memory = false; - TestAllocator(p); - EXPECT_EQ(p, nullptr); + FLAGS_use_pinned_memory = false; + paddle::memory::detail::CPUAllocator a; + TestAllocator(&a, 2048); + TestAllocator(&a, 0); } TEST(CPUAllocator, LockMem) { - void* p = nullptr; - FLAGS_uses_pinned_memory = true; - TestAllocator(p); - EXPECT_EQ(p, nullptr); + FLAGS_use_pinned_memory = true; + paddle::memory::detail::CPUAllocator a; + TestAllocator(&a, 2048); + TestAllocator(&a, 0); } #ifndef PADDLE_ONLY_CPU TEST(GPUAllocator, NoStaging) { - void* p = nullptr; - FLAGS_uses_pinned_memory = false; - TestAllocator(p); - EXPECT_EQ(p, nullptr); + FLAGS_use_pinned_memory = false; + paddle::memory::detail::GPUAllocator a; + TestAllocator(&a, 2048); + TestAllocator(&a, 0); } TEST(GPUAllocator, Staging) { - void* p = nullptr; - FLAGS_uses_pinned_memory = true; - TestAllocator(p); - EXPECT_EQ(p, nullptr); + FLAGS_use_pinned_memory = true; + paddle::memory::detail::GPUAllocator a; + TestAllocator(&a, 2048); + TestAllocator(&a, 0); } #endif // PADDLE_ONLY_CPU From 3e087f763e9c6c15a4f1d542fb3bdc327f7441c7 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Tue, 27 Jun 2017 16:48:25 -0700 Subject: [PATCH 14/16] Add buddy_allocator.cc and system_allocator.cc --- paddle/memory/detail/buddy_allocator.cc | 35 ++++++++ paddle/memory/detail/buddy_allocator.h | 76 ++++++++-------- paddle/memory/detail/system_allocator.cc | 90 +++++++++++++++++++ paddle/memory/detail/system_allocator_test.cc | 24 ++--- 4 files changed, 177 insertions(+), 48 deletions(-) create mode 100644 paddle/memory/detail/buddy_allocator.cc create mode 100644 paddle/memory/detail/system_allocator.cc diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc new file mode 100644 index 0000000000000..895bf319d778a --- /dev/null +++ b/paddle/memory/detail/buddy_allocator.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/memory/detail/system_allocator.h" + +namespace paddle { +namespace memory { +namespace detail { + +BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools, + SystemAllocator* system_allocator) + : pool_size_(pool_size), + max_pools_(max_pools), + system_allocator_(system_allocator) { + PADDLE_ASSERT(pool_size > 0); + PADDLE_ASSERT(max_pools > 0); + PADDLE_ASSERT(system_allocator != nullptr); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 35e96fd50782a..129b137ed7386 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #pragma once @@ -20,34 +20,38 @@ namespace paddle { namespace memory { namespace detail { -template class BuddyAllocator { - public: - // TODO(gangliao): This is a draft, add Buddy Allocator Algorithm soon - BuddyAllocator() {} - ~BuddyAllocator() {} - - public: - void* Alloc(size_t size) { - return Allocator::Alloc(size); - } - void Free(void*) { - // Because all info like size are stored in meta data, - // thus it's duplicate if add the parameter `size` in - // `Free(void*)` interface. - } - size_t Used(); + public: + BuddyAllocator(size_t pool_size, size_t max_pools, + SystemAllocator* system_allocator); + ~BuddyAllocator(); + + void* Alloc(size_t size); + void Free(void*); + size_t Used(); + + private: + struct Block { + size_t size_; + Block* left_; // left buddy + Block* right_; // right buddy + }; + + // Initially, there is only one pool. If a Alloc founds not enough + // memory from that pool, and there has not been max_num_pools_, + // create a new pool by calling system_allocator_.Alloc(pool_size_). + std::vector pools_; + + size_t pool_size_; // the size of each pool; + size_t max_num_pools_; // the size of all pools; - public: - BuddyAllocator(const BuddyAllocator&) = delete; - BuddyAllocator& operator=(const BuddyAllocator&) = delete; + SystemAllocator* system_allocator_; - private: - size_t min_alloc_size_; - size_t max_alloc_size_; + std::mutex mutex_; - private: - std::mutex mutex_; + // Disable copy and assignment. + BuddyAllocator(const BuddyAllocator&) = delete; + BuddyAllocator& operator=(const BuddyAllocator&) = delete; }; BuddyAllocator* GetCPUBuddyAllocator() { @@ -63,16 +67,16 @@ BuddyAllocator* GetCPUBuddyAllocator() { BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { static BuddyAllocator** as = NULL; if (as == NULL) { - int gpu_num = platform::GetDeviceCount(); + int gpu_num = platform::GetDeviceCount(); as = new BuddyAllocator*[gpu_num]; for (int gpu = 0; gpu < gpu_num; gpu++) { - as[gpu] = new BuddyAllocator(); + as[gpu] = new BuddyAllocator(); } } return as[gpu_id]; } -#endif // PADDLE_ONLY_CPU +#endif // PADDLE_ONLY_CPU } // namespace detail } // namespace memory diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc new file mode 100644 index 0000000000000..50bec926f83de --- /dev/null +++ b/paddle/memory/detail/system_allocator.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/memory/detail/system_allocator.h" + +#include // for malloc and free +#include // for mlock and munlock + +#include "gflags/gflags.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/cuda.h" + +// If use_pinned_memory is true, CPUAllocator calls mlock, which +// returns pinned and locked memory as staging areas for data exchange +// between host and device. Allocates too much would reduce the amount +// of memory available to the system for paging. So, by default, we +// should set false to use_pinned_memory. +DEFINE_bool(use_pinned_memory, false, + "If set, allocate cpu/gpu pinned memory."); + +namespace paddle { +namespace memory { +namespace detail { + +void* CPUAllocator::Alloc(size_t size) { + // According to http://www.cplusplus.com/reference/cstdlib/malloc/, + // malloc might not return nullptr if size is zero, but the returned + // pointer shall not be dereferenced -- so we make it nullptr. + if (size <= 0) return nullptr; + + void* p = malloc(size); + if (p != nullptr && FLAGS_use_pinned_memory) { + mlock(p, size); + } + return p; +} + +void CPUAllocator::Free(void* p, size_t size) { + if (p != nullptr && FLAGS_use_pinned_memory) { + munlock(p, size); + } + free(p); +} + +#ifndef PADDLE_ONLY_CPU + +void* GPUAllocator::Alloc(size_t size) { + // CUDA documentation doesn't explain if cudaMalloc returns nullptr + // if size is 0. We just make sure it does. + if (size <= 0) { + return nullptr; + } + + void* p = 0; + cudaError_t result = + FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size); + if (result != cudaSuccess) { + cudaGetLastError(); // clear error if there is any. + } + return result == cudaSuccess ? p : nullptr; +} + +void GPUAllocator::Free(void* p, size_t size) { + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p); + if (err != cudaErrorCudartUnloading) { + platform::throw_on_error(err, "cudaFree{Host} failed"); + } +} + +#endif // PADDLE_ONLY_CPU + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc index c461d8ac6265a..9bd5706a4e4d1 100644 --- a/paddle/memory/detail/system_allocator_test.cc +++ b/paddle/memory/detail/system_allocator_test.cc @@ -22,10 +22,10 @@ limitations under the License. */ DECLARE_bool(use_pinned_memory); -void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) { +void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { bool freed = false; { - void* p = a->Alloc(size); + void* p = a.Alloc(size); if (size > 0) { EXPECT_NE(p, nullptr); } else { @@ -33,9 +33,9 @@ void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) { } int* i = static_cast(p); - std::shared_ptr ptr(i, [&freed, a, size](void* p) { + std::shared_ptr ptr(i, [&](void* p) { freed = true; - a->Free(p, size); + a.Free(p, size); }); } EXPECT_TRUE(freed); @@ -44,28 +44,28 @@ void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) { TEST(CPUAllocator, NoLockMem) { FLAGS_use_pinned_memory = false; paddle::memory::detail::CPUAllocator a; - TestAllocator(&a, 2048); - TestAllocator(&a, 0); + TestAllocator(a, 2048); + TestAllocator(a, 0); } TEST(CPUAllocator, LockMem) { FLAGS_use_pinned_memory = true; paddle::memory::detail::CPUAllocator a; - TestAllocator(&a, 2048); - TestAllocator(&a, 0); + TestAllocator(a, 2048); + TestAllocator(a, 0); } #ifndef PADDLE_ONLY_CPU TEST(GPUAllocator, NoStaging) { FLAGS_use_pinned_memory = false; paddle::memory::detail::GPUAllocator a; - TestAllocator(&a, 2048); - TestAllocator(&a, 0); + TestAllocator(a, 2048); + TestAllocator(a, 0); } TEST(GPUAllocator, Staging) { FLAGS_use_pinned_memory = true; paddle::memory::detail::GPUAllocator a; - TestAllocator(&a, 2048); - TestAllocator(&a, 0); + TestAllocator(a, 2048); + TestAllocator(a, 0); } #endif // PADDLE_ONLY_CPU From 3e9aa7fd8bfac7434057afcdd6ae62ea7a92bff1 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 28 Jun 2017 10:42:54 +0800 Subject: [PATCH 15/16] FIX: Pass CI --- cmake/generic.cmake | 4 ---- paddle/memory/CMakeLists.txt | 6 ------ paddle/memory/detail/buddy_allocator.cc | 2 +- paddle/memory/detail/buddy_allocator.h | 3 +++ 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 840155750e1ac..69e8164a00d1f 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -78,10 +78,6 @@ # # cc_test(example_test SRCS example_test.cc DEPS example glog gflags) -if(WITH_GPU) - add_definitions(-DPADDLE_WITH_GPU) -endif() - if(NOT APPLE) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt index e74ce75c9398d..3943c3cfad31d 100644 --- a/paddle/memory/CMakeLists.txt +++ b/paddle/memory/CMakeLists.txt @@ -1,7 +1 @@ add_subdirectory(detail) - -if(${WITH_GPU}) - nv_library(memory SRCS memory.cc) -else(${WITH_GPU}) - cc_library(memory SRCS memory.cc) -endif(${WITH_GPU}) diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index 895bf319d778a..ebe680f5eea49 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -14,7 +14,7 @@ #pragma once -#include "paddle/memory/detail/system_allocator.h" +#include "paddle/memory/detail/buddy_allocator.h" namespace paddle { namespace memory { diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 129b137ed7386..702c7d28ee5f9 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -16,6 +16,9 @@ #include "paddle/memory/detail/system_allocator.h" +#include +#include + namespace paddle { namespace memory { namespace detail { From 9490d243dd0255021b288f9c2e43c57b30264b9b Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 28 Jun 2017 10:46:14 +0800 Subject: [PATCH 16/16] ENH: clang-format --- paddle/memory/detail/buddy_allocator.h | 2 +- paddle/platform/cuda.h | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 702c7d28ee5f9..82e6aaedc7199 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -16,8 +16,8 @@ #include "paddle/memory/detail/system_allocator.h" -#include #include +#include namespace paddle { namespace memory { diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h index 864a5d3340a8f..8fe891f9ce6c3 100644 --- a/paddle/platform/cuda.h +++ b/paddle/platform/cuda.h @@ -29,13 +29,12 @@ inline void throw_on_error(cudaError_t e, const char* message) { } int GetDeviceCount(void) { - int count; - throw_on_error(cudaGetDeviceCount(&count), - "cudaGetDeviceCount failed"); - return count; + int count; + throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed"); + return count; } } // namespace platform } // namespace paddle -#endif // PADDLE_ONLY_CPU +#endif // PADDLE_ONLY_CPU