From 84d1c734ca2fe7a17e000467823d49891507cf0b Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Sun, 25 Jun 2017 15:40:45 -0700
Subject: [PATCH 01/16] add paddle/memory/detail/cpu_allocator*

---
 paddle/CMakeLists.txt                      |  1 +
 paddle/memory/CMakeLists.txt               |  1 +
 paddle/memory/README.md                    | 14 ++---
 paddle/memory/detail/CMakeLists.txt        |  1 +
 paddle/memory/detail/cpu_allocator.h       | 63 ++++++++++++++++++++++
 paddle/memory/detail/cpu_allocator_test.cc | 32 +++++++++++
 paddle/memory/memory.cc                    | 51 ++++++++++++++++++
 paddle/memory/memory.h                     | 27 ++++++++++
 paddle/platform/place.cc                   | 12 ++---
 paddle/platform/place.h                    | 45 ++++++++++------
 paddle/platform/place_test.cc              | 14 ++---
 11 files changed, 224 insertions(+), 37 deletions(-)
 create mode 100644 paddle/memory/CMakeLists.txt
 create mode 100644 paddle/memory/detail/CMakeLists.txt
 create mode 100644 paddle/memory/detail/cpu_allocator.h
 create mode 100644 paddle/memory/detail/cpu_allocator_test.cc
 create mode 100644 paddle/memory/memory.cc
 create mode 100644 paddle/memory/memory.h
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 573bd937a351a..0cddb95244fcf 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(trainer)
 add_subdirectory(scripts)
 add_subdirectory(optimizer)
 add_subdirectory(strings)
+add_subdirectory(memory)
 
 # Do not build go directory until go cmake is working smoothly.
 # if(CMAKE_Go_COMPILER)
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
new file mode 100644
index 0000000000000..3943c3cfad31d
--- /dev/null
+++ b/paddle/memory/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(detail)
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index fd32d07ef40fc..e5f7880e4cad3 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -31,7 +31,7 @@ In `paddle/memory/memory.h` we have:
 namespace memory {
 template <typename Place> void* Alloc(Place, size_t);
 template <typename Place> void Free(Place, void*);
-template <typename Place> void Used(Place);
+template <typename Place> size_t Used(Place);
 }  // namespace memory
 ```
 
@@ -39,7 +39,7 @@ These function templates have specializations on either `platform::CPUPlace` or
 
 ```cpp
 template<>
-void Alloc<CPUPlace>(CPUPlace p, size_t size) {
+void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
   return GetCPUBuddyAllocator()->Alloc(size);
 }
 ```
@@ -102,15 +102,11 @@ class BuddyAllocator {
 };
 ```
 
-#### System Allocators
-
-The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.  A system allocator holds information about a device, including the amount of memory has been allocated, so we can call
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
 
-- `GPUAllocator::Used()` and
-- `CPUAllocator::Used()`
-
-to get the amount of memory that has been allocated so far.
+#### System Allocators
 
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
 
 ## Justification
 
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
new file mode 100644
index 0000000000000..fb8a11062da91
--- /dev/null
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -0,0 +1 @@
+cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
new file mode 100644
index 0000000000000..8a872d3800d3d
--- /dev/null
+++ b/paddle/memory/detail/cpu_allocator.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <malloc.h>  // for malloc and free
+#include <stddef.h>  // for size_t
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// CPUAllocator<staging=true> calls cudaMallocHost, which returns
+// pinned and mlocked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the
+// amount of memory available to the system for paging.  So, by
+// default, we should use CPUAllocator<staging=false>.
+template <bool staging>
+class CPUAllocator {
+public:
+  void* Alloc(size_t size);
+  void Free(void* p);
+};
+
+template <>
+class CPUAllocator<false> {
+public:
+  void* Alloc(size_t size) { return malloc(size); }
+  void Free(void* p) { free(p); }
+};
+
+// If CMake macro WITH_GPU is OFF, C++ compiler won't generate the
+// following specialization that depends on the CUDA library.
+#ifdef WITH_GPU
+template <>
+class CPUAllocator<true> {
+public:
+  void* Alloc(size_t size) {
+    void* p;
+    if (cudaMallocHost(&p, size) != cudaSuccess) {
+      return NULL;
+    }
+    return *p;
+  }
+
+  void Free(void* p) { cudaFreeHost(p); }
+};
+#endif  // WITH_GPU
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc
new file mode 100644
index 0000000000000..0aa33a22fd0bc
--- /dev/null
+++ b/paddle/memory/detail/cpu_allocator_test.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/cpu_allocator.h"
+#include "gtest/gtest.h"
+
+TEST(CPUAllocator, NonStaging) {
+  paddle::memory::detail::CPUAllocator<false> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p);
+}
+
+#ifdef WITH_GPU
+TEST(CPUAllocator, Staging) {
+  paddle::memory::detail::CPUAllocator<true> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p);
+}
+#endif  // WITH_GPU
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
new file mode 100644
index 0000000000000..5f1253ede6818
--- /dev/null
+++ b/paddle/memory/memory.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+
+namespace paddle {
+namespace memory {
+
+template <>
+void* Alloc<CPUPlace>(CPUPlace, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void* Alloc<GPUPlace>(GPUPlace pl, size_t size) {
+  return GetGPUBuddyAllocator(pl.device)->Alloc(size);
+}
+
+template <>
+void Free<CPUPlace>(CPUPlace, void* p) {
+  return GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+void* Alloc<GPUPlace>(GPUPlace pl, void* p) {
+  return GetGPUBuddyAllocator(pl.device)->Free(p);
+}
+
+template <>
+size_t Used<CPUPlace>(CPUPlace) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
+template <>
+size_t Alloc<GPUPlace>(GPUPlace pl) {
+  return GetGPUBuddyAllocator(pl.device)->Used();
+}
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
new file mode 100644
index 0000000000000..ae8ac6ca523ad
--- /dev/null
+++ b/paddle/memory/memory.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/frameowork/place.h"
+
+namespace paddle {
+namespace memory {
+
+typename<typename paddle::framework::Place> void* Alloc(Place, size_t);
+typename<typename paddle::framework::Place> void Free(Place, void*);
+typename<typename paddle::framework::Place> size_t Used(Place);
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 1afd03c01169d..0704820aa0507 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -8,8 +8,8 @@ namespace detail {
 class PlacePrinter : public boost::static_visitor<> {
  public:
   PlacePrinter(std::ostream &os) : os_(os) {}
-  void operator()(const CpuPlace &) { os_ << "CpuPlace"; }
-  void operator()(const GpuPlace &p) { os_ << "GpuPlace(" << p.device << ")"; }
+  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
+  void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
 
  private:
   std::ostream &os_;
@@ -22,14 +22,14 @@ static Place the_default_place;
 void set_place(const Place &place) { the_default_place = place; }
 const Place &get_place() { return the_default_place; }
 
-const GpuPlace default_gpu() { return GpuPlace(0); }
-const CpuPlace default_cpu() { return CpuPlace(); }
+const GPUPlace default_gpu() { return GPUPlace(0); }
+const CPUPlace default_cpu() { return CPUPlace(); }
 
 bool is_gpu_place(const Place &p) {
-  return boost::apply_visitor(IsGpuPlace(), p);
+  return boost::apply_visitor(IsGPUPlace(), p);
 }
 bool is_cpu_place(const Place &p) {
-  return !boost::apply_visitor(IsGpuPlace(), p);
+  return !boost::apply_visitor(IsGPUPlace(), p);
 }
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 489572c526e16..7cead183884bc 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -1,43 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
+
 #include <boost/variant.hpp>
 #include <iostream>
 
 namespace paddle {
 namespace platform {
 
-struct CpuPlace {
+struct CPUPlace {
   // WORKAROUND: for some reason, omitting this constructor
   // causes errors with boost 1.59 and OSX
-  CpuPlace() {}
+  CPUPlace() {}
 
   // needed for variant equality comparison
-  inline bool operator==(const CpuPlace &) const { return true; }
-  inline bool operator!=(const CpuPlace &) const { return false; }
+  inline bool operator==(const CPUPlace &) const { return true; }
+  inline bool operator!=(const CPUPlace &) const { return false; }
 };
 
-struct GpuPlace {
-  GpuPlace() : GpuPlace(0) {}
-  GpuPlace(int d) : device(d) {}
+struct GPUPlace {
+  GPUPlace() : GPUPlace(0) {}
+  GPUPlace(int d) : device(d) {}
 
   // needed for variant equality comparison
-  inline bool operator==(const GpuPlace &o) const { return device == o.device; }
-  inline bool operator!=(const GpuPlace &o) const { return !(*this == o); }
+  inline bool operator==(const GPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }
 
   int device;
 };
 
-struct IsGpuPlace : public boost::static_visitor<bool> {
-  bool operator()(const CpuPlace &) const { return false; }
-  bool operator()(const GpuPlace &gpu) const { return true; }
+struct IsGPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const GPUPlace &gpu) const { return true; }
 };
 
-typedef boost::variant<GpuPlace, CpuPlace> Place;
+typedef boost::variant<GPUPlace, CPUPlace> Place;
 
 void set_place(const Place &);
 const Place &get_place();
 
-const GpuPlace default_gpu();
-const CpuPlace default_cpu();
+const GPUPlace default_gpu();
+const CPUPlace default_cpu();
 
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc
index 73fccceedf691..33e2e5a439ce6 100644
--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
@@ -3,8 +3,8 @@
 #include "gtest/gtest.h"
 
 TEST(Place, Equality) {
-  paddle::platform::CpuPlace cpu;
-  paddle::platform::GpuPlace g0(0), g1(1), gg0(0);
+  paddle::platform::CPUPlace cpu;
+  paddle::platform::GPUPlace g0(0), g1(1), gg0(0);
 
   EXPECT_EQ(cpu, cpu);
   EXPECT_EQ(g0, g0);
@@ -22,19 +22,19 @@ TEST(Place, Default) {
   EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
 
-  paddle::platform::set_place(paddle::platform::CpuPlace());
+  paddle::platform::set_place(paddle::platform::CPUPlace());
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
 }
 
 TEST(Place, Print) {
   {
     std::stringstream ss;
-    ss << paddle::platform::GpuPlace(1);
-    EXPECT_EQ("GpuPlace(1)", ss.str());
+    ss << paddle::platform::GPUPlace(1);
+    EXPECT_EQ("GPUPlace(1)", ss.str());
   }
   {
     std::stringstream ss;
-    ss << paddle::platform::CpuPlace();
-    EXPECT_EQ("CpuPlace", ss.str());
+    ss << paddle::platform::CPUPlace();
+    EXPECT_EQ("CPUPlace", ss.str());
   }
 }

From db128c4586c3c925a6c53a9ae770cb07cdbea1bf Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Sun, 25 Jun 2017 17:54:06 -0700
Subject: [PATCH 02/16] Pass cpu_allocator_test

---
 CMakeLists.txt                             |  2 +-
 cmake/generic.cmake                        |  4 ++++
 paddle/memory/detail/CMakeLists.txt        |  6 +++++-
 paddle/memory/detail/cpu_allocator.h       | 13 +++++++++----
 paddle/memory/detail/cpu_allocator_test.cc | 16 +++++++++++-----
 paddle/memory/memory.cc                    | 14 ++++++++++++--
 paddle/memory/memory.h                     | 16 +++++++++++++---
 7 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5d7f2c7ec76d..3c719d35eced2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,7 +71,7 @@ if(ANDROID)
         "Disable RDMA when cross-compiling for Android" FORCE)
 endif(ANDROID)
 
-set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
 if (WITH_C_API AND WITH_PYTHON)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 69e8164a00d1f..840155750e1ac 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -78,6 +78,10 @@
 #
 #   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
 
+if(WITH_GPU)
+  add_definitions(-DPADDLE_WITH_GPU)
+endif()
+
 if(NOT APPLE)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index fb8a11062da91..c425e9f947d07 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1 +1,5 @@
-cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
+if(${WITH_GPU})
+  nv_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # nv_test links CUDA, but
+else(${WITH_GPU})
+  cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # cc_test doesn't.
+endif(${WITH_GPU})
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
index 8a872d3800d3d..0d8ea3f52b92f 100644
--- a/paddle/memory/detail/cpu_allocator.h
+++ b/paddle/memory/detail/cpu_allocator.h
@@ -17,6 +17,11 @@ limitations under the License. */
 #include <malloc.h>  // for malloc and free
 #include <stddef.h>  // for size_t
 
+#ifdef PADDLE_WITH_GPU
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#endif  // PADDLE_WITH_GPU
+
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -40,9 +45,9 @@ class CPUAllocator<false> {
   void Free(void* p) { free(p); }
 };
 
-// If CMake macro WITH_GPU is OFF, C++ compiler won't generate the
+// If CMake macro PADDLE_WITH_GPU is OFF, C++ compiler won't generate the
 // following specialization that depends on the CUDA library.
-#ifdef WITH_GPU
+#ifdef PADDLE_WITH_GPU
 template <>
 class CPUAllocator<true> {
 public:
@@ -51,12 +56,12 @@ class CPUAllocator<true> {
     if (cudaMallocHost(&p, size) != cudaSuccess) {
       return NULL;
     }
-    return *p;
+    return p;
   }
 
   void Free(void* p) { cudaFreeHost(p); }
 };
-#endif  // WITH_GPU
+#endif  // PADDLE_WITH_GPU
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc
index 0aa33a22fd0bc..464bc84e5c7b5 100644
--- a/paddle/memory/detail/cpu_allocator_test.cc
+++ b/paddle/memory/detail/cpu_allocator_test.cc
@@ -22,11 +22,17 @@ TEST(CPUAllocator, NonStaging) {
   a.Free(p);
 }
 
-#ifdef WITH_GPU
+#ifdef PADDLE_WITH_GPU
 TEST(CPUAllocator, Staging) {
   paddle::memory::detail::CPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p);
+
+  int devices;
+  if (cudaGetDeviceCount(&devices) == cudaSuccess && devices > 0) {
+    void* p = a.Alloc(4096);
+    EXPECT_NE(p, nullptr);
+    a.Free(p);
+  } else {
+    EXPECT_EQ(a.Alloc(4096), nullptr);
+  }
 }
-#endif  // WITH_GPU
+#endif  // PADDLE_WITH_GPU
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5f1253ede6818..b617923731a4d 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -19,7 +19,11 @@ namespace memory {
 
 template <>
 void* Alloc<CPUPlace>(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator()->Alloc(size);
+  return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size);
+}
+
+void* AllocStaging(CPUPlace, size_t size) {
+  return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size);
 }
 
 template <>
@@ -29,9 +33,14 @@ void* Alloc<GPUPlace>(GPUPlace pl, size_t size) {
 
 template <>
 void Free<CPUPlace>(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator()->Free(p);
+  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
+}
+
+void FreeStaging(CPUPlace, void* p) {
+  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
 }
 
+#ifdef PADDLE_WITH_GPU
 template <>
 void* Alloc<GPUPlace>(GPUPlace pl, void* p) {
   return GetGPUBuddyAllocator(pl.device)->Free(p);
@@ -46,6 +55,7 @@ template <>
 size_t Alloc<GPUPlace>(GPUPlace pl) {
   return GetGPUBuddyAllocator(pl.device)->Used();
 }
+#endif  // PADDLE_WITH_GPU
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index ae8ac6ca523ad..8c15a133bb4e9 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,9 +19,19 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-typename<typename paddle::framework::Place> void* Alloc(Place, size_t);
-typename<typename paddle::framework::Place> void Free(Place, void*);
-typename<typename paddle::framework::Place> size_t Used(Place);
+template <typename paddle::framework::Place>
+void* Alloc(Place, size_t);
+template <typename paddle::framework::Place>
+void Free(Place, void*);
+template <typename paddle::framework::Place>
+size_t Used(Place);
+
+// Staging memory means "pinned" host memory that can be mapped into
+// the CUDA memory space and accessed by the device rapidly.  Don't
+// allocate too much staging memory; otherwise system performance will
+// degrade because the OS cannot find enough swap memory space.
+void* AllocStaging(CPUPlace, size_t);
+void* FreeStaging(CPUPlace, size_t);
 
 }  // namespace memory
 }  // namespace paddle

From ce938ae5f9baea2b2d136154ee9a696b394929e1 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 26 Jun 2017 23:32:46 +0800
Subject: [PATCH 03/16] FIX: Pinned memory

---
 paddle/memory/README.md                    |  1 +
 paddle/memory/detail/CMakeLists.txt        |  6 +---
 paddle/memory/detail/cpu_allocator.h       | 39 ++++++++++++----------
 paddle/memory/detail/cpu_allocator_test.cc | 16 +++------
 4 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index e5f7880e4cad3..96a331a486f57 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -97,6 +97,7 @@ class BuddyAllocator {
   struct Block {
     size_t size;
     Block* left, right;
+    size_t index; // allocator id
   };
   ...
 };
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index c425e9f947d07..fb8a11062da91 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,5 +1 @@
-if(${WITH_GPU})
-  nv_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # nv_test links CUDA, but
-else(${WITH_GPU})
-  cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # cc_test doesn't.
-endif(${WITH_GPU})
+cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
index 0d8ea3f52b92f..a487fecef49b7 100644
--- a/paddle/memory/detail/cpu_allocator.h
+++ b/paddle/memory/detail/cpu_allocator.h
@@ -14,20 +14,19 @@ limitations under the License. */
 
 #pragma once
 
-#include <malloc.h>  // for malloc and free
 #include <stddef.h>  // for size_t
+#include <cstdlib>   // for malloc and free
 
-#ifdef PADDLE_WITH_GPU
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#endif  // PADDLE_WITH_GPU
+#ifndef _WIN32
+#include <sys/mman.h>  // for mlock and munlock
+#endif
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-// CPUAllocator<staging=true> calls cudaMallocHost, which returns
-// pinned and mlocked memory as staging areas for data exchange
+// CPUAllocator<staging=true> calls mlock, which returns
+// pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the
 // amount of memory available to the system for paging.  So, by
 // default, we should use CPUAllocator<staging=false>.
@@ -35,33 +34,37 @@ template <bool staging>
 class CPUAllocator {
 public:
   void* Alloc(size_t size);
-  void Free(void* p);
+  void Free(void* p, size_t size);
 };
 
 template <>
 class CPUAllocator<false> {
 public:
-  void* Alloc(size_t size) { return malloc(size); }
-  void Free(void* p) { free(p); }
+  void* Alloc(size_t size) { return std::malloc(size); }
+  void Free(void* p, size_t size) { std::free(p); }
 };
 
-// If CMake macro PADDLE_WITH_GPU is OFF, C++ compiler won't generate the
-// following specialization that depends on the CUDA library.
-#ifdef PADDLE_WITH_GPU
 template <>
 class CPUAllocator<true> {
 public:
   void* Alloc(size_t size) {
-    void* p;
-    if (cudaMallocHost(&p, size) != cudaSuccess) {
-      return NULL;
+    void* p = std::malloc(size);
+    if (p == nullptr) {
+      return p;
     }
+#ifndef _WIN32
+    mlock(p, size);
+#endif
     return p;
   }
 
-  void Free(void* p) { cudaFreeHost(p); }
+  void Free(void* p, size_t size) {
+#ifndef _WIN32
+    munlock(p, size);
+#endif
+    std::free(p);
+  }
 };
-#endif  // PADDLE_WITH_GPU
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc
index 464bc84e5c7b5..4e45266cd8ad8 100644
--- a/paddle/memory/detail/cpu_allocator_test.cc
+++ b/paddle/memory/detail/cpu_allocator_test.cc
@@ -19,20 +19,12 @@ TEST(CPUAllocator, NonStaging) {
   paddle::memory::detail::CPUAllocator<false> a;
   void* p = a.Alloc(4096);
   EXPECT_NE(p, nullptr);
-  a.Free(p);
+  a.Free(p, 4096);
 }
 
-#ifdef PADDLE_WITH_GPU
 TEST(CPUAllocator, Staging) {
   paddle::memory::detail::CPUAllocator<true> a;
-
-  int devices;
-  if (cudaGetDeviceCount(&devices) == cudaSuccess && devices > 0) {
-    void* p = a.Alloc(4096);
-    EXPECT_NE(p, nullptr);
-    a.Free(p);
-  } else {
-    EXPECT_EQ(a.Alloc(4096), nullptr);
-  }
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
 }
-#endif  // PADDLE_WITH_GPU

From ce70df86b1e8c892cdde5312caa0c2699f368f7d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 27 Jun 2017 00:15:36 +0800
Subject: [PATCH 04/16] Add gpu_allocator

---
 paddle/memory/.clang-format                |  5 ++
 paddle/memory/detail/CMakeLists.txt        |  1 +
 paddle/memory/detail/cpu_allocator.h       |  6 +-
 paddle/memory/detail/gpu_allocator.h       | 92 ++++++++++++++++++++++
 paddle/memory/detail/gpu_allocator_test.cc | 30 +++++++
 5 files changed, 131 insertions(+), 3 deletions(-)
 create mode 100644 paddle/memory/.clang-format
 create mode 100644 paddle/memory/detail/gpu_allocator.h
 create mode 100644 paddle/memory/detail/gpu_allocator_test.cc

diff --git a/paddle/memory/.clang-format b/paddle/memory/.clang-format
new file mode 100644
index 0000000000000..29282dc87e2c4
--- /dev/null
+++ b/paddle/memory/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index fb8a11062da91..81ca8a0bbf0ef 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1 +1,2 @@
 cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
+nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc)
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
index a487fecef49b7..17753ccef718f 100644
--- a/paddle/memory/detail/cpu_allocator.h
+++ b/paddle/memory/detail/cpu_allocator.h
@@ -32,21 +32,21 @@ namespace detail {
 // default, we should use CPUAllocator<staging=false>.
 template <bool staging>
 class CPUAllocator {
-public:
+ public:
   void* Alloc(size_t size);
   void Free(void* p, size_t size);
 };
 
 template <>
 class CPUAllocator<false> {
-public:
+ public:
   void* Alloc(size_t size) { return std::malloc(size); }
   void Free(void* p, size_t size) { std::free(p); }
 };
 
 template <>
 class CPUAllocator<true> {
-public:
+ public:
   void* Alloc(size_t size) {
     void* p = std::malloc(size);
     if (p == nullptr) {
diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/gpu_allocator.h
new file mode 100644
index 0000000000000..9452c41fb8975
--- /dev/null
+++ b/paddle/memory/detail/gpu_allocator.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>  // for size_t
+
+#include <thrust/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
+// pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the
+// amount of memory available to the system for paging.  So, by
+// default, we should use GPUAllocator<staging=false>.
+template <bool staging>
+class GPUAllocator {
+public:
+  void* Alloc(size_t size);
+  void Free(void* p, size_t size);
+};
+
+template <>
+class GPUAllocator<false> {
+public:
+  void* Alloc(size_t size) {
+    void* p = 0;
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      return p;
+    }
+    // clear last error
+    cudaGetLastError();
+    return nullptr;
+  }
+
+  void Free(void* p, size_t size) {
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds. 
+    auto err = cudaFree(p);
+    if (err != cudaErrorCudartUnloading) {
+        throw_on_error(err, "cudaFree failed");
+    }
+  }
+};
+
+template <>
+class GPUAllocator<true> {
+public:
+  void* Alloc(size_t size) {
+    void* p = 0;
+    cudaError_t result = cudaMallocHost(&p, size);
+    if (result == cudaSuccess) {
+        return p;
+    }
+    // clear last error
+    cudaGetLastError();
+    return nullptr;
+  }
+
+  void Free(void* p, size_t size) {
+    throw_on_error(cudaFreeHost(p), "cudaFreeHost failed");
+  }
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/gpu_allocator_test.cc b/paddle/memory/detail/gpu_allocator_test.cc
new file mode 100644
index 0000000000000..18c1c9ab43084
--- /dev/null
+++ b/paddle/memory/detail/gpu_allocator_test.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/gpu_allocator.h"
+#include "gtest/gtest.h"
+
+TEST(GPUAllocator, NonStaging) {
+  paddle::memory::detail::GPUAllocator<false> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
+}
+
+TEST(GPUAllocator, Staging) {
+  paddle::memory::detail::GPUAllocator<true> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
+}

From e02859c0f53dfe4616976b015d4fefd8aaa6eb39 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Mon, 26 Jun 2017 15:27:01 -0700
Subject: [PATCH 05/16] Replace {cpu,gpu}_allocator.h and
 {cpu,gpu}_allocator_test.cc by system_allocator{.h,_test.cc}

---
 paddle/memory/CMakeLists.txt                  |  6 ++
 paddle/memory/detail/CMakeLists.txt           |  3 +-
 paddle/memory/detail/cpu_allocator.h          | 71 -----------------
 paddle/memory/detail/cpu_allocator_test.cc    | 30 -------
 .../{gpu_allocator.h => system_allocator.h}   | 79 +++++++++++--------
 ...cator_test.cc => system_allocator_test.cc} | 20 ++++-
 paddle/memory/memory.cc                       | 67 +++++++---------
 paddle/memory/memory.h                        | 16 +---
 8 files changed, 106 insertions(+), 186 deletions(-)
 delete mode 100644 paddle/memory/detail/cpu_allocator.h
 delete mode 100644 paddle/memory/detail/cpu_allocator_test.cc
 rename paddle/memory/detail/{gpu_allocator.h => system_allocator.h} (58%)
 rename paddle/memory/detail/{gpu_allocator_test.cc => system_allocator_test.cc} (69%)

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3943c3cfad31d..86625124967d7 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1 +1,7 @@
 add_subdirectory(detail)
+
+if(${WITH_GPU})
+  nv_library(memory SRCS memory.cc)
+else(${WITH_GPU})
+  cc_library(memory SRCS memroy.cc)
+endif(${WITH_GPU})
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 81ca8a0bbf0ef..3b5bbd7a12fab 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,2 +1 @@
-cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
-nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc)
+cc_test(system_allocator_test SRCS system_allocator_test.cc)
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
deleted file mode 100644
index 17753ccef718f..0000000000000
--- a/paddle/memory/detail/cpu_allocator.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>  // for size_t
-#include <cstdlib>   // for malloc and free
-
-#ifndef _WIN32
-#include <sys/mman.h>  // for mlock and munlock
-#endif
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-// CPUAllocator<staging=true> calls mlock, which returns
-// pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the
-// amount of memory available to the system for paging.  So, by
-// default, we should use CPUAllocator<staging=false>.
-template <bool staging>
-class CPUAllocator {
- public:
-  void* Alloc(size_t size);
-  void Free(void* p, size_t size);
-};
-
-template <>
-class CPUAllocator<false> {
- public:
-  void* Alloc(size_t size) { return std::malloc(size); }
-  void Free(void* p, size_t size) { std::free(p); }
-};
-
-template <>
-class CPUAllocator<true> {
- public:
-  void* Alloc(size_t size) {
-    void* p = std::malloc(size);
-    if (p == nullptr) {
-      return p;
-    }
-#ifndef _WIN32
-    mlock(p, size);
-#endif
-    return p;
-  }
-
-  void Free(void* p, size_t size) {
-#ifndef _WIN32
-    munlock(p, size);
-#endif
-    std::free(p);
-  }
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc
deleted file mode 100644
index 4e45266cd8ad8..0000000000000
--- a/paddle/memory/detail/cpu_allocator_test.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/cpu_allocator.h"
-#include "gtest/gtest.h"
-
-TEST(CPUAllocator, NonStaging) {
-  paddle::memory::detail::CPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
-
-TEST(CPUAllocator, Staging) {
-  paddle::memory::detail::CPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/system_allocator.h
similarity index 58%
rename from paddle/memory/detail/gpu_allocator.h
rename to paddle/memory/detail/system_allocator.h
index 9452c41fb8975..0a64553188995 100644
--- a/paddle/memory/detail/gpu_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -14,20 +14,58 @@ limitations under the License. */
 
 #pragma once
 
-#include <stddef.h>  // for size_t
+#include <stddef.h>    // for size_t
+#include <sys/mman.h>  // for mlock and munlock
+#include <cstdlib>     // for malloc and free
 
-#include <thrust/system_error.h>
+#ifndef PADDLE_ONLY_CPU
 #include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_ONLY_CPU
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
+class SystemAllocator {
+ public:
+  virtual void* Alloc(size_t size) = 0;
+  virtual void* Free(void* p) = 0;
+};
+
+// CPUAllocator<lock_memory=true> calls mlock, which returns pinned
+// and locked memory as staging areas for data exchange between host
+// and device.  Allocates too much would reduce the amount of memory
+// available to the system for paging.  So, by default, we should use
+// CPUAllocator<staging=false>.
+template <bool lock_memory>
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t size) {
+    void* p = std::malloc(size);
+    if (p != nullptr && lock_memory) {
+      mlock(p, size);
+    }
+    return p;
+  }
+
+  virtual void Free(void* p, size_t size) {
+    if (p != nullptr && lock_memory) {
+      munlock(p, size);
+    }
+    std::free(p);
+  }
+};
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+namespace {
 inline void throw_on_error(cudaError_t e, const char* message) {
   if (e) {
     throw thrust::system_error(e, thrust::cuda_category(), message);
   }
 }
+}  // namespace
 
 // GPUAllocator<staging=true> calls cudaHostMalloc, which returns
 // pinned and locked memory as staging areas for data exchange
@@ -36,17 +74,11 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 // default, we should use GPUAllocator<staging=false>.
 template <bool staging>
 class GPUAllocator {
-public:
-  void* Alloc(size_t size);
-  void Free(void* p, size_t size);
-};
-
-template <>
-class GPUAllocator<false> {
-public:
+ public:
   void* Alloc(size_t size) {
     void* p = 0;
-    cudaError_t result = cudaMalloc(&p, size);
+    cudaError_t result =
+        staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
     if (result == cudaSuccess) {
       return p;
     }
@@ -60,32 +92,15 @@ class GPUAllocator<false> {
     // that is returned if you ever call cudaFree after the
     // driver has already shutdown. This happens only if the
     // process is terminating, in which case we don't care if
-    // cudaFree succeeds. 
-    auto err = cudaFree(p);
+    // cudaFree succeeds.
+    auto err = staging ? cudaFreeHost(p) : cudaFree(p);
     if (err != cudaErrorCudartUnloading) {
-        throw_on_error(err, "cudaFree failed");
+      throw_on_error(err, "cudaFree failed");
     }
   }
 };
 
-template <>
-class GPUAllocator<true> {
-public:
-  void* Alloc(size_t size) {
-    void* p = 0;
-    cudaError_t result = cudaMallocHost(&p, size);
-    if (result == cudaSuccess) {
-        return p;
-    }
-    // clear last error
-    cudaGetLastError();
-    return nullptr;
-  }
-
-  void Free(void* p, size_t size) {
-    throw_on_error(cudaFreeHost(p), "cudaFreeHost failed");
-  }
-};
+#endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/gpu_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
similarity index 69%
rename from paddle/memory/detail/gpu_allocator_test.cc
rename to paddle/memory/detail/system_allocator_test.cc
index 18c1c9ab43084..4e7b8018b6a07 100644
--- a/paddle/memory/detail/gpu_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -12,9 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/memory/detail/gpu_allocator.h"
+#include "paddle/memory/detail/system_allocator.h"
 #include "gtest/gtest.h"
 
+TEST(CPUAllocator, NoLockMem) {
+  paddle::memory::detail::CPUAllocator<false> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
+}
+
+TEST(CPUAllocator, LockMem) {
+  paddle::memory::detail::CPUAllocator<true> a;
+  void* p = a.Alloc(4096);
+  EXPECT_NE(p, nullptr);
+  a.Free(p, 4096);
+}
+
+#ifndef PADDLE_ONLY_CPU
+
 TEST(GPUAllocator, NonStaging) {
   paddle::memory::detail::GPUAllocator<false> a;
   void* p = a.Alloc(4096);
@@ -28,3 +44,5 @@ TEST(GPUAllocator, Staging) {
   EXPECT_NE(p, nullptr);
   a.Free(p, 4096);
 }
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index b617923731a4d..ca3c01ebdb035 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -14,48 +14,41 @@ limitations under the License. */
 
 #include "paddle/memory/memory.h"
 
+#include "paddle/memory/detail/cpu_allocator.h"
+#include "paddle/memory/detail/gpu_allocator.h"
+
 namespace paddle {
 namespace memory {
 
-template <>
-void* Alloc<CPUPlace>(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size);
-}
-
-void* AllocStaging(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size);
-}
-
-template <>
-void* Alloc<GPUPlace>(GPUPlace pl, size_t size) {
-  return GetGPUBuddyAllocator(pl.device)->Alloc(size);
-}
-
-template <>
-void Free<CPUPlace>(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
-}
-
-void FreeStaging(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
-}
-
-#ifdef PADDLE_WITH_GPU
-template <>
-void* Alloc<GPUPlace>(GPUPlace pl, void* p) {
-  return GetGPUBuddyAllocator(pl.device)->Free(p);
-}
-
-template <>
-size_t Used<CPUPlace>(CPUPlace) {
+void Alloc(paddle::platform::Place pl, size_t size) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    return GetGPUBuddyAllocator(pl.device)->Alloc(size);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+void Free(paddle::platform::Place pl, void* p) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    GetGPUBuddyAllocator(pl.device)->Free(p);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+size_t Used(paddle::platform::Place pl) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    return GetGPUBuddyAllocator(pl.device)->Used();
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
   return GetCPUBuddyAllocator()->Used();
 }
 
-template <>
-size_t Alloc<GPUPlace>(GPUPlace pl) {
-  return GetGPUBuddyAllocator(pl.device)->Used();
-}
-#endif  // PADDLE_WITH_GPU
-
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 8c15a133bb4e9..0bc609205eca2 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,19 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-template <typename paddle::framework::Place>
-void* Alloc(Place, size_t);
-template <typename paddle::framework::Place>
-void Free(Place, void*);
-template <typename paddle::framework::Place>
-size_t Used(Place);
-
-// Staging memory means "pinned" host memory that can be mapped into
-// the CUDA memory space and accessed by the device rapidly.  Don't
-// allocate too much staging memory; otherwise system performance will
-// degrade because the OS cannot find enough swap memory space.
-void* AllocStaging(CPUPlace, size_t);
-void* FreeStaging(CPUPlace, size_t);
+void* Alloc(paddle::framework::Place, size_t);
+void Free(paddle::framework::Place, void*);
+size_t Used(paddle::framework::Place);
 
 }  // namespace memory
 }  // namespace paddle

From 6250d108bfd39afb3b2beba438ecb22eca8991bc Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 27 Jun 2017 09:51:55 +0800
Subject: [PATCH 06/16] FIX: clang-format

---
 paddle/memory/detail/gpu_allocator.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/gpu_allocator.h
index 9452c41fb8975..682afdf7d3349 100644
--- a/paddle/memory/detail/gpu_allocator.h
+++ b/paddle/memory/detail/gpu_allocator.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <stddef.h>  // for size_t
 
-#include <thrust/system_error.h>
 #include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
 
 namespace paddle {
 namespace memory {
@@ -36,14 +36,14 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 // default, we should use GPUAllocator<staging=false>.
 template <bool staging>
 class GPUAllocator {
-public:
+ public:
   void* Alloc(size_t size);
   void Free(void* p, size_t size);
 };
 
 template <>
 class GPUAllocator<false> {
-public:
+ public:
   void* Alloc(size_t size) {
     void* p = 0;
     cudaError_t result = cudaMalloc(&p, size);
@@ -60,22 +60,22 @@ class GPUAllocator<false> {
     // that is returned if you ever call cudaFree after the
     // driver has already shutdown. This happens only if the
     // process is terminating, in which case we don't care if
-    // cudaFree succeeds. 
+    // cudaFree succeeds.
     auto err = cudaFree(p);
     if (err != cudaErrorCudartUnloading) {
-        throw_on_error(err, "cudaFree failed");
+      throw_on_error(err, "cudaFree failed");
     }
   }
 };
 
 template <>
 class GPUAllocator<true> {
-public:
+ public:
   void* Alloc(size_t size) {
     void* p = 0;
     cudaError_t result = cudaMallocHost(&p, size);
     if (result == cudaSuccess) {
-        return p;
+      return p;
     }
     // clear last error
     cudaGetLastError();

From f149d183f7d78fdaa171f2afabaf8a138596c8ff Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Mon, 26 Jun 2017 20:41:33 -0700
Subject: [PATCH 07/16] Add system_allocator

---
 paddle/memory/detail/CMakeLists.txt           |  6 +-
 paddle/memory/detail/system_allocator.h       | 84 ++++++++++++-------
 paddle/memory/detail/system_allocator_test.cc | 44 +++++-----
 3 files changed, 81 insertions(+), 53 deletions(-)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 3b5bbd7a12fab..c16dfadeb2180 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1 +1,5 @@
-cc_test(system_allocator_test SRCS system_allocator_test.cc)
+if(${WITH_GPU})
+  nv_test(system_allocator_test SRCS system_allocator_test.cc)
+else(${WITH_GPU})
+  cc_test(system_allocator_test SRCS system_allocator_test.cc)
+endif(${WITH_GPU})
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 0a64553188995..1768f9a0da6c9 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -23,14 +23,31 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_ONLY_CPU
 
+#include "paddle/platform/assert.h"
+
 namespace paddle {
 namespace memory {
 namespace detail {
 
-class SystemAllocator {
+class CPUDeleter {
  public:
-  virtual void* Alloc(size_t size) = 0;
-  virtual void* Free(void* p) = 0;
+  CPUDeleter(void* ptr, size_t size, bool locked)
+      : ptr_(ptr), size_(size), locked_(locked) {}
+
+  void* Ptr() { return ptr_; }
+
+  void operator()(void* ptr) {
+    PADDLE_ASSERT(ptr == ptr_);
+    if (ptr_ != nullptr && locked_) {
+      munlock(ptr_, size_);
+    }
+    std::free(ptr_);
+  }
+
+ private:
+  void* ptr_;
+  size_t size_;
+  bool locked_;
 };
 
 // CPUAllocator<lock_memory=true> calls mlock, which returns pinned
@@ -39,21 +56,14 @@ class SystemAllocator {
 // available to the system for paging.  So, by default, we should use
 // CPUAllocator<staging=false>.
 template <bool lock_memory>
-class CPUAllocator : public SystemAllocator {
+class CPUAllocator {
  public:
-  virtual void* Alloc(size_t size) {
+  static CPUDeleter Alloc(size_t size) {
     void* p = std::malloc(size);
     if (p != nullptr && lock_memory) {
       mlock(p, size);
     }
-    return p;
-  }
-
-  virtual void Free(void* p, size_t size) {
-    if (p != nullptr && lock_memory) {
-      munlock(p, size);
-    }
-    std::free(p);
+    return CPUDeleter(p, size, lock_memory);
   }
 };
 
@@ -67,6 +77,32 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 }
 }  // namespace
 
+class GPUDeleter {
+ public:
+  GPUDeleter(void* ptr, size_t size, bool staging)
+      : ptr_(ptr), size_(size), staging_(staging) {}
+
+  void* Ptr() { return ptr_; }
+
+  void operator()(void* ptr) {
+    PADDLE_ASSERT(ptr == ptr_);
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds.
+    cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr);
+    if (err != cudaErrorCudartUnloading) {
+      throw_on_error(err, "cudaFree{Host} failed");
+    }
+  }
+
+ private:
+  void* ptr_;
+  size_t size_;
+  bool staging_;
+};
+
 // GPUAllocator<staging=true> calls cudaHostMalloc, which returns
 // pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the
@@ -75,28 +111,14 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 template <bool staging>
 class GPUAllocator {
  public:
-  void* Alloc(size_t size) {
+  static GPUDeleter Alloc(size_t size) {
     void* p = 0;
     cudaError_t result =
         staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-    if (result == cudaSuccess) {
-      return p;
-    }
-    // clear last error
-    cudaGetLastError();
-    return nullptr;
-  }
-
-  void Free(void* p, size_t size) {
-    // Purposefully allow cudaErrorCudartUnloading, because
-    // that is returned if you ever call cudaFree after the
-    // driver has already shutdown. This happens only if the
-    // process is terminating, in which case we don't care if
-    // cudaFree succeeds.
-    auto err = staging ? cudaFreeHost(p) : cudaFree(p);
-    if (err != cudaErrorCudartUnloading) {
-      throw_on_error(err, "cudaFree failed");
+    if (result != cudaSuccess) {
+      cudaGetLastError();  // clear error if there is any.
     }
+    return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging);
   }
 };
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 4e7b8018b6a07..fec70a65b77d5 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -13,36 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/detail/system_allocator.h"
+
+#include <memory>
+#include <vector>
+
 #include "gtest/gtest.h"
 
-TEST(CPUAllocator, NoLockMem) {
-  paddle::memory::detail::CPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+template <typename Allocator>
+void TestAllocator() {
+  {
+    auto d = Allocator::Alloc(sizeof(int));
+    EXPECT_NE(d.Ptr(), nullptr);
+    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
+  }
+  {
+    auto d = Allocator::Alloc(0);
+    EXPECT_EQ(d.Ptr(), nullptr);
+    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
+  }
 }
 
+TEST(CPUAllocator, NoLockMem) {
+  TestAllocator<paddle::memory::detail::CPUAllocator<false>>();
+}
 TEST(CPUAllocator, LockMem) {
-  paddle::memory::detail::CPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+  TestAllocator<paddle::memory::detail::CPUAllocator<true>>();
 }
 
 #ifndef PADDLE_ONLY_CPU
-
-TEST(GPUAllocator, NonStaging) {
-  paddle::memory::detail::GPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+TEST(GPUAllocator, NoStaging) {
+  TestAllocator<paddle::memory::detail::GPUAllocator<false>>();
 }
-
 TEST(GPUAllocator, Staging) {
-  paddle::memory::detail::GPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+  TestAllocator<paddle::memory::detail::GPUAllocator<true>>();
 }
-
 #endif  // PADDLE_ONLY_CPU

From dd08d337c0138c9def5f7ce95f88bae5599e5f92 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:30:57 +0800
Subject: [PATCH 08/16] FIX: fix cmake type error

---
 CMakeLists.txt               |  2 ++
 paddle/CMakeLists.txt        | 10 +---------
 paddle/memory/CMakeLists.txt |  2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c719d35eced2..b779caefb9a8c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,7 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
+find_package(Boost QUIET)
 
 include(simd)
 
@@ -109,6 +110,7 @@ include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")
+include_directories(${Boost_INCLUDE_DIRS})
 
 set(EXTERNAL_LIBS
     ${GFLAGS_LIBRARIES}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 0cddb95244fcf..979b68e827218 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -10,17 +10,9 @@ add_subdirectory(trainer)
 add_subdirectory(scripts)
 add_subdirectory(optimizer)
 add_subdirectory(strings)
-add_subdirectory(memory)
-
-# Do not build go directory until go cmake is working smoothly.
-# if(CMAKE_Go_COMPILER)
-#   add_subdirectory(go)
-# endif()
-
-find_package(Boost QUIET)
 
 if(Boost_FOUND)
-  include_directories(${Boost_INCLUDE_DIRS})
+  add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
 endif()
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 86625124967d7..e74ce75c9398d 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -3,5 +3,5 @@ add_subdirectory(detail)
 if(${WITH_GPU})
   nv_library(memory SRCS memory.cc)
 else(${WITH_GPU})
-  cc_library(memory SRCS memroy.cc)
+  cc_library(memory SRCS memory.cc)
 endif(${WITH_GPU})

From dde0da9e0ffee7a49510061a139ab2abc7ab55b9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:31:24 +0800
Subject: [PATCH 09/16] ENH: Add cuda.h in platform

---
 paddle/platform/cuda.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 paddle/platform/cuda.h

diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
new file mode 100644
index 0000000000000..864a5d3340a8f
--- /dev/null
+++ b/paddle/platform/cuda.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+namespace paddle {
+namespace platform {
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+int GetDeviceCount(void) {
+    int count;
+    throw_on_error(cudaGetDeviceCount(&count),
+                   "cudaGetDeviceCount failed");
+    return count;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_ONLY_CPU 

From 29c7512b3ce13ca7b89d3ff3f4aea2c7d7f27478 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:31:46 +0800
Subject: [PATCH 10/16] FIX: fix memory.h/cc

---
 paddle/memory/memory.cc | 23 ++++++++++++++---------
 paddle/memory/memory.h  |  8 ++++----
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index ca3c01ebdb035..0d123d99e234a 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -13,41 +13,46 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/memory.h"
+#include "paddle/memory/detail/buddy_allocator.h"
+#include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
 
-#include "paddle/memory/detail/cpu_allocator.h"
-#include "paddle/memory/detail/gpu_allocator.h"
+#include <boost/variant.hpp>
 
 namespace paddle {
 namespace memory {
 
-void Alloc(paddle::platform::Place pl, size_t size) {
+void* Alloc(platform::Place pl, size_t size) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
-    return GetGPUBuddyAllocator(pl.device)->Alloc(size);
+    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
+    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Alloc(size);
+  return detail::GetCPUBuddyAllocator()->Alloc(size);
 }
 
 void Free(paddle::platform::Place pl, void* p) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
-    GetGPUBuddyAllocator(pl.device)->Free(p);
+    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
+    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  GetCPUBuddyAllocator()->Free(p);
+  detail::GetCPUBuddyAllocator()->Free(p);
 }
 
 size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
   if (paddle::platform::is_gpu_place(pl)) {
-    return GetGPUBuddyAllocator(pl.device)->Used();
+    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
+    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
   }
 #endif  // PADDLE_ONLY_CPU
   PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return GetCPUBuddyAllocator()->Used();
+  return detail::GetCPUBuddyAllocator()->Used();
 }
 
 }  // namespace memory
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 0bc609205eca2..a33092bade65e 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,14 +14,14 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/frameowork/place.h"
+#include "paddle/platform/place.h"
 
 namespace paddle {
 namespace memory {
 
-void* Alloc(paddle::framework::Place, size_t);
-void Free(paddle::framework::Place, void*);
-size_t Used(paddle::framework::Place);
+void* Alloc(paddle::platform::Place, size_t);
+void Free(paddle::platform::Place, void*);
+size_t Used(paddle::platform::Place);
 
 }  // namespace memory
 }  // namespace paddle

From b22dd12854150c31b9cb9e3e550bdee4b5df5977 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:32:06 +0800
Subject: [PATCH 11/16] ENH: Add buddy allocator draft

---
 paddle/memory/detail/CMakeLists.txt    |  4 +-
 paddle/memory/detail/buddy_allocator.h | 79 ++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 paddle/memory/detail/buddy_allocator.h

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index c16dfadeb2180..cd5622203ff51 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(${WITH_GPU})
-  nv_test(system_allocator_test SRCS system_allocator_test.cc)
+  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog)
 else(${WITH_GPU})
-  cc_test(system_allocator_test SRCS system_allocator_test.cc)
+  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog)
 endif(${WITH_GPU})
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
new file mode 100644
index 0000000000000..35e96fd50782a
--- /dev/null
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/system_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+template<typename Allocator>
+class BuddyAllocator {
+  public:
+    // TODO(gangliao): This is a draft, add Buddy Allocator Algorithm soon
+    BuddyAllocator() {}
+    ~BuddyAllocator() {}
+
+  public:
+    void* Alloc(size_t size) {
+        return Allocator::Alloc(size); 
+    }
+    void Free(void*) {
+      // Because all info like size are stored in meta data,
+      // thus it's duplicate if add the parameter `size` in
+      // `Free(void*)` interface.
+    }
+    size_t Used();
+
+  public:
+    BuddyAllocator(const BuddyAllocator&) = delete;
+    BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
+  private:
+    size_t min_alloc_size_;
+    size_t max_alloc_size_;
+
+  private:
+    std::mutex mutex_;
+};
+
+BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
+  static BuddyAllocator<CPUAllocator>* a = nullptr;
+  if (a == nullptr) {
+    a = new BuddyAllocator<CPUAllocator>();
+  }
+  return a;
+}
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator<GPUAllocator>** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetDeviceCount(); 
+    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+        as[gpu] = new BuddyAllocator<GPUAllocator>();
+    }
+  }
+  return as[gpu_id];
+}
+
+#endif  // PADDLE_ONLY_CPU 
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle

From 79373dabc8d2e4edc87fbef40efdfa1f54b35a9f Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 01:33:06 +0800
Subject: [PATCH 12/16] TEST: Add test for system allocator and deleter

---
 paddle/memory/detail/system_allocator.h       | 108 ++++++------------
 paddle/memory/detail/system_allocator_test.cc |  40 ++++---
 2 files changed, 60 insertions(+), 88 deletions(-)

diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 1768f9a0da6c9..f411019854e1a 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -18,107 +18,69 @@ limitations under the License. */
 #include <sys/mman.h>  // for mlock and munlock
 #include <cstdlib>     // for malloc and free
 
-#ifndef PADDLE_ONLY_CPU
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-#endif  // PADDLE_ONLY_CPU
-
+#include <gflags/gflags.h>
 #include "paddle/platform/assert.h"
+#include "paddle/platform/cuda.h"
+
+DEFINE_bool(uses_pinned_memory, false,
+            "If set, allocate cpu/gpu pinned memory.");
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-class CPUDeleter {
- public:
-  CPUDeleter(void* ptr, size_t size, bool locked)
-      : ptr_(ptr), size_(size), locked_(locked) {}
-
-  void* Ptr() { return ptr_; }
-
-  void operator()(void* ptr) {
-    PADDLE_ASSERT(ptr == ptr_);
-    if (ptr_ != nullptr && locked_) {
-      munlock(ptr_, size_);
-    }
-    std::free(ptr_);
-  }
-
- private:
-  void* ptr_;
-  size_t size_;
-  bool locked_;
-};
-
-// CPUAllocator<lock_memory=true> calls mlock, which returns pinned
-// and locked memory as staging areas for data exchange between host
-// and device.  Allocates too much would reduce the amount of memory
-// available to the system for paging.  So, by default, we should use
-// CPUAllocator<staging=false>.
-template <bool lock_memory>
+// If uses_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to uses_pinned_memory.
 class CPUAllocator {
  public:
-  static CPUDeleter Alloc(size_t size) {
+  static void* Alloc(size_t size) {
     void* p = std::malloc(size);
-    if (p != nullptr && lock_memory) {
+    if (p != nullptr && FLAGS_uses_pinned_memory) {
       mlock(p, size);
     }
-    return CPUDeleter(p, size, lock_memory);
+    return p;
   }
-};
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-namespace {
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-}  // namespace
-
-class GPUDeleter {
- public:
-  GPUDeleter(void* ptr, size_t size, bool staging)
-      : ptr_(ptr), size_(size), staging_(staging) {}
-
-  void* Ptr() { return ptr_; }
 
-  void operator()(void* ptr) {
-    PADDLE_ASSERT(ptr == ptr_);
-    // Purposefully allow cudaErrorCudartUnloading, because
-    // that is returned if you ever call cudaFree after the
-    // driver has already shutdown. This happens only if the
-    // process is terminating, in which case we don't care if
-    // cudaFree succeeds.
-    cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr);
-    if (err != cudaErrorCudartUnloading) {
-      throw_on_error(err, "cudaFree{Host} failed");
+  static void Free(void* p, size_t size) {
+    if (p != nullptr && FLAGS_uses_pinned_memory) {
+      munlock(p, size);
     }
+    std::free(p);
   }
-
- private:
-  void* ptr_;
-  size_t size_;
-  bool staging_;
 };
 
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
 // GPUAllocator<staging=true> calls cudaHostMalloc, which returns
 // pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the
 // amount of memory available to the system for paging.  So, by
 // default, we should use GPUAllocator<staging=false>.
-template <bool staging>
 class GPUAllocator {
  public:
-  static GPUDeleter Alloc(size_t size) {
+  static void* Alloc(size_t size) {
     void* p = 0;
-    cudaError_t result =
-        staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
+    cudaError_t result = FLAGS_uses_pinned_memory ? cudaMallocHost(&p, size)
+                                                  : cudaMalloc(&p, size);
     if (result != cudaSuccess) {
       cudaGetLastError();  // clear error if there is any.
     }
-    return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging);
+    return result == cudaSuccess ? p : nullptr;
+  }
+
+  static void Free(void* p, size_t size) {
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds.
+    cudaError_t err = FLAGS_uses_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
+    if (err != cudaErrorCudartUnloading) {
+      platform::throw_on_error(err, "cudaFree{Host} failed");
+    }
   }
 };
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index fec70a65b77d5..829d3558ba4dd 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -17,34 +17,44 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
+#include "glog/logging.h"
 #include "gtest/gtest.h"
 
 template <typename Allocator>
-void TestAllocator() {
-  {
-    auto d = Allocator::Alloc(sizeof(int));
-    EXPECT_NE(d.Ptr(), nullptr);
-    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
-  }
-  {
-    auto d = Allocator::Alloc(0);
-    EXPECT_EQ(d.Ptr(), nullptr);
-    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
-  }
+void TestAllocator(void* p) {
+  p = Allocator::Alloc(1024);
+
+  int* i = static_cast<int*>(p);
+  std::shared_ptr<int> ptr(i, [](int* p) { Allocator::Free(p, 1024); });
+
+  EXPECT_NE(p, nullptr);
 }
 
 TEST(CPUAllocator, NoLockMem) {
-  TestAllocator<paddle::memory::detail::CPUAllocator<false>>();
+  void* p = nullptr;
+  FLAGS_uses_pinned_memory = false;
+  TestAllocator<paddle::memory::detail::CPUAllocator>(p);
+  EXPECT_EQ(p, nullptr);
 }
+
 TEST(CPUAllocator, LockMem) {
-  TestAllocator<paddle::memory::detail::CPUAllocator<true>>();
+  void* p = nullptr;
+  FLAGS_uses_pinned_memory = true;
+  TestAllocator<paddle::memory::detail::CPUAllocator>(p);
+  EXPECT_EQ(p, nullptr);
 }
 
 #ifndef PADDLE_ONLY_CPU
 TEST(GPUAllocator, NoStaging) {
-  TestAllocator<paddle::memory::detail::GPUAllocator<false>>();
+  void* p = nullptr;
+  FLAGS_uses_pinned_memory = false;
+  TestAllocator<paddle::memory::detail::GPUAllocator>(p);
+  EXPECT_EQ(p, nullptr);
 }
 TEST(GPUAllocator, Staging) {
-  TestAllocator<paddle::memory::detail::GPUAllocator<true>>();
+  void* p = nullptr;
+  FLAGS_uses_pinned_memory = true;
+  TestAllocator<paddle::memory::detail::GPUAllocator>(p);
+  EXPECT_EQ(p, nullptr);
 }
 #endif  // PADDLE_ONLY_CPU

From b8f5922d88e5f7949eb9a469f761ad49981d677a Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Tue, 27 Jun 2017 16:32:24 -0700
Subject: [PATCH 13/16] Make CPUAllocator and GPUAllocator subclasses of
 SystemAllocator

---
 paddle/memory/detail/CMakeLists.txt           |  6 +-
 paddle/memory/detail/system_allocator.h       | 80 +++++--------------
 paddle/memory/detail/system_allocator_test.cc | 57 +++++++------
 3 files changed, 59 insertions(+), 84 deletions(-)

diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index cd5622203ff51..72d3749ad789e 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,5 +1,7 @@
 if(${WITH_GPU})
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 else(${WITH_GPU})
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS gflags glog)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 endif(${WITH_GPU})
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index f411019854e1a..184b383f7f782 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -14,76 +14,38 @@ limitations under the License. */
 
 #pragma once
 
-#include <stddef.h>    // for size_t
-#include <sys/mman.h>  // for mlock and munlock
-#include <cstdlib>     // for malloc and free
-
-#include <gflags/gflags.h>
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"
-
-DEFINE_bool(uses_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+#include <stddef.h>  // for size_t
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-// If uses_pinned_memory is true, CPUAllocator calls mlock, which
-// returns pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the amount
-// of memory available to the system for paging.  So, by default, we
-// should set false to uses_pinned_memory.
-class CPUAllocator {
+// SystemAllocator is the parent class of CPUAllocator and
+// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
+// pointing to the underlying system allocator.  An alternative to
+// this class hierarchy is to pass a system allocator class to
+// BuddyAllocator as a template parameter.  This approach makes
+// BuddyAllocator a class template, and it's very complicated
+// algorithm would make the buddy_allocator.h messy.
+class SystemAllocator {
  public:
-  static void* Alloc(size_t size) {
-    void* p = std::malloc(size);
-    if (p != nullptr && FLAGS_uses_pinned_memory) {
-      mlock(p, size);
-    }
-    return p;
-  }
-
-  static void Free(void* p, size_t size) {
-    if (p != nullptr && FLAGS_uses_pinned_memory) {
-      munlock(p, size);
-    }
-    std::free(p);
-  }
+  virtual ~SystemAllocator() {}
+  virtual void* Alloc(size_t size) = 0;
+  virtual void Free(void* p, size_t size) = 0;
 };
 
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
-// pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the
-// amount of memory available to the system for paging.  So, by
-// default, we should use GPUAllocator<staging=false>.
-class GPUAllocator {
+class CPUAllocator : public SystemAllocator {
  public:
-  static void* Alloc(size_t size) {
-    void* p = 0;
-    cudaError_t result = FLAGS_uses_pinned_memory ? cudaMallocHost(&p, size)
-                                                  : cudaMalloc(&p, size);
-    if (result != cudaSuccess) {
-      cudaGetLastError();  // clear error if there is any.
-    }
-    return result == cudaSuccess ? p : nullptr;
-  }
-
-  static void Free(void* p, size_t size) {
-    // Purposefully allow cudaErrorCudartUnloading, because
-    // that is returned if you ever call cudaFree after the
-    // driver has already shutdown. This happens only if the
-    // process is terminating, in which case we don't care if
-    // cudaFree succeeds.
-    cudaError_t err = FLAGS_uses_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
-    if (err != cudaErrorCudartUnloading) {
-      platform::throw_on_error(err, "cudaFree{Host} failed");
-    }
-  }
+  virtual void* Alloc(size_t size);
+  virtual void Free(void* p, size_t size);
 };
 
+#ifndef PADDLE_ONLY_CPU
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t size);
+  virtual void Free(void* p, size_t size);
+};
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 829d3558ba4dd..c461d8ac6265a 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -17,44 +17,55 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
-#include "glog/logging.h"
+#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 
-template <typename Allocator>
-void TestAllocator(void* p) {
-  p = Allocator::Alloc(1024);
+DECLARE_bool(use_pinned_memory);
 
-  int* i = static_cast<int*>(p);
-  std::shared_ptr<int> ptr(i, [](int* p) { Allocator::Free(p, 1024); });
+void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
+  bool freed = false;
+  {
+    void* p = a->Alloc(size);
+    if (size > 0) {
+      EXPECT_NE(p, nullptr);
+    } else {
+      EXPECT_EQ(p, nullptr);
+    }
 
-  EXPECT_NE(p, nullptr);
+    int* i = static_cast<int*>(p);
+    std::shared_ptr<int> ptr(i, [&freed, a, size](void* p) {
+      freed = true;
+      a->Free(p, size);
+    });
+  }
+  EXPECT_TRUE(freed);
 }
 
 TEST(CPUAllocator, NoLockMem) {
-  void* p = nullptr;
-  FLAGS_uses_pinned_memory = false;
-  TestAllocator<paddle::memory::detail::CPUAllocator>(p);
-  EXPECT_EQ(p, nullptr);
+  FLAGS_use_pinned_memory = false;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 
 TEST(CPUAllocator, LockMem) {
-  void* p = nullptr;
-  FLAGS_uses_pinned_memory = true;
-  TestAllocator<paddle::memory::detail::CPUAllocator>(p);
-  EXPECT_EQ(p, nullptr);
+  FLAGS_use_pinned_memory = true;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 
 #ifndef PADDLE_ONLY_CPU
 TEST(GPUAllocator, NoStaging) {
-  void* p = nullptr;
-  FLAGS_uses_pinned_memory = false;
-  TestAllocator<paddle::memory::detail::GPUAllocator>(p);
-  EXPECT_EQ(p, nullptr);
+  FLAGS_use_pinned_memory = false;
+  paddle::memory::detail::GPUAllocator a;
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 TEST(GPUAllocator, Staging) {
-  void* p = nullptr;
-  FLAGS_uses_pinned_memory = true;
-  TestAllocator<paddle::memory::detail::GPUAllocator>(p);
-  EXPECT_EQ(p, nullptr);
+  FLAGS_use_pinned_memory = true;
+  paddle::memory::detail::GPUAllocator a;
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 #endif  // PADDLE_ONLY_CPU

From 3e087f763e9c6c15a4f1d542fb3bdc327f7441c7 Mon Sep 17 00:00:00 2001
From: Yi Wang <yiwang01@baidu.com>
Date: Tue, 27 Jun 2017 16:48:25 -0700
Subject: [PATCH 14/16] Add buddy_allocator.cc and system_allocator.cc

---
 paddle/memory/detail/buddy_allocator.cc       | 35 ++++++++
 paddle/memory/detail/buddy_allocator.h        | 76 ++++++++--------
 paddle/memory/detail/system_allocator.cc      | 90 +++++++++++++++++++
 paddle/memory/detail/system_allocator_test.cc | 24 ++---
 4 files changed, 177 insertions(+), 48 deletions(-)
 create mode 100644 paddle/memory/detail/buddy_allocator.cc
 create mode 100644 paddle/memory/detail/system_allocator.cc

diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
new file mode 100644
index 0000000000000..895bf319d778a
--- /dev/null
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/system_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
+                               SystemAllocator* system_allocator)
+    : pool_size_(pool_size),
+      max_pools_(max_pools),
+      system_allocator_(system_allocator) {
+  PADDLE_ASSERT(pool_size > 0);
+  PADDLE_ASSERT(max_pools > 0);
+  PADDLE_ASSERT(system_allocator != nullptr);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 35e96fd50782a..129b137ed7386 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #pragma once
 
@@ -20,34 +20,38 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-template<typename Allocator>
 class BuddyAllocator {
-  public:
-    // TODO(gangliao): This is a draft, add Buddy Allocator Algorithm soon
-    BuddyAllocator() {}
-    ~BuddyAllocator() {}
-
-  public:
-    void* Alloc(size_t size) {
-        return Allocator::Alloc(size); 
-    }
-    void Free(void*) {
-      // Because all info like size are stored in meta data,
-      // thus it's duplicate if add the parameter `size` in
-      // `Free(void*)` interface.
-    }
-    size_t Used();
+ public:
+  BuddyAllocator(size_t pool_size, size_t max_pools,
+                 SystemAllocator* system_allocator);
+  ~BuddyAllocator();
+
+  void* Alloc(size_t size);
+  void Free(void*);
+  size_t Used();
+
+ private:
+  struct Block {
+    size_t size_;
+    Block* left_;   // left buddy
+    Block* right_;  // right buddy
+  };
+
+  // Initially, there is only one pool.  If a Alloc founds not enough
+  // memory from that pool, and there has not been max_num_pools_,
+  // create a new pool by calling system_allocator_.Alloc(pool_size_).
+  std::vector<void*> pools_;
+
+  size_t pool_size_;      // the size of each pool;
+  size_t max_num_pools_;  // the size of all pools;
 
-  public:
-    BuddyAllocator(const BuddyAllocator&) = delete;
-    BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+  SystemAllocator* system_allocator_;
 
-  private:
-    size_t min_alloc_size_;
-    size_t max_alloc_size_;
+  std::mutex mutex_;
 
-  private:
-    std::mutex mutex_;
+  // Disable copy and assignment.
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
 };
 
 BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
@@ -63,16 +67,16 @@ BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
 BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
   static BuddyAllocator<GPUAllocator>** as = NULL;
   if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount(); 
+    int gpu_num = platform::GetDeviceCount();
     as = new BuddyAllocator<GPUAllocator>*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-        as[gpu] = new BuddyAllocator<GPUAllocator>();
+      as[gpu] = new BuddyAllocator<GPUAllocator>();
     }
   }
   return as[gpu_id];
 }
 
-#endif  // PADDLE_ONLY_CPU 
+#endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
new file mode 100644
index 0000000000000..50bec926f83de
--- /dev/null
+++ b/paddle/memory/detail/system_allocator.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/system_allocator.h"
+
+#include <stdlib.h>    // for malloc and free
+#include <sys/mman.h>  // for mlock and munlock
+
+#include "gflags/gflags.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cuda.h"
+
+// If use_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to use_pinned_memory.
+DEFINE_bool(use_pinned_memory, false,
+            "If set, allocate cpu/gpu pinned memory.");
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void* CPUAllocator::Alloc(size_t size) {
+  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
+  // malloc might not return nullptr if size is zero, but the returned
+  // pointer shall not be dereferenced -- so we make it nullptr.
+  if (size <= 0) return nullptr;
+
+  void* p = malloc(size);
+  if (p != nullptr && FLAGS_use_pinned_memory) {
+    mlock(p, size);
+  }
+  return p;
+}
+
+void CPUAllocator::Free(void* p, size_t size) {
+  if (p != nullptr && FLAGS_use_pinned_memory) {
+    munlock(p, size);
+  }
+  free(p);
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+void* GPUAllocator::Alloc(size_t size) {
+  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
+  // if size is 0.  We just make sure it does.
+  if (size <= 0) {
+    return nullptr;
+  }
+
+  void* p = 0;
+  cudaError_t result =
+      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
+  if (result != cudaSuccess) {
+    cudaGetLastError();  // clear error if there is any.
+  }
+  return result == cudaSuccess ? p : nullptr;
+}
+
+void GPUAllocator::Free(void* p, size_t size) {
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFree after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFree succeeds.
+  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
+  if (err != cudaErrorCudartUnloading) {
+    platform::throw_on_error(err, "cudaFree{Host} failed");
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index c461d8ac6265a..9bd5706a4e4d1 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -22,10 +22,10 @@ limitations under the License. */
 
 DECLARE_bool(use_pinned_memory);
 
-void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
+void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
   bool freed = false;
   {
-    void* p = a->Alloc(size);
+    void* p = a.Alloc(size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -33,9 +33,9 @@ void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
     }
 
     int* i = static_cast<int*>(p);
-    std::shared_ptr<int> ptr(i, [&freed, a, size](void* p) {
+    std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a->Free(p, size);
+      a.Free(p, size);
     });
   }
   EXPECT_TRUE(freed);
@@ -44,28 +44,28 @@ void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
 TEST(CPUAllocator, NoLockMem) {
   FLAGS_use_pinned_memory = false;
   paddle::memory::detail::CPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
 }
 
 TEST(CPUAllocator, LockMem) {
   FLAGS_use_pinned_memory = true;
   paddle::memory::detail::CPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
 }
 
 #ifndef PADDLE_ONLY_CPU
 TEST(GPUAllocator, NoStaging) {
   FLAGS_use_pinned_memory = false;
   paddle::memory::detail::GPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
 }
 TEST(GPUAllocator, Staging) {
   FLAGS_use_pinned_memory = true;
   paddle::memory::detail::GPUAllocator a;
-  TestAllocator(&a, 2048);
-  TestAllocator(&a, 0);
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
 }
 #endif  // PADDLE_ONLY_CPU

From 3e9aa7fd8bfac7434057afcdd6ae62ea7a92bff1 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 10:42:54 +0800
Subject: [PATCH 15/16] FIX: Pass CI

---
 cmake/generic.cmake                     | 4 ----
 paddle/memory/CMakeLists.txt            | 6 ------
 paddle/memory/detail/buddy_allocator.cc | 2 +-
 paddle/memory/detail/buddy_allocator.h  | 3 +++
 4 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 840155750e1ac..69e8164a00d1f 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -78,10 +78,6 @@
 #
 #   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
 
-if(WITH_GPU)
-  add_definitions(-DPADDLE_WITH_GPU)
-endif()
-
 if(NOT APPLE)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index e74ce75c9398d..3943c3cfad31d 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1 @@
 add_subdirectory(detail)
-
-if(${WITH_GPU})
-  nv_library(memory SRCS memory.cc)
-else(${WITH_GPU})
-  cc_library(memory SRCS memory.cc)
-endif(${WITH_GPU})
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 895bf319d778a..ebe680f5eea49 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/memory/detail/system_allocator.h"
+#include "paddle/memory/detail/buddy_allocator.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 129b137ed7386..702c7d28ee5f9 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -16,6 +16,9 @@
 
 #include "paddle/memory/detail/system_allocator.h"
 
+#include <vector>
+#include <mutex>
+
 namespace paddle {
 namespace memory {
 namespace detail {

From 9490d243dd0255021b288f9c2e43c57b30264b9b Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Jun 2017 10:46:14 +0800
Subject: [PATCH 16/16] ENH: clang-format

---
 paddle/memory/detail/buddy_allocator.h | 2 +-
 paddle/platform/cuda.h                 | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 702c7d28ee5f9..82e6aaedc7199 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -16,8 +16,8 @@
 
 #include "paddle/memory/detail/system_allocator.h"
 
-#include <vector>
 #include <mutex>
+#include <vector>
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
index 864a5d3340a8f..8fe891f9ce6c3 100644
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
@@ -29,13 +29,12 @@ inline void throw_on_error(cudaError_t e, const char* message) {
 }
 
 int GetDeviceCount(void) {
-    int count;
-    throw_on_error(cudaGetDeviceCount(&count),
-                   "cudaGetDeviceCount failed");
-    return count;
+  int count;
+  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
+  return count;
 }
 
 }  // namespace platform
 }  // namespace paddle
 
-#endif  // PADDLE_ONLY_CPU 
+#endif  // PADDLE_ONLY_CPU