PaddlePaddle · dzhwinter · Oct 15, 2017 · Oct 17, 2017 · Oct 17, 2017 · Oct 17, 2017
@@ -127,6 +127,7 @@ include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)    # download pybind11
+include(external/nccl)      # download nccl1 
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration

diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
@@ -0,0 +1,67 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_GPU})
+    MESSAGE(WARNING
+        "WITH NCCL is not supported without GPU."
+        "Force WITH_GPU=ON")
+  return()
+ENDIF(NOT ${WITH_GPU})
+
+INCLUDE(ExternalProject)
+
+
+SET(NCCL_SOURCES_DIR ${THIRD_PARTY_PATH}/nccl)
+SET(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
+SET(NCCL_INCLUDE_DIR "${NCCL_INSTALL_DIR}/include" CACHE PATH "nccl include directory." FORCE)
+
+INCLUDE_DIRECTORIES(${NCCL_SOURCES_DIR}/src/extern_nccl/build/include)
+INCLUDE_DIRECTORIES(${NCCL_INCLUDE_DIR})
+
+IF(WIN32)
+  SET(NCCL_LIBRARIES "${NCCL_INSTALL_DIR}/lib/libnccl.lib" CACHE FILEPATH "nccl library." FORCE)
+ENDIF(WIN32)
+  SET(NCCL_LIBRARIES "${NCCL_INSTALL_DIR}/lib/libnccl.a" CACHE FILEPATH "nccl library." FORCE)
+
+
+# currently, nccl2 is not support in docker. So we use nccl1.
+# the progress of nccl2 can be tracked in https://gitlab.com/nvidia/cuda/issues/10
+
+ExternalProject_Add(
+  extern_nccl
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${NCCL_SOURCES_DIR}
+    DOWNLOAD_DIR ${NCCL_SOURCES_DIR}
+    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
+    GIT_TAG         "v1.3.4-1"
+    CONFIGURE_COMMAND ""
+    CMAKE_COMMAND ""
+    UPDATE_COMMAND  ""
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND    make -j 8
+    INSTALL_COMMAND  make install
+    INSTALL_DIR ${NCCL_INSTALL_DIR}
+    TEST_COMMAND      ""
+    )
+
+MESSAGE(STATUS "nccl include: ${NCCL_INCLUDE_DIR}")
+MESSAGE(STATUS "nccl source: ${NCCL_SOURCES_DIR}")
+
+MESSAGE(STATUS "nccl library: ${NCCL_LIBRARIES}")
+
+add_library(nccl INTERFACE)
+
+add_dependencies(nccl extern_nccl)
+
+LIST(APPEND external_project_dependencies nccl)
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog nccl)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
@@ -1,2 +1,7 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
-nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc DEPS dynamic_loader)
+IF(WITH_GPU)
+  nv_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags nccl)
+ELSE()
+  cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
+ENDIF()
+
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc DEPS dynamic_loader)
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
@@ -35,6 +35,11 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
 
+DEFINE_string(nccl_dir, "",
+              "Specify path for loading nccl library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -157,6 +162,14 @@ void GetLapackDsoHandle(void** dso_handle) {
 #endif
 }
 
+void GetNcclDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/platform/dynload/dynamic_loader.h
@@ -58,6 +58,14 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  */
 void GetLapackDsoHandle(void** dso_handle);
 
+/**
+ * @brief    load the DSO of NVIDIA nccl
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetNcclDsoHandle(void** dso_handle);
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.cc b/paddle/platform/dynload/nccl.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/platform/dynload/nccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag nccl_dso_flag;
+void *nccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <nccl.h>
+#include <mutex>
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag nccl_dso_flag;
+extern void* nccl_dso_handle;
+
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                    \
+  struct DynLoad__##__name {                                      \
+    template <typename... Args>                                   \
+    auto operator()(Args... args) -> decltype(__name(args...)) {  \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);   \
+      std::call_once(nccl_dso_flag,                               \
+                     paddle::platform::dynload::GetNcclDsoHandle, \
+                     &nccl_dso_handle);                           \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);    \
+    }                                                             \
+  };                                                              \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
+  struct DynLoad__##__name {                   \
+    template <typename... Args>                \
+    ncclResult_t operator()(Args... args) {    \
+      return __name(args...);                  \
+    }                                          \
+  };                                           \
+  extern DynLoad__##__name __name
+#endif
+
+#define NCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclReduce);                  \
+  __macro(ncclGetErrorString);
+
+NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
@@ -29,11 +29,14 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif
 
+#include <glog/logging.h>
+
 #ifdef PADDLE_WITH_CUDA
 
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/dynload/nccl.h"
 
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -172,6 +175,17 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   throw std::runtime_error(err + string::Sprintf(args...));
 }
 
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    ncclResult_t stat, const Args&... args) {
+  if (stat == ncclSuccess) {
+    return;
+  } else {
+    throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
+                             string::Sprintf(args...));
+  }
+}
+
 #endif  // PADDLE_ONLY_CPU
 
 template <typename T>

diff --git a/paddle/platform/place.h b/paddle/platform/place.h
@@ -35,6 +35,7 @@ struct GPUPlace {
   GPUPlace() : GPUPlace(0) {}
   explicit GPUPlace(int d) : device(d) {}
 
+  inline int GetDeviceId() const { return device; }
   // needed for variant equality comparison
   inline bool operator==(const GPUPlace &o) const { return device == o.device; }
   inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }