PaddlePaddle · liupluswei · Apr 20, 2020 · Apr 14, 2020 · Apr 15, 2020 · Apr 16, 2020
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
@@ -135,6 +135,12 @@ copy(inference_lib_dist
         SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
         DSTS ${dst_dir})
 
+set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
+copy(inference_lib_dist
+        SRCS ${cudaerror_INCLUDE_DIR}
+        DSTS ${dst_dir})
+
+# CMakeCache Info
 copy(inference_lib_dist
         SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
         DSTS ${FLUID_INFERENCE_INSTALL_DIR})
@@ -184,7 +190,7 @@ copy(fluid_lib_dist
         )
 
 set(module "framework")
-set(framework_lib_deps framework_proto)
+set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
 add_dependencies(fluid_lib_dist ${framework_lib_deps})
 copy(fluid_lib_dist
         SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
@@ -204,11 +210,11 @@ copy(fluid_lib_dist
         )
 
 set(module "platform")
-set(platform_lib_deps profiler_proto)
+set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
 add_dependencies(fluid_lib_dist ${platform_lib_deps})
 copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
         )
 
 set(module "string")
@@ -249,6 +255,7 @@ copy(inference_lib_dist
         SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
         DSTS ${dst_dir} ${dst_dir}/lib)
 
+
 # CMakeCache Info
 copy(fluid_lib_dist
         SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+include(ExternalProject)
 # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
 
 set(THIRD_PARTY_PATH  "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH     "${CMAKE_SOURCE_DIR}"    CACHE STRING
     "A path cache third party source code to avoid repeated download.")
 
 set(THIRD_PARTY_BUILD_TYPE Release)
+set(third_party_deps)
 
 # cache funciton to avoid repeat download code of third_party.
 # This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
@@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME)
     UNSET(${VAR_NAME})
 ENDMACRO()
 
+# Funciton to Download the dependencies during compilation
+# This function has 2 parameters, URL / DIRNAME:
+# 1. URL:           The download url of 3rd dependencies
+# 2. NAME:          The name of file, that determin the dirname
+#
+MACRO(file_download_and_uncompress URL NAME)
+  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
+  SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
+  SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
+  ExternalProject_Add(
+      ${EXTERNAL_PROJECT_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${THIRD_PARTY_PATH}/${NAME}
+      URL                   ${URL}
+      DOWNLOAD_DIR          ${THIRD_PARTY_PATH}/${NAME}/data/
+      SOURCE_DIR            ${THIRD_PARTY_PATH}/${NAME}/data/
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ""
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ""
+    )
+  list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
+ENDMACRO()
+
+
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
     if(WITH_MKL)
@@ -178,10 +206,13 @@ include(external/dlpack)    # download dlpack
 include(external/xxhash)    # download, build, install xxhash
 include(external/warpctc)   # download, build, install warpctc
 
-set(third_party_deps)
 list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
 list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
 
+# download file
+set(CUDAERROR_URL  "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
+file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")
+
 if(WITH_AMD_GPU)
     include(external/rocprim)   # download, build, install rocprim
     list(APPEND third_party_deps extern_rocprim)
@@ -274,4 +305,4 @@ if (WITH_LITE)
     include(external/lite)
 endif (WITH_LITE)
 
-add_custom_target(third_party DEPENDS ${third_party_deps})
+add_custom_target(third_party ALL DEPENDS ${third_party_deps})
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
@@ -1,6 +1,6 @@
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
 proto_library(error_codes_proto SRCS error_codes.proto)
-
+proto_library(cuda_error_proto SRCS cuda_error.proto)
 
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
@@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags)
 cc_library(errors SRCS errors.cc DEPS error_codes_proto)
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
 
-cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors)
+cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto)
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
 
 set(CPU_INFO_DEPS gflags glog enforce)

diff --git a/paddle/fluid/platform/cuda_error.proto b/paddle/fluid/platform/cuda_error.proto
@@ -0,0 +1,35 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.platform.proto;
+
+message MessageDesc {
+  // Indicates the type of error
+  required int32 errorCode = 1;
+  // Indicates the message of error
+  required string errorMessage = 2;
+}
+
+message AllMessageDesc {
+  // Version of cuda API
+  required int32 version = 1;
+  // Error messages of different errortype
+  repeated MessageDesc Messages = 2;
+}
+
+message cudaerrorDesc {
+  // Error messages of different cuda versions(9.0/10.0/10.2)
+  repeated AllMessageDesc AllMessages = 2;
+}
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
@@ -18,6 +18,13 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
 
+#if !defined(_WIN32)
+#include <dlfcn.h>    // dladdr
+#else                 // _WIN32
+#define NOMINMAX      // msvc max/min macro conflict with std::min/max
+#include <windows.h>  // GetModuleFileName
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -38,6 +45,7 @@ limitations under the License. */
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
+#include "paddle/fluid/platform/cuda_error.pb.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/port.h"
@@ -464,18 +472,112 @@ struct EOFException : public std::exception {
   } while (0)
 
 /** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/
-
 #ifdef PADDLE_WITH_CUDA
 
+inline std::string GetCudaErrorWebsite(int32_t cuda_version) {
+  std::ostringstream webstr;
+  webstr << "https://docs.nvidia.com/cuda/";
+  if (cuda_version != -1) {
+    double version = cuda_version / 10;
+    webstr << "archive/" << std::fixed << std::setprecision(1) << version;
+  }
+  webstr << "/cuda-runtime-api/group__CUDART__TYPES.html"
+            "#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038";
+  return webstr.str();
+}
+
+inline std::string GetCudaErrorMessage(cudaError_t e) {
+#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000
+  int32_t cuda_version = 100;
+#elif CUDA_VERSION >= 9000
+  int32_t cuda_version = 90;
+#else
+  int32_t cuda_version = -1;
+#endif
+  std::ostringstream sout;
+  sout << " CUDA runtime error(" << e << "): " << cudaGetErrorString(e) << ".";
+  static platform::proto::cudaerrorDesc cudaerror;
+  static bool _initSucceed = false;
+  if (cudaerror.ByteSizeLong() == 0) {
+    std::string filePath;
+#if !defined(_WIN32)
+    Dl_info info;
+    if (dladdr(reinterpret_cast<void*>(GetCudaErrorMessage), &info)) {
+      std::string strModule(info.dli_fname);
+      const size_t last_slash_idx = strModule.find_last_of("/");
+      std::string compare_path = strModule.substr(strModule.length() - 6);
+      if (std::string::npos != last_slash_idx) {
+        strModule.erase(last_slash_idx, std::string::npos);
+      }
+      if (compare_path.compare("avx.so") == 0) {
+        filePath = strModule +
+                   "/../include/third_party/cudaerror/data/cudaErrorMessage.pb";
+      } else {
+        filePath =
+            strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb";
+      }
+    }
+#else
+    char buf[100];
+    MEMORY_BASIC_INFORMATION mbi;
+    HMODULE h_module =
+        (::VirtualQuery(GetCudaErrorMessage, &mbi, sizeof(mbi)) != 0)
+            ? (HMODULE)mbi.AllocationBase
+            : NULL;
+    GetModuleFileName(h_module, buf, 100);
+    std::string strModule(buf);
+    const size_t last_slash_idx = strModule.find_last_of("\\");
+    std::string compare_path = strModule.substr(strModule.length() - 7);
+    if (std::string::npos != last_slash_idx) {
+      strModule.erase(last_slash_idx, std::string::npos);
+    }
+    if (compare_path.compare("avx.pyd") == 0) {
+      filePath =
+          strModule +
+          "\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
+    } else {
+      filePath =
+          strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
+    }
+#endif
+    std::ifstream fin(filePath, std::ios::in | std::ios::binary);
+    _initSucceed = cudaerror.ParseFromIstream(&fin);
+  }
+  if (_initSucceed) {
+    for (int i = 0; i < cudaerror.allmessages_size(); ++i) {
+      if (cuda_version == cudaerror.allmessages(i).version()) {
+        for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) {
+          if (e == cudaerror.allmessages(i).messages(j).errorcode()) {
+            sout << "\n\nRecommended Solution: "
+                 << cudaerror.allmessages(i).messages(j).errormessage() << " ";
+            return sout.str();
+          }
+        }
+      }
+    }
+  }
+  sout << "\n\nRecommended Solution: Please search for the error code[" << e
+       << "] on website[" << GetCudaErrorWebsite(cuda_version)
+       << "] to get Nvidia's official solution about CUDA Error. ";
+  return sout.str();
+}
+
 inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
 
 inline std::string build_ex_string(cudaError_t e, const std::string& msg) {
-  return msg;
+  // note(zhouwei): default message when input no error message by developer, it
+  // is not needed.
+  // better method is to refactor class ErrorSummary or
+  // PADDLE_ENFORCE_CUDA_SUCCESS
+  if (msg.find("An error occurred here") != std::string::npos) {
+    return platform::errors::External(GetCudaErrorMessage(e)).ToString();
+  }
+  return msg + GetCudaErrorMessage(e);
 }
 
 inline void throw_on_error(cudaError_t e, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
-  throw thrust::system_error(e, thrust::cuda_category(), msg);
+  throw std::runtime_error(msg);
 #else
   LOG(FATAL) << msg;
 #endif

diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h
@@ -33,6 +33,9 @@ class ErrorSummary {
   // Note(chenweihang): Final deprecated constructor
   //   This constructor is only used to be compatible with
   //   current existing no error message PADDLE_ENFORCE_*
+  // Note(zhouwei): PADDLE_ENFORCE_CUDA_SUCCESS error message
+  //   can be get from API or Nvidia official website, error
+  //   message from developer is not necessary
   ErrorSummary() {
     code_ = paddle::platform::error::LEGACY;
     msg_ =