Skip to content

Commit

Permalink
Optimize the error messages of paddle CUDA API, test=develop
Browse files Browse the repository at this point in the history
  • Loading branch information
zhwesky2010 committed Apr 14, 2020
1 parent 8f63a3e commit aa4ba28
Show file tree
Hide file tree
Showing 11 changed files with 447 additions and 139 deletions.
15 changes: 11 additions & 4 deletions cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ copy(inference_lib_dist
SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
DSTS ${dst_dir})

set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
copy(inference_lib_dist
SRCS ${cudaerror_INCLUDE_DIR}
DSTS ${dst_dir})

# CMakeCache Info
copy(inference_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR})
Expand Down Expand Up @@ -184,7 +190,7 @@ copy(fluid_lib_dist
)

set(module "framework")
set(framework_lib_deps framework_proto)
set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
add_dependencies(fluid_lib_dist ${framework_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
Expand All @@ -204,11 +210,11 @@ copy(fluid_lib_dist
)

set(module "platform")
set(platform_lib_deps profiler_proto)
set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
add_dependencies(fluid_lib_dist ${platform_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
)

set(module "string")
Expand Down Expand Up @@ -249,6 +255,7 @@ copy(inference_lib_dist
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)


# CMakeCache Info
copy(fluid_lib_dist
SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
Expand Down
35 changes: 33 additions & 2 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

include(ExternalProject)
# Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)

set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
Expand All @@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING
"A path cache third party source code to avoid repeated download.")

set(THIRD_PARTY_BUILD_TYPE Release)
set(third_party_deps)

# cache funciton to avoid repeat download code of third_party.
# This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
Expand Down Expand Up @@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME})
ENDMACRO()

# Funciton to Download the dependencies during compilation
# This function has 2 parameters, URL / DIRNAME:
# 1. URL: The download url of 3rd dependencies
# 2. NAME: The name of file, that determin the dirname
#
MACRO(file_download_and_uncompress URL NAME)
MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
ExternalProject_Add(
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${THIRD_PARTY_PATH}/${NAME}
URL ${URL}
DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
ENDMACRO()


# Correction of flags on different Platform(WIN/MAC) and Print Warning Message
if (APPLE)
if(WITH_MKL)
Expand Down Expand Up @@ -178,10 +206,13 @@ include(external/dlpack) # download dlpack
include(external/xxhash) # download, build, install xxhash
include(external/warpctc) # download, build, install warpctc

set(third_party_deps)
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)

# download file
set(CUDAERROR_URL "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")

if(WITH_AMD_GPU)
include(external/rocprim) # download, build, install rocprim
list(APPEND third_party_deps extern_rocprim)
Expand Down Expand Up @@ -274,4 +305,4 @@ if (WITH_LITE)
include(external/lite)
endif (WITH_LITE)

add_custom_target(third_party DEPENDS ${third_party_deps})
add_custom_target(third_party ALL DEPENDS ${third_party_deps})
4 changes: 2 additions & 2 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
proto_library(error_codes_proto SRCS error_codes.proto)

proto_library(cuda_error_proto SRCS cuda_error.proto)

if (WITH_PYTHON)
py_proto_compile(profiler_py_proto SRCS profiler.proto)
Expand Down Expand Up @@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags)
cc_library(errors SRCS errors.cc DEPS error_codes_proto)
cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)

cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors)
cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto)
cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)

set(CPU_INFO_DEPS gflags glog enforce)
Expand Down
35 changes: 35 additions & 0 deletions paddle/fluid/platform/cuda_error.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

syntax = "proto2";
package paddle.platform.proto;

message MessageDesc {
// Indicates the type of error
required int32 errorCode = 1;
// Indicates the message of error
required string errorMessage = 2;
}

message AllMessageDesc {
// Version of cuda API
required int32 version = 1;
// Error messages of different errortype
repeated MessageDesc Messages = 2;
}

message cudaerrorDesc {
// Error messages of different cuda versions(9.0/10.0/10.2)
repeated AllMessageDesc AllMessages = 2;
}
108 changes: 105 additions & 3 deletions paddle/fluid/platform/enforce.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__

#if !defined(_WIN32)
#include <dlfcn.h> // dladdr
#else // _WIN32
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h> // GetModuleFileName
#endif

#ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h>
#include <cudnn.h>
Expand All @@ -38,6 +45,7 @@ limitations under the License. */

#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "glog/logging.h"
#include "paddle/fluid/platform/cuda_error.pb.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/port.h"
Expand Down Expand Up @@ -464,18 +472,112 @@ struct EOFException : public std::exception {
} while (0)

/** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/

#ifdef PADDLE_WITH_CUDA

inline std::string GetCudaErrorWebsite(int32_t cuda_version) {
std::ostringstream webstr;
webstr << "https://docs.nvidia.com/cuda/";
if (cuda_version != -1) {
double version = cuda_version / 10;
webstr << "archive/" << std::fixed << std::setprecision(1) << version;
}
webstr << "/cuda-runtime-api/group__CUDART__TYPES.html"
"#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038";
return webstr.str();
}

inline std::string GetCudaErrorMessage(cudaError_t e) {
#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000
int32_t cuda_version = 100;
#elif CUDA_VERSION >= 9000
int32_t cuda_version = 90;
#else
int32_t cuda_version = -1;
#endif
std::ostringstream sout;
sout << " CUDA runtime error(" << e << "): " << cudaGetErrorString(e) << ".";
static platform::proto::cudaerrorDesc cudaerror;
static bool _initSucceed = false;
if (cudaerror.ByteSizeLong() == 0) {
std::string filePath;
#if !defined(_WIN32)
Dl_info info;
if (dladdr(reinterpret_cast<void*>(GetCudaErrorMessage), &info)) {
std::string strModule(info.dli_fname);
const size_t last_slash_idx = strModule.find_last_of("/");
std::string compare_path = strModule.substr(strModule.length() - 6);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.so") == 0) {
filePath = strModule +
"/../include/third_party/cudaerror/data/cudaErrorMessage.pb";
} else {
filePath =
strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb";
}
}
#else
char buf[100];
MEMORY_BASIC_INFORMATION mbi;
HMODULE h_module =
(::VirtualQuery(GetCudaErrorMessage, &mbi, sizeof(mbi)) != 0)
? (HMODULE)mbi.AllocationBase
: NULL;
GetModuleFileName(h_module, buf, 100);
std::string strModule(buf);
const size_t last_slash_idx = strModule.find_last_of("\\");
std::string compare_path = strModule.substr(strModule.length() - 7);
if (std::string::npos != last_slash_idx) {
strModule.erase(last_slash_idx, std::string::npos);
}
if (compare_path.compare("avx.pyd") == 0) {
filePath =
strModule +
"\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
} else {
filePath =
strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
}
#endif
std::ifstream fin(filePath, std::ios::in | std::ios::binary);
_initSucceed = cudaerror.ParseFromIstream(&fin);
}
if (_initSucceed) {
for (int i = 0; i < cudaerror.allmessages_size(); ++i) {
if (cuda_version == cudaerror.allmessages(i).version()) {
for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) {
if (e == cudaerror.allmessages(i).messages(j).errorcode()) {
sout << "\n\nRecommended Solution: "
<< cudaerror.allmessages(i).messages(j).errormessage() << " ";
return sout.str();
}
}
}
}
}
sout << "\n\nRecommended Solution: Please search for the error code[" << e
<< "] on website[" << GetCudaErrorWebsite(cuda_version)
<< "] to get Nvidia's official solution about CUDA Error. ";
return sout.str();
}

inline bool is_error(cudaError_t e) { return e != cudaSuccess; }

inline std::string build_ex_string(cudaError_t e, const std::string& msg) {
return msg;
// note(zhouwei): default message when input no error message by developer, it
// is not needed.
// better method is to refactor class ErrorSummary or
// PADDLE_ENFORCE_CUDA_SUCCESS
if (msg.find("An error occurred here") != std::string::npos) {
return platform::errors::External(GetCudaErrorMessage(e)).ToString();
}
return msg + GetCudaErrorMessage(e);
}

inline void throw_on_error(cudaError_t e, const std::string& msg) {
#ifndef REPLACE_ENFORCE_GLOG
throw thrust::system_error(e, thrust::cuda_category(), msg);
throw std::runtime_error(msg);
#else
LOG(FATAL) << msg;
#endif
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/platform/errors.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ class ErrorSummary {
// Note(chenweihang): Final deprecated constructor
// This constructor is only used to be compatible with
// current existing no error message PADDLE_ENFORCE_*
// Note(zhouwei): PADDLE_ENFORCE_CUDA_SUCCESS error message
// can be get from API or Nvidia official website, error
// message from developer is not necessary
ErrorSummary() {
code_ = paddle::platform::error::LEGACY;
msg_ =
Expand Down
Loading

0 comments on commit aa4ba28

Please sign in to comment.