From 6f59331f52beb5cbd6df1b5dcc65a71b100441bf Mon Sep 17 00:00:00 2001 From: zhouwei25 Date: Wed, 22 Apr 2020 00:34:00 +0800 Subject: [PATCH] Revert 'Optimize the error messages of paddle CUDA API (#23816)',test=develop --- cmake/inference_lib.cmake | 15 +- cmake/third_party.cmake | 35 +-- .../framework/details/nan_inf_utils_detail.cu | 4 +- .../tensorrt/plugin/split_op_plugin.cu | 27 +- .../cuda_device_context_allocator.h | 15 +- paddle/fluid/operators/argsort_op.cu | 14 +- .../operators/fused/fused_bn_activation_op.cu | 90 ++++-- .../fusion_transpose_flatten_concat_op.cu.cc | 43 ++- .../operators/grid_sampler_cudnn_op.cu.cc | 15 +- paddle/fluid/operators/math/blas_impl.cu.h | 16 +- paddle/fluid/operators/mean_op.cu | 10 +- .../fluid/operators/reader/buffered_reader.cc | 18 +- .../fluid/operators/sync_batch_norm_op.cu.h | 18 +- paddle/fluid/platform/CMakeLists.txt | 4 +- paddle/fluid/platform/cuda_error.proto | 35 --- paddle/fluid/platform/cuda_helper.h | 9 +- paddle/fluid/platform/cuda_resource_pool.cc | 18 +- paddle/fluid/platform/device_context.h | 13 +- paddle/fluid/platform/enforce.h | 290 +++++------------- paddle/fluid/platform/enforce_test.cc | 32 +- paddle/fluid/platform/gpu_info.cc | 196 ++++++++---- paddle/fluid/platform/profiler_helper.h | 5 +- paddle/fluid/platform/stream/cuda_stream.cc | 16 +- paddle/fluid/platform/stream/cuda_stream.h | 12 +- python/setup.py.in | 5 - tools/check_api_approvals.sh | 4 +- tools/count_invalid_enforce.sh | 4 +- tools/cudaError/README.md | 22 -- tools/cudaError/spider.py | 124 -------- tools/cudaError/start.sh | 32 -- 30 files changed, 496 insertions(+), 645 deletions(-) delete mode 100644 paddle/fluid/platform/cuda_error.proto delete mode 100644 tools/cudaError/README.md delete mode 100644 tools/cudaError/spider.py delete mode 100644 tools/cudaError/start.sh diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index a52d91741aff7..30e96b8e5f69e 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -135,12 +135,6 @@ copy(inference_lib_dist SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h DSTS ${dst_dir}) -set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data") -copy(inference_lib_dist - SRCS ${cudaerror_INCLUDE_DIR} - DSTS ${dst_dir}) - -# CMakeCache Info copy(inference_lib_dist SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt DSTS ${FLUID_INFERENCE_INSTALL_DIR}) @@ -190,7 +184,7 @@ copy(fluid_lib_dist ) set(module "framework") -set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto) +set(framework_lib_deps framework_proto) add_dependencies(fluid_lib_dist ${framework_lib_deps}) copy(fluid_lib_dist SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h @@ -210,11 +204,11 @@ copy(fluid_lib_dist ) set(module "platform") -set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto) +set(platform_lib_deps profiler_proto) add_dependencies(fluid_lib_dist ${platform_lib_deps}) copy(fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ) set(module "string") @@ -255,7 +249,6 @@ copy(inference_lib_dist SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) - # CMakeCache Info copy(fluid_lib_dist SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 9c8a9e0af1c12..3b3a43a69a9ef 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -include(ExternalProject) # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac) set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING @@ -22,7 +21,6 @@ set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING "A path cache third party source code to avoid repeated download.") set(THIRD_PARTY_BUILD_TYPE Release) -set(third_party_deps) # cache funciton to avoid repeat download code of third_party. # This function has 4 parameters, URL / REPOSITOR / TAG / DIR: @@ -102,32 +100,6 @@ MACRO(UNSET_VAR VAR_NAME) UNSET(${VAR_NAME}) ENDMACRO() -# Funciton to Download the dependencies during compilation -# This function has 2 parameters, URL / DIRNAME: -# 1. URL: The download url of 3rd dependencies -# 2. NAME: The name of file, that determin the dirname -# -MACRO(file_download_and_uncompress URL NAME) - MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}") - SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}") - SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data) - ExternalProject_Add( - ${EXTERNAL_PROJECT_NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${THIRD_PARTY_PATH}/${NAME} - URL ${URL} - DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ - SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "" - ) - list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME}) -ENDMACRO() - - # Correction of flags on different Platform(WIN/MAC) and Print Warning Message if (APPLE) if(WITH_MKL) @@ -206,13 +178,10 @@ include(external/dlpack) # download dlpack include(external/xxhash) # download, build, install xxhash include(external/warpctc) # download, build, install warpctc +set(third_party_deps) list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool) -# download file -set(CUDAERROR_URL "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE) -file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") - if(WITH_AMD_GPU) include(external/rocprim) # download, build, install rocprim list(APPEND third_party_deps extern_rocprim) @@ -305,4 +274,4 @@ if (WITH_LITE) include(external/lite) endif (WITH_LITE) -add_custom_target(third_party ALL DEPENDS ${third_party_deps}) +add_custom_target(third_party DEPENDS ${third_party_deps}) diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index f9f91680e3401..44668e491eb29 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -152,7 +152,9 @@ void TensorCheckerVisitor::apply( PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, - cudaMemcpyHostToDevice, dev_ctx->stream())); + cudaMemcpyHostToDevice, dev_ctx->stream()), + platform::errors::External( + "Async cudaMemcpy op_var info to gpu failed.")); } else { // get auto iter = op_var2gpu_str.find(op_var); PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true, diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 9eefb925d2061..7a032acef676b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -124,9 +124,12 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, float const* input_ptr = reinterpret_cast(inputs[0]); float* const* h_odatas = reinterpret_cast(outputs); float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( - output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*), - cudaMemcpyHostToDevice, stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpyAsync(output_ptrs, h_odatas, + d_output_ptrs_.size() * sizeof(float*), + cudaMemcpyHostToDevice, stream), + platform::errors::External( + "CUDA Memcpy failed during split plugin run.")); int outer_rows = outer_rows_ * batchSize; @@ -241,9 +244,12 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, float* const* h_odatas = reinterpret_cast(outputs); float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( - output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*), - cudaMemcpyHostToDevice, stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpyAsync(output_ptrs, h_odatas, + d_output_ptrs.size() * sizeof(float*), + cudaMemcpyHostToDevice, stream), + platform::errors::External( + "CUDA Memcpy failed during split plugin run.")); split_kernel<<>>( d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, @@ -257,9 +263,12 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, half* const* h_odatas = reinterpret_cast(outputs); half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( - output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*), - cudaMemcpyHostToDevice, stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpyAsync(output_ptrs, h_odatas, + d_output_ptrs.size() * sizeof(half*), + cudaMemcpyHostToDevice, stream), + platform::errors::External( + "CUDA Memcpy failed during split plugin run.")); split_kernel<<>>( d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 2163562a6080b..0997f575acc4e 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -80,13 +80,17 @@ class CUDADeviceContextAllocator : public Allocator { : place_(place), default_stream_(default_stream) { platform::CUDADeviceGuard guard(place_.device); PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventCreate(&event_, cudaEventDisableTiming)); + cudaEventCreate(&event_, cudaEventDisableTiming), + platform::errors::External( + "Create event failed in CUDADeviceContextAllocator")); } ~CUDADeviceContextAllocator() { if (event_) { platform::CUDADeviceGuard guard(place_.device); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventDestroy(event_), + "Destory event failed in CUDADeviceContextAllocator destroctor"); } } @@ -99,9 +103,12 @@ class CUDADeviceContextAllocator : public Allocator { auto allocation = new CUDADeviceContextAllocation(memory::Alloc(place_, size)); // Wait for the event on stream - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_)); PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamWaitEvent(default_stream_, event_, 0)); + cudaEventRecord(event_, default_stream_), + "Failed to record event in CUDADeviceContextAllocator"); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(default_stream_, event_, 0), + "Failed to wait event in CUDADeviceContextAllocator"); return allocation; } diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu index cbd7e33bc6b72..006bf559195aa 100644 --- a/paddle/fluid/operators/argsort_op.cu +++ b/paddle/fluid/operators/argsort_op.cu @@ -141,7 +141,12 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, cu_stream); } - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_CUDA_SUCCESS( + err, + "ArgSortOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate" + "temp_storage_bytes, status:%s.", + temp_storage_bytes, cudaGetErrorString(err)); Tensor temp_storage; temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); @@ -160,7 +165,12 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, cu_stream); } - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_CUDA_SUCCESS( + err, + "ArgSortOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, " + "temp_storage_bytes:%d status:%s.", + temp_storage_bytes, cudaGetErrorString(err)); } template diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu index 32eaf1180977a..2e308657936c0 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -108,21 +108,32 @@ class FusedBatchNormActKernel cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnCreateTensorDescriptor(&data_desc_).")); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnCreateTensorDescriptor(&bn_param_desc_).")); VLOG(3) << "Setting descriptors."; std::vector dims = {N, C, H, W, D}; std::vector strides = {H * W * D * C, 1, W * D * C, D * C, C}; - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()), + platform::errors::External( + "The error has happened when calling cudnnSetTensorNdDescriptor.")); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); + data_desc_, mode_), + platform::errors::External("The error has happened when calling " + "cudnnDeriveBNTensorDescriptor.")); double this_factor = 1. - momentum; cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; @@ -155,7 +166,10 @@ class FusedBatchNormActKernel /*yDesc=*/data_desc_, /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, /*activationDesc=*/activation_desc_, - /*sizeInBytes=*/&workspace_size)); + /*sizeInBytes=*/&workspace_size), + platform::errors::External( + "The error has happened when calling " + "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize.")); // -------------- cudnn batchnorm reserve space -------------- PADDLE_ENFORCE_CUDA_SUCCESS( @@ -165,7 +179,10 @@ class FusedBatchNormActKernel /*bnOps=*/bnOps_, /*activationDesc=*/activation_desc_, /*xDesc=*/data_desc_, - /*sizeInBytes=*/&reserve_space_size)); + /*sizeInBytes=*/&reserve_space_size), + platform::errors::External( + "The error has happened when calling " + "cudnnGetBatchNormalizationTrainingExReserveSpaceSize.")); reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(), reserve_space_size); @@ -187,13 +204,22 @@ class FusedBatchNormActKernel saved_variance->template mutable_data>( ctx.GetPlace()), activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr, - reserve_space_size)); + reserve_space_size), + platform::errors::External( + "The error has happened when calling " + "cudnnBatchNormalizationForwardTrainingEx.")); // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnDestroyTensorDescriptor(data_desc_).")); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnDestroyTensorDescriptor(bn_param_desc_).")); } }; @@ -272,9 +298,15 @@ class FusedBatchNormActGradKernel cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnCreateTensorDescriptor(&data_desc_).")); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnCreateTensorDescriptor(&bn_param_desc_).")); if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { LOG(ERROR) << "Provided epsilon is smaller than " << "CUDNN_BN_MIN_EPSILON. Setting it to " @@ -282,12 +314,17 @@ class FusedBatchNormActGradKernel } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()), + platform::errors::External( + "The error has happened when calling cudnnSetTensorNdDescriptor.")); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); + data_desc_, mode_), + platform::errors::External("The error has happened when calling " + "cudnnDeriveBNTensorDescriptor.")); const auto *saved_mean = ctx.Input("SavedMean"); const auto *saved_var = ctx.Input("SavedVariance"); @@ -317,7 +354,10 @@ class FusedBatchNormActGradKernel /*dxDesc=*/data_desc_, /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, /*activationDesc=*/activation_desc_, - /*sizeInBytes=*/&workspace_size)); + /*sizeInBytes=*/&workspace_size), + platform::errors::External( + "The error has happened when calling " + "cudnnGetBatchNormalizationBackwardExWorkspaceSize.")); workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), workspace_size); @@ -355,13 +395,21 @@ class FusedBatchNormActGradKernel /*workspace=*/workspace_ptr, /*workSpaceSizeInBytes=*/workspace_size, /*reserveSpace=*/const_cast(reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size)); + /*reserveSpaceSizeInBytes=*/reserve_space_size), + platform::errors::External("The error has happened when calling " + "cudnnBatchNormalizationBackwardEx.")); // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnDestroyTensorDescriptor(data_desc_).")); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_), + platform::errors::External( + "The error has happened when calling " + "cudnnDestroyTensorDescriptor(bn_param_desc_).")); } }; diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc index 17cb4556d45ef..b61ef8e566b77 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc @@ -46,9 +46,13 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { cudnnTensorDescriptor_t in_desc; cudnnTensorDescriptor_t out_desc; PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&in_desc)); + platform::dynload::cudnnCreateTensorDescriptor(&in_desc), + platform::errors::External("Create cudnn tensor descriptor failed in " + "transpose_flatten_concat_fusion op.")); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&out_desc)); + platform::dynload::cudnnCreateTensorDescriptor(&out_desc), + platform::errors::External("Create cudnn tensor descriptor failed in " + "transpose_flatten_concat_fusion op.")); cudnnDataType_t cudnn_dtype = CudnnDataType::type; auto& dev_ctx = ctx.template device_context(); @@ -87,15 +91,24 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { dims_y[i] = 1; } - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data())); - - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor( - handle, CudnnDataType::kOne(), in_desc, - static_cast(ins[k]->data()), - CudnnDataType::kZero(), out_desc, static_cast(odata))); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetTensorNdDescriptor( + in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()), + platform::errors::External("Create cudnn tensorNd descriptor failed " + "in transpose_flatten_concat op.")); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetTensorNdDescriptor( + out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()), + platform::errors::External("Create cudnn tensorNd descriptor failed " + "in transpose_flatten_concat op.")); + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnTransformTensor( + handle, CudnnDataType::kOne(), in_desc, + static_cast(ins[k]->data()), + CudnnDataType::kZero(), out_desc, static_cast(odata)), + platform::errors::External("Create cudnn transform tensor failed in " + "transpose_flatten_concat op.")); if (concat_axis == 0) { odata += osize; } else { @@ -104,9 +117,13 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { } } PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(in_desc)); + platform::dynload::cudnnDestroyTensorDescriptor(in_desc), + platform::errors::External( + "Destory cudnn descriptor failed in transpose_flatten_concat op.")); PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(out_desc)); + platform::dynload::cudnnDestroyTensorDescriptor(out_desc), + platform::errors::External( + "Destory cudnn descriptor failed in transpose_flatten_concat op.")); } }; diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index 3bf34fc685ee8..c266b0d32b14a 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -60,10 +60,13 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel { cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( DataLayout::kNCHW, framework::vectorize(output->dims())); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward( - handle, cudnn_st_desc, CudnnDataType::kOne(), cudnn_input_desc, - input_data, grid_data, CudnnDataType::kZero(), cudnn_output_desc, - output_data)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSpatialTfSamplerForward( + handle, cudnn_st_desc, CudnnDataType::kOne(), cudnn_input_desc, + input_data, grid_data, CudnnDataType::kZero(), cudnn_output_desc, + output_data), + platform::errors::InvalidArgument( + "cudnnSpatialTfSamplerForward in Op(grid_sampler) failed")); } }; @@ -119,7 +122,9 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel { input_data, CudnnDataType::kZero(), cudnn_input_grad_desc, input_grad_data, CudnnDataType::kOne(), cudnn_output_grad_desc, output_grad_data, grid_data, CudnnDataType::kZero(), - grid_grad_data)); + grid_grad_data), + platform::errors::InvalidArgument( + "cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed")); } }; diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index 8e903a4eccc74..c0ab35b0e753c 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -41,12 +41,16 @@ struct CUBlas { template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cublasSscal(args...), + platform::errors::External("dynload cublasSscal lib failed")); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cublasScopy(args...), + platform::errors::External("dynload cublasScopy lib failed")); } template @@ -104,12 +108,16 @@ struct CUBlas { template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cublasDscal(args...), + platform::errors::External("dynload cublasDscal lib failed")); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cublasDcopy(args...), + platform::errors::External("dynload cublasDcopy lib failed")); } template diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index 081c077ab73c2..d2b01fafb731f 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -59,14 +59,20 @@ class MeanCUDAKernel : public framework::OpKernel { auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x, out_data, size_prob, stream); - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_CUDA_SUCCESS( + err, platform::errors::External( + "MeanOP failed to get reduce workspace size %s.", + cudaGetErrorString(err))); framework::Tensor tmp; auto* temp_storage = tmp.mutable_data( framework::make_ddim({static_cast(temp_storage_bytes)}), context.GetPlace()); err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x, out_data, size_prob, stream); - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_CUDA_SUCCESS( + err, platform::errors::External( + "MeanOP failed to run CUDA reduce computation: %s.", + cudaGetErrorString(err))); } }; diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index e72820611d3a9..b237df130abcc 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -104,9 +104,13 @@ void BufferedReader::ReadAsync(size_t i) { // gpu memory immediately without waiting gpu kernel ends platform::SetDeviceId(boost::get(place_).device); PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventRecord(events_[i].get(), compute_stream_)); + cudaEventRecord(events_[i].get(), compute_stream_), + platform::errors::Fatal( + "cudaEventRecord raises unexpected exception")); PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0)); + cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0), + platform::errors::Fatal( + "cudaStreamWaitEvent raises unexpected exception")); platform::RecordEvent record_event("BufferedReader:MemoryCopy"); for (size_t i = 0; i < cpu.size(); ++i) { @@ -134,11 +138,17 @@ void BufferedReader::ReadAsync(size_t i) { size); memory::Copy(boost::get(place_), gpu_ptr, cuda_pinned_place, cuda_pinned_ptr, size, stream_.get()); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get())); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamSynchronize(stream_.get()), + platform::errors::Fatal( + "cudaStreamSynchronize raises unexpected exception")); } gpu[i].set_lod(cpu[i].lod()); } - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get())); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamSynchronize(stream_.get()), + platform::errors::Fatal( + "cudaStreamSynchronize raises unexpected exception")); } #endif return i; diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h index cfb9e16942c25..083d22aa2a38a 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu.h +++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h @@ -191,9 +191,12 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx, if (comm) { int dtype = platform::ToNCCLDataType(mean_out->type()); // In-place operation - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( - stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, - comm, stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1, + static_cast(dtype), + ncclSum, comm, stream), + platform::errors::InvalidArgument( + "ncclAllReduce in Op(sync_batch_norm) failed")); } #endif @@ -465,9 +468,12 @@ void SyncBatchNormGradFunctor( if (comm) { int dtype = platform::ToNCCLDataType(scale->type()); // In-place operation - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( - stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, - comm, stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1, + static_cast(dtype), + ncclSum, comm, stream), + platform::errors::InvalidArgument( + "ncclAllReduce in Op(sync_batch_norm) failed")); } #endif diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index d0d74f6ea87f1..ddf3035a92754 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,6 +1,6 @@ proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) proto_library(error_codes_proto SRCS error_codes.proto) -proto_library(cuda_error_proto SRCS cuda_error.proto) + if (WITH_PYTHON) py_proto_compile(profiler_py_proto SRCS profiler.proto) @@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags) cc_library(errors SRCS errors.cc DEPS error_codes_proto) cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) -cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto) +cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors) cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce) set(CPU_INFO_DEPS gflags glog enforce) diff --git a/paddle/fluid/platform/cuda_error.proto b/paddle/fluid/platform/cuda_error.proto deleted file mode 100644 index b55e0af81ee6f..0000000000000 --- a/paddle/fluid/platform/cuda_error.proto +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto2"; -package paddle.platform.proto; - -message MessageDesc { - // Indicates the type of error - required int32 errorCode = 1; - // Indicates the message of error - required string errorMessage = 2; -} - -message AllMessageDesc { - // Version of cuda API - required int32 version = 1; - // Error messages of different errortype - repeated MessageDesc Messages = 2; -} - -message cudaerrorDesc { - // Error messages of different cuda versions(9.0/10.0/10.2) - repeated AllMessageDesc AllMessages = 2; -} \ No newline at end of file diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index 74cf5545239f1..54f5e911e3d0f 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -29,7 +29,14 @@ namespace platform { class CublasHandleHolder { public: CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cublasCreate(&handle_), + platform::errors::External( + "The cuBLAS library was not initialized. This is usually caused by " + "an error in the CUDA Runtime API called by the cuBLAS routine, or " + "an error in the hardware setup.\n" + "To correct: check that the hardware, an appropriate version of " + "the driver, and the cuBLAS library are correctly installed.")); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream)); #if CUDA_VERSION >= 9000 if (math_type == CUBLAS_TENSOR_OP_MATH) { diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/cuda_resource_pool.cc index 65c8b96028ace..1828f0760a79a 100644 --- a/paddle/fluid/platform/cuda_resource_pool.cc +++ b/paddle/fluid/platform/cuda_resource_pool.cc @@ -27,13 +27,18 @@ CudaStreamResourcePool::CudaStreamResourcePool() { platform::SetDeviceId(dev_idx); cudaStream_t stream; PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), + platform::errors::Fatal( + "cudaStreamCreateWithFlags raises unexpected exception")); return stream; }; auto deleter = [dev_idx](cudaStream_t stream) { platform::SetDeviceId(dev_idx); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamDestroy(stream), + platform::errors::Fatal( + "cudaStreamDestroy raises unexpected exception")); }; pool_.emplace_back( @@ -67,13 +72,18 @@ CudaEventResourcePool::CudaEventResourcePool() { platform::SetDeviceId(dev_idx); cudaEvent_t event; PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + cudaEventCreateWithFlags(&event, cudaEventDisableTiming), + platform::errors::Fatal( + "cudaEventCreateWithFlags raises unexpected exception")); return event; }; auto deleter = [dev_idx](cudaEvent_t event) { platform::SetDeviceId(dev_idx); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventDestroy(event), + platform::errors::Fatal( + "cudaEventDestroy raises unexpected exception")); }; pool_.emplace_back(ResourcePool::Create(creator, deleter)); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 529992b47e273..e32c8d4ea6a52 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -162,9 +162,14 @@ class CUDAContext { << "Please recompile or reinstall Paddle with compatible CUDNN " "version."; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnSetStream(cudnn_handle_, RawStream())); + dynload::cudnnCreate(&cudnn_handle_), + platform::errors::Fatal( + "Failed to create Cudnn handle in DeviceContext")); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnSetStream(cudnn_handle_, RawStream()), + platform::errors::Fatal( + "Failed to set stream for Cudnn handle in DeviceContext")); } else { cudnn_handle_ = nullptr; } @@ -172,7 +177,9 @@ class CUDAContext { void DestoryCuDNNContext() { if (cudnn_handle_) { - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnDestroy(cudnn_handle_), + platform::errors::Fatal("Failed to destory Cudnn handle")); } cudnn_handle_ = nullptr; } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index f2e0c52170b60..99f83d9732029 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -18,13 +18,6 @@ limitations under the License. */ #include // for __cxa_demangle #endif // __GNUC__ -#if !defined(_WIN32) -#include // dladdr -#else // _WIN32 -#define NOMINMAX // msvc max/min macro conflict with std::min/max -#include // GetModuleFileName -#endif - #ifdef PADDLE_WITH_CUDA #include #include @@ -45,7 +38,6 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "glog/logging.h" -#include "paddle/fluid/platform/cuda_error.pb.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/port.h" @@ -228,6 +220,10 @@ inline std::string GetTraceBackString(StrType&& what, const char* file, inline bool is_error(bool stat) { return !stat; } +inline std::string build_ex_string(bool stat, const std::string& msg) { + return msg; +} + inline void throw_on_error(bool stat, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG throw std::runtime_error(msg); @@ -288,21 +284,23 @@ struct EnforceNotMet : public std::exception { } \ } while (0) #else -#define PADDLE_ENFORCE(COND, ...) \ - do { \ - auto __cond__ = (COND); \ - if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \ - try { \ - ::paddle::platform::throw_on_error( \ - __cond__, \ - ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString()); \ - } catch (...) { \ - HANDLE_THE_ERROR \ - throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ - __FILE__, __LINE__); \ - END_HANDLE_THE_ERROR \ - } \ - } \ +#define PADDLE_ENFORCE(COND, ...) \ + do { \ + auto __cond__ = (COND); \ + if (UNLIKELY(::paddle::platform::is_error(__cond__))) { \ + try { \ + ::paddle::platform::throw_on_error( \ + __cond__, \ + ::paddle::platform::build_ex_string( \ + __cond__, \ + ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \ + } catch (...) { \ + HANDLE_THE_ERROR \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + END_HANDLE_THE_ERROR \ + } \ + } \ } while (0) #endif @@ -466,148 +464,30 @@ struct EOFException : public std::exception { } while (0) /** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/ + #ifdef PADDLE_WITH_CUDA -/***** CUDA ERROR *****/ inline bool is_error(cudaError_t e) { return e != cudaSuccess; } -inline std::string GetCudaErrorWebsite(int32_t cuda_version) { - std::ostringstream webstr; - webstr << "https://docs.nvidia.com/cuda/"; - if (cuda_version != -1) { - double version = cuda_version / 10; - webstr << "archive/" << std::fixed << std::setprecision(1) << version; - } - webstr << "/cuda-runtime-api/group__CUDART__TYPES.html" - "#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038"; - return webstr.str(); -} - -inline std::string build_nvidia_error_msg(cudaError_t e) { -#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000 - int32_t cuda_version = 100; -#elif CUDA_VERSION >= 9000 - int32_t cuda_version = 90; -#else - int32_t cuda_version = -1; -#endif - std::ostringstream sout; - sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << "."; - static platform::proto::cudaerrorDesc cudaerror; - static bool _initSucceed = false; - if (cudaerror.ByteSizeLong() == 0) { - std::string filePath; -#if !defined(_WIN32) - Dl_info info; - if (dladdr(reinterpret_cast(GetCudaErrorWebsite), &info)) { - std::string strModule(info.dli_fname); - const size_t last_slash_idx = strModule.find_last_of("/"); - std::string compare_path = strModule.substr(strModule.length() - 6); - if (std::string::npos != last_slash_idx) { - strModule.erase(last_slash_idx, std::string::npos); - } - if (compare_path.compare("avx.so") == 0) { - filePath = strModule + - "/../include/third_party/cudaerror/data/cudaErrorMessage.pb"; - } else { - filePath = - strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb"; - } - } -#else - char buf[100]; - MEMORY_BASIC_INFORMATION mbi; - HMODULE h_module = - (::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0) - ? (HMODULE)mbi.AllocationBase - : NULL; - GetModuleFileName(h_module, buf, 100); - std::string strModule(buf); - const size_t last_slash_idx = strModule.find_last_of("\\"); - std::string compare_path = strModule.substr(strModule.length() - 7); - if (std::string::npos != last_slash_idx) { - strModule.erase(last_slash_idx, std::string::npos); - } - if (compare_path.compare("avx.pyd") == 0) { - filePath = - strModule + - "\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb"; - } else { - filePath = - strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb"; - } -#endif - std::ifstream fin(filePath, std::ios::in | std::ios::binary); - _initSucceed = cudaerror.ParseFromIstream(&fin); - } - if (_initSucceed) { - for (int i = 0; i < cudaerror.allmessages_size(); ++i) { - if (cuda_version == cudaerror.allmessages(i).version()) { - for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) { - if (e == cudaerror.allmessages(i).messages(j).errorcode()) { - sout << "\n [Advise: " - << cudaerror.allmessages(i).messages(j).errormessage() << "]"; - return sout.str(); - } - } - } - } - } - sout << "\n [Advise: Please search for the error code(" << e - << ") on website( " << GetCudaErrorWebsite(cuda_version) - << " ) to get Nvidia's official solution about CUDA Error.]"; - return sout.str(); +inline std::string build_ex_string(cudaError_t e, const std::string& msg) { + return msg; } inline void throw_on_error(cudaError_t e, const std::string& msg) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(msg); + throw thrust::system_error(e, thrust::cuda_category(), msg); #else LOG(FATAL) << msg; #endif } -/** curand ERROR **/ inline bool is_error(curandStatus_t stat) { return stat != CURAND_STATUS_SUCCESS; } -inline const char* curandGetErrorString(curandStatus_t stat) { - switch (stat) { - case CURAND_STATUS_SUCCESS: - return "CURAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: - return "CURAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: - return "CURAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: - return "CURAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: - return "CURAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: - return "CURAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: - return "CURAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: - return "CURAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: - return "CURAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: - return "CURAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: - return "CURAND_STATUS_INTERNAL_ERROR"; - default: - return "Unknown curand status"; - } -} - -inline std::string build_nvidia_error_msg(curandStatus_t stat) { - std::string msg(" Curand error, "); - return msg + curandGetErrorString(stat) + " "; +inline std::string build_ex_string(curandStatus_t stat, + const std::string& msg) { + return msg; } inline void throw_on_error(curandStatus_t stat, const std::string& msg) { @@ -619,14 +499,13 @@ inline void throw_on_error(curandStatus_t stat, const std::string& msg) { #endif } -/***** CUDNN ERROR *****/ inline bool is_error(cudnnStatus_t stat) { return stat != CUDNN_STATUS_SUCCESS; } -inline std::string build_nvidia_error_msg(cudnnStatus_t stat) { - std::string msg(" Cudnn error, "); - return msg + platform::dynload::cudnnGetErrorString(stat) + " "; +inline std::string build_ex_string(cudnnStatus_t stat, const std::string& msg) { + return msg + "\n [Hint: " + platform::dynload::cudnnGetErrorString(stat) + + "]"; } inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) { @@ -637,39 +516,33 @@ inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) { #endif } -/***** CUBLAS ERROR *****/ inline bool is_error(cublasStatus_t stat) { return stat != CUBLAS_STATUS_SUCCESS; } -inline const char* cublasGetErrorString(cublasStatus_t stat) { - switch (stat) { - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; - default: - return "Unknown cublas status"; +inline std::string build_ex_string(cublasStatus_t stat, + const std::string& msg) { + std::string err; + if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + err = "CUBLAS_STATUS_NOT_INITIALIZED"; + } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { + err = "CUBLAS_STATUS_ALLOC_FAILED"; + } else if (stat == CUBLAS_STATUS_INVALID_VALUE) { + err = "CUBLAS_STATUS_INVALID_VALUE"; + } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) { + err = "CUBLAS_STATUS_ARCH_MISMATCH"; + } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) { + err = "CUBLAS_STATUS_MAPPING_ERROR"; + } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) { + err = "CUBLAS_STATUS_EXECUTION_FAILED"; + } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) { + err = "CUBLAS_STATUS_INTERNAL_ERROR"; + } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) { + err = "CUBLAS_STATUS_NOT_SUPPORTED"; + } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) { + err = "CUBLAS_STATUS_LICENSE_ERROR"; } -} - -inline std::string build_nvidia_error_msg(cublasStatus_t stat) { - std::string msg(" Cublas error, "); - return msg + cublasGetErrorString(stat) + " "; + return msg + "\n [Hint: " + err + "]"; } inline void throw_on_error(cublasStatus_t stat, const std::string& msg) { @@ -680,15 +553,15 @@ inline void throw_on_error(cublasStatus_t stat, const std::string& msg) { #endif } -/****** NCCL ERROR ******/ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) inline bool is_error(ncclResult_t nccl_result) { return nccl_result != ncclSuccess; } -inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { - std::string msg(" Nccl error, "); - return msg + platform::dynload::ncclGetErrorString(nccl_result) + " "; +inline std::string build_ex_string(ncclResult_t nccl_result, + const std::string& msg) { + return msg + "\n [" + platform::dynload::ncclGetErrorString(nccl_result) + + "]"; } inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) { @@ -698,8 +571,11 @@ inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) { LOG(FATAL) << msg; #endif } -#endif // not(__APPLE__) and PADDLE_WITH_NCCL +#endif // __APPLE__ and windows + +#endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_CUDA namespace details { template @@ -722,28 +598,30 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); #endif } // namespace details +#endif // PADDLE_WITH_CUDA -#define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ - __CUDA_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - try { \ - ::paddle::platform::throw_on_error( \ - __cond__, \ - ::paddle::platform::errors::External( \ - ::paddle::platform::build_nvidia_error_msg(__cond__)) \ - .ToString()); \ - } catch (...) { \ - HANDLE_THE_ERROR \ - throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ - __FILE__, __LINE__); \ - END_HANDLE_THE_ERROR \ - } \ - } \ +#ifdef PADDLE_WITH_CUDA +#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::CudaStatusType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + try { \ + ::paddle::platform::throw_on_error( \ + __cond__, \ + ::paddle::platform::build_ex_string( \ + __cond__, \ + ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \ + } catch (...) { \ + HANDLE_THE_ERROR \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + END_HANDLE_THE_ERROR \ + } \ + } \ } while (0) #undef DEFINE_CUDA_STATUS_TYPE diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index db77ba95856d9..0057c784528c2 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -261,14 +261,15 @@ TEST(EOF_EXCEPTION, THROW_EOF) { #ifdef PADDLE_WITH_CUDA template bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") { - PADDLE_ENFORCE_CUDA_SUCCESS(value); + PADDLE_ENFORCE_CUDA_SUCCESS(value, msg); return true; } template -bool CheckCudaStatusFailure(T value, const std::string& msg) { +bool CheckCudaStatusFailure( + T value, const std::string& msg = "self-defined cuda status failed") { try { - PADDLE_ENFORCE_CUDA_SUCCESS(value); + PADDLE_ENFORCE_CUDA_SUCCESS(value, msg); return false; } catch (paddle::platform::EnforceNotMet& error) { std::string ex_msg = error.what(); @@ -278,29 +279,24 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) { TEST(enforce, cuda_success) { EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess)); - EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error")); - EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error")); + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue)); + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation)); EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS)); - EXPECT_TRUE( - CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error")); - EXPECT_TRUE( - CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error")); + EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH)); + EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED)); EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS)); - EXPECT_TRUE( - CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error")); - EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED)); + EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED)); EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS)); - EXPECT_TRUE( - CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error")); - EXPECT_TRUE( - CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED)); + EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE)); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); - EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error")); - EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError)); + EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError)); #endif } #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c07abba9e8ef9..40d6bc54ccf92 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include +#include #include "gflags/gflags.h" #include "paddle/fluid/platform/cuda_device_guard.h" @@ -41,13 +42,18 @@ faster way to query device properties. You can see details in https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/ */ +inline std::string CudaErrorWebsite() { + return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api" + "/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824" + "6db0a94a430e0038"; +} + static int GetCUDADeviceCountImpl() { int driverVersion = 0; cudaError_t status = cudaDriverGetVersion(&driverVersion); if (!(status == cudaSuccess && driverVersion != 0)) { // No GPU driver - VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; return 0; } @@ -61,8 +67,14 @@ static int GetCUDADeviceCountImpl() { return 0; } } + int count; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count)); + auto error_code = cudaGetDeviceCount(&count); + PADDLE_ENFORCE( + error_code, + "cudaGetDeviceCount failed in " + "paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s", + error_code, CudaErrorWebsite()); return count; } @@ -72,63 +84,72 @@ int GetCUDADeviceCount() { } int GetCUDAComputeCapability(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int major, minor; auto major_error_code = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); auto minor_error_code = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id); - PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code); - PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code); + PADDLE_ENFORCE_EQ( + major_error_code, 0, + "cudaDevAttrComputeCapabilityMajor failed in " + "paddle::platform::GetCUDAComputeCapability, error code : %d, %s", + major_error_code, CudaErrorWebsite()); + PADDLE_ENFORCE_EQ( + minor_error_code, 0, + "cudaDevAttrComputeCapabilityMinor failed in " + "paddle::platform::GetCUDAComputeCapability, error code : %d, %s", + minor_error_code, CudaErrorWebsite()); return major * 10 + minor; } dim3 GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); dim3 ret; int size; auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); - PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x); + PADDLE_ENFORCE_EQ(error_code_x, 0, + "cudaDevAttrMaxGridDimX failed in " + "paddle::platform::GpuMaxGridDimSize, error code : %d, %s", + error_code_x, CudaErrorWebsite()); ret.x = size; auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id); - PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y); + PADDLE_ENFORCE_EQ(error_code_y, 0, + "cudaDevAttrMaxGridDimY failed in " + "paddle::platform::GpuMaxGridDimSize, error code : %d, %s", + error_code_y, CudaErrorWebsite()); ret.y = size; auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id); - PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z); + PADDLE_ENFORCE_EQ(error_code_z, 0, + "cudaDevAttrMaxGridDimZ failed in " + "paddle::platform::GpuMaxGridDimSize, error code : %d, %s", + error_code_z, CudaErrorWebsite()); ret.z = size; return ret; } int GetCUDARuntimeVersion(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int runtime_version = 0; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version)); + auto error_code = cudaRuntimeGetVersion(&runtime_version); + PADDLE_ENFORCE(error_code, + "cudaRuntimeGetVersion failed in " + "paddle::platform::GetCUDARuntimeVersion, error code : %d, %s", + error_code, CudaErrorWebsite()); return runtime_version; } int GetCUDADriverVersion(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int driver_version = 0; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version)); + auto error_code = cudaDriverGetVersion(&driver_version); + PADDLE_ENFORCE(error_code, + "cudaDriverGetVersion failed in " + "paddle::platform::GetCUDADriverVersion, error code : %d, %s", + error_code, CudaErrorWebsite()); return driver_version; } @@ -143,44 +164,56 @@ bool TensorCoreAvailable() { } int GetCUDAMultiProcessors(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); + auto error_code = + cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id); + PADDLE_ENFORCE(error_code, + "cudaDeviceGetAttribute failed in " + "paddle::platform::GetCUDAMultiProcess, error code : %d, %s", + error_code, CudaErrorWebsite()); return count; } int GetCUDAMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute( - &count, cudaDevAttrMaxThreadsPerMultiProcessor, id)); + auto error_code = cudaDeviceGetAttribute( + &count, cudaDevAttrMaxThreadsPerMultiProcessor, id); + PADDLE_ENFORCE( + error_code, + "cudaDeviceGetAttribute failed in paddle::" + "platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s", + error_code, CudaErrorWebsite()); return count; } int GetCUDAMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); + PADDLE_ENFORCE_LT( + id, GetCUDADeviceCount(), + platform::errors::InvalidArgument( + "Device id must less than GPU count, but received id is:%d, " + "GPU count is: %d.", + id, GetCUDADeviceCount())); int count; - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); + auto error_code = + cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id); + PADDLE_ENFORCE_EQ( + error_code, 0, + platform::errors::InvalidArgument( + "cudaDeviceGetAttribute returned error code should be 0, " + "but received error code is: %d, %s", + error_code, CudaErrorWebsite())); return count; } int GetCurrentDeviceId() { int device_id; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id)); + auto error_code = cudaGetDevice(&device_id); + PADDLE_ENFORCE(error_code, + "cudaGetDevice failed in " + "paddle::platform::GetCurrentDeviceId, error code : %d, %s", + error_code, CudaErrorWebsite()); return device_id; } @@ -204,12 +237,12 @@ std::vector GetSelectedDevices() { void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), - platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, GetCUDADeviceCount())); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(id)); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + auto error_code = cudaSetDevice(id); + PADDLE_ENFORCE(error_code, + "cudaSetDevice failed in " + "paddle::platform::SetDeviced, error code : %d, %s", + error_code, CudaErrorWebsite()); } void GpuMemoryUsage(size_t *available, size_t *total) { @@ -273,44 +306,74 @@ size_t GpuMaxChunkSize() { void GpuMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream)); + auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream); + PADDLE_ENFORCE(error_code, + "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync " + "(%p -> %p, length: %d) error code : %d, %s", + src, dst, static_cast(count), error_code, + CudaErrorWebsite()); } void GpuMemcpySync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind)); + auto error_code = cudaMemcpy(dst, src, count, kind); + PADDLE_ENFORCE(error_code, + "cudaMemcpy failed in paddle::platform::GpuMemcpySync " + "(%p -> %p, length: %d) error code : %d, %s", + src, dst, static_cast(count), error_code, + CudaErrorWebsite()); } void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, int src_device, size_t count, cudaStream_t stream) { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); + auto error_code = + cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream); + PADDLE_ENFORCE( + error_code, + "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync " + "error code : %d, %s", + error_code, CudaErrorWebsite()); } void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, int src_device, size_t count) { - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpyPeer(dst, dst_device, src, src_device, count)); + auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count); + PADDLE_ENFORCE(error_code, + "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync " + "error code : %d, %s", + error_code, CudaErrorWebsite()); } void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream)); + auto error_code = cudaMemsetAsync(dst, value, count, stream); + PADDLE_ENFORCE(error_code, + "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync " + "error code : %d, %s", + error_code, CudaErrorWebsite()); } void GpuStreamSync(cudaStream_t stream) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); + auto error_code = cudaStreamSynchronize(stream); + PADDLE_ENFORCE_CUDA_SUCCESS( + error_code, + platform::errors::External( + "cudaStreamSynchronize failed in paddle::platform::GpuStreamSync " + "error code : %d, %s", + error_code, CudaErrorWebsite())); } static void RaiseNonOutOfMemoryError(cudaError_t *status) { if (*status == cudaErrorMemoryAllocation) { *status = cudaSuccess; } + PADDLE_ENFORCE_CUDA_SUCCESS(*status); *status = cudaGetLastError(); if (*status == cudaErrorMemoryAllocation) { *status = cudaSuccess; } + PADDLE_ENFORCE_CUDA_SUCCESS(*status); } @@ -387,7 +450,8 @@ class RecordedCudaMallocHelper { CUDADeviceGuard guard(dev_id_); auto err = cudaFree(ptr); if (err != cudaErrorCudartUnloading) { - PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_CUDA_SUCCESS( + err, platform::errors::External("cudaFree raises unexpected error")); if (NeedRecord()) { std::lock_guard guard(*mtx_); cur_size_ -= size; diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index af27564b99f79..41d5180ffaf5f 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -117,7 +117,10 @@ void SynchronizeAllDevice() { int count = GetCUDADeviceCount(); for (int i = 0; i < count; i++) { SetDeviceId(i); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaDeviceSynchronize(), + platform::errors::External( + "Device synchronize failed in cudaDeviceSynchronize()")); } #endif } diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc index 7a090ff7e5166..739892eafd824 100644 --- a/paddle/fluid/platform/stream/cuda_stream.cc +++ b/paddle/fluid/platform/stream/cuda_stream.cc @@ -30,10 +30,13 @@ bool CUDAStream::Init(const Place& place, const enum Priority& priority) { CUDADeviceGuard guard(boost::get(place_).device); if (priority == Priority::kHigh) { PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1)); + cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1), + platform::errors::Fatal("High priority cuda stream creation failed.")); } else if (priority == Priority::kNormal) { PADDLE_ENFORCE_CUDA_SUCCESS( - cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0)); + cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0), + platform::errors::Fatal( + "Normal priority cuda stream creation failed.")); } callback_manager_.reset(new StreamCallbackManager(stream_)); VLOG(3) << "CUDAStream Init stream: " << stream_ @@ -46,7 +49,9 @@ void CUDAStream::Destroy() { Wait(); WaitCallback(); if (stream_) { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamDestroy(stream_), + platform::errors::Fatal("Cuda stream destruction failed.")); } stream_ = nullptr; } @@ -62,7 +67,10 @@ void CUDAStream::Wait() const { } #endif - PADDLE_ENFORCE_CUDA_SUCCESS(e_sync); + PADDLE_ENFORCE_CUDA_SUCCESS( + e_sync, platform::errors::Fatal( + "cudaStreamSynchronize raises error: %s, errono: %d", + cudaGetErrorString(e_sync), static_cast(e_sync))); } } // namespace stream diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h index 57e763d527624..f7149f1e13098 100644 --- a/paddle/fluid/platform/stream/cuda_stream.h +++ b/paddle/fluid/platform/stream/cuda_stream.h @@ -53,15 +53,21 @@ class CUDAStream final { template void RecordEvent(cudaEvent_t ev, Callback callback) const { callback(); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventRecord(ev, stream_), + platform::errors::Fatal("CUDA event recording failed.")); } void RecordEvent(cudaEvent_t ev) const { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventRecord(ev, stream_), + platform::errors::Fatal("CUDA event recording failed.")); } void WaitEvent(cudaEvent_t ev) const { - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(stream_, ev, 0), + platform::errors::Fatal("Failed to wait event.")); } void Wait() const; diff --git a/python/setup.py.in b/python/setup.py.in index f6ec0fa26b984..906615e7efb42 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -215,14 +215,12 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: # the prefix is sys.prefix which should always be usr paddle_bins = '' - if not '${WIN32}': paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]} if '${HAS_NOAVX_CORE}' == 'ON': package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')] - package_dir={ '': '${PADDLE_BINARY_DIR}/python', # The paddle.fluid.proto will be generated while compiling. @@ -333,7 +331,6 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) + list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) + list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) + - list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen @@ -405,9 +402,7 @@ class InstallHeaders(Command): return self.copy_file(header, install_dir) def run(self): - # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows if os.name == 'nt' or sys.platform == 'darwin': - self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb') return hdrs = self.distribution.headers if not hdrs: diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 3e079d0433f87..51330bea8ea62 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -172,8 +172,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 6836917 47554610 22561442 fi -ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true` -VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true` +ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true` +VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true` INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true` if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please specify chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n" diff --git a/tools/count_invalid_enforce.sh b/tools/count_invalid_enforce.sh index fe99674f6bec6..a2dbc22119d75 100644 --- a/tools/count_invalid_enforce.sh +++ b/tools/count_invalid_enforce.sh @@ -30,9 +30,9 @@ ALL_PADDLE_CHECK_CNT=0 VALID_PADDLE_CHECK_CNT=0 function enforce_scan(){ - paddle_check=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" $1 || true` + paddle_check=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" $1 || true` total_check_cnt=`echo "$paddle_check" | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true` - valid_check_cnt=`echo "$paddle_check" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true` + valid_check_cnt=`echo "$paddle_check" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true` eval $2=$total_check_cnt eval $3=$valid_check_cnt } diff --git a/tools/cudaError/README.md b/tools/cudaError/README.md deleted file mode 100644 index df7434c33a9fd..0000000000000 --- a/tools/cudaError/README.md +++ /dev/null @@ -1,22 +0,0 @@ -Usage: - -Please run: -``` -bash start.sh -``` - -The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default. - -If you want to crawl a specified version of CUDA, Please run: -``` -bash start.sh -``` -URL can be derived by default, so you don't have to enter a URL. - -for example: -``` -bash start.sh 11.0 -``` -will capture error message of CUDA11.0(in future). - -Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz diff --git a/tools/cudaError/spider.py b/tools/cudaError/spider.py deleted file mode 100644 index c2c3dc97f4222..0000000000000 --- a/tools/cudaError/spider.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import ssl -import re -import urllib2 -import json -import collections -import sys, getopt -import cuda_error_pb2 - - -def parsing(cuda_errorDesc, version, url): - All_Messages = cuda_errorDesc.AllMessages.add() - All_Messages.version = int(version) - - ssl._create_default_https_context = ssl._create_unverified_context - html = urllib2.urlopen(url).read() - res_div = r'
.*?

CUDA error types

.*?
.*?
(.*?)
' - m_div = re.findall(res_div, html, re.S | re.M) - - url_list = url.split('/') - url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1]) - - dic = collections.OrderedDict() - dic_message = collections.OrderedDict() - for line in m_div: - res_dt = r'
(.*?)
.*?
(.*?)
' - m_dt = re.findall(res_dt, line, re.S | re.M) - for error in m_dt: - res_type = r'(.*?)' - m_type = re.findall(res_type, error[0], re.S | re.M)[0] - m_message = error[1] - m_message = m_message.replace('\n', '') - res_a = r'()' - res_shape = r'(.*?)' - list_a = re.findall(res_a, m_message, re.S | re.M) - list_shape = re.findall(res_shape, m_message, re.S | re.M) - assert len(list_a) == len(list_shape) - for idx in range(len(list_a)): - m_message = m_message.replace(list_a[idx], list_shape[idx]) - - m_message = m_message.replace( - '
Deprecated
', '') - - res_span = r'()' - res_span_detail = r'(.*?)' - list_span = re.findall(res_span, m_message, re.S | re.M) - list_span_detail = re.findall(res_span_detail, m_message, re.S | - re.M) - assert len(list_span) == len(list_span_detail) - for idx in range(len(list_span)): - m_message = m_message.replace(list_span[idx], - list_span_detail[idx]) - - res_p = r'(

.*?

)' - res_p_detail = r'

(.*?)

' - list_p = re.findall(res_p, m_message, re.S | re.M) - list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M) - assert len(list_p) == len(list_p_detail) - for idx in range(len(list_p)): - m_message = m_message.replace(list_p[idx], list_p_detail[idx]) - - m_message = m_message.replace(' ', '') - _Messages = All_Messages.Messages.add() - try: - _Messages.errorCode = int(m_type) - except ValueError: - if re.match('0x', m_type): - _Messages.errorCode = int(m_type, 16) - else: - raise ValueError - _Messages.errorMessage = m_message # save for cudaErrorMessage.pb from python-protobuf interface - - -def main(argv): - version = [] - url = [] - try: - opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="]) - except getopt.GetoptError: - print 'python spider.py -v -u ' - sys.exit(2) - for opt, arg in opts: - if opt in ("-h", "--help"): - print 'python spider.py -v -u ' - sys.exit() - elif opt in ("-v", "--version"): - version = arg - elif opt in ("-u", "--url"): - url = arg - version = version.split(',') - url = url.split(',') - assert len(version) == len(url) - cuda_errorDesc = cuda_error_pb2.cudaerrorDesc() - for idx in range(len(version)): - if version[idx] == "-1": - print("crawling errorMessage for CUDA%s from %s" % - ("-latest-version", url[idx])) - else: - print("crawling errorMessage for CUDA%s from %s" % - (version[idx], url[idx])) - parsing(cuda_errorDesc, version[idx], url[idx]) - - serializeToString = cuda_errorDesc.SerializeToString() - with open("cudaErrorMessage.pb", "wb") as f: - f.write(serializeToString - ) # save for cudaErrorMessage.pb from python-protobuf interface - print("crawling errorMessage for CUDA has been done!!!") - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh deleted file mode 100644 index 3c0e57ffe7ec1..0000000000000 --- a/tools/cudaError/start.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash -set -ex -SYSTEM=`uname -s` -rm -f protoc-3.11.3-linux-x86_64.* -if [ "$SYSTEM" == "Linux" ]; then - wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip - unzip -d protobuf -o protoc-3.11.3-linux-x86_64.zip - rm protoc-3.11.3-linux-x86_64.* -elif [ "$SYSTEM" == "Darwin" ]; then - wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-osx-x86_64.zip - unzip -d protobuf -o protoc-3.11.3-osx-x86_64.zip - rm protoc-3.11.3-osx-x86_64.* -else - echo "please run on Mac/Linux" - exit 1 -fi -protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto - -version=90,100,-1 # -1 represent the latest cuda-version -url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038 - -if [ "$1" != "" ]; then - version=$version,$(($1*10)) - if [ "$2" != "" ]; then - url=$url,$2 - else - url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038 - fi -fi - -python spider.py --version=$version --url=$url -tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb