From 6f59331f52beb5cbd6df1b5dcc65a71b100441bf Mon Sep 17 00:00:00 2001
From: zhouwei25 <zhouwei25@baidu.com>
Date: Wed, 22 Apr 2020 00:34:00 +0800
Subject: [PATCH] Revert 'Optimize the error messages of paddle CUDA API
 (#23816)',test=develop

---
 cmake/inference_lib.cmake                     |  15 +-
 cmake/third_party.cmake                       |  35 +--
 .../framework/details/nan_inf_utils_detail.cu |   4 +-
 .../tensorrt/plugin/split_op_plugin.cu        |  27 +-
 .../cuda_device_context_allocator.h           |  15 +-
 paddle/fluid/operators/argsort_op.cu          |  14 +-
 .../operators/fused/fused_bn_activation_op.cu |  90 ++++--
 .../fusion_transpose_flatten_concat_op.cu.cc  |  43 ++-
 .../operators/grid_sampler_cudnn_op.cu.cc     |  15 +-
 paddle/fluid/operators/math/blas_impl.cu.h    |  16 +-
 paddle/fluid/operators/mean_op.cu             |  10 +-
 .../fluid/operators/reader/buffered_reader.cc |  18 +-
 .../fluid/operators/sync_batch_norm_op.cu.h   |  18 +-
 paddle/fluid/platform/CMakeLists.txt          |   4 +-
 paddle/fluid/platform/cuda_error.proto        |  35 ---
 paddle/fluid/platform/cuda_helper.h           |   9 +-
 paddle/fluid/platform/cuda_resource_pool.cc   |  18 +-
 paddle/fluid/platform/device_context.h        |  13 +-
 paddle/fluid/platform/enforce.h               | 290 +++++-------------
 paddle/fluid/platform/enforce_test.cc         |  32 +-
 paddle/fluid/platform/gpu_info.cc             | 196 ++++++++----
 paddle/fluid/platform/profiler_helper.h       |   5 +-
 paddle/fluid/platform/stream/cuda_stream.cc   |  16 +-
 paddle/fluid/platform/stream/cuda_stream.h    |  12 +-
 python/setup.py.in                            |   5 -
 tools/check_api_approvals.sh                  |   4 +-
 tools/count_invalid_enforce.sh                |   4 +-
 tools/cudaError/README.md                     |  22 --
 tools/cudaError/spider.py                     | 124 --------
 tools/cudaError/start.sh                      |  32 --
 30 files changed, 496 insertions(+), 645 deletions(-)
 delete mode 100644 paddle/fluid/platform/cuda_error.proto
 delete mode 100644 tools/cudaError/README.md
 delete mode 100644 tools/cudaError/spider.py
 delete mode 100644 tools/cudaError/start.sh

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index a52d91741aff7..30e96b8e5f69e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -135,12 +135,6 @@ copy(inference_lib_dist
         SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
         DSTS ${dst_dir})
 
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
-copy(inference_lib_dist
-        SRCS ${cudaerror_INCLUDE_DIR}
-        DSTS ${dst_dir})
-
-# CMakeCache Info
 copy(inference_lib_dist
         SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
         DSTS ${FLUID_INFERENCE_INSTALL_DIR})
@@ -190,7 +184,7 @@ copy(fluid_lib_dist
         )
 
 set(module "framework")
-set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
+set(framework_lib_deps framework_proto)
 add_dependencies(fluid_lib_dist ${framework_lib_deps})
 copy(fluid_lib_dist
         SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
@@ -210,11 +204,11 @@ copy(fluid_lib_dist
         )
 
 set(module "platform")
-set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
+set(platform_lib_deps profiler_proto)
 add_dependencies(fluid_lib_dist ${platform_lib_deps})
 copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}
         )
 
 set(module "string")
@@ -255,7 +249,6 @@ copy(inference_lib_dist
         SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
         DSTS ${dst_dir} ${dst_dir}/lib)
 
-
 # CMakeCache Info
 copy(fluid_lib_dist
         SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 9c8a9e0af1c12..3b3a43a69a9ef 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-include(ExternalProject)
 # Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
 
 set(THIRD_PARTY_PATH  "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -22,7 +21,6 @@ set(THIRD_PARTY_CACHE_PATH     "${CMAKE_SOURCE_DIR}"    CACHE STRING
     "A path cache third party source code to avoid repeated download.")
 
 set(THIRD_PARTY_BUILD_TYPE Release)
-set(third_party_deps)
 
 # cache funciton to avoid repeat download code of third_party.
 # This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
@@ -102,32 +100,6 @@ MACRO(UNSET_VAR VAR_NAME)
     UNSET(${VAR_NAME})
 ENDMACRO()
 
-# Funciton to Download the dependencies during compilation
-# This function has 2 parameters, URL / DIRNAME:
-# 1. URL:           The download url of 3rd dependencies
-# 2. NAME:          The name of file, that determin the dirname
-#
-MACRO(file_download_and_uncompress URL NAME)
-  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
-  SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
-  SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
-  ExternalProject_Add(
-      ${EXTERNAL_PROJECT_NAME}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${THIRD_PARTY_PATH}/${NAME}
-      URL                   ${URL}
-      DOWNLOAD_DIR          ${THIRD_PARTY_PATH}/${NAME}/data/
-      SOURCE_DIR            ${THIRD_PARTY_PATH}/${NAME}/data/
-      DOWNLOAD_NO_PROGRESS  1
-      CONFIGURE_COMMAND     ""
-      BUILD_COMMAND         ""
-      UPDATE_COMMAND        ""
-      INSTALL_COMMAND       ""
-    )
-  list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
-ENDMACRO()
-
-
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
     if(WITH_MKL)
@@ -206,13 +178,10 @@ include(external/dlpack)    # download dlpack
 include(external/xxhash)    # download, build, install xxhash
 include(external/warpctc)   # download, build, install warpctc
 
+set(third_party_deps)
 list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
 list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
 
-# download file
-set(CUDAERROR_URL  "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
-file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")
-
 if(WITH_AMD_GPU)
     include(external/rocprim)   # download, build, install rocprim
     list(APPEND third_party_deps extern_rocprim)
@@ -305,4 +274,4 @@ if (WITH_LITE)
     include(external/lite)
 endif (WITH_LITE)
 
-add_custom_target(third_party ALL DEPENDS ${third_party_deps})
+add_custom_target(third_party DEPENDS ${third_party_deps})
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index f9f91680e3401..44668e491eb29 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -152,7 +152,9 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
 
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
-                          cudaMemcpyHostToDevice, dev_ctx->stream()));
+                          cudaMemcpyHostToDevice, dev_ctx->stream()),
+          platform::errors::External(
+              "Async cudaMemcpy op_var info to gpu failed."));
     } else {  // get
       auto iter = op_var2gpu_str.find(op_var);
       PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 9eefb925d2061..7a032acef676b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -124,9 +124,12 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
   float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
   float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
   float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
-      output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
-      cudaMemcpyHostToDevice, stream));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaMemcpyAsync(output_ptrs, h_odatas,
+                      d_output_ptrs_.size() * sizeof(float*),
+                      cudaMemcpyHostToDevice, stream),
+      platform::errors::External(
+          "CUDA Memcpy failed during split plugin run."));
 
   int outer_rows = outer_rows_ * batchSize;
 
@@ -241,9 +244,12 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
     float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
-        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
-        cudaMemcpyHostToDevice, stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpyAsync(output_ptrs, h_odatas,
+                        d_output_ptrs.size() * sizeof(float*),
+                        cudaMemcpyHostToDevice, stream),
+        platform::errors::External(
+            "CUDA Memcpy failed during split plugin run."));
 
     split_kernel<<<grid, block, 0, stream>>>(
         d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
@@ -257,9 +263,12 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
     half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
-        output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
-        cudaMemcpyHostToDevice, stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpyAsync(output_ptrs, h_odatas,
+                        d_output_ptrs.size() * sizeof(half*),
+                        cudaMemcpyHostToDevice, stream),
+        platform::errors::External(
+            "CUDA Memcpy failed during split plugin run."));
 
     split_kernel<<<grid, block, 0, stream>>>(
         d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 2163562a6080b..0997f575acc4e 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -80,13 +80,17 @@ class CUDADeviceContextAllocator : public Allocator {
       : place_(place), default_stream_(default_stream) {
     platform::CUDADeviceGuard guard(place_.device);
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaEventCreate(&event_, cudaEventDisableTiming));
+        cudaEventCreate(&event_, cudaEventDisableTiming),
+        platform::errors::External(
+            "Create event failed in CUDADeviceContextAllocator"));
   }
 
   ~CUDADeviceContextAllocator() {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaEventDestroy(event_),
+          "Destory event failed in CUDADeviceContextAllocator destroctor");
     }
   }
 
@@ -99,9 +103,12 @@ class CUDADeviceContextAllocator : public Allocator {
     auto allocation =
         new CUDADeviceContextAllocation(memory::Alloc(place_, size));
     // Wait for the event on stream
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(default_stream_, event_, 0));
+        cudaEventRecord(event_, default_stream_),
+        "Failed to record event in CUDADeviceContextAllocator");
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamWaitEvent(default_stream_, event_, 0),
+        "Failed to wait event in CUDADeviceContextAllocator");
     return allocation;
   }
 
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index cbd7e33bc6b72..006bf559195aa 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -141,7 +141,12 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
         cu_stream);
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      err,
+      "ArgSortOP failed as could not launch "
+      "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
+      "temp_storage_bytes, status:%s.",
+      temp_storage_bytes, cudaGetErrorString(err));
 
   Tensor temp_storage;
   temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
@@ -160,7 +165,12 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         cu_stream);
   }
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      err,
+      "ArgSortOP failed as could not launch "
+      "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
+      "temp_storage_bytes:%d status:%s.",
+      temp_storage_bytes, cudaGetErrorString(err));
 }
 
 template <typename T, typename IndType>
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 32eaf1180977a..2e308657936c0 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -108,21 +108,32 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnCreateTensorDescriptor(&data_desc_)."));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
 
     VLOG(3) << "Setting descriptors.";
     std::vector<int> dims = {N, C, H, W, D};
     std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnSetTensorNdDescriptor(
+            data_desc_, CudnnDataType<T>::type,
+            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
+        platform::errors::External(
+            "The error has happened when calling cudnnSetTensorNdDescriptor."));
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+                                                         data_desc_, mode_),
+        platform::errors::External("The error has happened when calling "
+                                   "cudnnDeriveBNTensorDescriptor."));
 
     double this_factor = 1. - momentum;
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
@@ -155,7 +166,10 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                 /*yDesc=*/data_desc_,
                 /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
                 /*activationDesc=*/activation_desc_,
-                /*sizeInBytes=*/&workspace_size));
+                /*sizeInBytes=*/&workspace_size),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize."));
 
     // -------------- cudnn batchnorm reserve space --------------
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -165,7 +179,10 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             /*bnOps=*/bnOps_,
             /*activationDesc=*/activation_desc_,
             /*xDesc=*/data_desc_,
-            /*sizeInBytes=*/&reserve_space_size));
+            /*sizeInBytes=*/&reserve_space_size),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnGetBatchNormalizationTrainingExReserveSpaceSize."));
 
     reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
                                                     reserve_space_size);
@@ -187,13 +204,22 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             saved_variance->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace()),
             activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
-            reserve_space_size));
+            reserve_space_size),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnBatchNormalizationForwardTrainingEx."));
 
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnDestroyTensorDescriptor(data_desc_)."));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
   }
 };
 
@@ -272,9 +298,15 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnCreateTensorDescriptor(&data_desc_)."));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnCreateTensorDescriptor(&bn_param_desc_)."));
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
       LOG(ERROR) << "Provided epsilon is smaller than "
                  << "CUDNN_BN_MIN_EPSILON. Setting it to "
@@ -282,12 +314,17 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     }
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnSetTensorNdDescriptor(
+            data_desc_, CudnnDataType<T>::type,
+            x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()),
+        platform::errors::External(
+            "The error has happened when calling cudnnSetTensorNdDescriptor."));
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+                                                         data_desc_, mode_),
+        platform::errors::External("The error has happened when calling "
+                                   "cudnnDeriveBNTensorDescriptor."));
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
@@ -317,7 +354,10 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*dxDesc=*/data_desc_,
             /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
             /*activationDesc=*/activation_desc_,
-            /*sizeInBytes=*/&workspace_size));
+            /*sizeInBytes=*/&workspace_size),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnGetBatchNormalizationBackwardExWorkspaceSize."));
 
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
@@ -355,13 +395,21 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*workspace=*/workspace_ptr,
             /*workSpaceSizeInBytes=*/workspace_size,
             /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
-            /*reserveSpaceSizeInBytes=*/reserve_space_size));
+            /*reserveSpaceSizeInBytes=*/reserve_space_size),
+        platform::errors::External("The error has happened when calling "
+                                   "cudnnBatchNormalizationBackwardEx."));
 
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnDestroyTensorDescriptor(data_desc_)."));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_),
+        platform::errors::External(
+            "The error has happened when calling "
+            "cudnnDestroyTensorDescriptor(bn_param_desc_)."));
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 17cb4556d45ef..b61ef8e566b77 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -46,9 +46,13 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
     cudnnTensorDescriptor_t in_desc;
     cudnnTensorDescriptor_t out_desc;
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
+        platform::dynload::cudnnCreateTensorDescriptor(&in_desc),
+        platform::errors::External("Create cudnn tensor descriptor failed in "
+                                   "transpose_flatten_concat_fusion op."));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
+        platform::dynload::cudnnCreateTensorDescriptor(&out_desc),
+        platform::errors::External("Create cudnn tensor descriptor failed in "
+                                   "transpose_flatten_concat_fusion op."));
     cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
 
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -87,15 +91,24 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         dims_y[i] = 1;
       }
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
-
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
-          handle, CudnnDataType<T>::kOne(), in_desc,
-          static_cast<const void*>(ins[k]->data<T>()),
-          CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnSetTensorNdDescriptor(
+              in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()),
+          platform::errors::External("Create cudnn tensorNd descriptor failed "
+                                     "in transpose_flatten_concat op."));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnSetTensorNdDescriptor(
+              out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()),
+          platform::errors::External("Create cudnn tensorNd descriptor failed "
+                                     "in transpose_flatten_concat op."));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnTransformTensor(
+              handle, CudnnDataType<T>::kOne(), in_desc,
+              static_cast<const void*>(ins[k]->data<T>()),
+              CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)),
+          platform::errors::External("Create cudnn transform tensor failed in "
+                                     "transpose_flatten_concat op."));
       if (concat_axis == 0) {
         odata += osize;
       } else {
@@ -104,9 +117,13 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
       }
     }
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
+        platform::dynload::cudnnDestroyTensorDescriptor(in_desc),
+        platform::errors::External(
+            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
+        platform::dynload::cudnnDestroyTensorDescriptor(out_desc),
+        platform::errors::External(
+            "Destory cudnn descriptor failed in transpose_flatten_concat op."));
   }
 };
 
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 3bf34fc685ee8..c266b0d32b14a 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -60,10 +60,13 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         DataLayout::kNCHW, framework::vectorize<int>(output->dims()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
-        handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
-        input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
-        output_data));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnSpatialTfSamplerForward(
+            handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
+            input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
+            output_data),
+        platform::errors::InvalidArgument(
+            "cudnnSpatialTfSamplerForward in Op(grid_sampler) failed"));
   }
 };
 
@@ -119,7 +122,9 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
             input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
             input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
             output_grad_data, grid_data, CudnnDataType<T>::kZero(),
-            grid_grad_data));
+            grid_grad_data),
+        platform::errors::InvalidArgument(
+            "cudnnSpatialTfSamplerBackward in Op(grid_sampler) failed"));
   }
 };
 
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 8e903a4eccc74..c0ab35b0e753c 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -41,12 +41,16 @@ struct CUBlas<float> {
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasSscal(args...),
+        platform::errors::External("dynload cublasSscal lib failed"));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasScopy(args...),
+        platform::errors::External("dynload cublasScopy lib failed"));
   }
 
   template <typename... ARGS>
@@ -104,12 +108,16 @@ struct CUBlas<double> {
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasDscal(args...),
+        platform::errors::External("dynload cublasDscal lib failed"));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cublasDcopy(args...),
+        platform::errors::External("dynload cublasDcopy lib failed"));
   }
 
   template <typename... ARGS>
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 081c077ab73c2..d2b01fafb731f 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -59,14 +59,20 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
 
     auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
                                       out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        err, platform::errors::External(
+                 "MeanOP failed to get reduce workspace size %s.",
+                 cudaGetErrorString(err)));
     framework::Tensor tmp;
     auto* temp_storage = tmp.mutable_data<uint8_t>(
         framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
         context.GetPlace());
     err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
                                  out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        err, platform::errors::External(
+                 "MeanOP failed to run CUDA reduce computation: %s.",
+                 cudaGetErrorString(err)));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index e72820611d3a9..b237df130abcc 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -104,9 +104,13 @@ void BufferedReader::ReadAsync(size_t i) {
       // gpu memory immediately without waiting gpu kernel ends
       platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventRecord(events_[i].get(), compute_stream_));
+          cudaEventRecord(events_[i].get(), compute_stream_),
+          platform::errors::Fatal(
+              "cudaEventRecord raises unexpected exception"));
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
+          cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0),
+          platform::errors::Fatal(
+              "cudaStreamWaitEvent raises unexpected exception"));
 
       platform::RecordEvent record_event("BufferedReader:MemoryCopy");
       for (size_t i = 0; i < cpu.size(); ++i) {
@@ -134,11 +138,17 @@ void BufferedReader::ReadAsync(size_t i) {
                        size);
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                        cuda_pinned_place, cuda_pinned_ptr, size, stream_.get());
-          PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
+          PADDLE_ENFORCE_CUDA_SUCCESS(
+              cudaStreamSynchronize(stream_.get()),
+              platform::errors::Fatal(
+                  "cudaStreamSynchronize raises unexpected exception"));
         }
         gpu[i].set_lod(cpu[i].lod());
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaStreamSynchronize(stream_.get()),
+          platform::errors::Fatal(
+              "cudaStreamSynchronize raises unexpected exception"));
     }
 #endif
     return i;
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
index cfb9e16942c25..083d22aa2a38a 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -191,9 +191,12 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
     if (comm) {
       int dtype = platform::ToNCCLDataType(mean_out->type());
       // In-place operation
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
-          stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
-          comm, stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
+                                           static_cast<ncclDataType_t>(dtype),
+                                           ncclSum, comm, stream),
+          platform::errors::InvalidArgument(
+              "ncclAllReduce in Op(sync_batch_norm) failed"));
     }
 #endif
 
@@ -465,9 +468,12 @@ void SyncBatchNormGradFunctor(
   if (comm) {
     int dtype = platform::ToNCCLDataType(scale->type());
     // In-place operation
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
-        stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
-        comm, stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::ncclAllReduce(stats, stats, 2 * C + 1,
+                                         static_cast<ncclDataType_t>(dtype),
+                                         ncclSum, comm, stream),
+        platform::errors::InvalidArgument(
+            "ncclAllReduce in Op(sync_batch_norm) failed"));
   }
 #endif
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index d0d74f6ea87f1..ddf3035a92754 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,6 +1,6 @@
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
 proto_library(error_codes_proto SRCS error_codes.proto)
-proto_library(cuda_error_proto SRCS cuda_error.proto)
+
 
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
@@ -28,7 +28,7 @@ cc_library(flags SRCS flags.cc DEPS gflags)
 cc_library(errors SRCS errors.cc DEPS error_codes_proto)
 cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
 
-cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors cuda_error_proto)
+cc_library(enforce INTERFACE SRCS enforce.cc DEPS flags errors)
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
 
 set(CPU_INFO_DEPS gflags glog enforce)
diff --git a/paddle/fluid/platform/cuda_error.proto b/paddle/fluid/platform/cuda_error.proto
deleted file mode 100644
index b55e0af81ee6f..0000000000000
--- a/paddle/fluid/platform/cuda_error.proto
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-package paddle.platform.proto;
-
-message MessageDesc {
-  // Indicates the type of error
-  required int32 errorCode = 1;
-  // Indicates the message of error
-  required string errorMessage = 2;
-}
-
-message AllMessageDesc {
-  // Version of cuda API
-  required int32 version = 1;
-  // Error messages of different errortype
-  repeated MessageDesc Messages = 2;
-}
-
-message cudaerrorDesc {
-  // Error messages of different cuda versions(9.0/10.0/10.2)
-  repeated AllMessageDesc AllMessages = 2;
-}
\ No newline at end of file
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index 74cf5545239f1..54f5e911e3d0f 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -29,7 +29,14 @@ namespace platform {
 class CublasHandleHolder {
  public:
   CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cublasCreate(&handle_),
+        platform::errors::External(
+            "The cuBLAS library was not initialized. This is usually caused by "
+            "an error in the CUDA Runtime API called by the cuBLAS routine, or "
+            "an error in the hardware setup.\n"
+            "To correct: check that the hardware, an appropriate version of "
+            "the driver, and the cuBLAS library are correctly installed."));
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
 #if CUDA_VERSION >= 9000
     if (math_type == CUBLAS_TENSOR_OP_MATH) {
diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/cuda_resource_pool.cc
index 65c8b96028ace..1828f0760a79a 100644
--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/cuda_resource_pool.cc
@@ -27,13 +27,18 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
       platform::SetDeviceId(dev_idx);
       cudaStream_t stream;
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+          cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking),
+          platform::errors::Fatal(
+              "cudaStreamCreateWithFlags raises unexpected exception"));
       return stream;
     };
 
     auto deleter = [dev_idx](cudaStream_t stream) {
       platform::SetDeviceId(dev_idx);
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaStreamDestroy(stream),
+          platform::errors::Fatal(
+              "cudaStreamDestroy raises unexpected exception"));
     };
 
     pool_.emplace_back(
@@ -67,13 +72,18 @@ CudaEventResourcePool::CudaEventResourcePool() {
       platform::SetDeviceId(dev_idx);
       cudaEvent_t event;
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+          cudaEventCreateWithFlags(&event, cudaEventDisableTiming),
+          platform::errors::Fatal(
+              "cudaEventCreateWithFlags raises unexpected exception"));
       return event;
     };
 
     auto deleter = [dev_idx](cudaEvent_t event) {
       platform::SetDeviceId(dev_idx);
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaEventDestroy(event),
+          platform::errors::Fatal(
+              "cudaEventDestroy raises unexpected exception"));
     };
 
     pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 529992b47e273..e32c8d4ea6a52 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -162,9 +162,14 @@ class CUDAContext {
             << "Please recompile or reinstall Paddle with compatible CUDNN "
                "version.";
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
       PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnSetStream(cudnn_handle_, RawStream()));
+          dynload::cudnnCreate(&cudnn_handle_),
+          platform::errors::Fatal(
+              "Failed to create Cudnn handle in DeviceContext"));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::cudnnSetStream(cudnn_handle_, RawStream()),
+          platform::errors::Fatal(
+              "Failed to set stream for Cudnn handle in DeviceContext"));
     } else {
       cudnn_handle_ = nullptr;
     }
@@ -172,7 +177,9 @@ class CUDAContext {
 
   void DestoryCuDNNContext() {
     if (cudnn_handle_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::cudnnDestroy(cudnn_handle_),
+          platform::errors::Fatal("Failed to destory Cudnn handle"));
     }
     cudnn_handle_ = nullptr;
   }
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index f2e0c52170b60..99f83d9732029 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -18,13 +18,6 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
 
-#if !defined(_WIN32)
-#include <dlfcn.h>    // dladdr
-#else                 // _WIN32
-#define NOMINMAX      // msvc max/min macro conflict with std::min/max
-#include <windows.h>  // GetModuleFileName
-#endif
-
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -45,7 +38,6 @@ limitations under the License. */
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
-#include "paddle/fluid/platform/cuda_error.pb.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/port.h"
@@ -228,6 +220,10 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
 
 inline bool is_error(bool stat) { return !stat; }
 
+inline std::string build_ex_string(bool stat, const std::string& msg) {
+  return msg;
+}
+
 inline void throw_on_error(bool stat, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
   throw std::runtime_error(msg);
@@ -288,21 +284,23 @@ struct EnforceNotMet : public std::exception {
     }                                                                        \
   } while (0)
 #else
-#define PADDLE_ENFORCE(COND, ...)                                         \
-  do {                                                                    \
-    auto __cond__ = (COND);                                               \
-    if (UNLIKELY(::paddle::platform::is_error(__cond__))) {               \
-      try {                                                               \
-        ::paddle::platform::throw_on_error(                               \
-            __cond__,                                                     \
-            ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString());    \
-      } catch (...) {                                                     \
-        HANDLE_THE_ERROR                                                  \
-        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
-                                                __FILE__, __LINE__);      \
-        END_HANDLE_THE_ERROR                                              \
-      }                                                                   \
-    }                                                                     \
+#define PADDLE_ENFORCE(COND, ...)                                           \
+  do {                                                                      \
+    auto __cond__ = (COND);                                                 \
+    if (UNLIKELY(::paddle::platform::is_error(__cond__))) {                 \
+      try {                                                                 \
+        ::paddle::platform::throw_on_error(                                 \
+            __cond__,                                                       \
+            ::paddle::platform::build_ex_string(                            \
+                __cond__,                                                   \
+                ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \
+      } catch (...) {                                                       \
+        HANDLE_THE_ERROR                                                    \
+        throw ::paddle::platform::EnforceNotMet(std::current_exception(),   \
+                                                __FILE__, __LINE__);        \
+        END_HANDLE_THE_ERROR                                                \
+      }                                                                     \
+    }                                                                       \
   } while (0)
 #endif
 
@@ -466,148 +464,30 @@ struct EOFException : public std::exception {
   } while (0)
 
 /** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/
+
 #ifdef PADDLE_WITH_CUDA
 
-/***** CUDA ERROR *****/
 inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
 
-inline std::string GetCudaErrorWebsite(int32_t cuda_version) {
-  std::ostringstream webstr;
-  webstr << "https://docs.nvidia.com/cuda/";
-  if (cuda_version != -1) {
-    double version = cuda_version / 10;
-    webstr << "archive/" << std::fixed << std::setprecision(1) << version;
-  }
-  webstr << "/cuda-runtime-api/group__CUDART__TYPES.html"
-            "#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038";
-  return webstr.str();
-}
-
-inline std::string build_nvidia_error_msg(cudaError_t e) {
-#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000
-  int32_t cuda_version = 100;
-#elif CUDA_VERSION >= 9000
-  int32_t cuda_version = 90;
-#else
-  int32_t cuda_version = -1;
-#endif
-  std::ostringstream sout;
-  sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << ".";
-  static platform::proto::cudaerrorDesc cudaerror;
-  static bool _initSucceed = false;
-  if (cudaerror.ByteSizeLong() == 0) {
-    std::string filePath;
-#if !defined(_WIN32)
-    Dl_info info;
-    if (dladdr(reinterpret_cast<void*>(GetCudaErrorWebsite), &info)) {
-      std::string strModule(info.dli_fname);
-      const size_t last_slash_idx = strModule.find_last_of("/");
-      std::string compare_path = strModule.substr(strModule.length() - 6);
-      if (std::string::npos != last_slash_idx) {
-        strModule.erase(last_slash_idx, std::string::npos);
-      }
-      if (compare_path.compare("avx.so") == 0) {
-        filePath = strModule +
-                   "/../include/third_party/cudaerror/data/cudaErrorMessage.pb";
-      } else {
-        filePath =
-            strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb";
-      }
-    }
-#else
-    char buf[100];
-    MEMORY_BASIC_INFORMATION mbi;
-    HMODULE h_module =
-        (::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0)
-            ? (HMODULE)mbi.AllocationBase
-            : NULL;
-    GetModuleFileName(h_module, buf, 100);
-    std::string strModule(buf);
-    const size_t last_slash_idx = strModule.find_last_of("\\");
-    std::string compare_path = strModule.substr(strModule.length() - 7);
-    if (std::string::npos != last_slash_idx) {
-      strModule.erase(last_slash_idx, std::string::npos);
-    }
-    if (compare_path.compare("avx.pyd") == 0) {
-      filePath =
-          strModule +
-          "\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
-    } else {
-      filePath =
-          strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
-    }
-#endif
-    std::ifstream fin(filePath, std::ios::in | std::ios::binary);
-    _initSucceed = cudaerror.ParseFromIstream(&fin);
-  }
-  if (_initSucceed) {
-    for (int i = 0; i < cudaerror.allmessages_size(); ++i) {
-      if (cuda_version == cudaerror.allmessages(i).version()) {
-        for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) {
-          if (e == cudaerror.allmessages(i).messages(j).errorcode()) {
-            sout << "\n  [Advise: "
-                 << cudaerror.allmessages(i).messages(j).errormessage() << "]";
-            return sout.str();
-          }
-        }
-      }
-    }
-  }
-  sout << "\n  [Advise: Please search for the error code(" << e
-       << ") on website( " << GetCudaErrorWebsite(cuda_version)
-       << " ) to get Nvidia's official solution about CUDA Error.]";
-  return sout.str();
+inline std::string build_ex_string(cudaError_t e, const std::string& msg) {
+  return msg;
 }
 
 inline void throw_on_error(cudaError_t e, const std::string& msg) {
 #ifndef REPLACE_ENFORCE_GLOG
-  throw std::runtime_error(msg);
+  throw thrust::system_error(e, thrust::cuda_category(), msg);
 #else
   LOG(FATAL) << msg;
 #endif
 }
 
-/** curand ERROR **/
 inline bool is_error(curandStatus_t stat) {
   return stat != CURAND_STATUS_SUCCESS;
 }
 
-inline const char* curandGetErrorString(curandStatus_t stat) {
-  switch (stat) {
-    case CURAND_STATUS_SUCCESS:
-      return "CURAND_STATUS_SUCCESS";
-    case CURAND_STATUS_VERSION_MISMATCH:
-      return "CURAND_STATUS_VERSION_MISMATCH";
-    case CURAND_STATUS_NOT_INITIALIZED:
-      return "CURAND_STATUS_NOT_INITIALIZED";
-    case CURAND_STATUS_ALLOCATION_FAILED:
-      return "CURAND_STATUS_ALLOCATION_FAILED";
-    case CURAND_STATUS_TYPE_ERROR:
-      return "CURAND_STATUS_TYPE_ERROR";
-    case CURAND_STATUS_OUT_OF_RANGE:
-      return "CURAND_STATUS_OUT_OF_RANGE";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case CURAND_STATUS_LAUNCH_FAILURE:
-      return "CURAND_STATUS_LAUNCH_FAILURE";
-    case CURAND_STATUS_PREEXISTING_FAILURE:
-      return "CURAND_STATUS_PREEXISTING_FAILURE";
-    case CURAND_STATUS_INITIALIZATION_FAILED:
-      return "CURAND_STATUS_INITIALIZATION_FAILED";
-    case CURAND_STATUS_ARCH_MISMATCH:
-      return "CURAND_STATUS_ARCH_MISMATCH";
-    case CURAND_STATUS_INTERNAL_ERROR:
-      return "CURAND_STATUS_INTERNAL_ERROR";
-    default:
-      return "Unknown curand status";
-  }
-}
-
-inline std::string build_nvidia_error_msg(curandStatus_t stat) {
-  std::string msg(" Curand error, ");
-  return msg + curandGetErrorString(stat) + " ";
+inline std::string build_ex_string(curandStatus_t stat,
+                                   const std::string& msg) {
+  return msg;
 }
 
 inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
@@ -619,14 +499,13 @@ inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
 #endif
 }
 
-/***** CUDNN ERROR *****/
 inline bool is_error(cudnnStatus_t stat) {
   return stat != CUDNN_STATUS_SUCCESS;
 }
 
-inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
-  std::string msg(" Cudnn error, ");
-  return msg + platform::dynload::cudnnGetErrorString(stat) + " ";
+inline std::string build_ex_string(cudnnStatus_t stat, const std::string& msg) {
+  return msg + "\n  [Hint: " + platform::dynload::cudnnGetErrorString(stat) +
+         "]";
 }
 
 inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
@@ -637,39 +516,33 @@ inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
 #endif
 }
 
-/***** CUBLAS ERROR *****/
 inline bool is_error(cublasStatus_t stat) {
   return stat != CUBLAS_STATUS_SUCCESS;
 }
 
-inline const char* cublasGetErrorString(cublasStatus_t stat) {
-  switch (stat) {
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "CUBLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "CUBLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "CUBLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "CUBLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "CUBLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "CUBLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "CUBLAS_STATUS_INTERNAL_ERROR";
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return "CUBLAS_STATUS_NOT_SUPPORTED";
-    case CUBLAS_STATUS_LICENSE_ERROR:
-      return "CUBLAS_STATUS_LICENSE_ERROR";
-    default:
-      return "Unknown cublas status";
+inline std::string build_ex_string(cublasStatus_t stat,
+                                   const std::string& msg) {
+  std::string err;
+  if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    err = "CUBLAS_STATUS_NOT_INITIALIZED";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    err = "CUBLAS_STATUS_ALLOC_FAILED";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    err = "CUBLAS_STATUS_INVALID_VALUE";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    err = "CUBLAS_STATUS_ARCH_MISMATCH";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    err = "CUBLAS_STATUS_MAPPING_ERROR";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    err = "CUBLAS_STATUS_EXECUTION_FAILED";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    err = "CUBLAS_STATUS_INTERNAL_ERROR";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    err = "CUBLAS_STATUS_NOT_SUPPORTED";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    err = "CUBLAS_STATUS_LICENSE_ERROR";
   }
-}
-
-inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
-  std::string msg(" Cublas error, ");
-  return msg + cublasGetErrorString(stat) + " ";
+  return msg + "\n  [Hint: " + err + "]";
 }
 
 inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
@@ -680,15 +553,15 @@ inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
 #endif
 }
 
-/****** NCCL ERROR ******/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
   return nccl_result != ncclSuccess;
 }
 
-inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
-  std::string msg(" Nccl error, ");
-  return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
+inline std::string build_ex_string(ncclResult_t nccl_result,
+                                   const std::string& msg) {
+  return msg + "\n  [" + platform::dynload::ncclGetErrorString(nccl_result) +
+         "]";
 }
 
 inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
@@ -698,8 +571,11 @@ inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
   LOG(FATAL) << msg;
 #endif
 }
-#endif  // not(__APPLE__) and PADDLE_WITH_NCCL
+#endif  // __APPLE__ and windows
+
+#endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_CUDA
 namespace details {
 
 template <typename T>
@@ -722,28 +598,30 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
 #endif
 
 }  // namespace details
+#endif  // PADDLE_WITH_CUDA
 
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                                 \
-  do {                                                                    \
-    auto __cond__ = (COND);                                               \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                      \
-    constexpr auto __success_type__ =                                     \
-        ::paddle::platform::details::CudaStatusType<                      \
-            __CUDA_STATUS_TYPE__>::kSuccess;                              \
-    if (UNLIKELY(__cond__ != __success_type__)) {                         \
-      try {                                                               \
-        ::paddle::platform::throw_on_error(                               \
-            __cond__,                                                     \
-            ::paddle::platform::errors::External(                         \
-                ::paddle::platform::build_nvidia_error_msg(__cond__))     \
-                .ToString());                                             \
-      } catch (...) {                                                     \
-        HANDLE_THE_ERROR                                                  \
-        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
-                                                __FILE__, __LINE__);      \
-        END_HANDLE_THE_ERROR                                              \
-      }                                                                   \
-    }                                                                     \
+#ifdef PADDLE_WITH_CUDA
+#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...)                              \
+  do {                                                                      \
+    auto __cond__ = (COND);                                                 \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                        \
+    constexpr auto __success_type__ =                                       \
+        ::paddle::platform::details::CudaStatusType<                        \
+            __CUDA_STATUS_TYPE__>::kSuccess;                                \
+    if (UNLIKELY(__cond__ != __success_type__)) {                           \
+      try {                                                                 \
+        ::paddle::platform::throw_on_error(                                 \
+            __cond__,                                                       \
+            ::paddle::platform::build_ex_string(                            \
+                __cond__,                                                   \
+                ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString())); \
+      } catch (...) {                                                       \
+        HANDLE_THE_ERROR                                                    \
+        throw ::paddle::platform::EnforceNotMet(std::current_exception(),   \
+                                                __FILE__, __LINE__);        \
+        END_HANDLE_THE_ERROR                                                \
+      }                                                                     \
+    }                                                                       \
   } while (0)
 
 #undef DEFINE_CUDA_STATUS_TYPE
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index db77ba95856d9..0057c784528c2 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -261,14 +261,15 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
-  PADDLE_ENFORCE_CUDA_SUCCESS(value);
+  PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
   return true;
 }
 
 template <typename T>
-bool CheckCudaStatusFailure(T value, const std::string& msg) {
+bool CheckCudaStatusFailure(
+    T value, const std::string& msg = "self-defined cuda status failed") {
   try {
-    PADDLE_ENFORCE_CUDA_SUCCESS(value);
+    PADDLE_ENFORCE_CUDA_SUCCESS(value, msg);
     return false;
   } catch (paddle::platform::EnforceNotMet& error) {
     std::string ex_msg = error.what();
@@ -278,29 +279,24 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) {
 
 TEST(enforce, cuda_success) {
   EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue));
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation));
 
   EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
-  EXPECT_TRUE(
-      CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error"));
-  EXPECT_TRUE(
-      CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH));
+  EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED));
 
   EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
-  EXPECT_TRUE(
-      CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error"));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED));
 
   EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
-  EXPECT_TRUE(
-      CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error"));
-  EXPECT_TRUE(
-      CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE));
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
   EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error"));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError));
 #endif
 }
 #endif
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index c07abba9e8ef9..40d6bc54ccf92 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <cstdlib>
 #include <memory>
+#include <string>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -41,13 +42,18 @@ faster way to query device properties. You can see details in
 https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
 */
 
+inline std::string CudaErrorWebsite() {
+  return "Please see detail in https://docs.nvidia.com/cuda/cuda-runtime-api"
+         "/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c217824"
+         "6db0a94a430e0038";
+}
+
 static int GetCUDADeviceCountImpl() {
   int driverVersion = 0;
   cudaError_t status = cudaDriverGetVersion(&driverVersion);
 
   if (!(status == cudaSuccess && driverVersion != 0)) {
     // No GPU driver
-    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
     return 0;
   }
 
@@ -61,8 +67,14 @@ static int GetCUDADeviceCountImpl() {
       return 0;
     }
   }
+
   int count;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
+  auto error_code = cudaGetDeviceCount(&count);
+  PADDLE_ENFORCE(
+      error_code,
+      "cudaGetDeviceCount failed in "
+      "paddle::platform::GetCUDADeviceCountImpl, error code : %d, %s",
+      error_code, CudaErrorWebsite());
   return count;
 }
 
@@ -72,63 +84,72 @@ int GetCUDADeviceCount() {
 }
 
 int GetCUDAComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int major, minor;
 
   auto major_error_code =
       cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
   auto minor_error_code =
       cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
-  PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
-  PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
+  PADDLE_ENFORCE_EQ(
+      major_error_code, 0,
+      "cudaDevAttrComputeCapabilityMajor failed in "
+      "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
+      major_error_code, CudaErrorWebsite());
+  PADDLE_ENFORCE_EQ(
+      minor_error_code, 0,
+      "cudaDevAttrComputeCapabilityMinor failed in "
+      "paddle::platform::GetCUDAComputeCapability, error code : %d, %s",
+      minor_error_code, CudaErrorWebsite());
   return major * 10 + minor;
 }
 
 dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   dim3 ret;
   int size;
   auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
-  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
+  PADDLE_ENFORCE_EQ(error_code_x, 0,
+                    "cudaDevAttrMaxGridDimX failed in "
+                    "paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
+                    error_code_x, CudaErrorWebsite());
   ret.x = size;
 
   auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
-  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
+  PADDLE_ENFORCE_EQ(error_code_y, 0,
+                    "cudaDevAttrMaxGridDimY failed in "
+                    "paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
+                    error_code_y, CudaErrorWebsite());
   ret.y = size;
 
   auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
-  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
+  PADDLE_ENFORCE_EQ(error_code_z, 0,
+                    "cudaDevAttrMaxGridDimZ failed in "
+                    "paddle::platform::GpuMaxGridDimSize, error code : %d, %s",
+                    error_code_z, CudaErrorWebsite());
   ret.z = size;
   return ret;
 }
 
 int GetCUDARuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int runtime_version = 0;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
+  auto error_code = cudaRuntimeGetVersion(&runtime_version);
+  PADDLE_ENFORCE(error_code,
+                 "cudaRuntimeGetVersion failed in "
+                 "paddle::platform::GetCUDARuntimeVersion, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
   return runtime_version;
 }
 
 int GetCUDADriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int driver_version = 0;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
+  auto error_code = cudaDriverGetVersion(&driver_version);
+  PADDLE_ENFORCE(error_code,
+                 "cudaDriverGetVersion failed in "
+                 "paddle::platform::GetCUDADriverVersion, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
   return driver_version;
 }
 
@@ -143,44 +164,56 @@ bool TensorCoreAvailable() {
 }
 
 int GetCUDAMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
+  auto error_code =
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id);
+  PADDLE_ENFORCE(error_code,
+                 "cudaDeviceGetAttribute failed in "
+                 "paddle::platform::GetCUDAMultiProcess, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
   return count;
 }
 
 int GetCUDAMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   int count;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
-      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
+  auto error_code = cudaDeviceGetAttribute(
+      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id);
+  PADDLE_ENFORCE(
+      error_code,
+      "cudaDeviceGetAttribute failed in paddle::"
+      "platform::GetCUDAMaxThreadsPerMultiProcessor, error code : %d, %s",
+      error_code, CudaErrorWebsite());
   return count;
 }
 
 int GetCUDAMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
+  PADDLE_ENFORCE_LT(
+      id, GetCUDADeviceCount(),
+      platform::errors::InvalidArgument(
+          "Device id must less than GPU count, but received id is:%d, "
+          "GPU count is: %d.",
+          id, GetCUDADeviceCount()));
   int count;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
+  auto error_code =
+      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id);
+  PADDLE_ENFORCE_EQ(
+      error_code, 0,
+      platform::errors::InvalidArgument(
+          "cudaDeviceGetAttribute returned error code should be 0, "
+          "but received error code is: %d, %s",
+          error_code, CudaErrorWebsite()));
   return count;
 }
 
 int GetCurrentDeviceId() {
   int device_id;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
+  auto error_code = cudaGetDevice(&device_id);
+  PADDLE_ENFORCE(error_code,
+                 "cudaGetDevice failed in "
+                 "paddle::platform::GetCurrentDeviceId, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
   return device_id;
 }
 
@@ -204,12 +237,12 @@ std::vector<int> GetSelectedDevices() {
 
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(id));
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  auto error_code = cudaSetDevice(id);
+  PADDLE_ENFORCE(error_code,
+                 "cudaSetDevice failed in "
+                 "paddle::platform::SetDeviced, error code : %d, %s",
+                 error_code, CudaErrorWebsite());
 }
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
@@ -273,44 +306,74 @@ size_t GpuMaxChunkSize() {
 
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
+  auto error_code = cudaMemcpyAsync(dst, src, count, kind, stream);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
+                 "(%p -> %p, length: %d) error code : %d, %s",
+                 src, dst, static_cast<int>(count), error_code,
+                 CudaErrorWebsite());
 }
 
 void GpuMemcpySync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
+  auto error_code = cudaMemcpy(dst, src, count, kind);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync "
+                 "(%p -> %p, length: %d) error code : %d, %s",
+                 src, dst, static_cast<int>(count), error_code,
+                 CudaErrorWebsite());
 }
 
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
                         int src_device, size_t count, cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+  auto error_code =
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream);
+  PADDLE_ENFORCE(
+      error_code,
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync "
+      "error code : %d, %s",
+      error_code, CudaErrorWebsite());
 }
 
 void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                        int src_device, size_t count) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
+  auto error_code = cudaMemcpyPeer(dst, dst_device, src, src_device, count);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync "
+                 "error code : %d, %s",
+                 error_code, CudaErrorWebsite());
 }
 
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
+  auto error_code = cudaMemsetAsync(dst, value, count, stream);
+  PADDLE_ENFORCE(error_code,
+                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync "
+                 "error code : %d, %s",
+                 error_code, CudaErrorWebsite());
 }
 
 void GpuStreamSync(cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  auto error_code = cudaStreamSynchronize(stream);
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      error_code,
+      platform::errors::External(
+          "cudaStreamSynchronize failed in paddle::platform::GpuStreamSync "
+          "error code : %d, %s",
+          error_code, CudaErrorWebsite()));
 }
 
 static void RaiseNonOutOfMemoryError(cudaError_t *status) {
   if (*status == cudaErrorMemoryAllocation) {
     *status = cudaSuccess;
   }
+
   PADDLE_ENFORCE_CUDA_SUCCESS(*status);
 
   *status = cudaGetLastError();
   if (*status == cudaErrorMemoryAllocation) {
     *status = cudaSuccess;
   }
+
   PADDLE_ENFORCE_CUDA_SUCCESS(*status);
 }
 
@@ -387,7 +450,8 @@ class RecordedCudaMallocHelper {
     CUDADeviceGuard guard(dev_id_);
     auto err = cudaFree(ptr);
     if (err != cudaErrorCudartUnloading) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(err);
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          err, platform::errors::External("cudaFree raises unexpected error"));
       if (NeedRecord()) {
         std::lock_guard<std::mutex> guard(*mtx_);
         cur_size_ -= size;
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index af27564b99f79..41d5180ffaf5f 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -117,7 +117,10 @@ void SynchronizeAllDevice() {
   int count = GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
     SetDeviceId(i);
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaDeviceSynchronize(),
+        platform::errors::External(
+            "Device synchronize failed in cudaDeviceSynchronize()"));
   }
 #endif
 }
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index 7a090ff7e5166..739892eafd824 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -30,10 +30,13 @@ bool CUDAStream::Init(const Place& place, const enum Priority& priority) {
   CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
   if (priority == Priority::kHigh) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1));
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1),
+        platform::errors::Fatal("High priority cuda stream creation failed."));
   } else if (priority == Priority::kNormal) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0),
+        platform::errors::Fatal(
+            "Normal priority cuda stream creation failed."));
   }
   callback_manager_.reset(new StreamCallbackManager(stream_));
   VLOG(3) << "CUDAStream Init stream: " << stream_
@@ -46,7 +49,9 @@ void CUDAStream::Destroy() {
   Wait();
   WaitCallback();
   if (stream_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamDestroy(stream_),
+        platform::errors::Fatal("Cuda stream destruction failed."));
   }
   stream_ = nullptr;
 }
@@ -62,7 +67,10 @@ void CUDAStream::Wait() const {
   }
 #endif
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      e_sync, platform::errors::Fatal(
+                  "cudaStreamSynchronize raises error: %s, errono: %d",
+                  cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
 }
 
 }  // namespace stream
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 57e763d527624..f7149f1e13098 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -53,15 +53,21 @@ class CUDAStream final {
   template <typename Callback>
   void RecordEvent(cudaEvent_t ev, Callback callback) const {
     callback();
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventRecord(ev, stream_),
+        platform::errors::Fatal("CUDA event recording failed."));
   }
 
   void RecordEvent(cudaEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventRecord(ev, stream_),
+        platform::errors::Fatal("CUDA event recording failed."));
   }
 
   void WaitEvent(cudaEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamWaitEvent(stream_, ev, 0),
+        platform::errors::Fatal("Failed to wait event."));
   }
 
   void Wait() const;
diff --git a/python/setup.py.in b/python/setup.py.in
index f6ec0fa26b984..906615e7efb42 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -215,14 +215,12 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
-
 if not '${WIN32}':
     paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
 if '${HAS_NOAVX_CORE}' == 'ON':
     package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
 
-
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
     # The paddle.fluid.proto will be generated while compiling.
@@ -333,7 +331,6 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
     list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
     list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
-    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage
     ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen
     list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen
     list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen
@@ -405,9 +402,7 @@ class InstallHeaders(Command):
         return self.copy_file(header, install_dir)
 
     def run(self):
-        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
         if os.name == 'nt' or sys.platform == 'darwin':
-            self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
             return
         hdrs = self.distribution.headers
         if not hdrs:
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 3e079d0433f87..51330bea8ea62 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -172,8 +172,8 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6836917 47554610 22561442
 fi
 
-ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
-VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
+ALL_PADDLE_CHECK=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true`
+VALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' || true`
 INVALID_PADDLE_CHECK=`echo "$ALL_PADDLE_CHECK" |grep -vxF "$VALID_PADDLE_CHECK" || true`
 if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="The error message you wrote in PADDLE_ENFORCE{_**} or PADDLE_THROW does not meet our error message writing specification. Possible errors include 1. the error message is empty / 2. the error message is too short / 3. the error type is not specified. Please read the specification [ https://github.com/PaddlePaddle/Paddle/wiki/Paddle-Error-Message-Writing-Specification ], then refine the error message. If it is a mismatch, please specify chenwhql (Recommend), luotao1 or lanxianghit review and approve.\nThe PADDLE_ENFORCE{_**} or PADDLE_THROW entries that do not meet the specification are as follows:\n${INVALID_PADDLE_CHECK}\n"
diff --git a/tools/count_invalid_enforce.sh b/tools/count_invalid_enforce.sh
index fe99674f6bec6..a2dbc22119d75 100644
--- a/tools/count_invalid_enforce.sh
+++ b/tools/count_invalid_enforce.sh
@@ -30,9 +30,9 @@ ALL_PADDLE_CHECK_CNT=0
 VALID_PADDLE_CHECK_CNT=0
 
 function enforce_scan(){
-    paddle_check=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" $1 || true`
+    paddle_check=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" $1 || true`
     total_check_cnt=`echo "$paddle_check" | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
-    valid_check_cnt=`echo "$paddle_check" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
+    valid_check_cnt=`echo "$paddle_check" | grep -zoE '(PADDLE_ENFORCE[A-Z_]*|PADDLE_THROW)\((.[^,;]+,)*.[^";]*(errors::).[^"]*".[^";]{20,}.[^;]*\);\s' | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true`
     eval $2=$total_check_cnt
     eval $3=$valid_check_cnt
 }
diff --git a/tools/cudaError/README.md b/tools/cudaError/README.md
deleted file mode 100644
index df7434c33a9fd..0000000000000
--- a/tools/cudaError/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-Usage:
-
-Please run:
-```
-bash start.sh
-```
-
-The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default.
-
-If you want to crawl a specified version of CUDA, Please run:
-```
-bash start.sh <version> <URL(optional)>
-```
-URL can be derived by default, so you don't have to enter a URL.
-
-for example:
-```
-bash start.sh 11.0
-```
-will capture error message of CUDA11.0(in future).
-
-Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz
diff --git a/tools/cudaError/spider.py b/tools/cudaError/spider.py
deleted file mode 100644
index c2c3dc97f4222..0000000000000
--- a/tools/cudaError/spider.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import ssl
-import re
-import urllib2
-import json
-import collections
-import sys, getopt
-import cuda_error_pb2
-
-
-def parsing(cuda_errorDesc, version, url):
-    All_Messages = cuda_errorDesc.AllMessages.add()
-    All_Messages.version = int(version)
-
-    ssl._create_default_https_context = ssl._create_unverified_context
-    html = urllib2.urlopen(url).read()
-    res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
-    m_div = re.findall(res_div, html, re.S | re.M)
-
-    url_list = url.split('/')
-    url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1])
-
-    dic = collections.OrderedDict()
-    dic_message = collections.OrderedDict()
-    for line in m_div:
-        res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
-        m_dt = re.findall(res_dt, line, re.S | re.M)
-        for error in m_dt:
-            res_type = r'<span class="ph ph apiData">(.*?)</span>'
-            m_type = re.findall(res_type, error[0], re.S | re.M)[0]
-            m_message = error[1]
-            m_message = m_message.replace('\n', '')
-            res_a = r'(<a class=.*?</a>)'
-            res_shape = r'<a class=.*?>(.*?)</a>'
-            list_a = re.findall(res_a, m_message, re.S | re.M)
-            list_shape = re.findall(res_shape, m_message, re.S | re.M)
-            assert len(list_a) == len(list_shape)
-            for idx in range(len(list_a)):
-                m_message = m_message.replace(list_a[idx], list_shape[idx])
-
-            m_message = m_message.replace(
-                '<h6 class=\"deprecated_header\">Deprecated</h6>', '')
-
-            res_span = r'(<span class=.*?</span>)'
-            res_span_detail = r'<span class=.*?>(.*?)</span>'
-            list_span = re.findall(res_span, m_message, re.S | re.M)
-            list_span_detail = re.findall(res_span_detail, m_message, re.S |
-                                          re.M)
-            assert len(list_span) == len(list_span_detail)
-            for idx in range(len(list_span)):
-                m_message = m_message.replace(list_span[idx],
-                                              list_span_detail[idx])
-
-            res_p = r'(<p>.*?</p>)'
-            res_p_detail = r'<p>(.*?)</p>'
-            list_p = re.findall(res_p, m_message, re.S | re.M)
-            list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
-            assert len(list_p) == len(list_p_detail)
-            for idx in range(len(list_p)):
-                m_message = m_message.replace(list_p[idx], list_p_detail[idx])
-
-            m_message = m_message.replace('  ', '')
-            _Messages = All_Messages.Messages.add()
-            try:
-                _Messages.errorCode = int(m_type)
-            except ValueError:
-                if re.match('0x', m_type):
-                    _Messages.errorCode = int(m_type, 16)
-                else:
-                    raise ValueError
-            _Messages.errorMessage = m_message  # save for cudaErrorMessage.pb from python-protobuf interface
-
-
-def main(argv):
-    version = []
-    url = []
-    try:
-        opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="])
-    except getopt.GetoptError:
-        print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
-        sys.exit(2)
-    for opt, arg in opts:
-        if opt in ("-h", "--help"):
-            print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
-            sys.exit()
-        elif opt in ("-v", "--version"):
-            version = arg
-        elif opt in ("-u", "--url"):
-            url = arg
-    version = version.split(',')
-    url = url.split(',')
-    assert len(version) == len(url)
-    cuda_errorDesc = cuda_error_pb2.cudaerrorDesc()
-    for idx in range(len(version)):
-        if version[idx] == "-1":
-            print("crawling errorMessage for CUDA%s from %s" %
-                  ("-latest-version", url[idx]))
-        else:
-            print("crawling errorMessage for CUDA%s from %s" %
-                  (version[idx], url[idx]))
-        parsing(cuda_errorDesc, version[idx], url[idx])
-
-    serializeToString = cuda_errorDesc.SerializeToString()
-    with open("cudaErrorMessage.pb", "wb") as f:
-        f.write(serializeToString
-                )  # save for cudaErrorMessage.pb from python-protobuf interface
-    print("crawling errorMessage for CUDA has been done!!!")
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh
deleted file mode 100644
index 3c0e57ffe7ec1..0000000000000
--- a/tools/cudaError/start.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env bash
-set -ex
-SYSTEM=`uname -s`
-rm -f protoc-3.11.3-linux-x86_64.*
-if [ "$SYSTEM" == "Linux" ]; then
-    wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip
-    unzip -d protobuf -o protoc-3.11.3-linux-x86_64.zip
-    rm protoc-3.11.3-linux-x86_64.*
-elif [ "$SYSTEM" == "Darwin" ]; then
-    wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-osx-x86_64.zip
-    unzip -d protobuf -o protoc-3.11.3-osx-x86_64.zip
-    rm protoc-3.11.3-osx-x86_64.*
-else
-    echo "please run on Mac/Linux"
-    exit 1
-fi
-protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto
-
-version=90,100,-1    # -1 represent the latest cuda-version 
-url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
-
-if [ "$1" != "" ]; then
-    version=$version,$(($1*10))
-    if [ "$2" != "" ]; then
-        url=$url,$2
-    else
-        url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
-    fi
-fi
-
-python spider.py --version=$version --url=$url
-tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb