Skip to content

Commit

Permalink
Revert 'Optimize the error messages of paddle CUDA API (PaddlePaddle#…
Browse files Browse the repository at this point in the history
…23816)',test=develop
  • Loading branch information
zhwesky2010 committed Apr 21, 2020
1 parent 89cf4f9 commit 6f59331
Show file tree
Hide file tree
Showing 30 changed files with 496 additions and 645 deletions.
15 changes: 4 additions & 11 deletions cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,6 @@ copy(inference_lib_dist
SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
DSTS ${dst_dir})

set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
copy(inference_lib_dist
SRCS ${cudaerror_INCLUDE_DIR}
DSTS ${dst_dir})

# CMakeCache Info
copy(inference_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR})
Expand Down Expand Up @@ -190,7 +184,7 @@ copy(fluid_lib_dist
)

set(module "framework")
set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
set(framework_lib_deps framework_proto)
add_dependencies(fluid_lib_dist ${framework_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
Expand All @@ -210,11 +204,11 @@ copy(fluid_lib_dist
)

set(module "platform")
set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
set(platform_lib_deps profiler_proto)
add_dependencies(fluid_lib_dist ${platform_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}
)

set(module "string")
Expand Down Expand Up @@ -255,7 +249,6 @@ copy(inference_lib_dist
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)


# CMakeCache Info
copy(fluid_lib_dist
SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
Expand Down
35 changes: 2 additions & 33 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

include(ExternalProject)
# Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)

set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
Expand All @@ -22,7 +21,6 @@ set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING
"A path cache third party source code to avoid repeated download.")

set(THIRD_PARTY_BUILD_TYPE Release)
set(third_party_deps)

# cache funciton to avoid repeat download code of third_party.
# This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
Expand Down Expand Up @@ -102,32 +100,6 @@ MACRO(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME})
ENDMACRO()

# Funciton to Download the dependencies during compilation
# This function has 2 parameters, URL / DIRNAME:
# 1. URL: The download url of 3rd dependencies
# 2. NAME: The name of file, that determin the dirname
#
MACRO(file_download_and_uncompress URL NAME)
MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
ExternalProject_Add(
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${THIRD_PARTY_PATH}/${NAME}
URL ${URL}
DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
ENDMACRO()


# Correction of flags on different Platform(WIN/MAC) and Print Warning Message
if (APPLE)
if(WITH_MKL)
Expand Down Expand Up @@ -206,13 +178,10 @@ include(external/dlpack) # download dlpack
include(external/xxhash) # download, build, install xxhash
include(external/warpctc) # download, build, install warpctc

set(third_party_deps)
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)

# download file
set(CUDAERROR_URL "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")

if(WITH_AMD_GPU)
include(external/rocprim) # download, build, install rocprim
list(APPEND third_party_deps extern_rocprim)
Expand Down Expand Up @@ -305,4 +274,4 @@ if (WITH_LITE)
include(external/lite)
endif (WITH_LITE)

add_custom_target(third_party ALL DEPENDS ${third_party_deps})
add_custom_target(third_party DEPENDS ${third_party_deps})
4 changes: 3 additions & 1 deletion paddle/fluid/framework/details/nan_inf_utils_detail.cu
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,9 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(

PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
cudaMemcpyHostToDevice, dev_ctx->stream()));
cudaMemcpyHostToDevice, dev_ctx->stream()),
platform::errors::External(
"Async cudaMemcpy op_var info to gpu failed."));
} else { // get
auto iter = op_var2gpu_str.find(op_var);
PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
Expand Down
27 changes: 18 additions & 9 deletions paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,12 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs_.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));

int outer_rows = outer_rows_ * batchSize;

Expand Down Expand Up @@ -241,9 +244,12 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));

split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
Expand All @@ -257,9 +263,12 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
cudaMemcpyHostToDevice, stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs.size() * sizeof(half*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));

split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
Expand Down
15 changes: 11 additions & 4 deletions paddle/fluid/memory/allocation/cuda_device_context_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,17 @@ class CUDADeviceContextAllocator : public Allocator {
: place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreate(&event_, cudaEventDisableTiming));
cudaEventCreate(&event_, cudaEventDisableTiming),
platform::errors::External(
"Create event failed in CUDADeviceContextAllocator"));
}

~CUDADeviceContextAllocator() {
if (event_) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventDestroy(event_),
"Destory event failed in CUDADeviceContextAllocator destroctor");
}
}

Expand All @@ -99,9 +103,12 @@ class CUDADeviceContextAllocator : public Allocator {
auto allocation =
new CUDADeviceContextAllocation(memory::Alloc(place_, size));
// Wait for the event on stream
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0));
cudaEventRecord(event_, default_stream_),
"Failed to record event in CUDADeviceContextAllocator");
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0),
"Failed to wait event in CUDADeviceContextAllocator");
return allocation;
}

Expand Down
14 changes: 12 additions & 2 deletions paddle/fluid/operators/argsort_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,12 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
cu_stream);
}
PADDLE_ENFORCE_CUDA_SUCCESS(err);
PADDLE_ENFORCE_CUDA_SUCCESS(
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
"temp_storage_bytes, status:%s.",
temp_storage_bytes, cudaGetErrorString(err));

Tensor temp_storage;
temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
Expand All @@ -160,7 +165,12 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
cu_stream);
}

PADDLE_ENFORCE_CUDA_SUCCESS(err);
PADDLE_ENFORCE_CUDA_SUCCESS(
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
"temp_storage_bytes:%d status:%s.",
temp_storage_bytes, cudaGetErrorString(err));
}

template <typename T, typename IndType>
Expand Down
Loading

0 comments on commit 6f59331

Please sign in to comment.