Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize the error messages of paddle CUDA API #23816

Merged
merged 7 commits into from
Apr 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ copy(inference_lib_dist
SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
DSTS ${dst_dir})

set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
copy(inference_lib_dist
SRCS ${cudaerror_INCLUDE_DIR}
DSTS ${dst_dir})

# CMakeCache Info
copy(inference_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR})
Expand Down Expand Up @@ -184,7 +190,7 @@ copy(fluid_lib_dist
)

set(module "framework")
set(framework_lib_deps framework_proto)
set(framework_lib_deps framework_proto data_feed_proto trainer_desc_proto)
add_dependencies(fluid_lib_dist ${framework_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/trainer_desc.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
Expand All @@ -204,11 +210,11 @@ copy(fluid_lib_dist
)

set(module "platform")
set(platform_lib_deps profiler_proto)
set(platform_lib_deps profiler_proto error_codes_proto cuda_error_proto)
add_dependencies(fluid_lib_dist ${platform_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/error_codes.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
)

set(module "string")
Expand Down Expand Up @@ -249,6 +255,7 @@ copy(inference_lib_dist
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)


# CMakeCache Info
copy(fluid_lib_dist
SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
Expand Down
35 changes: 33 additions & 2 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

include(ExternalProject)
# Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)

set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
Expand All @@ -21,6 +22,7 @@ set(THIRD_PARTY_CACHE_PATH "${CMAKE_SOURCE_DIR}" CACHE STRING
"A path cache third party source code to avoid repeated download.")

set(THIRD_PARTY_BUILD_TYPE Release)
set(third_party_deps)

# cache funciton to avoid repeat download code of third_party.
# This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
Expand Down Expand Up @@ -100,6 +102,32 @@ MACRO(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME})
ENDMACRO()

# Funciton to Download the dependencies during compilation
# This function has 2 parameters, URL / DIRNAME:
# 1. URL: The download url of 3rd dependencies
# 2. NAME: The name of file, that determin the dirname
#
MACRO(file_download_and_uncompress URL NAME)
MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
SET(EXTERNAL_PROJECT_NAME "extern_download_${NAME}")
SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data)
ExternalProject_Add(
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${THIRD_PARTY_PATH}/${NAME}
URL ${URL}
DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
list(APPEND third_party_deps ${EXTERNAL_PROJECT_NAME})
ENDMACRO()


# Correction of flags on different Platform(WIN/MAC) and Print Warning Message
if (APPLE)
if(WITH_MKL)
Expand Down Expand Up @@ -178,10 +206,13 @@ include(external/dlpack) # download dlpack
include(external/xxhash) # download, build, install xxhash
include(external/warpctc) # download, build, install warpctc

set(third_party_deps)
list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)

# download file
set(CUDAERROR_URL "https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror")

if(WITH_AMD_GPU)
include(external/rocprim) # download, build, install rocprim
list(APPEND third_party_deps extern_rocprim)
Expand Down Expand Up @@ -274,4 +305,4 @@ if (WITH_LITE)
include(external/lite)
endif (WITH_LITE)

add_custom_target(third_party DEPENDS ${third_party_deps})
add_custom_target(third_party ALL DEPENDS ${third_party_deps})
4 changes: 1 addition & 3 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cu
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(

PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
cudaMemcpyHostToDevice, dev_ctx->stream()),
platform::errors::External(
"Async cudaMemcpy op_var info to gpu failed."));
cudaMemcpyHostToDevice, dev_ctx->stream()));
} else { // get
auto iter = op_var2gpu_str.find(op_var);
PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
Expand Down
27 changes: 9 additions & 18 deletions paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,9 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs_.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream));

int outer_rows = outer_rows_ * batchSize;

Expand Down Expand Up @@ -244,12 +241,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
cudaMemcpyHostToDevice, stream));

split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
Expand All @@ -263,12 +257,9 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);

PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyAsync(output_ptrs, h_odatas,
d_output_ptrs.size() * sizeof(half*),
cudaMemcpyHostToDevice, stream),
platform::errors::External(
"CUDA Memcpy failed during split plugin run."));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
cudaMemcpyHostToDevice, stream));

split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
Expand Down
15 changes: 4 additions & 11 deletions paddle/fluid/memory/allocation/cuda_device_context_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,17 +80,13 @@ class CUDADeviceContextAllocator : public Allocator {
: place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreate(&event_, cudaEventDisableTiming),
platform::errors::External(
"Create event failed in CUDADeviceContextAllocator"));
cudaEventCreate(&event_, cudaEventDisableTiming));
}

~CUDADeviceContextAllocator() {
if (event_) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventDestroy(event_),
"Destory event failed in CUDADeviceContextAllocator destroctor");
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
}
}

Expand All @@ -103,12 +99,9 @@ class CUDADeviceContextAllocator : public Allocator {
auto allocation =
new CUDADeviceContextAllocation(memory::Alloc(place_, size));
// Wait for the event on stream
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(event_, default_stream_),
"Failed to record event in CUDADeviceContextAllocator");
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0),
"Failed to wait event in CUDADeviceContextAllocator");
cudaStreamWaitEvent(default_stream_, event_, 0));
return allocation;
}

Expand Down
14 changes: 2 additions & 12 deletions paddle/fluid/operators/argsort_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
cu_stream);
}
PADDLE_ENFORCE_CUDA_SUCCESS(
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate"
"temp_storage_bytes, status:%s.",
temp_storage_bytes, cudaGetErrorString(err));
PADDLE_ENFORCE_CUDA_SUCCESS(err);

Tensor temp_storage;
temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
Expand All @@ -165,12 +160,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
cu_stream);
}

PADDLE_ENFORCE_CUDA_SUCCESS(
err,
"ArgSortOP failed as could not launch "
"cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
"temp_storage_bytes:%d status:%s.",
temp_storage_bytes, cudaGetErrorString(err));
PADDLE_ENFORCE_CUDA_SUCCESS(err);
}

template <typename T, typename IndType>
Expand Down
Loading