Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#63 from paddlebox-xpu/xpu_check_nan_inf
Browse files Browse the repository at this point in the history
xpu support check_nan_inf
  • Loading branch information
xymyeah authored Apr 16, 2024
2 parents b6b2e43 + dfe6e10 commit e0f1c18
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 30 deletions.
4 changes: 2 additions & 2 deletions paddle/fluid/framework/boxps_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ void BoxPSWorker::TrainFiles() {
SyncParam();
}
}
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)
if (FLAGS_check_nan_inf) {
// check nan result
if (framework::details::CheckBatchNanOrInfRet(place_)) {
Expand Down Expand Up @@ -892,7 +892,7 @@ void BoxPSWorker::TrainFilesWithProfiler() {
TRACE_SCOPE_END("ops run",);
#endif
cal_timer.Pause();
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)
if (FLAGS_check_nan_inf) {
// check nan result
if (framework::details::CheckBatchNanOrInfRet(place_)) {
Expand Down
117 changes: 90 additions & 27 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
#endif
#include "paddle/fluid/framework/convert_utils.h"

#ifdef PADDLE_WITH_XPU
#include "xpu/refactor/math.h"
#endif

namespace paddle {
namespace framework {
namespace details {
Expand Down Expand Up @@ -391,22 +395,48 @@ void CheckVarHasNanOrInf(const std::string& op_type,
return;
}

float* cpu_data = new float[tensor->numel()];
// float* cpu_data = new float[tensor->numel()];
// memory::Copy(platform::CPUPlace(),
// static_cast<void*>(cpu_data),
// tensor->place(),
// static_cast<const void*>(tensor->data<float>()),
// tensor->numel() * sizeof(float));
// bool flag = false;
// for (int i = 0; i < tensor->numel(); i++) {
// if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
// flag = true;
// break;
// }
// }
// delete[] cpu_data;

using XPUType = typename XPUTypeTrait<float>::Type;
platform::XPUDeviceContext* dev_ctx = dynamic_cast<platform::XPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(tensor->place()));
const XPUType* x = reinterpret_cast<const XPUType*>(tensor->data<float>());

Tensor y_tensor;
bool* y_ptr = y_tensor.mutable_data<bool>({1}, place);
int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(),
x,
y_ptr,
tensor->numel());
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External(
"The check_nan_or_inf XPU OP return wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
dev_ctx->Wait();

bool check_res = false;
bool* res_ptr = &check_res;
memory::Copy(platform::CPUPlace(),
static_cast<void*>(cpu_data),
tensor->place(),
static_cast<const void*>(tensor->data<float>()),
tensor->numel() * sizeof(float));
bool flag = false;
for (int i = 0; i < tensor->numel(); i++) {
if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
flag = true;
break;
}
}
delete[] cpu_data;
static_cast<void*>(res_ptr),
y_tensor.place(),
static_cast<const void*>(y_tensor.data<bool>()),
y_tensor.numel() * sizeof(bool));
VLOG(3) << "CheckVarHasNanOrInfRet check_res = " << check_res;
PADDLE_ENFORCE_NE(
flag,
check_res,
true,
platform::errors::Fatal(
"Operator %s output Tensor %s contains Inf.", op_type, var_name));
Expand Down Expand Up @@ -719,30 +749,62 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
#ifdef PADDLE_WITH_XPU
if (framework::TransToProtoVarType(tensor->dtype()) !=
proto::VarType::FP32) {
LOG(WARNING) << "skip check_nan_inf, tensor type:" << tensor->dtype() << " not float32!";
return;
}

float* cpu_data = new float[tensor->numel()];
// float* cpu_data = new float[tensor->numel()];
// memory::Copy(platform::CPUPlace(),
// static_cast<void*>(cpu_data),
// tensor->place(),
// static_cast<const void*>(tensor->data<float>()),
// tensor->numel() * sizeof(float));
// // bool flag = false;
// for (int64_t i = 0; i < tensor->numel(); i++) {
// if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
// get_cpu_nan_inf_num() ++;
// break;
// }
// }
// delete[] cpu_data;

using XPUType = typename XPUTypeTrait<float>::Type;
platform::XPUDeviceContext* dev_ctx = dynamic_cast<platform::XPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(tensor->place()));
const XPUType* x = reinterpret_cast<const XPUType*>(tensor->data<float>());

Tensor y_tensor;
bool* y_ptr = y_tensor.mutable_data<bool>({1}, place);
int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(),
x,
y_ptr,
tensor->numel());
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External(
"The check_nan_or_inf XPU OP return wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
dev_ctx->Wait();

bool check_res = false;
bool* res_ptr = &check_res;
memory::Copy(platform::CPUPlace(),
static_cast<void*>(cpu_data),
tensor->place(),
static_cast<const void*>(tensor->data<float>()),
tensor->numel() * sizeof(float));
// bool flag = false;
for (int64_t i = 0; i < tensor->numel(); i++) {
if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
get_cpu_nan_inf_num() ++;
break;
}
static_cast<void*>(res_ptr),
y_tensor.place(),
static_cast<const void*>(y_tensor.data<bool>()),
y_tensor.numel() * sizeof(bool));
VLOG(3) << "CheckVarHasNanOrInfRet check_res = " << check_res;
if (check_res) {
get_cpu_nan_inf_num() ++;
}
delete[] cpu_data;
return;
#endif
}
#if defined(PADDLE_WITH_CUDA)
unsigned int* dnum = get_device_num_ptr(place);
CudaTensorCheckNanInf(*tensor, dnum);
#endif
}

bool CheckBatchNanOrInfRet(const platform::Place& place) {
if (!platform::is_gpu_place(place)) {
return (get_cpu_nan_inf_num() > 0);
Expand Down Expand Up @@ -829,9 +891,10 @@ void DumpTensorToFile(const std::string& path, const std::string& prefix,
out.write(s.c_str(), s.length());
out.close();
}

void DumpAllScope(const Scope& exec_scope, const platform::Place& place) {
int device_id = 0;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)) && !defined(_WIN32)
device_id = place.GetDeviceId();
#endif
VLOG(0) << "begin dump scope all tensor data, device id=" << device_id;
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1760,7 +1760,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
}

if (FLAGS_check_nan_inf) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_XPU_KP)
if (framework::details::CheckOpHasNanOrInfRet(*this, exec_scope, place)) {
framework::details::DumpAllScope(exec_scope, place);
// dump current op data
Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/platform/device/xpu/xpu2_op_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,12 @@ XPUOpMap& get_kl2_ops() {
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"fused_concat_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"load",
XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
pOpKernelType(vartype::INT64, XPUPlace()),
pOpKernelType(vartype::INT32, XPUPlace()),
pOpKernelType(vartype::INT8, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace())})},
};
return s_xpu2_kernels;
}
Expand Down

0 comments on commit e0f1c18

Please sign in to comment.