Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#55 from tiancaitzp/paddlebox
Browse files Browse the repository at this point in the history
optimize fused_seqpool_cvm ops host time.
  • Loading branch information
tiancaitzp authored Feb 28, 2024
2 parents c92e181 + d906074 commit 555d9c8
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 23 deletions.
24 changes: 16 additions & 8 deletions paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,11 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel<T> {
}
for (int i = 0; i < slot_num; i++) {
out[i]->Resize({static_cast<int64_t>(bs), y_dims[1]});
out[i]->set_lod(y_lod);
}
//TODO:r480 l3 have some thing wrong
static bool use_l3_tensor = std::getenv("XPU_PADDLE_L3_TENSOR")!=NULL ?
(std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
false;
(std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
false;
auto place = ctx.GetPlace();
phi::Place l3_place = ctx.template device_context<DeviceContext>().GetL3Place();
int w = ins[0]->numel() / x0_dims[0];
Expand All @@ -115,11 +114,16 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel<T> {
"The output of dims[1] should be dividable of (w-2)"));
}

std::vector<const T*> cpu_x_addr_vec(slot_num, 0);
std::vector<T*> cpu_y_addr_vec(slot_num, 0);
std::vector<const T*> cpu_x_addr_vec;
cpu_x_addr_vec.reserve(slot_num);
std::vector<T*> cpu_y_addr_vec;
cpu_y_addr_vec.reserve(slot_num);

unsigned int sum_lod_size = slot_num * (bs + 1);
std::vector<int> cpu_lodx(sum_lod_size);
std::vector<int> cpu_lodx;
cpu_lodx.reserve(sum_lod_size);
unsigned int lod_index = 0;

for (int i = 0; i < slot_num; i++) {
cpu_x_addr_vec[i] = reinterpret_cast<const T*>(ins[i]->data<T>());
if(use_l3_tensor) {
Expand All @@ -128,10 +132,14 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel<T> {
cpu_y_addr_vec[i] = reinterpret_cast<T*>(out[i]->mutable_data<T>(place));
}
auto x_lod = ins[i]->lod()[0];
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (size_t j = 0; j < x_lod.size(); j++) {
cpu_lodx[lod_index + j] = x_lod[j];
}
lod_index += x_lod.size();

lod_index += x_lod.size();
}
#ifdef TRACE_PROFILE
TRACE_SCOPE_START("xpu::sequence_sum_pool_cvm", xpu_wait(xpu_context->xpu_stream););
Expand Down Expand Up @@ -239,7 +247,7 @@ class FusedSeqpoolCVMGradOpXPUKernel : public framework::OpKernel<T> {
item_size,
batch_size,
slot_num,
embed_thres_size);
embed_thres_size);

PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External(
Expand Down
22 changes: 15 additions & 7 deletions paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,11 @@ class FusedSeqpoolCVMWithConvOpXPUKernel : public framework::OpKernel<T> {
}
for (int i = 0; i < slot_num; i++) {
out[i]->Resize({static_cast<int64_t>(bs), y_dims[1]});
out[i]->set_lod(y_lod);
}
//TODO:r480 l3 have some thing wrong
static bool use_l3_tensor = std::getenv("XPU_PADDLE_L3_TENSOR")!=NULL ?
(std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
false;
(std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
false;
auto place = ctx.GetPlace();
phi::Place l3_place = ctx.template device_context<DeviceContext>().GetL3Place();
int w = ins[0]->numel() / x0_dims[0];
Expand All @@ -87,11 +86,16 @@ class FusedSeqpoolCVMWithConvOpXPUKernel : public framework::OpKernel<T> {
"The output of dims[1] should be dividable of (w-2)"));
}

std::vector<const T*> cpu_x_addr_vec(slot_num, 0);
std::vector<T*> cpu_y_addr_vec(slot_num, 0);
std::vector<const T*> cpu_x_addr_vec;
cpu_x_addr_vec.reserve(slot_num);
std::vector<T*> cpu_y_addr_vec;
cpu_y_addr_vec.reserve(slot_num);

unsigned int sum_lod_size = slot_num * (bs + 1);
std::vector<int> cpu_lodx(sum_lod_size);
std::vector<int> cpu_lodx;
cpu_lodx.reserve(sum_lod_size);
unsigned int lod_index = 0;

for (int i = 0; i < slot_num; i++) {
cpu_x_addr_vec[i] = reinterpret_cast<const T*>(ins[i]->data<T>());
if(use_l3_tensor) {
Expand All @@ -100,10 +104,14 @@ class FusedSeqpoolCVMWithConvOpXPUKernel : public framework::OpKernel<T> {
cpu_y_addr_vec[i] = reinterpret_cast<T*>(out[i]->mutable_data<T>(place));
}
auto x_lod = ins[i]->lod()[0];
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (size_t j = 0; j < x_lod.size(); j++) {
cpu_lodx[lod_index + j] = x_lod[j];
}
lod_index += x_lod.size();

lod_index += x_lod.size();
}

#ifdef TRACE_PROFILE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class FusedSeqpoolCVMWithDiffThresOpXPUKernel : public framework::OpKernel<T> {
auto threshold_vec_param = ctx.Attr<std::vector<float>>("threshold_vec");
std::vector<float> threshold_vec(threshold_vec_param.begin(), threshold_vec_param.end());


// VLOG(0) << "threshold_vec.size=" << threshold_vec.size();
// for(int i=0; i<threshold_vec.size(); ++i) {
// VLOG(0) << "i=" << i << ", threshold=" << threshold_vec[i];
Expand All @@ -74,14 +75,19 @@ class FusedSeqpoolCVMWithDiffThresOpXPUKernel : public framework::OpKernel<T> {
for (size_t i = 0; i <= bs; ++i) {
y_lod[0][i] = i;
}

for (int i = 0; i < slot_num; i++) {
out[i]->Resize({static_cast<int64_t>(bs), y_dims[1]});
out[i]->set_lod(y_lod);
}
//TODO:r480 l3 have some thing wrong

// struct timeval af_set_var;
// gettimeofday(&af_set_var, NULL);

// TODO:r480 l3 have some thing wrong
static bool use_l3_tensor = std::getenv("XPU_PADDLE_L3_TENSOR")!=NULL ?
(std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
false;
(std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
false;

auto place = ctx.GetPlace();
phi::Place l3_place = ctx.template device_context<DeviceContext>().GetL3Place();
int w = ins[0]->numel() / x0_dims[0];
Expand All @@ -97,11 +103,16 @@ class FusedSeqpoolCVMWithDiffThresOpXPUKernel : public framework::OpKernel<T> {
"The output of dims[1] should be dividable of (w-2)"));
}

std::vector<const T*> cpu_x_addr_vec(slot_num, 0);
std::vector<T*> cpu_y_addr_vec(slot_num, 0);
std::vector<const T*> cpu_x_addr_vec;
cpu_x_addr_vec.reserve(slot_num);
std::vector<T*> cpu_y_addr_vec;
cpu_y_addr_vec.reserve(slot_num);

unsigned int sum_lod_size = slot_num * (bs + 1);
std::vector<int> cpu_lodx(sum_lod_size);
std::vector<int> cpu_lodx;
cpu_lodx.reserve(sum_lod_size);
unsigned int lod_index = 0;

for (int i = 0; i < slot_num; i++) {
cpu_x_addr_vec[i] = reinterpret_cast<const T*>(ins[i]->data<T>());
if(use_l3_tensor) {
Expand All @@ -110,10 +121,14 @@ class FusedSeqpoolCVMWithDiffThresOpXPUKernel : public framework::OpKernel<T> {
cpu_y_addr_vec[i] = reinterpret_cast<T*>(out[i]->mutable_data<T>(place));
}
auto x_lod = ins[i]->lod()[0];
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (size_t j = 0; j < x_lod.size(); j++) {
cpu_lodx[lod_index + j] = x_lod[j];
}
lod_index += x_lod.size();

lod_index += x_lod.size();
}

#ifdef TRACE_PROFILE
Expand Down

0 comments on commit 555d9c8

Please sign in to comment.