Merge pull request PaddlePaddle#55 from tiancaitzp/paddlebox

optimize fused_seqpool_cvm ops host time.
zmxdream · Feb 28, 2024 · 555d9c8 · 555d9c8
2 parents c92e181 + d906074
commit 555d9c8
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 23 deletions.
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op_xpu.cc
@@ -94,12 +94,11 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel<T> {
     }
     for (int i = 0; i < slot_num; i++) {
       out[i]->Resize({static_cast<int64_t>(bs), y_dims[1]});
-      out[i]->set_lod(y_lod);
     }
     //TODO:r480 l3 have some thing wrong
     static bool use_l3_tensor = std::getenv("XPU_PADDLE_L3_TENSOR")!=NULL ?
-                        (std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
-                        false;
+                      (std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
+                      false;
     auto place = ctx.GetPlace();
     phi::Place l3_place = ctx.template device_context<DeviceContext>().GetL3Place();
     int w = ins[0]->numel() / x0_dims[0];
@@ -115,11 +114,16 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel<T> {
                       "The output of dims[1] should be dividable of (w-2)"));
     }
 
-    std::vector<const T*> cpu_x_addr_vec(slot_num, 0);
-    std::vector<T*> cpu_y_addr_vec(slot_num, 0);
+    std::vector<const T*> cpu_x_addr_vec;
+    cpu_x_addr_vec.reserve(slot_num);
+    std::vector<T*> cpu_y_addr_vec;
+    cpu_y_addr_vec.reserve(slot_num);
+
     unsigned int sum_lod_size = slot_num * (bs + 1);
-    std::vector<int> cpu_lodx(sum_lod_size);
+    std::vector<int> cpu_lodx;
+    cpu_lodx.reserve(sum_lod_size);
     unsigned int lod_index = 0;
+
     for (int i = 0; i < slot_num; i++) {
         cpu_x_addr_vec[i] = reinterpret_cast<const T*>(ins[i]->data<T>());
         if(use_l3_tensor) {
@@ -128,10 +132,14 @@ class FusedSeqpoolCVMOpXPUKernel : public framework::OpKernel<T> {
           cpu_y_addr_vec[i] = reinterpret_cast<T*>(out[i]->mutable_data<T>(place));
         }
         auto x_lod = ins[i]->lod()[0];
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
         for (size_t j = 0; j < x_lod.size(); j++) {
            cpu_lodx[lod_index + j] = x_lod[j];
         }
-	      lod_index += x_lod.size();
+
+        lod_index += x_lod.size();
     }
 #ifdef TRACE_PROFILE
     TRACE_SCOPE_START("xpu::sequence_sum_pool_cvm", xpu_wait(xpu_context->xpu_stream););
@@ -239,7 +247,7 @@ class FusedSeqpoolCVMGradOpXPUKernel : public framework::OpKernel<T> {
                                                item_size,
                                                batch_size,
                                                slot_num,
-                                               embed_thres_size);                                    
+                                               embed_thres_size);
 
      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
             platform::errors::External(

diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op_xpu.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op_xpu.cc
@@ -66,12 +66,11 @@ class FusedSeqpoolCVMWithConvOpXPUKernel : public framework::OpKernel<T> {
     }
     for (int i = 0; i < slot_num; i++) {
       out[i]->Resize({static_cast<int64_t>(bs), y_dims[1]});
-      out[i]->set_lod(y_lod);
     }
     //TODO:r480 l3 have some thing wrong
     static bool use_l3_tensor = std::getenv("XPU_PADDLE_L3_TENSOR")!=NULL ?
-                        (std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
-                        false;
+                      (std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
+                      false;
     auto place = ctx.GetPlace();
     phi::Place l3_place = ctx.template device_context<DeviceContext>().GetL3Place();
     int w = ins[0]->numel() / x0_dims[0];
@@ -87,11 +86,16 @@ class FusedSeqpoolCVMWithConvOpXPUKernel : public framework::OpKernel<T> {
                       "The output of dims[1] should be dividable of (w-2)"));
     }
 
-    std::vector<const T*> cpu_x_addr_vec(slot_num, 0);
-    std::vector<T*> cpu_y_addr_vec(slot_num, 0);
+    std::vector<const T*> cpu_x_addr_vec;
+    cpu_x_addr_vec.reserve(slot_num);
+    std::vector<T*> cpu_y_addr_vec;
+    cpu_y_addr_vec.reserve(slot_num);
+
     unsigned int sum_lod_size = slot_num * (bs + 1);
-    std::vector<int> cpu_lodx(sum_lod_size);
+    std::vector<int> cpu_lodx;
+    cpu_lodx.reserve(sum_lod_size);
     unsigned int lod_index = 0;
+
     for (int i = 0; i < slot_num; i++) {
         cpu_x_addr_vec[i] = reinterpret_cast<const T*>(ins[i]->data<T>());
         if(use_l3_tensor) {
@@ -100,10 +104,14 @@ class FusedSeqpoolCVMWithConvOpXPUKernel : public framework::OpKernel<T> {
           cpu_y_addr_vec[i] = reinterpret_cast<T*>(out[i]->mutable_data<T>(place));
         }
         auto x_lod = ins[i]->lod()[0];
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
         for (size_t j = 0; j < x_lod.size(); j++) {
            cpu_lodx[lod_index + j] = x_lod[j];
         }
-	      lod_index += x_lod.size();
+
+        lod_index += x_lod.size();
     }
 
 #ifdef TRACE_PROFILE

diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_with_diff_thres_op_xpu.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_with_diff_thres_op_xpu.cc
@@ -58,6 +58,7 @@ class FusedSeqpoolCVMWithDiffThresOpXPUKernel : public framework::OpKernel<T> {
     auto threshold_vec_param = ctx.Attr<std::vector<float>>("threshold_vec");
     std::vector<float> threshold_vec(threshold_vec_param.begin(), threshold_vec_param.end());
 
+
     // VLOG(0) << "threshold_vec.size=" << threshold_vec.size();
     // for(int i=0; i<threshold_vec.size(); ++i) {
     //   VLOG(0) << "i=" << i << ", threshold=" << threshold_vec[i];
@@ -74,14 +75,19 @@ class FusedSeqpoolCVMWithDiffThresOpXPUKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i <= bs; ++i) {
         y_lod[0][i] = i;
     }
+
     for (int i = 0; i < slot_num; i++) {
       out[i]->Resize({static_cast<int64_t>(bs), y_dims[1]});
-      out[i]->set_lod(y_lod);
     }
-    //TODO:r480 l3 have some thing wrong
+
+    // struct timeval af_set_var;
+    // gettimeofday(&af_set_var, NULL);
+
+    // TODO:r480 l3 have some thing wrong
     static bool use_l3_tensor = std::getenv("XPU_PADDLE_L3_TENSOR")!=NULL ?
-                        (std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
-                        false;
+                      (std::strcmp(std::getenv("XPU_PADDLE_L3_TENSOR"), "1") == 0 ? true:false) :
+                      false;
+
     auto place = ctx.GetPlace();
     phi::Place l3_place = ctx.template device_context<DeviceContext>().GetL3Place();
     int w = ins[0]->numel() / x0_dims[0];
@@ -97,11 +103,16 @@ class FusedSeqpoolCVMWithDiffThresOpXPUKernel : public framework::OpKernel<T> {
                       "The output of dims[1] should be dividable of (w-2)"));
     }
 
-    std::vector<const T*> cpu_x_addr_vec(slot_num, 0);
-    std::vector<T*> cpu_y_addr_vec(slot_num, 0);
+    std::vector<const T*> cpu_x_addr_vec;
+    cpu_x_addr_vec.reserve(slot_num);
+    std::vector<T*> cpu_y_addr_vec;
+    cpu_y_addr_vec.reserve(slot_num);
+
     unsigned int sum_lod_size = slot_num * (bs + 1);
-    std::vector<int> cpu_lodx(sum_lod_size);
+    std::vector<int> cpu_lodx;
+    cpu_lodx.reserve(sum_lod_size);
     unsigned int lod_index = 0;
+
     for (int i = 0; i < slot_num; i++) {
         cpu_x_addr_vec[i] = reinterpret_cast<const T*>(ins[i]->data<T>());
         if(use_l3_tensor) {
@@ -110,10 +121,14 @@ class FusedSeqpoolCVMWithDiffThresOpXPUKernel : public framework::OpKernel<T> {
           cpu_y_addr_vec[i] = reinterpret_cast<T*>(out[i]->mutable_data<T>(place));
         }
         auto x_lod = ins[i]->lod()[0];
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
         for (size_t j = 0; j < x_lod.size(); j++) {
            cpu_lodx[lod_index + j] = x_lod[j];
         }
-	      lod_index += x_lod.size();
+
+        lod_index += x_lod.size();
     }
 
 #ifdef TRACE_PROFILE