synchronize in the beginning of all CUDA functions (#2661)

Fix #2660. TensorFlow made streams non-blocking in tensorflow/tensorflow@9d12620. Our own CUDA functions uses the default streams that is different from TensorFlow's, so we need to synchronize in the beginning of all functions. In the future, it might be worth using the same stream as TensorFlow's to improve the performance. Signed-off-by: Jinzhe Zeng <[email protected]>
deepmodeling · Jul 9, 2023 · d3d3c18 · d3d3c18
1 parent 37fd8d1
commit d3d3c18
Show file tree

Hide file tree

Showing 10 changed files with 64 additions and 0 deletions.
diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/cuda/coord.cu
@@ -335,6 +335,8 @@ template <typename FPTYPE>
 void normalize_coord_gpu(FPTYPE *coord,
                          const int natom,
                          const Region<FPTYPE> &region) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const FPTYPE *boxt = region.boxt;
   const FPTYPE *rec_boxt = region.rec_boxt;
   const int nblock = (natom + TPB - 1) / TPB;
@@ -360,6 +362,8 @@ int copy_coord_gpu(FPTYPE *out_c,
                    const int &total_cellnum,
                    const int *cell_info,
                    const Region<FPTYPE> &region) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
                    total_cellnum);
   int *int_data_cpu = new int

diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu
@@ -67,6 +67,8 @@ void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
   if (size <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
@@ -83,6 +85,8 @@ void gelu_grad_gpu_cuda(FPTYPE* out,
   if (size <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
@@ -100,6 +104,8 @@ void gelu_grad_grad_gpu_cuda(FPTYPE* out,
   if (size <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 

diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu
@@ -187,6 +187,8 @@ int build_nlist_gpu(InputNlist &nlist,
   if (mem_size < nall) {
     return 1;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int nblock = (nall + TPB - 1) / TPB;
   int *ilist = nlist.ilist;
   int *numneigh = nlist.numneigh;
@@ -229,6 +231,8 @@ void use_nlist_map(int *nlist,
                    const int *nlist_map,
                    const int nloc,
                    const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
@@ -246,6 +250,8 @@ void use_nei_info_gpu(int *nlist,
                       const int nnei,
                       const int ntypes,
                       const bool b_nlist_map) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
@@ -291,6 +297,8 @@ __global__ void map_filter_ftype(int *ftype_out,
 void filter_ftype_gpu_cuda(int *ftype_out,
                            const int *ftype_in,
                            const int nloc) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nloc + TPB - 1) / TPB;
   map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
   DPErrcheck(cudaGetLastError());

diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
@@ -569,6 +569,8 @@ void format_nbor_list_gpu_cuda(int* nlist,
                                const int nall,
                                const float rcut,
                                const std::vector<int> sec) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int LEN = 256;
   const int nnei = sec.back();
   const int nblock = (nloc + LEN - 1) / LEN;
@@ -629,6 +631,8 @@ void prod_env_mat_a_gpu_cuda(FPTYPE* em,
                              const float rcut_smth,
                              const std::vector<int> sec,
                              const int* f_type) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   if (f_type == NULL) {
     f_type = type;
   }
@@ -669,6 +673,8 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em,
                              const float rcut,
                              const float rcut_smth,
                              const std::vector<int> sec) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
   DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));

diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
@@ -110,6 +110,8 @@ void prod_force_a_gpu_cuda(FPTYPE* force,
                            const int nall,
                            const int nnei,
                            const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
@@ -137,6 +139,8 @@ void prod_force_r_gpu_cuda(FPTYPE* force,
                            const int nall,
                            const int nnei,
                            const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 1;
   DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 

diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu
@@ -88,6 +88,8 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
                                 const int nloc,
                                 const int nnei,
                                 const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(
       cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
@@ -117,6 +119,8 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
                                 const int nloc,
                                 const int nnei,
                                 const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 1;
   DPErrcheck(
       cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));

diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
@@ -113,6 +113,8 @@ void prod_virial_a_gpu_cuda(FPTYPE* virial,
                             const int nloc,
                             const int nall,
                             const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
@@ -141,6 +143,8 @@ void prod_virial_r_gpu_cuda(FPTYPE* virial,
                             const int nloc,
                             const int nall,
                             const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 

diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu
@@ -92,6 +92,8 @@ void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
                                  const int* nlist,
                                  const int nloc,
                                  const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
@@ -112,6 +114,8 @@ void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
                                  const int* nlist,
                                  const int nloc,
                                  const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei;
   DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;

diff --git a/source/lib/src/cuda/region.cu b/source/lib/src/cuda/region.cu
@@ -27,6 +27,8 @@ template <typename FPTYPE>
 void convert_to_inter_gpu(FPTYPE *ri,
                           const Region<FPTYPE> &region,
                           const FPTYPE *rp) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
@@ -36,13 +38,17 @@ template <typename FPTYPE>
 void convert_to_phys_gpu(FPTYPE *rp,
                          const Region<FPTYPE> &region,
                          const FPTYPE *ri) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   _inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
 void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   _compute_volume<<<1, 1>>>(volume, region.boxt);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());

diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
@@ -630,6 +630,8 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(
           out, table, em_x, em, two_embed, table_info[0], table_info[1],
@@ -653,6 +655,8 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
   DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));
 
@@ -679,6 +683,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
   tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
@@ -702,6 +708,8 @@ void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_t_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(
           out, table, em_x, em, table_info[0], table_info[1], table_info[2],
@@ -725,6 +733,8 @@ void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
   DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
 
@@ -752,6 +762,8 @@ void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));
 
   tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
@@ -774,6 +786,8 @@ void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_r_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(out, table, em, table_info[0], table_info[1],
                                   table_info[2], table_info[3], table_info[4],
@@ -794,6 +808,8 @@ void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));
 
   tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>
@@ -816,6 +832,8 @@ void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(
       cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
   tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>