Skip to content

Commit

Permalink
synchronize in the beginning of all CUDA functions (#2661)
Browse files Browse the repository at this point in the history
Fix #2660.

TensorFlow made streams non-blocking in
tensorflow/tensorflow@9d12620.
Our own CUDA functions uses the default streams that is different from
TensorFlow's, so we need to synchronize in the beginning of all
functions.

In the future, it might be worth using the same stream as TensorFlow's
to improve the performance.

Signed-off-by: Jinzhe Zeng <[email protected]>
  • Loading branch information
njzjz authored Jul 9, 2023
1 parent 37fd8d1 commit d3d3c18
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 0 deletions.
4 changes: 4 additions & 0 deletions source/lib/src/cuda/coord.cu
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,8 @@ template <typename FPTYPE>
void normalize_coord_gpu(FPTYPE *coord,
const int natom,
const Region<FPTYPE> &region) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const FPTYPE *boxt = region.boxt;
const FPTYPE *rec_boxt = region.rec_boxt;
const int nblock = (natom + TPB - 1) / TPB;
Expand All @@ -360,6 +362,8 @@ int copy_coord_gpu(FPTYPE *out_c,
const int &total_cellnum,
const int *cell_info,
const Region<FPTYPE> &region) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
total_cellnum);
int *int_data_cpu = new int
Expand Down
6 changes: 6 additions & 0 deletions source/lib/src/cuda/gelu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
if (size <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int THREAD_ITEMS = 1024;
const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;

Expand All @@ -83,6 +85,8 @@ void gelu_grad_gpu_cuda(FPTYPE* out,
if (size <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int THREAD_ITEMS = 1024;
const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;

Expand All @@ -100,6 +104,8 @@ void gelu_grad_grad_gpu_cuda(FPTYPE* out,
if (size <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int THREAD_ITEMS = 1024;
const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;

Expand Down
8 changes: 8 additions & 0 deletions source/lib/src/cuda/neighbor_list.cu
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ int build_nlist_gpu(InputNlist &nlist,
if (mem_size < nall) {
return 1;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int nblock = (nall + TPB - 1) / TPB;
int *ilist = nlist.ilist;
int *numneigh = nlist.numneigh;
Expand Down Expand Up @@ -229,6 +231,8 @@ void use_nlist_map(int *nlist,
const int *nlist_map,
const int nloc,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
int nblock = (nnei + TPB - 1) / TPB;
dim3 block_grid(nloc, nblock);
dim3 thread_grid(1, TPB);
Expand All @@ -246,6 +250,8 @@ void use_nei_info_gpu(int *nlist,
const int nnei,
const int ntypes,
const bool b_nlist_map) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
int nblock = (nnei + TPB - 1) / TPB;
dim3 block_grid(nloc, nblock);
dim3 thread_grid(1, TPB);
Expand Down Expand Up @@ -291,6 +297,8 @@ __global__ void map_filter_ftype(int *ftype_out,
void filter_ftype_gpu_cuda(int *ftype_out,
const int *ftype_in,
const int nloc) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
int nblock = (nloc + TPB - 1) / TPB;
map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
DPErrcheck(cudaGetLastError());
Expand Down
6 changes: 6 additions & 0 deletions source/lib/src/cuda/prod_env_mat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,8 @@ void format_nbor_list_gpu_cuda(int* nlist,
const int nall,
const float rcut,
const std::vector<int> sec) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int LEN = 256;
const int nnei = sec.back();
const int nblock = (nloc + LEN - 1) / LEN;
Expand Down Expand Up @@ -629,6 +631,8 @@ void prod_env_mat_a_gpu_cuda(FPTYPE* em,
const float rcut_smth,
const std::vector<int> sec,
const int* f_type) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
if (f_type == NULL) {
f_type = type;
}
Expand Down Expand Up @@ -669,6 +673,8 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em,
const float rcut,
const float rcut_smth,
const std::vector<int> sec) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int nnei = sec.back();
const int ndescrpt = nnei * 1;
DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
Expand Down
4 changes: 4 additions & 0 deletions source/lib/src/cuda/prod_force.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ void prod_force_a_gpu_cuda(FPTYPE* force,
const int nall,
const int nnei,
const int nframes) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 4;
DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));

Expand Down Expand Up @@ -137,6 +139,8 @@ void prod_force_r_gpu_cuda(FPTYPE* force,
const int nall,
const int nnei,
const int nframes) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 1;
DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));

Expand Down
4 changes: 4 additions & 0 deletions source/lib/src/cuda/prod_force_grad.cu
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
const int nloc,
const int nnei,
const int nframes) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 4;
DPErrcheck(
cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
Expand Down Expand Up @@ -117,6 +119,8 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
const int nloc,
const int nnei,
const int nframes) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 1;
DPErrcheck(
cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
Expand Down
4 changes: 4 additions & 0 deletions source/lib/src/cuda/prod_virial.cu
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ void prod_virial_a_gpu_cuda(FPTYPE* virial,
const int nloc,
const int nall,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));

Expand Down Expand Up @@ -141,6 +143,8 @@ void prod_virial_r_gpu_cuda(FPTYPE* virial,
const int nloc,
const int nall,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));

Expand Down
4 changes: 4 additions & 0 deletions source/lib/src/cuda/prod_virial_grad.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
const int* nlist,
const int nloc,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 4;
DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
const int LEN = 128;
Expand All @@ -112,6 +114,8 @@ void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
const int* nlist,
const int nloc,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei;
DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
const int LEN = 128;
Expand Down
6 changes: 6 additions & 0 deletions source/lib/src/cuda/region.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ template <typename FPTYPE>
void convert_to_inter_gpu(FPTYPE *ri,
const Region<FPTYPE> &region,
const FPTYPE *rp) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
_phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
Expand All @@ -36,13 +38,17 @@ template <typename FPTYPE>
void convert_to_phys_gpu(FPTYPE *rp,
const Region<FPTYPE> &region,
const FPTYPE *ri) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
_inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
}

template <typename FPTYPE>
void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
_compute_volume<<<1, 1>>>(volume, region.boxt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
Expand Down
18 changes: 18 additions & 0 deletions source/lib/src/cuda/tabulate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,8 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>
<<<nloc, last_layer_size>>>(
out, table, em_x, em, two_embed, table_info[0], table_info[1],
Expand All @@ -653,6 +655,8 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));

Expand All @@ -679,6 +683,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
<<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
Expand All @@ -702,6 +708,8 @@ void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
tabulate_fusion_se_t_fifth_order_polynomial<FPTYPE, MM, KK>
<<<nloc, last_layer_size>>>(
out, table, em_x, em, table_info[0], table_info[1], table_info[2],
Expand All @@ -725,6 +733,8 @@ void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));

Expand Down Expand Up @@ -752,6 +762,8 @@ void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));

tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
Expand All @@ -774,6 +786,8 @@ void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
tabulate_fusion_se_r_fifth_order_polynomial<FPTYPE, MM, KK>
<<<nloc, last_layer_size>>>(out, table, em, table_info[0], table_info[1],
table_info[2], table_info[3], table_info[4],
Expand All @@ -794,6 +808,8 @@ void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));

tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>
Expand All @@ -816,6 +832,8 @@ void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(
cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
Expand Down

0 comments on commit d3d3c18

Please sign in to comment.