deepmodeling · amcadmus · Jun 30, 2021 · May 13, 2021 · Jun 15, 2021 · Jun 18, 2021
diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
@@ -70,13 +70,13 @@ run_model (ENERGYTYPE &			dener,
 
   std::vector<Tensor> output_tensors;
   check_status (session->Run(input_tensors, 
-			    {"o_energy", "o_force", "o_atom_virial"}, 
+			    {"o_energy", "o_force", "o_atom_energy", "o_atom_virial"}, 
 			    {}, 
 			    &output_tensors));
 
   Tensor output_e = output_tensors[0];
   Tensor output_f = output_tensors[1];
-  Tensor output_av = output_tensors[2];
+  Tensor output_av = output_tensors[3];
 
   auto oe = output_e.flat <ENERGYTYPE> ();
   auto of = output_f.flat <VALUETYPE> ();

diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
@@ -50,22 +50,23 @@ __global__ void force_deriv_wrt_neighbors_a(
     const int nnei)
 {  
     // idy -> nnei
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int idy = blockIdx.y;
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
     const unsigned int idz = threadIdx.y;
-    const unsigned int idw = threadIdx.z;
     const int ndescrpt = nnei * 4;
-    if (idx >= nloc) {
+    if (idy >= nnei) {
         return;
     }
     // deriv wrt neighbors
     int j_idx = nlist[idx * nnei + idy];
     if (j_idx < 0) {
         return;
     }
-    atomicAdd(
-        force + j_idx * 3 + idz, 
-        net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]);
+    FPTYPE force_tmp = 0.f;
+    for (int idw = 0; idw < 4; ++idw) {
+        force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
+    }
+    atomicAdd(force + j_idx * 3 + idz, force_tmp);
 }
 
 template<typename FPTYPE>
@@ -78,11 +79,11 @@ __global__ void force_deriv_wrt_neighbors_r(
 		const int nnei)
 {  
     // idy -> nnei
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int idy = blockIdx.y;
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
     const unsigned int idz = threadIdx.y;
     const int ndescrpt = nnei * 1;
-    if (idx >= nloc) {
+    if (idy >= nnei) {
         return;
     }
     // deriv wrt neighbors
@@ -116,9 +117,9 @@ void prod_force_a_gpu_cuda(
       net_deriv, in_deriv, ndescrpt);
 
   const int LEN = 64;
-  const int nblock = (nloc + LEN -1) / LEN;
-  dim3 block_grid(nblock, nnei);
-  dim3 thread_grid(LEN, 3, 4);
+  const int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 3);
   force_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       force, 
       net_deriv, in_deriv, nlist, nloc, nnei);
@@ -144,8 +145,8 @@ void prod_force_r_gpu_cuda(
       net_deriv, in_deriv, ndescrpt);
 
   const int LEN = 64;
-  const int nblock = (nloc + LEN -1) / LEN;
-  dim3 block_grid(nblock, nnei);
+  const int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
   dim3 thread_grid(LEN, 3);
   force_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       force, 

diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
@@ -45,12 +45,11 @@ __global__ void virial_deriv_wrt_neighbors_a(
   // idz = dd0 * 3 + dd1
   // dd0 = idz / 3
   // dd1 = idz % 3
-  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const unsigned int idy = blockIdx.y;
+  const unsigned int idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
   const unsigned int idz = threadIdx.y;
-  const unsigned int idw = threadIdx.z;
   const int ndescrpt = nnei * 4;
-  if (idx >= nloc) {
+  if (idy >= nnei) {
       return;
   }
   int j_idx = nlist[idx * nnei + idy];
@@ -60,9 +59,11 @@ __global__ void virial_deriv_wrt_neighbors_a(
   // atomicAdd(
   //    virial + idz, 
   //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
-  atomicAdd(
-      atom_virial + j_idx * 9 + idz, 
-      net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]);
+  FPTYPE virial_tmp = 0.f;
+  for (int idw = 0; idw < 4; ++idw) {
+      virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
+  }
+  atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp);
 }
 
 template<typename FPTYPE>
@@ -81,12 +82,12 @@ __global__ void virial_deriv_wrt_neighbors_r(
     // idz = dd0 * 3 + dd1
     // dd0 = idz / 3
     // dd1 = idz % 3
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int idy = blockIdx.y;
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
     const unsigned int idz = threadIdx.y;
     const int ndescrpt = nnei * 1;
 
-    if (idx >= nloc) {
+    if (idy >= nnei) {
         return;
     }
     int j_idx = nlist[idx * nnei + idy];
@@ -122,9 +123,9 @@ void prod_virial_a_gpu_cuda(
       0.0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
-  int nblock = (nloc + LEN -1) / LEN;
-  dim3 block_grid(nblock, nnei);
-  dim3 thread_grid(LEN, 9, 4);
+  int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 9);
   // compute virial of a frame
   virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       virial, atom_virial, 
@@ -155,8 +156,8 @@ void prod_virial_r_gpu_cuda(
       0.0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
-  int nblock = (nloc + LEN -1) / LEN;
-  dim3 block_grid(nblock, nnei);
+  int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
   dim3 thread_grid(LEN, 9);
   // compute virial of a frame
   virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(