deepmodeling · amcadmus · Aug 27, 2021 · Aug 27, 2021 · Aug 27, 2021
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
@@ -135,8 +135,8 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial(
   bool unloop = false;
   FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
   for (int ii = 0; ii < MTILE; ii++) {
-    if (thread_idx < last_layer_size) {
-      iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
+    for (int jj = thread_idx; jj < last_layer_size; jj += blockDim.x) {
+      iteratorA[ii * last_layer_size + jj] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + jj];
     }
   }
   __syncthreads();

diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
@@ -6,7 +6,6 @@
 #define TPB 256
 #define WARP_SIZE 64
 #define FULL_MASK 0xffffffff
-#include "gpu_rocm.h"
 
 template <typename FPTYPE>
 __forceinline__ __device__
@@ -140,8 +139,8 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial(
   bool unloop = false;
   FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
   for (int ii = 0; ii < MTILE; ii++) {
-    if (thread_idx < last_layer_size) {
-      iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
+    for (int jj = thread_idx; jj < last_layer_size; jj += blockDim.x) {
+      iteratorA[ii * last_layer_size + jj] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + jj];
     }
   }
   __syncthreads();

diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
@@ -222,6 +222,7 @@ class TabulateFusionGradGradOp : public OpKernel {
           dz_dy,
           table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size);
       #endif // TENSORFLOW_USE_ROCM
+      OP_REQUIRES (context, (last_layer_size <= 1024),      errors::InvalidArgument ("In the process of model compression, the size of the last layer of embedding net must be less than 1024!"));
     }
     else if (device == "CPU") {
       deepmd::tabulate_fusion_grad_grad_cpu(