diff --git a/doc/train/gpu-limitations.md b/doc/train/gpu-limitations.md
index a28fe7d400..269ccb8e57 100644
--- a/doc/train/gpu-limitations.md
+++ b/doc/train/gpu-limitations.md
@@ -3,4 +3,5 @@ If you use deepmd-kit in a GPU environment, the acceptable value range of some v
 1. The number of atom type of a given system must be less than 128.
 2. The maximum distance between an atom and it's neighbors must be less than 128. It can be controlled by setting the rcut value of training parameters.
 3. Theoretically, the maximum number of atoms that a single GPU can accept is about 10,000,000. However, this value is actually limited by the GPU memory size currently, usually within 1000,000 atoms even at the model compression mode.
-4. The total sel value of training parameters(in model/descriptor section) must be less than 4096.
\ No newline at end of file
+4. The total sel value of training parameters(in model/descriptor section) must be less than 4096.
+5. The size of the last layer of embedding net must be less than 1024 during the model compression process.
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
index a45fc2b7e1..265a2baffe 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/cuda/tabulate.cu
@@ -135,8 +135,8 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial(
   bool unloop = false;
   FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
   for (int ii = 0; ii < MTILE; ii++) {
-    if (thread_idx < last_layer_size) {
-      iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
+    for (int jj = thread_idx; jj < last_layer_size; jj += blockDim.x) {
+      iteratorA[ii * last_layer_size + jj] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + jj];
     }
   }
   __syncthreads();
diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
index 8321f1354b..050bf1658a 100644
--- a/source/lib/src/rocm/tabulate.hip.cu
+++ b/source/lib/src/rocm/tabulate.hip.cu
@@ -6,7 +6,6 @@
 #define TPB 256
 #define WARP_SIZE 64
 #define FULL_MASK 0xffffffff
-#include "gpu_rocm.h"
 
 template <typename FPTYPE>
 __forceinline__ __device__
@@ -140,8 +139,8 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial(
   bool unloop = false;
   FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
   for (int ii = 0; ii < MTILE; ii++) {
-    if (thread_idx < last_layer_size) {
-      iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
+    for (int jj = thread_idx; jj < last_layer_size; jj += blockDim.x) {
+      iteratorA[ii * last_layer_size + jj] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + jj];
     }
   }
   __syncthreads();
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index d0c16fd122..3d3019b188 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -222,6 +222,7 @@ class TabulateFusionGradGradOp : public OpKernel {
           dz_dy,
           table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size);
       #endif // TENSORFLOW_USE_ROCM
+      OP_REQUIRES (context, (last_layer_size <= 1024),      errors::InvalidArgument ("In the process of model compression, the size of the last layer of embedding net must be less than 1024!"));
     }
     else if (device == "CPU") {
       deepmd::tabulate_fusion_grad_grad_cpu(