From 4877203994cef2be433c8e36a68ade3319b1d560 Mon Sep 17 00:00:00 2001
From: HLA <huangla@dp.tech>
Date: Tue, 24 Aug 2021 00:00:58 +0800
Subject: [PATCH 1/6] commit-message: decouple activation function's type from
 model compression's process in SE_A, now tanh & gelu is both available.

---
 deepmd/descriptor/se_a.py      |   4 +-
 deepmd/utils/tabulate.py       |  45 ++++++++----
 source/op/unaggregated_grad.cc | 122 ++++++++++++++++++++++++++-------
 source/tests/test_tabulate.py  |  49 +++++++++++++
 4 files changed, 182 insertions(+), 38 deletions(-)
 create mode 100644 source/tests/test_tabulate.py

diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index fab872e721..fbc9a77b56 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -126,6 +126,7 @@ def __init__ (self,
         self.uniform_seed = uniform_seed
         self.seed_shift = embedding_net_rand_seed_shift(self.filter_neuron)
         self.trainable = trainable
+        self.compress_activation_fn = get_activation_func(activation_function)
         self.filter_activation_fn = get_activation_func(activation_function)
         self.filter_precision = get_precision(precision)
         self.filter_np_precision = get_np_precision(precision)
@@ -316,7 +317,8 @@ def enable_compression(self,
                 The overflow check frequency
         """
         self.compress = True
-        self.table = DPTabulate(model_file, self.type_one_side, self.exclude_types)
+        self.table = DPTabulate(
+            model_file, self.type_one_side, self.exclude_types, self.compress_activation_fn)
         self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency]
         self.lower, self.upper \
             = self.table.build(min_nbor_dist, 
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index 719697dc87..44b4380e70 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -34,7 +34,8 @@ class DPTabulate():
     def __init__(self,
                  model_file : str,
                  type_one_side : bool = False,
-                 exclude_types : List[List[int]] = []) -> None:
+                 exclude_types : List[List[int]] = [],
+                 activation_fn=tf.nn.tanh) -> None:
         """
         Constructor
         """
@@ -44,6 +45,15 @@ def __init__(self,
         self.exclude_types = exclude_types
         if self.type_one_side and len(self.exclude_types) != 0:
             raise RunTimeError('"type_one_side" is not compatible with "exclude_types"')
+        
+        # functype
+        if activation_fn.__name__ == 'tf.nn.tanh' or activation_fn.__name__ == 'tanh':
+            self.functype = 1
+        elif activation_fn.__name__ == 'gelu':
+            self.functype = 2
+        else:
+            raise RunTimeError("Unknown actication function type!")
+        self.activation_fn = activation_fn
 
         self.graph, self.graph_def = load_graph_def(self.model_file)
         self.sess = tf.Session(graph = self.graph)
@@ -199,26 +209,37 @@ def _make_data(self, xx, idx):
                 xx = tf.reshape(xx, [xx.size, -1])
                 for layer in range(self.layer_size):
                     if layer == 0:
-                        yy = self._layer_0(xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
-                        dy = op_module.unaggregated_dy_dx_s(yy, self.matrix["layer_" + str(layer + 1)][idx])
-                        dy2 = op_module.unaggregated_dy2_dx_s(yy, dy, self.matrix["layer_" + str(layer + 1)][idx])
+                        xbar = tf.matmul(
+                            xx, self.matrix["layer_" + str(layer + 1)][idx]) + self.bias["layer_" + str(layer + 1)][idx]
+                        yy = self._layer_0(
+                            xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
+                        dy = op_module.unaggregated_dy_dx_s(
+                            yy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype))
+                        dy2 = op_module.unaggregated_dy2_dx_s(
+                            yy, dy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype))
                     else:
-                        tt, yy = self._layer_1(yy, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
-                        dz = op_module.unaggregated_dy_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dy)
-                        dy2 = op_module.unaggregated_dy2_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dz, dy, dy2)
+                        ybar = tf.matmul(
+                            yy, self.matrix["layer_" + str(layer + 1)][idx]) + self.bias["layer_" + str(layer + 1)][idx]
+                        tt, zz = self._layer_1(
+                            yy, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
+                        dz = op_module.unaggregated_dy_dx(
+                            zz - tt, self.matrix["layer_" + str(layer + 1)][idx], dy, ybar, tf.constant(self.functype))
+                        dy2 = op_module.unaggregated_dy2_dx(
+                            zz - tt, self.matrix["layer_" + str(layer + 1)][idx], dy, dy2, ybar, tf.constant(self.functype))
                         dy = dz
- 
-                vv = yy.eval()
+                        yy = zz
+
+                vv = zz.eval()
                 dd = dy.eval()
                 d2 = dy2.eval()
         return vv, dd, d2
 
     def _layer_0(self, x, w, b):
-        return tf.nn.tanh(tf.matmul(x, w) + b)
+        return self.activation_fn(tf.matmul(x, w) + b)
 
     def _layer_1(self, x, w, b):
-        t = tf.concat([x, x], axis = 1)
-        return t, tf.nn.tanh(tf.matmul(x, w) + b) + t
+        t = tf.concat([x, x], axis=1)
+        return t, self.activation_fn(tf.matmul(x, w) + b) + t
 
     def _save_data(self):
         for ii in range(self.ntypes * self.ntypes):
diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index 343a339a92..f43dcf17a1 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -2,42 +2,90 @@
 #include "ComputeDescriptor.h"
 #include "neighbor_list.h"
 
+
+#define SQRT2_PI 0.7978845608028654
+#define GGELU 0.044715
+
 REGISTER_OP("UnaggregatedDyDxS")
     .Attr("T: {float, double} = DT_DOUBLE") 
     .Input("y: T")                
-    .Input("w: T")              
+    .Input("w: T")     
+    .Input("xbar: T")
+    .Input("functype: int32")
     .Output("dy_dx: T");
 
 REGISTER_OP("UnaggregatedDyDx")
     .Attr("T: {float, double} = DT_DOUBLE")
     .Input("z: T")           
     .Input("w: T")     
-    .Input("dy_dx: T")     
+    .Input("dy_dx: T")   
+    .Input("ybar: T")
+    .Input("functype: int32")
     .Output("dz_dx: T");
 
 REGISTER_OP("UnaggregatedDy2DxS")
     .Attr("T: {float, double} = DT_DOUBLE") 
     .Input("y: T")                
     .Input("dy: T")                
-    .Input("w: T")              
+    .Input("w: T")  
+    .Input("xbar: T")
+    .Input("functype: int32")
     .Output("dy2_dx: T");
 
 REGISTER_OP("UnaggregatedDy2Dx")
     .Attr("T: {float, double} = DT_DOUBLE")
     .Input("z: T")           
-    .Input("w: T")     
-    .Input("dz_dx: T")     
+    .Input("w: T")       
     .Input("dy_dx: T")     
     .Input("dy2_dx: T")     
+    .Input("ybar: T")
+    .Input("functype: int32")
     .Output("dz2_dx: T");
+template <typename FPTYPE>
+FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype)  //functype=tanh, gelu, ..
+{
+    switch (functype)
+    {
+		case 1:
+			return (1 - y * y);
+		case 2:
+		{
+			const FPTYPE var = tanh(SQRT2_PI * (xbar + GGELU * xbar * xbar * xbar));
+			return 0.5 * SQRT2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5;
+		}
+		default:
+			return -1;
+	}
+	
+}
+
+template <typename FPTYPE>
+FPTYPE grad_grad(const FPTYPE xbar, const FPTYPE y, const int functype)
+{
+    switch (functype)
+    {
+		case 1:
+			return -2 * y * (1 - y * y);
+		case 2:
+		{
+			const FPTYPE var1 = tanh(SQRT2_PI * (xbar + GGELU * xbar * xbar * xbar));
+			const FPTYPE var2 = SQRT2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1);
+			return  3 * GGELU * SQRT2_PI * xbar * xbar * (1 - var1 * var1) - SQRT2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2;
+		}
+		default:
+			return -1;
+	}
+}
+	
+
 
 template <typename FPTYPE>
 struct UnaggregatedDyDxSFunctor {
-    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
+    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy_dx, const int functype) {
         #pragma omp parallel for
         for (int ii = 0; ii < length; ii++) {
             for (int jj = 0; jj < width; jj++) {
-                dy_dx[ii * width + jj] = (1 - y[ii * width + jj] * y[ii * width + jj]) * w[jj];
+				dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj],functype)*w[jj];
             }
         }
     }
@@ -53,12 +101,13 @@ struct UnaggregatedDyDxSFunctor {
 // calculate the gradient for all variables!
 template <typename FPTYPE>
 struct UnaggregatedDyDxFunctor {
-    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) {
+    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * ybar,  const int length, const int width, const int size, FPTYPE * dz_dx, const int functype) {
+		//width=2*size
         #pragma omp parallel for
         for (int kk = 0; kk < length; kk++) {
             for (int ii = 0; ii < width; ii++) {
                 //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
-                FPTYPE dz_drou = 1 - z[kk * width + ii] * z[kk * width + ii];
+				FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype);
                 FPTYPE accumulator = 0.0;
                 for (int jj = 0; jj < size; jj++) {
                     accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
@@ -80,11 +129,11 @@ struct UnaggregatedDyDxFunctor {
 
 template <typename FPTYPE>
 struct UnaggregatedDy2DxSFunctor {
-    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const int length, const int width, FPTYPE * dy2_dx) {
+    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy2_dx, const int functype) {
         #pragma omp parallel for
         for (int ii = 0; ii < length; ii++) {
             for (int jj = 0; jj < width; jj++) {
-                dy2_dx[ii * width + jj] = -2 * w[jj] * y[ii * width + jj] * dy[ii * width + jj];
+				dy2_dx[ii * width + jj] =  grad_grad(xbar[ii * width + jj],y[ii * width + jj],functype)*w[jj]*w[jj];
             }
         }
     }
@@ -100,12 +149,12 @@ struct UnaggregatedDy2DxSFunctor {
 // calculate the gradient for all variables!
 template <typename FPTYPE>
 struct UnaggregatedDy2DxFunctor {
-    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) {
+    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const FPTYPE * ybar, const int length, const int width, const int size, FPTYPE * dz2_dx, const int functype) {
         #pragma omp parallel for
         for (int kk = 0; kk < length; kk++) {
             for (int ii = 0; ii < width; ii++) {
                 //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
-                FPTYPE dz_drou = 1 - z[kk * width + ii] * z[kk * width + ii];
+				FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype);
                 FPTYPE accumulator = 0.0;
                 for (int jj = 0; jj < size; jj++) {
                     accumulator += w[jj * width + ii] * dy2_dx[kk * size + jj];
@@ -115,7 +164,7 @@ struct UnaggregatedDy2DxFunctor {
                 for (int jj = 0; jj < size; jj++) {
                     accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
                 }
-                dz_drou -= 2 * z[kk * width + ii] * (dz_dx[kk * width + ii] - dy_dx[kk * size + ii % size]) * accumulator;
+				dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii],functype) * accumulator * accumulator;
                 dz_drou += dy2_dx[kk * size + ii % size];
                 dz2_dx[kk * width + ii] = dz_drou;
             }
@@ -141,13 +190,18 @@ class UnaggregatedDyDxSOp : public OpKernel {
 
     void _Compute(OpKernelContext* context) {
         // Grab the input tensor
+		//xbar=xw+b
         int context_input_index = 0;
         const Tensor& y	= context->input(context_input_index++);
         const Tensor& w	= context->input(context_input_index++);
+		const Tensor& xbar = context->input(context_input_index++);
+		const Tensor& functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES (context, (y.shape().dims() == 2),	    errors::InvalidArgument ("Dim of table should be 1"));
+        OP_REQUIRES (context, (y.shape().dims() == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
         OP_REQUIRES (context, (w.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+		OP_REQUIRES(context, (xbar.shape().dims() == 2),    errors::InvalidArgument("Dim of input should be 2"));
+		//check functype
 
         int context_output_index = 0;
         Tensor* dy_dx = NULL;
@@ -159,11 +213,14 @@ class UnaggregatedDyDxSOp : public OpKernel {
             context->eigen_device<Device>(),            // define actually graph execution device
             y.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
+			xbar.flat<FPTYPE>().data(),
             y.shape().dim_size(0),
             y.shape().dim_size(1),
-            dy_dx->flat<FPTYPE>().data()
+            dy_dx->flat<FPTYPE>().data(),
+            functype.flat<int32>()(0)
         );
     }
+
 private:
 };
 
@@ -182,11 +239,14 @@ class UnaggregatedDy2DxSOp : public OpKernel {
         const Tensor& y	    = context->input(context_input_index++);
         const Tensor& dy	= context->input(context_input_index++);
         const Tensor& w	    = context->input(context_input_index++);
+		const Tensor& xbar = context->input(context_input_index++);
+        const Tensor& functype = context->input(context_input_index++);
 
         // set size of the sample
         OP_REQUIRES (context, (y.shape().dims()  == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
         OP_REQUIRES (context, (dy.shape().dims() == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
         OP_REQUIRES (context, (w.shape().dims()  == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (xbar.shape().dims()  == 2),		errors::InvalidArgument ("Dim of input should be 2"));
     
         int context_output_index = 0;
         Tensor* dy2_dx = NULL; 
@@ -199,11 +259,14 @@ class UnaggregatedDy2DxSOp : public OpKernel {
             y.flat<FPTYPE>().data(),
             dy.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
+			xbar.flat<FPTYPE>().data(),
             y.shape().dim_size(0),
             y.shape().dim_size(1),
-            dy2_dx->flat<FPTYPE>().data()
+            dy2_dx->flat<FPTYPE>().data(),
+            functype.flat<int32>()(0)
         );
     }
+
 private:
 };
 
@@ -222,11 +285,14 @@ class UnaggregatedDyDxOp : public OpKernel {
         const Tensor& z	= context->input(context_input_index++);
         const Tensor& w	= context->input(context_input_index++);
         const Tensor& dy_dx	= context->input(context_input_index++);
+        const Tensor& ybar	= context->input(context_input_index++);
+        const Tensor& functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of table should be 1"));
+        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of input should be 2"));
         OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
         OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (ybar.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
 
         int context_output_index = 0;
         Tensor* dz_dx = NULL;
@@ -239,12 +305,15 @@ class UnaggregatedDyDxOp : public OpKernel {
             z.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
             dy_dx.flat<FPTYPE>().data(),
+            ybar.flat<FPTYPE>().data(),
             z.shape().dim_size(0),
-            z.shape().dim_size(1),
-            w.shape().dim_size(0),
-            dz_dx->flat<FPTYPE>().data()
+            z.shape().dim_size(1),        //N1
+            w.shape().dim_size(0),      //N0 , N1=2N0
+            dz_dx->flat<FPTYPE>().data(),
+            functype.flat<int32>()(0)
         );
     }
+
 private:
 };
 
@@ -262,16 +331,17 @@ class UnaggregatedDy2DxOp : public OpKernel {
         int context_input_index = 0;
         const Tensor& z	= context->input(context_input_index++);
         const Tensor& w	= context->input(context_input_index++);
-        const Tensor& dz_dx	= context->input(context_input_index++);
         const Tensor& dy_dx	= context->input(context_input_index++);
         const Tensor& dy2_dx = context->input(context_input_index++);
+        const Tensor& ybar = context->input(context_input_index++);
+        const Tensor& functype = context->input(context_input_index++);
 
         // set size of the sample
         OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of input should be 2"));
         OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (dz_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
         OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
         OP_REQUIRES (context, (dy2_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (ybar.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
 
         int context_output_index = 0;
         Tensor* dz2_dx = NULL;
@@ -283,15 +353,17 @@ class UnaggregatedDy2DxOp : public OpKernel {
             context->eigen_device<Device>(),            // define actually graph execution device
             z.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
-            dz_dx.flat<FPTYPE>().data(),
             dy_dx.flat<FPTYPE>().data(),
             dy2_dx.flat<FPTYPE>().data(),
+            ybar.flat<FPTYPE>().data(),
             z.shape().dim_size(0),
             z.shape().dim_size(1),
             w.shape().dim_size(0),
-            dz2_dx->flat<FPTYPE>().data()
+            dz2_dx->flat<FPTYPE>().data(),
+            functype.flat<int32>()(0)
         );
     }
+
 private:
 };
 
diff --git a/source/tests/test_tabulate.py b/source/tests/test_tabulate.py
new file mode 100644
index 0000000000..f082ac3dcc
--- /dev/null
+++ b/source/tests/test_tabulate.py
@@ -0,0 +1,49 @@
+import unittest
+import numpy as np
+from deepmd.utils.tabulate import DPTabulate
+from deepmd.env import op_module
+from deepmd.env import tf
+from deepmd.common import gelu
+
+tf.enable_eager_execution(
+    config=None,
+    device_policy=None,
+    execution_mode=None
+)
+# Now just test some OPs utilized by DPTabulate sourced in /opt/deepmd-kit/source/op/unaggregated_grad.cc
+
+class TestDPTabulate(unittest.TestCase):
+    def test_op_tanh(self):
+        w=tf.constant([[0.1,0.2,0.3,0.4],[0.5,0.6,0.7,0.8],[0.9,1,1.1,1.2]],dtype='double')
+        x=tf.constant([[0.1,0.2,0.3],[0.4,0.5,0.6],[0.7,0.8,0.9],[1.0,1.1,1.2]],dtype='double')
+        b=tf.constant([[0.1],[0.2],[0.3],[0.4]],dtype='double')
+        xbar = tf.matmul(x, w) + b
+        y=tf.nn.tanh(xbar)
+        dy = op_module.unaggregated_dy_dx_s(y, w, xbar, tf.constant(1))
+        answer = np.array([[8.008666403121351973e-02, 1.513925729426658651e-01, 2.134733287761668430e-01, 2.661983049806041501e-01], 
+                           [4.010658815015743367e-02, 6.306476628799793926e-02, 7.332167904608145881e-02, 7.494218676568849269e-02],
+                           [1.561705624394135218e-02, 1.994112926507514427e-02, 1.887519955881525671e-02, 1.576442161040989692e-02],
+                           [5.492686739421748753e-03, 5.754985286040992763e-03, 4.493113544969218158e-03, 3.107638130764600777e-03]])
+        
+        self.assertTrue((dy.numpy()==answer).all())
+    
+    def test_op_gelu(self):
+        w = tf.constant([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [
+                        0.9, 1, 1.1, 1.2]], dtype='double')
+        x = tf.constant([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [
+                        0.7, 0.8, 0.9], [1.0, 1.1, 1.2]], dtype='double')
+        b = tf.constant([[0.1], [0.2], [0.3], [0.4]], dtype='double')
+        xbar = tf.matmul(x, w) + b
+        y = gelu(xbar)
+        dy = op_module.unaggregated_dy_dx_s(y, w, xbar, tf.constant(2))
+        answer = np.array([[8.549286163555620821e-02, 1.782905778685600906e-01, 2.776474599997448833e-01, 3.827650237273348965e-01],
+                           [1.089906023807040714e-01, 2.230820937721638697e-01, 3.381867859682909927e-01, 4.513008399758057232e-01],
+                           [1.124254240556722684e-01, 2.209918074710395253e-01, 3.238894323148118759e-01, 4.220357318198978414e-01],
+                           [1.072173273655498138e-01, 2.082159073100979807e-01, 3.059816075270163083e-01, 4.032981557798429595e-01]])
+
+        self.assertTrue((dy.numpy() == answer).all())
+
+
+
+if __name__ == '__main__':
+    unittest.main()

From 95c1ea550dfb10583cc89dc8078cb17532664374 Mon Sep 17 00:00:00 2001
From: HLA <huangla@dp.tech>
Date: Wed, 25 Aug 2021 11:58:39 +0800
Subject: [PATCH 2/6] commit-message: modified code and passed unittest

---
 deepmd/utils/tabulate.py       |  7 +++++--
 source/op/unaggregated_grad.cc | 13 ++++++-------
 source/tests/test_tabulate.py  | 21 ++++++++++++---------
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index 44b4380e70..93a75c6f24 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -5,6 +5,7 @@
 from typing import Tuple, List
 from deepmd.env import tf
 from deepmd.env import op_module
+from deepmd.common import ACTIVATION_FN_DICT
 from deepmd.utils.sess import run_sess
 from deepmd.utils.graph import get_tensor_by_name_from_graph, load_graph_def 
 from deepmd.utils.graph import get_embedding_net_nodes_from_graph_def
@@ -30,6 +31,8 @@ class DPTabulate():
     exclude_types : List[List[int]]
             The excluded pairs of types which have no interaction with each other.
             For example, `[[0, 1]]` means no interaction between type 0 and type 1.
+    activation_function
+            The activation function in the embedding net. Supported options are {"tanh","gelu"} in common.ACTIVATION_FN_DICT.
     """
     def __init__(self,
                  model_file : str,
@@ -47,9 +50,9 @@ def __init__(self,
             raise RunTimeError('"type_one_side" is not compatible with "exclude_types"')
         
         # functype
-        if activation_fn.__name__ == 'tf.nn.tanh' or activation_fn.__name__ == 'tanh':
+        if activation_fn == ACTIVATION_FN_DICT["tanh"]:
             self.functype = 1
-        elif activation_fn.__name__ == 'gelu':
+        elif activation_fn == ACTIVATION_FN_DICT["gelu"]:
             self.functype = 2
         else:
             raise RunTimeError("Unknown actication function type!")
diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index f43dcf17a1..542268d3e7 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -1,9 +1,8 @@
 #include "custom_op.h"
 #include "ComputeDescriptor.h"
 #include "neighbor_list.h"
+#include "device.h"
 
-
-#define SQRT2_PI 0.7978845608028654
 #define GGELU 0.044715
 
 REGISTER_OP("UnaggregatedDyDxS")
@@ -50,8 +49,8 @@ FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype)  //functype=t
 			return (1 - y * y);
 		case 2:
 		{
-			const FPTYPE var = tanh(SQRT2_PI * (xbar + GGELU * xbar * xbar * xbar));
-			return 0.5 * SQRT2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5;
+			const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
+			return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5;
 		}
 		default:
 			return -1;
@@ -68,9 +67,9 @@ FPTYPE grad_grad(const FPTYPE xbar, const FPTYPE y, const int functype)
 			return -2 * y * (1 - y * y);
 		case 2:
 		{
-			const FPTYPE var1 = tanh(SQRT2_PI * (xbar + GGELU * xbar * xbar * xbar));
-			const FPTYPE var2 = SQRT2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1);
-			return  3 * GGELU * SQRT2_PI * xbar * xbar * (1 - var1 * var1) - SQRT2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2;
+			const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
+			const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1);
+			return  3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2;
 		}
 		default:
 			return -1;
diff --git a/source/tests/test_tabulate.py b/source/tests/test_tabulate.py
index f082ac3dcc..ce26c4e3e6 100644
--- a/source/tests/test_tabulate.py
+++ b/source/tests/test_tabulate.py
@@ -5,11 +5,6 @@
 from deepmd.env import tf
 from deepmd.common import gelu
 
-tf.enable_eager_execution(
-    config=None,
-    device_policy=None,
-    execution_mode=None
-)
 # Now just test some OPs utilized by DPTabulate sourced in /opt/deepmd-kit/source/op/unaggregated_grad.cc
 
 class TestDPTabulate(unittest.TestCase):
@@ -20,13 +15,17 @@ def test_op_tanh(self):
         xbar = tf.matmul(x, w) + b
         y=tf.nn.tanh(xbar)
         dy = op_module.unaggregated_dy_dx_s(y, w, xbar, tf.constant(1))
+        dy_array = tf.Session().run(dy)
         answer = np.array([[8.008666403121351973e-02, 1.513925729426658651e-01, 2.134733287761668430e-01, 2.661983049806041501e-01], 
-                           [4.010658815015743367e-02, 6.306476628799793926e-02, 7.332167904608145881e-02, 7.494218676568849269e-02],
+                           [4.010658815015744061e-02, 6.306476628799793926e-02, 7.332167904608145881e-02, 7.494218676568849269e-02],
                            [1.561705624394135218e-02, 1.994112926507514427e-02, 1.887519955881525671e-02, 1.576442161040989692e-02],
                            [5.492686739421748753e-03, 5.754985286040992763e-03, 4.493113544969218158e-03, 3.107638130764600777e-03]])
         
-        self.assertTrue((dy.numpy()==answer).all())
-    
+        places = 18
+        for ii in range(dy_array.shape[0]):
+            for jj in range(dy_array.shape[1]):
+                self.assertAlmostEqual(dy_array[ii,jj], answer[ii,jj], places=places)
+
     def test_op_gelu(self):
         w = tf.constant([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [
                         0.9, 1, 1.1, 1.2]], dtype='double')
@@ -36,12 +35,16 @@ def test_op_gelu(self):
         xbar = tf.matmul(x, w) + b
         y = gelu(xbar)
         dy = op_module.unaggregated_dy_dx_s(y, w, xbar, tf.constant(2))
+        dy_array = tf.Session().run(dy)
         answer = np.array([[8.549286163555620821e-02, 1.782905778685600906e-01, 2.776474599997448833e-01, 3.827650237273348965e-01],
                            [1.089906023807040714e-01, 2.230820937721638697e-01, 3.381867859682909927e-01, 4.513008399758057232e-01],
                            [1.124254240556722684e-01, 2.209918074710395253e-01, 3.238894323148118759e-01, 4.220357318198978414e-01],
                            [1.072173273655498138e-01, 2.082159073100979807e-01, 3.059816075270163083e-01, 4.032981557798429595e-01]])
 
-        self.assertTrue((dy.numpy() == answer).all())
+        places = 18
+        for ii in range(dy_array.shape[0]):
+            for jj in range(dy_array.shape[1]):
+                self.assertAlmostEqual(dy_array[ii, jj], answer[ii, jj], places=places)
 
 
 

From 2d7e537219c4ca686b18fc40dc59da6008cff6fa Mon Sep 17 00:00:00 2001
From: HLA <huangla@dp.tech>
Date: Wed, 25 Aug 2021 14:20:49 +0800
Subject: [PATCH 3/6] commit-message: Format Document

---
 deepmd/utils/tabulate.py       |   3 +-
 source/op/unaggregated_grad.cc | 418 ++++++++++++++++++---------------
 2 files changed, 227 insertions(+), 194 deletions(-)

diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index 93a75c6f24..f1057b38f2 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -2,6 +2,7 @@
 import math
 import logging
 import numpy as np
+from typing import Callable
 from typing import Tuple, List
 from deepmd.env import tf
 from deepmd.env import op_module
@@ -38,7 +39,7 @@ def __init__(self,
                  model_file : str,
                  type_one_side : bool = False,
                  exclude_types : List[List[int]] = [],
-                 activation_fn=tf.nn.tanh) -> None:
+                 activation_fn : Callable[[tf.Tensor], tf.Tensor] = tf.nn.tanh) -> None:
         """
         Constructor
         """
diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index 542268d3e7..9810e1a36a 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -6,56 +6,55 @@
 #define GGELU 0.044715
 
 REGISTER_OP("UnaggregatedDyDxS")
-    .Attr("T: {float, double} = DT_DOUBLE") 
-    .Input("y: T")                
-    .Input("w: T")     
+    .Attr("T: {float, double} = DT_DOUBLE")
+    .Input("y: T")
+    .Input("w: T")
     .Input("xbar: T")
     .Input("functype: int32")
     .Output("dy_dx: T");
 
 REGISTER_OP("UnaggregatedDyDx")
     .Attr("T: {float, double} = DT_DOUBLE")
-    .Input("z: T")           
-    .Input("w: T")     
-    .Input("dy_dx: T")   
+    .Input("z: T")
+    .Input("w: T")
+    .Input("dy_dx: T")
     .Input("ybar: T")
     .Input("functype: int32")
     .Output("dz_dx: T");
 
 REGISTER_OP("UnaggregatedDy2DxS")
-    .Attr("T: {float, double} = DT_DOUBLE") 
-    .Input("y: T")                
-    .Input("dy: T")                
-    .Input("w: T")  
+    .Attr("T: {float, double} = DT_DOUBLE")
+    .Input("y: T")
+    .Input("dy: T")
+    .Input("w: T")
     .Input("xbar: T")
     .Input("functype: int32")
     .Output("dy2_dx: T");
 
 REGISTER_OP("UnaggregatedDy2Dx")
     .Attr("T: {float, double} = DT_DOUBLE")
-    .Input("z: T")           
-    .Input("w: T")       
-    .Input("dy_dx: T")     
-    .Input("dy2_dx: T")     
+    .Input("z: T")
+    .Input("w: T")
+    .Input("dy_dx: T")
+    .Input("dy2_dx: T")
     .Input("ybar: T")
     .Input("functype: int32")
     .Output("dz2_dx: T");
 template <typename FPTYPE>
-FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype)  //functype=tanh, gelu, ..
+FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype) //functype=tanh, gelu, ..
 {
     switch (functype)
     {
-		case 1:
-			return (1 - y * y);
-		case 2:
-		{
-			const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
-			return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5;
-		}
-		default:
-			return -1;
-	}
-	
+    case 1:
+        return (1 - y * y);
+    case 2:
+    {
+        const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
+        return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5;
+    }
+    default:
+        return -1;
+    }
 }
 
 template <typename FPTYPE>
@@ -63,52 +62,60 @@ FPTYPE grad_grad(const FPTYPE xbar, const FPTYPE y, const int functype)
 {
     switch (functype)
     {
-		case 1:
-			return -2 * y * (1 - y * y);
-		case 2:
-		{
-			const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
-			const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1);
-			return  3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2;
-		}
-		default:
-			return -1;
-	}
+    case 1:
+        return -2 * y * (1 - y * y);
+    case 2:
+    {
+        const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
+        const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1);
+        return 3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2;
+    }
+    default:
+        return -1;
+    }
 }
-	
-
 
 template <typename FPTYPE>
-struct UnaggregatedDyDxSFunctor {
-    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy_dx, const int functype) {
-        #pragma omp parallel for
-        for (int ii = 0; ii < length; ii++) {
-            for (int jj = 0; jj < width; jj++) {
-				dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj],functype)*w[jj];
+struct UnaggregatedDyDxSFunctor
+{
+    void operator()(const CPUDevice &d, const FPTYPE *y, const FPTYPE *w, const FPTYPE *xbar, const int length, const int width, FPTYPE *dy_dx, const int functype)
+    {
+#pragma omp parallel for
+        for (int ii = 0; ii < length; ii++)
+        {
+            for (int jj = 0; jj < width; jj++)
+            {
+                dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj], functype) * w[jj];
             }
         }
     }
 
-    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
-        //Currently, Do nothing at all! 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    void operator()(const GPUDevice &d, const FPTYPE *y, const FPTYPE *w, const int length, const int width, FPTYPE *dy_dx)
+    {
+        //Currently, Do nothing at all!
         return;
     }
-    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 // calculate the gradient for all variables!
 template <typename FPTYPE>
-struct UnaggregatedDyDxFunctor {
-    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * ybar,  const int length, const int width, const int size, FPTYPE * dz_dx, const int functype) {
-		//width=2*size
-        #pragma omp parallel for
-        for (int kk = 0; kk < length; kk++) {
-            for (int ii = 0; ii < width; ii++) {
+struct UnaggregatedDyDxFunctor
+{
+    void operator()(const CPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const FPTYPE *ybar, const int length, const int width, const int size, FPTYPE *dz_dx, const int functype)
+    {
+//width=2*size
+#pragma omp parallel for
+        for (int kk = 0; kk < length; kk++)
+        {
+            for (int ii = 0; ii < width; ii++)
+            {
                 //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
-				FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype);
+                FPTYPE dz_drou = grad(ybar[kk * width + ii], z[kk * width + ii], functype);
                 FPTYPE accumulator = 0.0;
-                for (int jj = 0; jj < size; jj++) {
+                for (int jj = 0; jj < size; jj++)
+                {
                     accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
                 }
                 dz_drou *= accumulator;
@@ -118,238 +125,264 @@ struct UnaggregatedDyDxFunctor {
         }
     }
 
-    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) {
-        //Currently, Do nothing at all! 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    void operator()(const GPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const int length, const int width, const int size, FPTYPE *dz_dx)
+    {
+        //Currently, Do nothing at all!
         return;
     }
-    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 template <typename FPTYPE>
-struct UnaggregatedDy2DxSFunctor {
-    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy2_dx, const int functype) {
-        #pragma omp parallel for
-        for (int ii = 0; ii < length; ii++) {
-            for (int jj = 0; jj < width; jj++) {
-				dy2_dx[ii * width + jj] =  grad_grad(xbar[ii * width + jj],y[ii * width + jj],functype)*w[jj]*w[jj];
+struct UnaggregatedDy2DxSFunctor
+{
+    void operator()(const CPUDevice &d, const FPTYPE *y, const FPTYPE *dy, const FPTYPE *w, const FPTYPE *xbar, const int length, const int width, FPTYPE *dy2_dx, const int functype)
+    {
+#pragma omp parallel for
+        for (int ii = 0; ii < length; ii++)
+        {
+            for (int jj = 0; jj < width; jj++)
+            {
+                dy2_dx[ii * width + jj] = grad_grad(xbar[ii * width + jj], y[ii * width + jj], functype) * w[jj] * w[jj];
             }
         }
     }
 
-    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
-        //Currently, Do nothing at all! 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    void operator()(const GPUDevice &d, const FPTYPE *y, const FPTYPE *w, const int length, const int width, FPTYPE *dy_dx)
+    {
+        //Currently, Do nothing at all!
         return;
     }
-    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 // calculate the gradient for all variables!
 template <typename FPTYPE>
-struct UnaggregatedDy2DxFunctor {
-    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const FPTYPE * ybar, const int length, const int width, const int size, FPTYPE * dz2_dx, const int functype) {
-        #pragma omp parallel for
-        for (int kk = 0; kk < length; kk++) {
-            for (int ii = 0; ii < width; ii++) {
+struct UnaggregatedDy2DxFunctor
+{
+    void operator()(const CPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const FPTYPE *dy2_dx, const FPTYPE *ybar, const int length, const int width, const int size, FPTYPE *dz2_dx, const int functype)
+    {
+#pragma omp parallel for
+        for (int kk = 0; kk < length; kk++)
+        {
+            for (int ii = 0; ii < width; ii++)
+            {
                 //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
-				FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype);
+                FPTYPE dz_drou = grad(ybar[kk * width + ii], z[kk * width + ii], functype);
                 FPTYPE accumulator = 0.0;
-                for (int jj = 0; jj < size; jj++) {
+                for (int jj = 0; jj < size; jj++)
+                {
                     accumulator += w[jj * width + ii] * dy2_dx[kk * size + jj];
                 }
                 dz_drou *= accumulator;
                 accumulator = 0.0;
-                for (int jj = 0; jj < size; jj++) {
+                for (int jj = 0; jj < size; jj++)
+                {
                     accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
                 }
-				dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii],functype) * accumulator * accumulator;
+                dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii], functype) * accumulator * accumulator;
                 dz_drou += dy2_dx[kk * size + ii % size];
                 dz2_dx[kk * width + ii] = dz_drou;
             }
         }
     }
 
-    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) {
-        //Currently, Do nothing at all! 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    void operator()(const GPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dz_dx, const FPTYPE *dy_dx, const FPTYPE *dy2_dx, const int length, const int width, const int size, FPTYPE *dz2_dx)
+    {
+        //Currently, Do nothing at all!
         return;
     }
-    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
-template<typename Device, typename FPTYPE>
-class UnaggregatedDyDxSOp : public OpKernel {
- public:
-    explicit UnaggregatedDyDxSOp(OpKernelConstruction* context) : OpKernel(context) {}
+template <typename Device, typename FPTYPE>
+class UnaggregatedDyDxSOp : public OpKernel
+{
+public:
+    explicit UnaggregatedDyDxSOp(OpKernelConstruction *context) : OpKernel(context) {}
 
-    void Compute(OpKernelContext* context) override {
-        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+    void Compute(OpKernelContext *context) override
+    {
+        deepmd::safe_compute(context, [this](OpKernelContext *context)
+                             { this->_Compute(context); });
     }
 
-    void _Compute(OpKernelContext* context) {
+    void _Compute(OpKernelContext *context)
+    {
         // Grab the input tensor
-		//xbar=xw+b
+        //xbar=xw+b
         int context_input_index = 0;
-        const Tensor& y	= context->input(context_input_index++);
-        const Tensor& w	= context->input(context_input_index++);
-		const Tensor& xbar = context->input(context_input_index++);
-		const Tensor& functype = context->input(context_input_index++);
+        const Tensor &y = context->input(context_input_index++);
+        const Tensor &w = context->input(context_input_index++);
+        const Tensor &xbar = context->input(context_input_index++);
+        const Tensor &functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES (context, (y.shape().dims() == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (w.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
-		OP_REQUIRES(context, (xbar.shape().dims() == 2),    errors::InvalidArgument("Dim of input should be 2"));
-		//check functype
+        OP_REQUIRES(context, (y.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        //check functype
 
         int context_output_index = 0;
-        Tensor* dy_dx = NULL;
+        Tensor *dy_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     y.shape(),
-	    					     &dy_dx));
+                                                         y.shape(),
+                                                         &dy_dx));
 
         UnaggregatedDyDxSFunctor<FPTYPE>()(
-            context->eigen_device<Device>(),            // define actually graph execution device
+            context->eigen_device<Device>(), // define actually graph execution device
             y.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
-			xbar.flat<FPTYPE>().data(),
+            xbar.flat<FPTYPE>().data(),
             y.shape().dim_size(0),
             y.shape().dim_size(1),
             dy_dx->flat<FPTYPE>().data(),
-            functype.flat<int32>()(0)
-        );
+            functype.flat<int32>()(0));
     }
 
 private:
 };
 
-template<typename Device, typename FPTYPE>
-class UnaggregatedDy2DxSOp : public OpKernel {
- public:
-    explicit UnaggregatedDy2DxSOp(OpKernelConstruction* context) : OpKernel(context) {}
+template <typename Device, typename FPTYPE>
+class UnaggregatedDy2DxSOp : public OpKernel
+{
+public:
+    explicit UnaggregatedDy2DxSOp(OpKernelConstruction *context) : OpKernel(context) {}
 
-    void Compute(OpKernelContext* context) override {
-        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+    void Compute(OpKernelContext *context) override
+    {
+        deepmd::safe_compute(context, [this](OpKernelContext *context)
+                             { this->_Compute(context); });
     }
 
-    void _Compute(OpKernelContext* context) {
+    void _Compute(OpKernelContext *context)
+    {
         // Grab the input tensor
         int context_input_index = 0;
-        const Tensor& y	    = context->input(context_input_index++);
-        const Tensor& dy	= context->input(context_input_index++);
-        const Tensor& w	    = context->input(context_input_index++);
-		const Tensor& xbar = context->input(context_input_index++);
-        const Tensor& functype = context->input(context_input_index++);
+        const Tensor &y = context->input(context_input_index++);
+        const Tensor &dy = context->input(context_input_index++);
+        const Tensor &w = context->input(context_input_index++);
+        const Tensor &xbar = context->input(context_input_index++);
+        const Tensor &functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES (context, (y.shape().dims()  == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (dy.shape().dims() == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (w.shape().dims()  == 2),		errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (xbar.shape().dims()  == 2),		errors::InvalidArgument ("Dim of input should be 2"));
-    
+        OP_REQUIRES(context, (y.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (dy.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+
         int context_output_index = 0;
-        Tensor* dy2_dx = NULL; 
+        Tensor *dy2_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     y.shape(),
-	    					     &dy2_dx));
+                                                         y.shape(),
+                                                         &dy2_dx));
 
         UnaggregatedDy2DxSFunctor<FPTYPE>()(
-            context->eigen_device<Device>(),            // define actually graph execution device
+            context->eigen_device<Device>(), // define actually graph execution device
             y.flat<FPTYPE>().data(),
             dy.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
-			xbar.flat<FPTYPE>().data(),
+            xbar.flat<FPTYPE>().data(),
             y.shape().dim_size(0),
             y.shape().dim_size(1),
             dy2_dx->flat<FPTYPE>().data(),
-            functype.flat<int32>()(0)
-        );
+            functype.flat<int32>()(0));
     }
 
 private:
 };
 
-template<typename Device, typename FPTYPE>
-class UnaggregatedDyDxOp : public OpKernel {
- public:
-    explicit UnaggregatedDyDxOp(OpKernelConstruction* context) : OpKernel(context) {}
+template <typename Device, typename FPTYPE>
+class UnaggregatedDyDxOp : public OpKernel
+{
+public:
+    explicit UnaggregatedDyDxOp(OpKernelConstruction *context) : OpKernel(context) {}
 
-    void Compute(OpKernelContext* context) override {
-        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+    void Compute(OpKernelContext *context) override
+    {
+        deepmd::safe_compute(context, [this](OpKernelContext *context)
+                             { this->_Compute(context); });
     }
 
-    void _Compute(OpKernelContext* context) {
+    void _Compute(OpKernelContext *context)
+    {
         // Grab the input tensor
         int context_input_index = 0;
-        const Tensor& z	= context->input(context_input_index++);
-        const Tensor& w	= context->input(context_input_index++);
-        const Tensor& dy_dx	= context->input(context_input_index++);
-        const Tensor& ybar	= context->input(context_input_index++);
-        const Tensor& functype = context->input(context_input_index++);
+        const Tensor &z = context->input(context_input_index++);
+        const Tensor &w = context->input(context_input_index++);
+        const Tensor &dy_dx = context->input(context_input_index++);
+        const Tensor &ybar = context->input(context_input_index++);
+        const Tensor &functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (ybar.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES(context, (z.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (dy_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (ybar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
 
         int context_output_index = 0;
-        Tensor* dz_dx = NULL;
+        Tensor *dz_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     z.shape(),
-	    					     &dz_dx));
+                                                         z.shape(),
+                                                         &dz_dx));
 
         UnaggregatedDyDxFunctor<FPTYPE>()(
-            context->eigen_device<Device>(),            // define actually graph execution device
+            context->eigen_device<Device>(), // define actually graph execution device
             z.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
             dy_dx.flat<FPTYPE>().data(),
             ybar.flat<FPTYPE>().data(),
             z.shape().dim_size(0),
-            z.shape().dim_size(1),        //N1
-            w.shape().dim_size(0),      //N0 , N1=2N0
+            z.shape().dim_size(1), //N1
+            w.shape().dim_size(0), //N0 , N1=2N0
             dz_dx->flat<FPTYPE>().data(),
-            functype.flat<int32>()(0)
-        );
+            functype.flat<int32>()(0));
     }
 
 private:
 };
 
-template<typename Device, typename FPTYPE>
-class UnaggregatedDy2DxOp : public OpKernel {
- public:
-    explicit UnaggregatedDy2DxOp(OpKernelConstruction* context) : OpKernel(context) {}
+template <typename Device, typename FPTYPE>
+class UnaggregatedDy2DxOp : public OpKernel
+{
+public:
+    explicit UnaggregatedDy2DxOp(OpKernelConstruction *context) : OpKernel(context) {}
 
-    void Compute(OpKernelContext* context) override {
-        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+    void Compute(OpKernelContext *context) override
+    {
+        deepmd::safe_compute(context, [this](OpKernelContext *context)
+                             { this->_Compute(context); });
     }
 
-    void _Compute(OpKernelContext* context) {
+    void _Compute(OpKernelContext *context)
+    {
         // Grab the input tensor
         int context_input_index = 0;
-        const Tensor& z	= context->input(context_input_index++);
-        const Tensor& w	= context->input(context_input_index++);
-        const Tensor& dy_dx	= context->input(context_input_index++);
-        const Tensor& dy2_dx = context->input(context_input_index++);
-        const Tensor& ybar = context->input(context_input_index++);
-        const Tensor& functype = context->input(context_input_index++);
+        const Tensor &z = context->input(context_input_index++);
+        const Tensor &w = context->input(context_input_index++);
+        const Tensor &dy_dx = context->input(context_input_index++);
+        const Tensor &dy2_dx = context->input(context_input_index++);
+        const Tensor &ybar = context->input(context_input_index++);
+        const Tensor &functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (dy2_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
-        OP_REQUIRES (context, (ybar.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES(context, (z.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (dy_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (dy2_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES(context, (ybar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
 
         int context_output_index = 0;
-        Tensor* dz2_dx = NULL;
+        Tensor *dz2_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     z.shape(),
-	    					     &dz2_dx));
+                                                         z.shape(),
+                                                         &dz2_dx));
 
         UnaggregatedDy2DxFunctor<FPTYPE>()(
-            context->eigen_device<Device>(),            // define actually graph execution device
+            context->eigen_device<Device>(), // define actually graph execution device
             z.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
             dy_dx.flat<FPTYPE>().data(),
@@ -359,27 +392,26 @@ class UnaggregatedDy2DxOp : public OpKernel {
             z.shape().dim_size(1),
             w.shape().dim_size(0),
             dz2_dx->flat<FPTYPE>().data(),
-            functype.flat<int32>()(0)
-        );
+            functype.flat<int32>()(0));
     }
 
 private:
 };
 
 // Register the CPU kernels.
-#define REGISTER_CPU(T)                                                                 \
-REGISTER_KERNEL_BUILDER(                                                                \
-    Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
-    UnaggregatedDyDxSOp<CPUDevice, T>);                                                 \
-REGISTER_KERNEL_BUILDER(                                                                \
-    Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint<T>("T"),                 \
-    UnaggregatedDyDxOp<CPUDevice, T>);                                                  \
-REGISTER_KERNEL_BUILDER(                                                                \
-    Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),               \
-    UnaggregatedDy2DxSOp<CPUDevice, T>);                                                \
-REGISTER_KERNEL_BUILDER(                                                                \
-    Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
-    UnaggregatedDy2DxOp<CPUDevice, T>);             
+#define REGISTER_CPU(T)                                                       \
+    REGISTER_KERNEL_BUILDER(                                                  \
+        Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),  \
+        UnaggregatedDyDxSOp<CPUDevice, T>);                                   \
+    REGISTER_KERNEL_BUILDER(                                                  \
+        Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+        UnaggregatedDyDxOp<CPUDevice, T>);                                    \
+    REGISTER_KERNEL_BUILDER(                                                  \
+        Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+        UnaggregatedDy2DxSOp<CPUDevice, T>);                                  \
+    REGISTER_KERNEL_BUILDER(                                                  \
+        Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint<T>("T"),  \
+        UnaggregatedDy2DxOp<CPUDevice, T>);
 REGISTER_CPU(float);
 REGISTER_CPU(double);
 // Not required in the current situation
@@ -391,7 +423,7 @@ REGISTER_CPU(double);
 //     UnaggregatedDyDxSOp<GPUDevice, T>);                                                 \
 // REGISTER_KERNEL_BUILDER(                                                                \
 //     Name("UnaggregatedDyDx").Device(DEVICE_GPU).TypeConstraint<T>("T"),                 \
-//     UnaggregatedDyDxOp<GPUDevice, T>);                         
+//     UnaggregatedDyDxOp<GPUDevice, T>);
 // REGISTER_GPU(float);
 // REGISTER_GPU(double);
 // #endif  // GOOGLE_CUDA

From de03176b5a70772d5edaac1c4f2b74faaab4ed0a Mon Sep 17 00:00:00 2001
From: HLA <huangla@dp.tech>
Date: Wed, 25 Aug 2021 22:27:34 +0800
Subject: [PATCH 4/6] commit-message :Format revert

---
 source/op/unaggregated_grad.cc | 406 +++++++++++++++------------------
 1 file changed, 187 insertions(+), 219 deletions(-)

diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index 9810e1a36a..0a56f5ab90 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -6,55 +6,56 @@
 #define GGELU 0.044715
 
 REGISTER_OP("UnaggregatedDyDxS")
-    .Attr("T: {float, double} = DT_DOUBLE")
-    .Input("y: T")
-    .Input("w: T")
+    .Attr("T: {float, double} = DT_DOUBLE") 
+    .Input("y: T")                
+    .Input("w: T")     
     .Input("xbar: T")
     .Input("functype: int32")
     .Output("dy_dx: T");
 
 REGISTER_OP("UnaggregatedDyDx")
     .Attr("T: {float, double} = DT_DOUBLE")
-    .Input("z: T")
-    .Input("w: T")
-    .Input("dy_dx: T")
+    .Input("z: T")           
+    .Input("w: T")     
+    .Input("dy_dx: T")   
     .Input("ybar: T")
     .Input("functype: int32")
     .Output("dz_dx: T");
 
 REGISTER_OP("UnaggregatedDy2DxS")
-    .Attr("T: {float, double} = DT_DOUBLE")
-    .Input("y: T")
-    .Input("dy: T")
-    .Input("w: T")
+    .Attr("T: {float, double} = DT_DOUBLE") 
+    .Input("y: T")                
+    .Input("dy: T")                
+    .Input("w: T")  
     .Input("xbar: T")
     .Input("functype: int32")
     .Output("dy2_dx: T");
 
 REGISTER_OP("UnaggregatedDy2Dx")
     .Attr("T: {float, double} = DT_DOUBLE")
-    .Input("z: T")
-    .Input("w: T")
-    .Input("dy_dx: T")
-    .Input("dy2_dx: T")
+    .Input("z: T")           
+    .Input("w: T")       
+    .Input("dy_dx: T")     
+    .Input("dy2_dx: T")     
     .Input("ybar: T")
     .Input("functype: int32")
     .Output("dz2_dx: T");
 template <typename FPTYPE>
-FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype) //functype=tanh, gelu, ..
+FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype)  //functype=tanh, gelu, ..
 {
     switch (functype)
     {
-    case 1:
-        return (1 - y * y);
-    case 2:
-    {
-        const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
-        return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5;
-    }
-    default:
-        return -1;
+        case 1:
+            return (1 - y * y);
+        case 2:
+        {
+            const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
+            return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5;
+        }
+        default:
+            return -1;
     }
+    
 }
 
 template <typename FPTYPE>
@@ -62,60 +63,52 @@ FPTYPE grad_grad(const FPTYPE xbar, const FPTYPE y, const int functype)
 {
     switch (functype)
     {
-    case 1:
-        return -2 * y * (1 - y * y);
-    case 2:
-    {
-        const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
-        const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1);
-        return 3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2;
-    }
-    default:
-        return -1;
+        case 1:
+            return -2 * y * (1 - y * y);
+        case 2:
+        {
+            const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar));
+            const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1);
+            return  3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2;
+        }
+        default:
+            return -1;
     }
 }
+    
+
 
 template <typename FPTYPE>
-struct UnaggregatedDyDxSFunctor
-{
-    void operator()(const CPUDevice &d, const FPTYPE *y, const FPTYPE *w, const FPTYPE *xbar, const int length, const int width, FPTYPE *dy_dx, const int functype)
-    {
-#pragma omp parallel for
-        for (int ii = 0; ii < length; ii++)
-        {
-            for (int jj = 0; jj < width; jj++)
-            {
-                dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj], functype) * w[jj];
+struct UnaggregatedDyDxSFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy_dx, const int functype) {
+        #pragma omp parallel for
+        for (int ii = 0; ii < length; ii++) {
+            for (int jj = 0; jj < width; jj++) {
+                dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj],functype)*w[jj];
             }
         }
     }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    void operator()(const GPUDevice &d, const FPTYPE *y, const FPTYPE *w, const int length, const int width, FPTYPE *dy_dx)
-    {
-        //Currently, Do nothing at all!
+    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
+        //Currently, Do nothing at all! 
         return;
     }
-#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 // calculate the gradient for all variables!
 template <typename FPTYPE>
-struct UnaggregatedDyDxFunctor
-{
-    void operator()(const CPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const FPTYPE *ybar, const int length, const int width, const int size, FPTYPE *dz_dx, const int functype)
-    {
-//width=2*size
-#pragma omp parallel for
-        for (int kk = 0; kk < length; kk++)
-        {
-            for (int ii = 0; ii < width; ii++)
-            {
+struct UnaggregatedDyDxFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * ybar,  const int length, const int width, const int size, FPTYPE * dz_dx, const int functype) {
+        //width=2*size
+        #pragma omp parallel for
+        for (int kk = 0; kk < length; kk++) {
+            for (int ii = 0; ii < width; ii++) {
                 //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
-                FPTYPE dz_drou = grad(ybar[kk * width + ii], z[kk * width + ii], functype);
+                FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype);
                 FPTYPE accumulator = 0.0;
-                for (int jj = 0; jj < size; jj++)
-                {
+                for (int jj = 0; jj < size; jj++) {
                     accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
                 }
                 dz_drou *= accumulator;
@@ -125,163 +118,143 @@ struct UnaggregatedDyDxFunctor
         }
     }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    void operator()(const GPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const int length, const int width, const int size, FPTYPE *dz_dx)
-    {
-        //Currently, Do nothing at all!
+    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) {
+        //Currently, Do nothing at all! 
         return;
     }
-#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 template <typename FPTYPE>
-struct UnaggregatedDy2DxSFunctor
-{
-    void operator()(const CPUDevice &d, const FPTYPE *y, const FPTYPE *dy, const FPTYPE *w, const FPTYPE *xbar, const int length, const int width, FPTYPE *dy2_dx, const int functype)
-    {
-#pragma omp parallel for
-        for (int ii = 0; ii < length; ii++)
-        {
-            for (int jj = 0; jj < width; jj++)
-            {
-                dy2_dx[ii * width + jj] = grad_grad(xbar[ii * width + jj], y[ii * width + jj], functype) * w[jj] * w[jj];
+struct UnaggregatedDy2DxSFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy2_dx, const int functype) {
+        #pragma omp parallel for
+        for (int ii = 0; ii < length; ii++) {
+            for (int jj = 0; jj < width; jj++) {
+                dy2_dx[ii * width + jj] =  grad_grad(xbar[ii * width + jj],y[ii * width + jj],functype)*w[jj]*w[jj];
             }
         }
     }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    void operator()(const GPUDevice &d, const FPTYPE *y, const FPTYPE *w, const int length, const int width, FPTYPE *dy_dx)
-    {
-        //Currently, Do nothing at all!
+    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
+        //Currently, Do nothing at all! 
         return;
     }
-#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
 // calculate the gradient for all variables!
 template <typename FPTYPE>
-struct UnaggregatedDy2DxFunctor
-{
-    void operator()(const CPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const FPTYPE *dy2_dx, const FPTYPE *ybar, const int length, const int width, const int size, FPTYPE *dz2_dx, const int functype)
-    {
-#pragma omp parallel for
-        for (int kk = 0; kk < length; kk++)
-        {
-            for (int ii = 0; ii < width; ii++)
-            {
+struct UnaggregatedDy2DxFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const FPTYPE * ybar, const int length, const int width, const int size, FPTYPE * dz2_dx, const int functype) {
+        #pragma omp parallel for
+        for (int kk = 0; kk < length; kk++) {
+            for (int ii = 0; ii < width; ii++) {
                 //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
-                FPTYPE dz_drou = grad(ybar[kk * width + ii], z[kk * width + ii], functype);
+                FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype);
                 FPTYPE accumulator = 0.0;
-                for (int jj = 0; jj < size; jj++)
-                {
+                for (int jj = 0; jj < size; jj++) {
                     accumulator += w[jj * width + ii] * dy2_dx[kk * size + jj];
                 }
                 dz_drou *= accumulator;
                 accumulator = 0.0;
-                for (int jj = 0; jj < size; jj++)
-                {
+                for (int jj = 0; jj < size; jj++) {
                     accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
                 }
-                dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii], functype) * accumulator * accumulator;
+                dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii],functype) * accumulator * accumulator;
                 dz_drou += dy2_dx[kk * size + ii % size];
                 dz2_dx[kk * width + ii] = dz_drou;
             }
         }
     }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    void operator()(const GPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dz_dx, const FPTYPE *dy_dx, const FPTYPE *dy2_dx, const int length, const int width, const int size, FPTYPE *dz2_dx)
-    {
-        //Currently, Do nothing at all!
+    #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) {
+        //Currently, Do nothing at all! 
         return;
     }
-#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 };
 
-template <typename Device, typename FPTYPE>
-class UnaggregatedDyDxSOp : public OpKernel
-{
-public:
-    explicit UnaggregatedDyDxSOp(OpKernelConstruction *context) : OpKernel(context) {}
+template<typename Device, typename FPTYPE>
+class UnaggregatedDyDxSOp : public OpKernel {
+ public:
+    explicit UnaggregatedDyDxSOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-    void Compute(OpKernelContext *context) override
-    {
-        deepmd::safe_compute(context, [this](OpKernelContext *context)
-                             { this->_Compute(context); });
+    void Compute(OpKernelContext* context) override {
+        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
     }
 
-    void _Compute(OpKernelContext *context)
-    {
+    void _Compute(OpKernelContext* context) {
         // Grab the input tensor
         //xbar=xw+b
         int context_input_index = 0;
-        const Tensor &y = context->input(context_input_index++);
-        const Tensor &w = context->input(context_input_index++);
-        const Tensor &xbar = context->input(context_input_index++);
-        const Tensor &functype = context->input(context_input_index++);
+        const Tensor& y	= context->input(context_input_index++);
+        const Tensor& w	= context->input(context_input_index++);
+        const Tensor& xbar = context->input(context_input_index++);
+        const Tensor& functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES(context, (y.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES (context, (y.shape().dims() == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (w.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES(context, (xbar.shape().dims() == 2),    errors::InvalidArgument("Dim of input should be 2"));
         //check functype
 
         int context_output_index = 0;
-        Tensor *dy_dx = NULL;
+        Tensor* dy_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                         y.shape(),
-                                                         &dy_dx));
+                                 y.shape(),
+                                 &dy_dx));
 
         UnaggregatedDyDxSFunctor<FPTYPE>()(
-            context->eigen_device<Device>(), // define actually graph execution device
+            context->eigen_device<Device>(),            // define actually graph execution device
             y.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
             xbar.flat<FPTYPE>().data(),
             y.shape().dim_size(0),
             y.shape().dim_size(1),
             dy_dx->flat<FPTYPE>().data(),
-            functype.flat<int32>()(0));
+            functype.flat<int32>()(0)
+        );
     }
 
 private:
 };
 
-template <typename Device, typename FPTYPE>
-class UnaggregatedDy2DxSOp : public OpKernel
-{
-public:
-    explicit UnaggregatedDy2DxSOp(OpKernelConstruction *context) : OpKernel(context) {}
+template<typename Device, typename FPTYPE>
+class UnaggregatedDy2DxSOp : public OpKernel {
+ public:
+    explicit UnaggregatedDy2DxSOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-    void Compute(OpKernelContext *context) override
-    {
-        deepmd::safe_compute(context, [this](OpKernelContext *context)
-                             { this->_Compute(context); });
+    void Compute(OpKernelContext* context) override {
+        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
     }
 
-    void _Compute(OpKernelContext *context)
-    {
+    void _Compute(OpKernelContext* context) {
         // Grab the input tensor
         int context_input_index = 0;
-        const Tensor &y = context->input(context_input_index++);
-        const Tensor &dy = context->input(context_input_index++);
-        const Tensor &w = context->input(context_input_index++);
-        const Tensor &xbar = context->input(context_input_index++);
-        const Tensor &functype = context->input(context_input_index++);
+        const Tensor& y	    = context->input(context_input_index++);
+        const Tensor& dy	= context->input(context_input_index++);
+        const Tensor& w	    = context->input(context_input_index++);
+        const Tensor& xbar = context->input(context_input_index++);
+        const Tensor& functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES(context, (y.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (dy.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-
+        OP_REQUIRES (context, (y.shape().dims()  == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy.shape().dims() == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (w.shape().dims()  == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (xbar.shape().dims()  == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+    
         int context_output_index = 0;
-        Tensor *dy2_dx = NULL;
+        Tensor* dy2_dx = NULL; 
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                         y.shape(),
-                                                         &dy2_dx));
+                                 y.shape(),
+                                 &dy2_dx));
 
         UnaggregatedDy2DxSFunctor<FPTYPE>()(
-            context->eigen_device<Device>(), // define actually graph execution device
+            context->eigen_device<Device>(),            // define actually graph execution device
             y.flat<FPTYPE>().data(),
             dy.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
@@ -289,100 +262,94 @@ class UnaggregatedDy2DxSOp : public OpKernel
             y.shape().dim_size(0),
             y.shape().dim_size(1),
             dy2_dx->flat<FPTYPE>().data(),
-            functype.flat<int32>()(0));
+            functype.flat<int32>()(0)
+        );
     }
 
 private:
 };
 
-template <typename Device, typename FPTYPE>
-class UnaggregatedDyDxOp : public OpKernel
-{
-public:
-    explicit UnaggregatedDyDxOp(OpKernelConstruction *context) : OpKernel(context) {}
+template<typename Device, typename FPTYPE>
+class UnaggregatedDyDxOp : public OpKernel {
+ public:
+    explicit UnaggregatedDyDxOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-    void Compute(OpKernelContext *context) override
-    {
-        deepmd::safe_compute(context, [this](OpKernelContext *context)
-                             { this->_Compute(context); });
+    void Compute(OpKernelContext* context) override {
+        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
     }
 
-    void _Compute(OpKernelContext *context)
-    {
+    void _Compute(OpKernelContext* context) {
         // Grab the input tensor
         int context_input_index = 0;
-        const Tensor &z = context->input(context_input_index++);
-        const Tensor &w = context->input(context_input_index++);
-        const Tensor &dy_dx = context->input(context_input_index++);
-        const Tensor &ybar = context->input(context_input_index++);
-        const Tensor &functype = context->input(context_input_index++);
+        const Tensor& z	= context->input(context_input_index++);
+        const Tensor& w	= context->input(context_input_index++);
+        const Tensor& dy_dx	= context->input(context_input_index++);
+        const Tensor& ybar	= context->input(context_input_index++);
+        const Tensor& functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES(context, (z.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (dy_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (ybar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (ybar.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
 
         int context_output_index = 0;
-        Tensor *dz_dx = NULL;
+        Tensor* dz_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                         z.shape(),
-                                                         &dz_dx));
+                                 z.shape(),
+                                 &dz_dx));
 
         UnaggregatedDyDxFunctor<FPTYPE>()(
-            context->eigen_device<Device>(), // define actually graph execution device
+            context->eigen_device<Device>(),            // define actually graph execution device
             z.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
             dy_dx.flat<FPTYPE>().data(),
             ybar.flat<FPTYPE>().data(),
             z.shape().dim_size(0),
-            z.shape().dim_size(1), //N1
-            w.shape().dim_size(0), //N0 , N1=2N0
+            z.shape().dim_size(1),        //N1
+            w.shape().dim_size(0),      //N0 , N1=2N0
             dz_dx->flat<FPTYPE>().data(),
-            functype.flat<int32>()(0));
+            functype.flat<int32>()(0)
+        );
     }
 
 private:
 };
 
-template <typename Device, typename FPTYPE>
-class UnaggregatedDy2DxOp : public OpKernel
-{
-public:
-    explicit UnaggregatedDy2DxOp(OpKernelConstruction *context) : OpKernel(context) {}
+template<typename Device, typename FPTYPE>
+class UnaggregatedDy2DxOp : public OpKernel {
+ public:
+    explicit UnaggregatedDy2DxOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-    void Compute(OpKernelContext *context) override
-    {
-        deepmd::safe_compute(context, [this](OpKernelContext *context)
-                             { this->_Compute(context); });
+    void Compute(OpKernelContext* context) override {
+        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
     }
 
-    void _Compute(OpKernelContext *context)
-    {
+    void _Compute(OpKernelContext* context) {
         // Grab the input tensor
         int context_input_index = 0;
-        const Tensor &z = context->input(context_input_index++);
-        const Tensor &w = context->input(context_input_index++);
-        const Tensor &dy_dx = context->input(context_input_index++);
-        const Tensor &dy2_dx = context->input(context_input_index++);
-        const Tensor &ybar = context->input(context_input_index++);
-        const Tensor &functype = context->input(context_input_index++);
+        const Tensor& z	= context->input(context_input_index++);
+        const Tensor& w	= context->input(context_input_index++);
+        const Tensor& dy_dx	= context->input(context_input_index++);
+        const Tensor& dy2_dx = context->input(context_input_index++);
+        const Tensor& ybar = context->input(context_input_index++);
+        const Tensor& functype = context->input(context_input_index++);
 
         // set size of the sample
-        OP_REQUIRES(context, (z.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (dy_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (dy2_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
-        OP_REQUIRES(context, (ybar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2"));
+        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy2_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (ybar.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
 
         int context_output_index = 0;
-        Tensor *dz2_dx = NULL;
+        Tensor* dz2_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                         z.shape(),
-                                                         &dz2_dx));
+                                 z.shape(),
+                                 &dz2_dx));
 
         UnaggregatedDy2DxFunctor<FPTYPE>()(
-            context->eigen_device<Device>(), // define actually graph execution device
+            context->eigen_device<Device>(),            // define actually graph execution device
             z.flat<FPTYPE>().data(),
             w.flat<FPTYPE>().data(),
             dy_dx.flat<FPTYPE>().data(),
@@ -392,26 +359,27 @@ class UnaggregatedDy2DxOp : public OpKernel
             z.shape().dim_size(1),
             w.shape().dim_size(0),
             dz2_dx->flat<FPTYPE>().data(),
-            functype.flat<int32>()(0));
+            functype.flat<int32>()(0)
+        );
     }
 
 private:
 };
 
 // Register the CPU kernels.
-#define REGISTER_CPU(T)                                                       \
-    REGISTER_KERNEL_BUILDER(                                                  \
-        Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),  \
-        UnaggregatedDyDxSOp<CPUDevice, T>);                                   \
-    REGISTER_KERNEL_BUILDER(                                                  \
-        Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
-        UnaggregatedDyDxOp<CPUDevice, T>);                                    \
-    REGISTER_KERNEL_BUILDER(                                                  \
-        Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-        UnaggregatedDy2DxSOp<CPUDevice, T>);                                  \
-    REGISTER_KERNEL_BUILDER(                                                  \
-        Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint<T>("T"),  \
-        UnaggregatedDy2DxOp<CPUDevice, T>);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
+    UnaggregatedDyDxSOp<CPUDevice, T>);                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint<T>("T"),                 \
+    UnaggregatedDyDxOp<CPUDevice, T>);                                                  \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),               \
+    UnaggregatedDy2DxSOp<CPUDevice, T>);                                                \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
+    UnaggregatedDy2DxOp<CPUDevice, T>);             
 REGISTER_CPU(float);
 REGISTER_CPU(double);
 // Not required in the current situation
@@ -423,7 +391,7 @@ REGISTER_CPU(double);
 //     UnaggregatedDyDxSOp<GPUDevice, T>);                                                 \
 // REGISTER_KERNEL_BUILDER(                                                                \
 //     Name("UnaggregatedDyDx").Device(DEVICE_GPU).TypeConstraint<T>("T"),                 \
-//     UnaggregatedDyDxOp<GPUDevice, T>);
+//     UnaggregatedDyDxOp<GPUDevice, T>);                         
 // REGISTER_GPU(float);
 // REGISTER_GPU(double);
 // #endif  // GOOGLE_CUDA

From 7a68bd716130a3bbaa72ecf6b560508626e8fc52 Mon Sep 17 00:00:00 2001
From: HLA <huangla@dp.tech>
Date: Thu, 26 Aug 2021 09:42:29 +0800
Subject: [PATCH 5/6] commit-message: format change

---
 source/op/unaggregated_grad.cc | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index 0a56f5ab90..c132a09b3a 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -205,8 +205,8 @@ class UnaggregatedDyDxSOp : public OpKernel {
         int context_output_index = 0;
         Tensor* dy_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                 y.shape(),
-                                 &dy_dx));
+                                                         y.shape(),
+                                                         &dy_dx));
 
         UnaggregatedDyDxSFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
@@ -219,7 +219,6 @@ class UnaggregatedDyDxSOp : public OpKernel {
             functype.flat<int32>()(0)
         );
     }
-
 private:
 };
 
@@ -248,10 +247,10 @@ class UnaggregatedDy2DxSOp : public OpKernel {
         OP_REQUIRES (context, (xbar.shape().dims()  == 2),		errors::InvalidArgument ("Dim of input should be 2"));
     
         int context_output_index = 0;
-        Tensor* dy2_dx = NULL; 
+        Tensor* dy2_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                 y.shape(),
-                                 &dy2_dx));
+                                                         y.shape(),
+                                                         &dy2_dx));
 
         UnaggregatedDy2DxSFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
@@ -265,7 +264,6 @@ class UnaggregatedDy2DxSOp : public OpKernel {
             functype.flat<int32>()(0)
         );
     }
-
 private:
 };
 
@@ -296,8 +294,8 @@ class UnaggregatedDyDxOp : public OpKernel {
         int context_output_index = 0;
         Tensor* dz_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                 z.shape(),
-                                 &dz_dx));
+                                                         z.shape(),
+                                                         &dz_dx));
 
         UnaggregatedDyDxFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
@@ -312,7 +310,6 @@ class UnaggregatedDyDxOp : public OpKernel {
             functype.flat<int32>()(0)
         );
     }
-
 private:
 };
 
@@ -345,8 +342,8 @@ class UnaggregatedDy2DxOp : public OpKernel {
         int context_output_index = 0;
         Tensor* dz2_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                 z.shape(),
-                                 &dz2_dx));
+                                                         z.shape(),
+                                                         &dz2_dx));
 
         UnaggregatedDy2DxFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
@@ -362,7 +359,6 @@ class UnaggregatedDy2DxOp : public OpKernel {
             functype.flat<int32>()(0)
         );
     }
-
 private:
 };
 

From 1c71fee03d2acb1dec458935768c904aaaa73e7b Mon Sep 17 00:00:00 2001
From: HLA <huangla@dp.tech>
Date: Thu, 26 Aug 2021 15:14:38 +0800
Subject: [PATCH 6/6] commit-message: Format change

---
 source/op/unaggregated_grad.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index c132a09b3a..89c14a84fb 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -205,8 +205,8 @@ class UnaggregatedDyDxSOp : public OpKernel {
         int context_output_index = 0;
         Tensor* dy_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                         y.shape(),
-                                                         &dy_dx));
+	    					     y.shape(),
+	    					     &dy_dx));
 
         UnaggregatedDyDxSFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
@@ -249,8 +249,8 @@ class UnaggregatedDy2DxSOp : public OpKernel {
         int context_output_index = 0;
         Tensor* dy2_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                         y.shape(),
-                                                         &dy2_dx));
+	    					     y.shape(),
+	    					     &dy2_dx));
 
         UnaggregatedDy2DxSFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
@@ -294,8 +294,8 @@ class UnaggregatedDyDxOp : public OpKernel {
         int context_output_index = 0;
         Tensor* dz_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                         z.shape(),
-                                                         &dz_dx));
+	    					     z.shape(),
+	    					     &dz_dx));
 
         UnaggregatedDyDxFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
@@ -342,8 +342,8 @@ class UnaggregatedDy2DxOp : public OpKernel {
         int context_output_index = 0;
         Tensor* dz2_dx = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-                                                         z.shape(),
-                                                         &dz2_dx));
+	    					     z.shape(),
+	    					     &dz2_dx));
 
         UnaggregatedDy2DxFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device