From 4877203994cef2be433c8e36a68ade3319b1d560 Mon Sep 17 00:00:00 2001 From: HLA Date: Tue, 24 Aug 2021 00:00:58 +0800 Subject: [PATCH 1/6] commit-message: decouple activation function's type from model compression's process in SE_A, now tanh & gelu is both available. --- deepmd/descriptor/se_a.py | 4 +- deepmd/utils/tabulate.py | 45 ++++++++---- source/op/unaggregated_grad.cc | 122 ++++++++++++++++++++++++++------- source/tests/test_tabulate.py | 49 +++++++++++++ 4 files changed, 182 insertions(+), 38 deletions(-) create mode 100644 source/tests/test_tabulate.py diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index fab872e721..fbc9a77b56 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -126,6 +126,7 @@ def __init__ (self, self.uniform_seed = uniform_seed self.seed_shift = embedding_net_rand_seed_shift(self.filter_neuron) self.trainable = trainable + self.compress_activation_fn = get_activation_func(activation_function) self.filter_activation_fn = get_activation_func(activation_function) self.filter_precision = get_precision(precision) self.filter_np_precision = get_np_precision(precision) @@ -316,7 +317,8 @@ def enable_compression(self, The overflow check frequency """ self.compress = True - self.table = DPTabulate(model_file, self.type_one_side, self.exclude_types) + self.table = DPTabulate( + model_file, self.type_one_side, self.exclude_types, self.compress_activation_fn) self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency] self.lower, self.upper \ = self.table.build(min_nbor_dist, diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index 719697dc87..44b4380e70 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -34,7 +34,8 @@ class DPTabulate(): def __init__(self, model_file : str, type_one_side : bool = False, - exclude_types : List[List[int]] = []) -> None: + exclude_types : List[List[int]] = [], + activation_fn=tf.nn.tanh) -> None: """ Constructor """ @@ -44,6 +45,15 @@ def __init__(self, self.exclude_types = exclude_types if self.type_one_side and len(self.exclude_types) != 0: raise RunTimeError('"type_one_side" is not compatible with "exclude_types"') + + # functype + if activation_fn.__name__ == 'tf.nn.tanh' or activation_fn.__name__ == 'tanh': + self.functype = 1 + elif activation_fn.__name__ == 'gelu': + self.functype = 2 + else: + raise RunTimeError("Unknown actication function type!") + self.activation_fn = activation_fn self.graph, self.graph_def = load_graph_def(self.model_file) self.sess = tf.Session(graph = self.graph) @@ -199,26 +209,37 @@ def _make_data(self, xx, idx): xx = tf.reshape(xx, [xx.size, -1]) for layer in range(self.layer_size): if layer == 0: - yy = self._layer_0(xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx]) - dy = op_module.unaggregated_dy_dx_s(yy, self.matrix["layer_" + str(layer + 1)][idx]) - dy2 = op_module.unaggregated_dy2_dx_s(yy, dy, self.matrix["layer_" + str(layer + 1)][idx]) + xbar = tf.matmul( + xx, self.matrix["layer_" + str(layer + 1)][idx]) + self.bias["layer_" + str(layer + 1)][idx] + yy = self._layer_0( + xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx]) + dy = op_module.unaggregated_dy_dx_s( + yy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) + dy2 = op_module.unaggregated_dy2_dx_s( + yy, dy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) else: - tt, yy = self._layer_1(yy, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx]) - dz = op_module.unaggregated_dy_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dy) - dy2 = op_module.unaggregated_dy2_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dz, dy, dy2) + ybar = tf.matmul( + yy, self.matrix["layer_" + str(layer + 1)][idx]) + self.bias["layer_" + str(layer + 1)][idx] + tt, zz = self._layer_1( + yy, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx]) + dz = op_module.unaggregated_dy_dx( + zz - tt, self.matrix["layer_" + str(layer + 1)][idx], dy, ybar, tf.constant(self.functype)) + dy2 = op_module.unaggregated_dy2_dx( + zz - tt, self.matrix["layer_" + str(layer + 1)][idx], dy, dy2, ybar, tf.constant(self.functype)) dy = dz - - vv = yy.eval() + yy = zz + + vv = zz.eval() dd = dy.eval() d2 = dy2.eval() return vv, dd, d2 def _layer_0(self, x, w, b): - return tf.nn.tanh(tf.matmul(x, w) + b) + return self.activation_fn(tf.matmul(x, w) + b) def _layer_1(self, x, w, b): - t = tf.concat([x, x], axis = 1) - return t, tf.nn.tanh(tf.matmul(x, w) + b) + t + t = tf.concat([x, x], axis=1) + return t, self.activation_fn(tf.matmul(x, w) + b) + t def _save_data(self): for ii in range(self.ntypes * self.ntypes): diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc index 343a339a92..f43dcf17a1 100644 --- a/source/op/unaggregated_grad.cc +++ b/source/op/unaggregated_grad.cc @@ -2,42 +2,90 @@ #include "ComputeDescriptor.h" #include "neighbor_list.h" + +#define SQRT2_PI 0.7978845608028654 +#define GGELU 0.044715 + REGISTER_OP("UnaggregatedDyDxS") .Attr("T: {float, double} = DT_DOUBLE") .Input("y: T") - .Input("w: T") + .Input("w: T") + .Input("xbar: T") + .Input("functype: int32") .Output("dy_dx: T"); REGISTER_OP("UnaggregatedDyDx") .Attr("T: {float, double} = DT_DOUBLE") .Input("z: T") .Input("w: T") - .Input("dy_dx: T") + .Input("dy_dx: T") + .Input("ybar: T") + .Input("functype: int32") .Output("dz_dx: T"); REGISTER_OP("UnaggregatedDy2DxS") .Attr("T: {float, double} = DT_DOUBLE") .Input("y: T") .Input("dy: T") - .Input("w: T") + .Input("w: T") + .Input("xbar: T") + .Input("functype: int32") .Output("dy2_dx: T"); REGISTER_OP("UnaggregatedDy2Dx") .Attr("T: {float, double} = DT_DOUBLE") .Input("z: T") - .Input("w: T") - .Input("dz_dx: T") + .Input("w: T") .Input("dy_dx: T") .Input("dy2_dx: T") + .Input("ybar: T") + .Input("functype: int32") .Output("dz2_dx: T"); +template +FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype) //functype=tanh, gelu, .. +{ + switch (functype) + { + case 1: + return (1 - y * y); + case 2: + { + const FPTYPE var = tanh(SQRT2_PI * (xbar + GGELU * xbar * xbar * xbar)); + return 0.5 * SQRT2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5; + } + default: + return -1; + } + +} + +template +FPTYPE grad_grad(const FPTYPE xbar, const FPTYPE y, const int functype) +{ + switch (functype) + { + case 1: + return -2 * y * (1 - y * y); + case 2: + { + const FPTYPE var1 = tanh(SQRT2_PI * (xbar + GGELU * xbar * xbar * xbar)); + const FPTYPE var2 = SQRT2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1); + return 3 * GGELU * SQRT2_PI * xbar * xbar * (1 - var1 * var1) - SQRT2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2; + } + default: + return -1; + } +} + + template struct UnaggregatedDyDxSFunctor { - void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) { + void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy_dx, const int functype) { #pragma omp parallel for for (int ii = 0; ii < length; ii++) { for (int jj = 0; jj < width; jj++) { - dy_dx[ii * width + jj] = (1 - y[ii * width + jj] * y[ii * width + jj]) * w[jj]; + dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj],functype)*w[jj]; } } } @@ -53,12 +101,13 @@ struct UnaggregatedDyDxSFunctor { // calculate the gradient for all variables! template struct UnaggregatedDyDxFunctor { - void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) { + void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * ybar, const int length, const int width, const int size, FPTYPE * dz_dx, const int functype) { + //width=2*size #pragma omp parallel for for (int kk = 0; kk < length; kk++) { for (int ii = 0; ii < width; ii++) { //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]); - FPTYPE dz_drou = 1 - z[kk * width + ii] * z[kk * width + ii]; + FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype); FPTYPE accumulator = 0.0; for (int jj = 0; jj < size; jj++) { accumulator += w[jj * width + ii] * dy_dx[kk * size + jj]; @@ -80,11 +129,11 @@ struct UnaggregatedDyDxFunctor { template struct UnaggregatedDy2DxSFunctor { - void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const int length, const int width, FPTYPE * dy2_dx) { + void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy2_dx, const int functype) { #pragma omp parallel for for (int ii = 0; ii < length; ii++) { for (int jj = 0; jj < width; jj++) { - dy2_dx[ii * width + jj] = -2 * w[jj] * y[ii * width + jj] * dy[ii * width + jj]; + dy2_dx[ii * width + jj] = grad_grad(xbar[ii * width + jj],y[ii * width + jj],functype)*w[jj]*w[jj]; } } } @@ -100,12 +149,12 @@ struct UnaggregatedDy2DxSFunctor { // calculate the gradient for all variables! template struct UnaggregatedDy2DxFunctor { - void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) { + void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const FPTYPE * ybar, const int length, const int width, const int size, FPTYPE * dz2_dx, const int functype) { #pragma omp parallel for for (int kk = 0; kk < length; kk++) { for (int ii = 0; ii < width; ii++) { //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]); - FPTYPE dz_drou = 1 - z[kk * width + ii] * z[kk * width + ii]; + FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype); FPTYPE accumulator = 0.0; for (int jj = 0; jj < size; jj++) { accumulator += w[jj * width + ii] * dy2_dx[kk * size + jj]; @@ -115,7 +164,7 @@ struct UnaggregatedDy2DxFunctor { for (int jj = 0; jj < size; jj++) { accumulator += w[jj * width + ii] * dy_dx[kk * size + jj]; } - dz_drou -= 2 * z[kk * width + ii] * (dz_dx[kk * width + ii] - dy_dx[kk * size + ii % size]) * accumulator; + dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii],functype) * accumulator * accumulator; dz_drou += dy2_dx[kk * size + ii % size]; dz2_dx[kk * width + ii] = dz_drou; } @@ -141,13 +190,18 @@ class UnaggregatedDyDxSOp : public OpKernel { void _Compute(OpKernelContext* context) { // Grab the input tensor + //xbar=xw+b int context_input_index = 0; const Tensor& y = context->input(context_input_index++); const Tensor& w = context->input(context_input_index++); + const Tensor& xbar = context->input(context_input_index++); + const Tensor& functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES (context, (y.shape().dims() == 2), errors::InvalidArgument ("Dim of table should be 1")); + OP_REQUIRES (context, (y.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + //check functype int context_output_index = 0; Tensor* dy_dx = NULL; @@ -159,11 +213,14 @@ class UnaggregatedDyDxSOp : public OpKernel { context->eigen_device(), // define actually graph execution device y.flat().data(), w.flat().data(), + xbar.flat().data(), y.shape().dim_size(0), y.shape().dim_size(1), - dy_dx->flat().data() + dy_dx->flat().data(), + functype.flat()(0) ); } + private: }; @@ -182,11 +239,14 @@ class UnaggregatedDy2DxSOp : public OpKernel { const Tensor& y = context->input(context_input_index++); const Tensor& dy = context->input(context_input_index++); const Tensor& w = context->input(context_input_index++); + const Tensor& xbar = context->input(context_input_index++); + const Tensor& functype = context->input(context_input_index++); // set size of the sample OP_REQUIRES (context, (y.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); OP_REQUIRES (context, (dy.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (xbar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); int context_output_index = 0; Tensor* dy2_dx = NULL; @@ -199,11 +259,14 @@ class UnaggregatedDy2DxSOp : public OpKernel { y.flat().data(), dy.flat().data(), w.flat().data(), + xbar.flat().data(), y.shape().dim_size(0), y.shape().dim_size(1), - dy2_dx->flat().data() + dy2_dx->flat().data(), + functype.flat()(0) ); } + private: }; @@ -222,11 +285,14 @@ class UnaggregatedDyDxOp : public OpKernel { const Tensor& z = context->input(context_input_index++); const Tensor& w = context->input(context_input_index++); const Tensor& dy_dx = context->input(context_input_index++); + const Tensor& ybar = context->input(context_input_index++); + const Tensor& functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES (context, (z.shape().dims() == 2), errors::InvalidArgument ("Dim of table should be 1")); + OP_REQUIRES (context, (z.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); OP_REQUIRES (context, (dy_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (ybar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); int context_output_index = 0; Tensor* dz_dx = NULL; @@ -239,12 +305,15 @@ class UnaggregatedDyDxOp : public OpKernel { z.flat().data(), w.flat().data(), dy_dx.flat().data(), + ybar.flat().data(), z.shape().dim_size(0), - z.shape().dim_size(1), - w.shape().dim_size(0), - dz_dx->flat().data() + z.shape().dim_size(1), //N1 + w.shape().dim_size(0), //N0 , N1=2N0 + dz_dx->flat().data(), + functype.flat()(0) ); } + private: }; @@ -262,16 +331,17 @@ class UnaggregatedDy2DxOp : public OpKernel { int context_input_index = 0; const Tensor& z = context->input(context_input_index++); const Tensor& w = context->input(context_input_index++); - const Tensor& dz_dx = context->input(context_input_index++); const Tensor& dy_dx = context->input(context_input_index++); const Tensor& dy2_dx = context->input(context_input_index++); + const Tensor& ybar = context->input(context_input_index++); + const Tensor& functype = context->input(context_input_index++); // set size of the sample OP_REQUIRES (context, (z.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (dz_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); OP_REQUIRES (context, (dy_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); OP_REQUIRES (context, (dy2_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (ybar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); int context_output_index = 0; Tensor* dz2_dx = NULL; @@ -283,15 +353,17 @@ class UnaggregatedDy2DxOp : public OpKernel { context->eigen_device(), // define actually graph execution device z.flat().data(), w.flat().data(), - dz_dx.flat().data(), dy_dx.flat().data(), dy2_dx.flat().data(), + ybar.flat().data(), z.shape().dim_size(0), z.shape().dim_size(1), w.shape().dim_size(0), - dz2_dx->flat().data() + dz2_dx->flat().data(), + functype.flat()(0) ); } + private: }; diff --git a/source/tests/test_tabulate.py b/source/tests/test_tabulate.py new file mode 100644 index 0000000000..f082ac3dcc --- /dev/null +++ b/source/tests/test_tabulate.py @@ -0,0 +1,49 @@ +import unittest +import numpy as np +from deepmd.utils.tabulate import DPTabulate +from deepmd.env import op_module +from deepmd.env import tf +from deepmd.common import gelu + +tf.enable_eager_execution( + config=None, + device_policy=None, + execution_mode=None +) +# Now just test some OPs utilized by DPTabulate sourced in /opt/deepmd-kit/source/op/unaggregated_grad.cc + +class TestDPTabulate(unittest.TestCase): + def test_op_tanh(self): + w=tf.constant([[0.1,0.2,0.3,0.4],[0.5,0.6,0.7,0.8],[0.9,1,1.1,1.2]],dtype='double') + x=tf.constant([[0.1,0.2,0.3],[0.4,0.5,0.6],[0.7,0.8,0.9],[1.0,1.1,1.2]],dtype='double') + b=tf.constant([[0.1],[0.2],[0.3],[0.4]],dtype='double') + xbar = tf.matmul(x, w) + b + y=tf.nn.tanh(xbar) + dy = op_module.unaggregated_dy_dx_s(y, w, xbar, tf.constant(1)) + answer = np.array([[8.008666403121351973e-02, 1.513925729426658651e-01, 2.134733287761668430e-01, 2.661983049806041501e-01], + [4.010658815015743367e-02, 6.306476628799793926e-02, 7.332167904608145881e-02, 7.494218676568849269e-02], + [1.561705624394135218e-02, 1.994112926507514427e-02, 1.887519955881525671e-02, 1.576442161040989692e-02], + [5.492686739421748753e-03, 5.754985286040992763e-03, 4.493113544969218158e-03, 3.107638130764600777e-03]]) + + self.assertTrue((dy.numpy()==answer).all()) + + def test_op_gelu(self): + w = tf.constant([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [ + 0.9, 1, 1.1, 1.2]], dtype='double') + x = tf.constant([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [ + 0.7, 0.8, 0.9], [1.0, 1.1, 1.2]], dtype='double') + b = tf.constant([[0.1], [0.2], [0.3], [0.4]], dtype='double') + xbar = tf.matmul(x, w) + b + y = gelu(xbar) + dy = op_module.unaggregated_dy_dx_s(y, w, xbar, tf.constant(2)) + answer = np.array([[8.549286163555620821e-02, 1.782905778685600906e-01, 2.776474599997448833e-01, 3.827650237273348965e-01], + [1.089906023807040714e-01, 2.230820937721638697e-01, 3.381867859682909927e-01, 4.513008399758057232e-01], + [1.124254240556722684e-01, 2.209918074710395253e-01, 3.238894323148118759e-01, 4.220357318198978414e-01], + [1.072173273655498138e-01, 2.082159073100979807e-01, 3.059816075270163083e-01, 4.032981557798429595e-01]]) + + self.assertTrue((dy.numpy() == answer).all()) + + + +if __name__ == '__main__': + unittest.main() From 95c1ea550dfb10583cc89dc8078cb17532664374 Mon Sep 17 00:00:00 2001 From: HLA Date: Wed, 25 Aug 2021 11:58:39 +0800 Subject: [PATCH 2/6] commit-message: modified code and passed unittest --- deepmd/utils/tabulate.py | 7 +++++-- source/op/unaggregated_grad.cc | 13 ++++++------- source/tests/test_tabulate.py | 21 ++++++++++++--------- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index 44b4380e70..93a75c6f24 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -5,6 +5,7 @@ from typing import Tuple, List from deepmd.env import tf from deepmd.env import op_module +from deepmd.common import ACTIVATION_FN_DICT from deepmd.utils.sess import run_sess from deepmd.utils.graph import get_tensor_by_name_from_graph, load_graph_def from deepmd.utils.graph import get_embedding_net_nodes_from_graph_def @@ -30,6 +31,8 @@ class DPTabulate(): exclude_types : List[List[int]] The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1. + activation_function + The activation function in the embedding net. Supported options are {"tanh","gelu"} in common.ACTIVATION_FN_DICT. """ def __init__(self, model_file : str, @@ -47,9 +50,9 @@ def __init__(self, raise RunTimeError('"type_one_side" is not compatible with "exclude_types"') # functype - if activation_fn.__name__ == 'tf.nn.tanh' or activation_fn.__name__ == 'tanh': + if activation_fn == ACTIVATION_FN_DICT["tanh"]: self.functype = 1 - elif activation_fn.__name__ == 'gelu': + elif activation_fn == ACTIVATION_FN_DICT["gelu"]: self.functype = 2 else: raise RunTimeError("Unknown actication function type!") diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc index f43dcf17a1..542268d3e7 100644 --- a/source/op/unaggregated_grad.cc +++ b/source/op/unaggregated_grad.cc @@ -1,9 +1,8 @@ #include "custom_op.h" #include "ComputeDescriptor.h" #include "neighbor_list.h" +#include "device.h" - -#define SQRT2_PI 0.7978845608028654 #define GGELU 0.044715 REGISTER_OP("UnaggregatedDyDxS") @@ -50,8 +49,8 @@ FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype) //functype=t return (1 - y * y); case 2: { - const FPTYPE var = tanh(SQRT2_PI * (xbar + GGELU * xbar * xbar * xbar)); - return 0.5 * SQRT2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5; + const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); + return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5; } default: return -1; @@ -68,9 +67,9 @@ FPTYPE grad_grad(const FPTYPE xbar, const FPTYPE y, const int functype) return -2 * y * (1 - y * y); case 2: { - const FPTYPE var1 = tanh(SQRT2_PI * (xbar + GGELU * xbar * xbar * xbar)); - const FPTYPE var2 = SQRT2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1); - return 3 * GGELU * SQRT2_PI * xbar * xbar * (1 - var1 * var1) - SQRT2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2; + const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); + const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1); + return 3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2; } default: return -1; diff --git a/source/tests/test_tabulate.py b/source/tests/test_tabulate.py index f082ac3dcc..ce26c4e3e6 100644 --- a/source/tests/test_tabulate.py +++ b/source/tests/test_tabulate.py @@ -5,11 +5,6 @@ from deepmd.env import tf from deepmd.common import gelu -tf.enable_eager_execution( - config=None, - device_policy=None, - execution_mode=None -) # Now just test some OPs utilized by DPTabulate sourced in /opt/deepmd-kit/source/op/unaggregated_grad.cc class TestDPTabulate(unittest.TestCase): @@ -20,13 +15,17 @@ def test_op_tanh(self): xbar = tf.matmul(x, w) + b y=tf.nn.tanh(xbar) dy = op_module.unaggregated_dy_dx_s(y, w, xbar, tf.constant(1)) + dy_array = tf.Session().run(dy) answer = np.array([[8.008666403121351973e-02, 1.513925729426658651e-01, 2.134733287761668430e-01, 2.661983049806041501e-01], - [4.010658815015743367e-02, 6.306476628799793926e-02, 7.332167904608145881e-02, 7.494218676568849269e-02], + [4.010658815015744061e-02, 6.306476628799793926e-02, 7.332167904608145881e-02, 7.494218676568849269e-02], [1.561705624394135218e-02, 1.994112926507514427e-02, 1.887519955881525671e-02, 1.576442161040989692e-02], [5.492686739421748753e-03, 5.754985286040992763e-03, 4.493113544969218158e-03, 3.107638130764600777e-03]]) - self.assertTrue((dy.numpy()==answer).all()) - + places = 18 + for ii in range(dy_array.shape[0]): + for jj in range(dy_array.shape[1]): + self.assertAlmostEqual(dy_array[ii,jj], answer[ii,jj], places=places) + def test_op_gelu(self): w = tf.constant([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [ 0.9, 1, 1.1, 1.2]], dtype='double') @@ -36,12 +35,16 @@ def test_op_gelu(self): xbar = tf.matmul(x, w) + b y = gelu(xbar) dy = op_module.unaggregated_dy_dx_s(y, w, xbar, tf.constant(2)) + dy_array = tf.Session().run(dy) answer = np.array([[8.549286163555620821e-02, 1.782905778685600906e-01, 2.776474599997448833e-01, 3.827650237273348965e-01], [1.089906023807040714e-01, 2.230820937721638697e-01, 3.381867859682909927e-01, 4.513008399758057232e-01], [1.124254240556722684e-01, 2.209918074710395253e-01, 3.238894323148118759e-01, 4.220357318198978414e-01], [1.072173273655498138e-01, 2.082159073100979807e-01, 3.059816075270163083e-01, 4.032981557798429595e-01]]) - self.assertTrue((dy.numpy() == answer).all()) + places = 18 + for ii in range(dy_array.shape[0]): + for jj in range(dy_array.shape[1]): + self.assertAlmostEqual(dy_array[ii, jj], answer[ii, jj], places=places) From 2d7e537219c4ca686b18fc40dc59da6008cff6fa Mon Sep 17 00:00:00 2001 From: HLA Date: Wed, 25 Aug 2021 14:20:49 +0800 Subject: [PATCH 3/6] commit-message: Format Document --- deepmd/utils/tabulate.py | 3 +- source/op/unaggregated_grad.cc | 418 ++++++++++++++++++--------------- 2 files changed, 227 insertions(+), 194 deletions(-) diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index 93a75c6f24..f1057b38f2 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -2,6 +2,7 @@ import math import logging import numpy as np +from typing import Callable from typing import Tuple, List from deepmd.env import tf from deepmd.env import op_module @@ -38,7 +39,7 @@ def __init__(self, model_file : str, type_one_side : bool = False, exclude_types : List[List[int]] = [], - activation_fn=tf.nn.tanh) -> None: + activation_fn : Callable[[tf.Tensor], tf.Tensor] = tf.nn.tanh) -> None: """ Constructor """ diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc index 542268d3e7..9810e1a36a 100644 --- a/source/op/unaggregated_grad.cc +++ b/source/op/unaggregated_grad.cc @@ -6,56 +6,55 @@ #define GGELU 0.044715 REGISTER_OP("UnaggregatedDyDxS") - .Attr("T: {float, double} = DT_DOUBLE") - .Input("y: T") - .Input("w: T") + .Attr("T: {float, double} = DT_DOUBLE") + .Input("y: T") + .Input("w: T") .Input("xbar: T") .Input("functype: int32") .Output("dy_dx: T"); REGISTER_OP("UnaggregatedDyDx") .Attr("T: {float, double} = DT_DOUBLE") - .Input("z: T") - .Input("w: T") - .Input("dy_dx: T") + .Input("z: T") + .Input("w: T") + .Input("dy_dx: T") .Input("ybar: T") .Input("functype: int32") .Output("dz_dx: T"); REGISTER_OP("UnaggregatedDy2DxS") - .Attr("T: {float, double} = DT_DOUBLE") - .Input("y: T") - .Input("dy: T") - .Input("w: T") + .Attr("T: {float, double} = DT_DOUBLE") + .Input("y: T") + .Input("dy: T") + .Input("w: T") .Input("xbar: T") .Input("functype: int32") .Output("dy2_dx: T"); REGISTER_OP("UnaggregatedDy2Dx") .Attr("T: {float, double} = DT_DOUBLE") - .Input("z: T") - .Input("w: T") - .Input("dy_dx: T") - .Input("dy2_dx: T") + .Input("z: T") + .Input("w: T") + .Input("dy_dx: T") + .Input("dy2_dx: T") .Input("ybar: T") .Input("functype: int32") .Output("dz2_dx: T"); template -FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype) //functype=tanh, gelu, .. +FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype) //functype=tanh, gelu, .. { switch (functype) { - case 1: - return (1 - y * y); - case 2: - { - const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); - return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5; - } - default: - return -1; - } - + case 1: + return (1 - y * y); + case 2: + { + const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); + return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5; + } + default: + return -1; + } } template @@ -63,52 +62,60 @@ FPTYPE grad_grad(const FPTYPE xbar, const FPTYPE y, const int functype) { switch (functype) { - case 1: - return -2 * y * (1 - y * y); - case 2: - { - const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); - const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1); - return 3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2; - } - default: - return -1; - } + case 1: + return -2 * y * (1 - y * y); + case 2: + { + const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); + const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1); + return 3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2; + } + default: + return -1; + } } - - template -struct UnaggregatedDyDxSFunctor { - void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy_dx, const int functype) { - #pragma omp parallel for - for (int ii = 0; ii < length; ii++) { - for (int jj = 0; jj < width; jj++) { - dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj],functype)*w[jj]; +struct UnaggregatedDyDxSFunctor +{ + void operator()(const CPUDevice &d, const FPTYPE *y, const FPTYPE *w, const FPTYPE *xbar, const int length, const int width, FPTYPE *dy_dx, const int functype) + { +#pragma omp parallel for + for (int ii = 0; ii < length; ii++) + { + for (int jj = 0; jj < width; jj++) + { + dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj], functype) * w[jj]; } } } - #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) { - //Currently, Do nothing at all! +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + void operator()(const GPUDevice &d, const FPTYPE *y, const FPTYPE *w, const int length, const int width, FPTYPE *dy_dx) + { + //Currently, Do nothing at all! return; } - #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; // calculate the gradient for all variables! template -struct UnaggregatedDyDxFunctor { - void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * ybar, const int length, const int width, const int size, FPTYPE * dz_dx, const int functype) { - //width=2*size - #pragma omp parallel for - for (int kk = 0; kk < length; kk++) { - for (int ii = 0; ii < width; ii++) { +struct UnaggregatedDyDxFunctor +{ + void operator()(const CPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const FPTYPE *ybar, const int length, const int width, const int size, FPTYPE *dz_dx, const int functype) + { +//width=2*size +#pragma omp parallel for + for (int kk = 0; kk < length; kk++) + { + for (int ii = 0; ii < width; ii++) + { //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]); - FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype); + FPTYPE dz_drou = grad(ybar[kk * width + ii], z[kk * width + ii], functype); FPTYPE accumulator = 0.0; - for (int jj = 0; jj < size; jj++) { + for (int jj = 0; jj < size; jj++) + { accumulator += w[jj * width + ii] * dy_dx[kk * size + jj]; } dz_drou *= accumulator; @@ -118,238 +125,264 @@ struct UnaggregatedDyDxFunctor { } } - #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) { - //Currently, Do nothing at all! +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + void operator()(const GPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const int length, const int width, const int size, FPTYPE *dz_dx) + { + //Currently, Do nothing at all! return; } - #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; template -struct UnaggregatedDy2DxSFunctor { - void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy2_dx, const int functype) { - #pragma omp parallel for - for (int ii = 0; ii < length; ii++) { - for (int jj = 0; jj < width; jj++) { - dy2_dx[ii * width + jj] = grad_grad(xbar[ii * width + jj],y[ii * width + jj],functype)*w[jj]*w[jj]; +struct UnaggregatedDy2DxSFunctor +{ + void operator()(const CPUDevice &d, const FPTYPE *y, const FPTYPE *dy, const FPTYPE *w, const FPTYPE *xbar, const int length, const int width, FPTYPE *dy2_dx, const int functype) + { +#pragma omp parallel for + for (int ii = 0; ii < length; ii++) + { + for (int jj = 0; jj < width; jj++) + { + dy2_dx[ii * width + jj] = grad_grad(xbar[ii * width + jj], y[ii * width + jj], functype) * w[jj] * w[jj]; } } } - #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) { - //Currently, Do nothing at all! +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + void operator()(const GPUDevice &d, const FPTYPE *y, const FPTYPE *w, const int length, const int width, FPTYPE *dy_dx) + { + //Currently, Do nothing at all! return; } - #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; // calculate the gradient for all variables! template -struct UnaggregatedDy2DxFunctor { - void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const FPTYPE * ybar, const int length, const int width, const int size, FPTYPE * dz2_dx, const int functype) { - #pragma omp parallel for - for (int kk = 0; kk < length; kk++) { - for (int ii = 0; ii < width; ii++) { +struct UnaggregatedDy2DxFunctor +{ + void operator()(const CPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const FPTYPE *dy2_dx, const FPTYPE *ybar, const int length, const int width, const int size, FPTYPE *dz2_dx, const int functype) + { +#pragma omp parallel for + for (int kk = 0; kk < length; kk++) + { + for (int ii = 0; ii < width; ii++) + { //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]); - FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype); + FPTYPE dz_drou = grad(ybar[kk * width + ii], z[kk * width + ii], functype); FPTYPE accumulator = 0.0; - for (int jj = 0; jj < size; jj++) { + for (int jj = 0; jj < size; jj++) + { accumulator += w[jj * width + ii] * dy2_dx[kk * size + jj]; } dz_drou *= accumulator; accumulator = 0.0; - for (int jj = 0; jj < size; jj++) { + for (int jj = 0; jj < size; jj++) + { accumulator += w[jj * width + ii] * dy_dx[kk * size + jj]; } - dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii],functype) * accumulator * accumulator; + dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii], functype) * accumulator * accumulator; dz_drou += dy2_dx[kk * size + ii % size]; dz2_dx[kk * width + ii] = dz_drou; } } } - #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) { - //Currently, Do nothing at all! +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + void operator()(const GPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dz_dx, const FPTYPE *dy_dx, const FPTYPE *dy2_dx, const int length, const int width, const int size, FPTYPE *dz2_dx) + { + //Currently, Do nothing at all! return; } - #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; -template -class UnaggregatedDyDxSOp : public OpKernel { - public: - explicit UnaggregatedDyDxSOp(OpKernelConstruction* context) : OpKernel(context) {} +template +class UnaggregatedDyDxSOp : public OpKernel +{ +public: + explicit UnaggregatedDyDxSOp(OpKernelConstruction *context) : OpKernel(context) {} - void Compute(OpKernelContext* context) override { - deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); + void Compute(OpKernelContext *context) override + { + deepmd::safe_compute(context, [this](OpKernelContext *context) + { this->_Compute(context); }); } - void _Compute(OpKernelContext* context) { + void _Compute(OpKernelContext *context) + { // Grab the input tensor - //xbar=xw+b + //xbar=xw+b int context_input_index = 0; - const Tensor& y = context->input(context_input_index++); - const Tensor& w = context->input(context_input_index++); - const Tensor& xbar = context->input(context_input_index++); - const Tensor& functype = context->input(context_input_index++); + const Tensor &y = context->input(context_input_index++); + const Tensor &w = context->input(context_input_index++); + const Tensor &xbar = context->input(context_input_index++); + const Tensor &functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES (context, (y.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - //check functype + OP_REQUIRES(context, (y.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + //check functype int context_output_index = 0; - Tensor* dy_dx = NULL; + Tensor *dy_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - y.shape(), - &dy_dx)); + y.shape(), + &dy_dx)); UnaggregatedDyDxSFunctor()( - context->eigen_device(), // define actually graph execution device + context->eigen_device(), // define actually graph execution device y.flat().data(), w.flat().data(), - xbar.flat().data(), + xbar.flat().data(), y.shape().dim_size(0), y.shape().dim_size(1), dy_dx->flat().data(), - functype.flat()(0) - ); + functype.flat()(0)); } private: }; -template -class UnaggregatedDy2DxSOp : public OpKernel { - public: - explicit UnaggregatedDy2DxSOp(OpKernelConstruction* context) : OpKernel(context) {} +template +class UnaggregatedDy2DxSOp : public OpKernel +{ +public: + explicit UnaggregatedDy2DxSOp(OpKernelConstruction *context) : OpKernel(context) {} - void Compute(OpKernelContext* context) override { - deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); + void Compute(OpKernelContext *context) override + { + deepmd::safe_compute(context, [this](OpKernelContext *context) + { this->_Compute(context); }); } - void _Compute(OpKernelContext* context) { + void _Compute(OpKernelContext *context) + { // Grab the input tensor int context_input_index = 0; - const Tensor& y = context->input(context_input_index++); - const Tensor& dy = context->input(context_input_index++); - const Tensor& w = context->input(context_input_index++); - const Tensor& xbar = context->input(context_input_index++); - const Tensor& functype = context->input(context_input_index++); + const Tensor &y = context->input(context_input_index++); + const Tensor &dy = context->input(context_input_index++); + const Tensor &w = context->input(context_input_index++); + const Tensor &xbar = context->input(context_input_index++); + const Tensor &functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES (context, (y.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (dy.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (xbar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - + OP_REQUIRES(context, (y.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (dy.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + int context_output_index = 0; - Tensor* dy2_dx = NULL; + Tensor *dy2_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - y.shape(), - &dy2_dx)); + y.shape(), + &dy2_dx)); UnaggregatedDy2DxSFunctor()( - context->eigen_device(), // define actually graph execution device + context->eigen_device(), // define actually graph execution device y.flat().data(), dy.flat().data(), w.flat().data(), - xbar.flat().data(), + xbar.flat().data(), y.shape().dim_size(0), y.shape().dim_size(1), dy2_dx->flat().data(), - functype.flat()(0) - ); + functype.flat()(0)); } private: }; -template -class UnaggregatedDyDxOp : public OpKernel { - public: - explicit UnaggregatedDyDxOp(OpKernelConstruction* context) : OpKernel(context) {} +template +class UnaggregatedDyDxOp : public OpKernel +{ +public: + explicit UnaggregatedDyDxOp(OpKernelConstruction *context) : OpKernel(context) {} - void Compute(OpKernelContext* context) override { - deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); + void Compute(OpKernelContext *context) override + { + deepmd::safe_compute(context, [this](OpKernelContext *context) + { this->_Compute(context); }); } - void _Compute(OpKernelContext* context) { + void _Compute(OpKernelContext *context) + { // Grab the input tensor int context_input_index = 0; - const Tensor& z = context->input(context_input_index++); - const Tensor& w = context->input(context_input_index++); - const Tensor& dy_dx = context->input(context_input_index++); - const Tensor& ybar = context->input(context_input_index++); - const Tensor& functype = context->input(context_input_index++); + const Tensor &z = context->input(context_input_index++); + const Tensor &w = context->input(context_input_index++); + const Tensor &dy_dx = context->input(context_input_index++); + const Tensor &ybar = context->input(context_input_index++); + const Tensor &functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES (context, (z.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (dy_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (ybar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES(context, (z.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (dy_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (ybar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); int context_output_index = 0; - Tensor* dz_dx = NULL; + Tensor *dz_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - z.shape(), - &dz_dx)); + z.shape(), + &dz_dx)); UnaggregatedDyDxFunctor()( - context->eigen_device(), // define actually graph execution device + context->eigen_device(), // define actually graph execution device z.flat().data(), w.flat().data(), dy_dx.flat().data(), ybar.flat().data(), z.shape().dim_size(0), - z.shape().dim_size(1), //N1 - w.shape().dim_size(0), //N0 , N1=2N0 + z.shape().dim_size(1), //N1 + w.shape().dim_size(0), //N0 , N1=2N0 dz_dx->flat().data(), - functype.flat()(0) - ); + functype.flat()(0)); } private: }; -template -class UnaggregatedDy2DxOp : public OpKernel { - public: - explicit UnaggregatedDy2DxOp(OpKernelConstruction* context) : OpKernel(context) {} +template +class UnaggregatedDy2DxOp : public OpKernel +{ +public: + explicit UnaggregatedDy2DxOp(OpKernelConstruction *context) : OpKernel(context) {} - void Compute(OpKernelContext* context) override { - deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); + void Compute(OpKernelContext *context) override + { + deepmd::safe_compute(context, [this](OpKernelContext *context) + { this->_Compute(context); }); } - void _Compute(OpKernelContext* context) { + void _Compute(OpKernelContext *context) + { // Grab the input tensor int context_input_index = 0; - const Tensor& z = context->input(context_input_index++); - const Tensor& w = context->input(context_input_index++); - const Tensor& dy_dx = context->input(context_input_index++); - const Tensor& dy2_dx = context->input(context_input_index++); - const Tensor& ybar = context->input(context_input_index++); - const Tensor& functype = context->input(context_input_index++); + const Tensor &z = context->input(context_input_index++); + const Tensor &w = context->input(context_input_index++); + const Tensor &dy_dx = context->input(context_input_index++); + const Tensor &dy2_dx = context->input(context_input_index++); + const Tensor &ybar = context->input(context_input_index++); + const Tensor &functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES (context, (z.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (dy_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (dy2_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); - OP_REQUIRES (context, (ybar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES(context, (z.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (dy_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (dy2_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES(context, (ybar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); int context_output_index = 0; - Tensor* dz2_dx = NULL; + Tensor *dz2_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - z.shape(), - &dz2_dx)); + z.shape(), + &dz2_dx)); UnaggregatedDy2DxFunctor()( - context->eigen_device(), // define actually graph execution device + context->eigen_device(), // define actually graph execution device z.flat().data(), w.flat().data(), dy_dx.flat().data(), @@ -359,27 +392,26 @@ class UnaggregatedDy2DxOp : public OpKernel { z.shape().dim_size(1), w.shape().dim_size(0), dz2_dx->flat().data(), - functype.flat()(0) - ); + functype.flat()(0)); } private: }; // Register the CPU kernels. -#define REGISTER_CPU(T) \ -REGISTER_KERNEL_BUILDER( \ - Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint("T"), \ - UnaggregatedDyDxSOp); \ -REGISTER_KERNEL_BUILDER( \ - Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint("T"), \ - UnaggregatedDyDxOp); \ -REGISTER_KERNEL_BUILDER( \ - Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint("T"), \ - UnaggregatedDy2DxSOp); \ -REGISTER_KERNEL_BUILDER( \ - Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint("T"), \ - UnaggregatedDy2DxOp); +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint("T"), \ + UnaggregatedDyDxSOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint("T"), \ + UnaggregatedDyDxOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint("T"), \ + UnaggregatedDy2DxSOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint("T"), \ + UnaggregatedDy2DxOp); REGISTER_CPU(float); REGISTER_CPU(double); // Not required in the current situation @@ -391,7 +423,7 @@ REGISTER_CPU(double); // UnaggregatedDyDxSOp); \ // REGISTER_KERNEL_BUILDER( \ // Name("UnaggregatedDyDx").Device(DEVICE_GPU).TypeConstraint("T"), \ -// UnaggregatedDyDxOp); +// UnaggregatedDyDxOp); // REGISTER_GPU(float); // REGISTER_GPU(double); // #endif // GOOGLE_CUDA From de03176b5a70772d5edaac1c4f2b74faaab4ed0a Mon Sep 17 00:00:00 2001 From: HLA Date: Wed, 25 Aug 2021 22:27:34 +0800 Subject: [PATCH 4/6] commit-message :Format revert --- source/op/unaggregated_grad.cc | 406 +++++++++++++++------------------ 1 file changed, 187 insertions(+), 219 deletions(-) diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc index 9810e1a36a..0a56f5ab90 100644 --- a/source/op/unaggregated_grad.cc +++ b/source/op/unaggregated_grad.cc @@ -6,55 +6,56 @@ #define GGELU 0.044715 REGISTER_OP("UnaggregatedDyDxS") - .Attr("T: {float, double} = DT_DOUBLE") - .Input("y: T") - .Input("w: T") + .Attr("T: {float, double} = DT_DOUBLE") + .Input("y: T") + .Input("w: T") .Input("xbar: T") .Input("functype: int32") .Output("dy_dx: T"); REGISTER_OP("UnaggregatedDyDx") .Attr("T: {float, double} = DT_DOUBLE") - .Input("z: T") - .Input("w: T") - .Input("dy_dx: T") + .Input("z: T") + .Input("w: T") + .Input("dy_dx: T") .Input("ybar: T") .Input("functype: int32") .Output("dz_dx: T"); REGISTER_OP("UnaggregatedDy2DxS") - .Attr("T: {float, double} = DT_DOUBLE") - .Input("y: T") - .Input("dy: T") - .Input("w: T") + .Attr("T: {float, double} = DT_DOUBLE") + .Input("y: T") + .Input("dy: T") + .Input("w: T") .Input("xbar: T") .Input("functype: int32") .Output("dy2_dx: T"); REGISTER_OP("UnaggregatedDy2Dx") .Attr("T: {float, double} = DT_DOUBLE") - .Input("z: T") - .Input("w: T") - .Input("dy_dx: T") - .Input("dy2_dx: T") + .Input("z: T") + .Input("w: T") + .Input("dy_dx: T") + .Input("dy2_dx: T") .Input("ybar: T") .Input("functype: int32") .Output("dz2_dx: T"); template -FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype) //functype=tanh, gelu, .. +FPTYPE grad(const FPTYPE xbar, const FPTYPE y, const int functype) //functype=tanh, gelu, .. { switch (functype) { - case 1: - return (1 - y * y); - case 2: - { - const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); - return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5; - } - default: - return -1; + case 1: + return (1 - y * y); + case 2: + { + const FPTYPE var = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); + return 0.5 * SQRT_2_PI * xbar * (1 - var * var) * (3 * GGELU * xbar * xbar + 1) + 0.5 * var + 0.5; + } + default: + return -1; } + } template @@ -62,60 +63,52 @@ FPTYPE grad_grad(const FPTYPE xbar, const FPTYPE y, const int functype) { switch (functype) { - case 1: - return -2 * y * (1 - y * y); - case 2: - { - const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); - const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1); - return 3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2; - } - default: - return -1; + case 1: + return -2 * y * (1 - y * y); + case 2: + { + const FPTYPE var1 = tanh(SQRT_2_PI * (xbar + GGELU * xbar * xbar * xbar)); + const FPTYPE var2 = SQRT_2_PI * (1 - var1 * var1) * (3 * GGELU * xbar * xbar + 1); + return 3 * GGELU * SQRT_2_PI * xbar * xbar * (1 - var1 * var1) - SQRT_2_PI * xbar * var2 * (3 * GGELU * xbar * xbar + 1) * var1 + var2; + } + default: + return -1; } } + + template -struct UnaggregatedDyDxSFunctor -{ - void operator()(const CPUDevice &d, const FPTYPE *y, const FPTYPE *w, const FPTYPE *xbar, const int length, const int width, FPTYPE *dy_dx, const int functype) - { -#pragma omp parallel for - for (int ii = 0; ii < length; ii++) - { - for (int jj = 0; jj < width; jj++) - { - dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj], functype) * w[jj]; +struct UnaggregatedDyDxSFunctor { + void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy_dx, const int functype) { + #pragma omp parallel for + for (int ii = 0; ii < length; ii++) { + for (int jj = 0; jj < width; jj++) { + dy_dx[ii * width + jj] = grad(xbar[ii * width + jj], y[ii * width + jj],functype)*w[jj]; } } } -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - void operator()(const GPUDevice &d, const FPTYPE *y, const FPTYPE *w, const int length, const int width, FPTYPE *dy_dx) - { - //Currently, Do nothing at all! + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) { + //Currently, Do nothing at all! return; } -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; // calculate the gradient for all variables! template -struct UnaggregatedDyDxFunctor -{ - void operator()(const CPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const FPTYPE *ybar, const int length, const int width, const int size, FPTYPE *dz_dx, const int functype) - { -//width=2*size -#pragma omp parallel for - for (int kk = 0; kk < length; kk++) - { - for (int ii = 0; ii < width; ii++) - { +struct UnaggregatedDyDxFunctor { + void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * ybar, const int length, const int width, const int size, FPTYPE * dz_dx, const int functype) { + //width=2*size + #pragma omp parallel for + for (int kk = 0; kk < length; kk++) { + for (int ii = 0; ii < width; ii++) { //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]); - FPTYPE dz_drou = grad(ybar[kk * width + ii], z[kk * width + ii], functype); + FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype); FPTYPE accumulator = 0.0; - for (int jj = 0; jj < size; jj++) - { + for (int jj = 0; jj < size; jj++) { accumulator += w[jj * width + ii] * dy_dx[kk * size + jj]; } dz_drou *= accumulator; @@ -125,163 +118,143 @@ struct UnaggregatedDyDxFunctor } } -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - void operator()(const GPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const int length, const int width, const int size, FPTYPE *dz_dx) - { - //Currently, Do nothing at all! + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) { + //Currently, Do nothing at all! return; } -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; template -struct UnaggregatedDy2DxSFunctor -{ - void operator()(const CPUDevice &d, const FPTYPE *y, const FPTYPE *dy, const FPTYPE *w, const FPTYPE *xbar, const int length, const int width, FPTYPE *dy2_dx, const int functype) - { -#pragma omp parallel for - for (int ii = 0; ii < length; ii++) - { - for (int jj = 0; jj < width; jj++) - { - dy2_dx[ii * width + jj] = grad_grad(xbar[ii * width + jj], y[ii * width + jj], functype) * w[jj] * w[jj]; +struct UnaggregatedDy2DxSFunctor { + void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const FPTYPE* xbar, const int length, const int width, FPTYPE * dy2_dx, const int functype) { + #pragma omp parallel for + for (int ii = 0; ii < length; ii++) { + for (int jj = 0; jj < width; jj++) { + dy2_dx[ii * width + jj] = grad_grad(xbar[ii * width + jj],y[ii * width + jj],functype)*w[jj]*w[jj]; } } } -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - void operator()(const GPUDevice &d, const FPTYPE *y, const FPTYPE *w, const int length, const int width, FPTYPE *dy_dx) - { - //Currently, Do nothing at all! + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) { + //Currently, Do nothing at all! return; } -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; // calculate the gradient for all variables! template -struct UnaggregatedDy2DxFunctor -{ - void operator()(const CPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dy_dx, const FPTYPE *dy2_dx, const FPTYPE *ybar, const int length, const int width, const int size, FPTYPE *dz2_dx, const int functype) - { -#pragma omp parallel for - for (int kk = 0; kk < length; kk++) - { - for (int ii = 0; ii < width; ii++) - { +struct UnaggregatedDy2DxFunctor { + void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const FPTYPE * ybar, const int length, const int width, const int size, FPTYPE * dz2_dx, const int functype) { + #pragma omp parallel for + for (int kk = 0; kk < length; kk++) { + for (int ii = 0; ii < width; ii++) { //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]); - FPTYPE dz_drou = grad(ybar[kk * width + ii], z[kk * width + ii], functype); + FPTYPE dz_drou = grad(ybar[kk*width+ii], z[kk * width + ii],functype); FPTYPE accumulator = 0.0; - for (int jj = 0; jj < size; jj++) - { + for (int jj = 0; jj < size; jj++) { accumulator += w[jj * width + ii] * dy2_dx[kk * size + jj]; } dz_drou *= accumulator; accumulator = 0.0; - for (int jj = 0; jj < size; jj++) - { + for (int jj = 0; jj < size; jj++) { accumulator += w[jj * width + ii] * dy_dx[kk * size + jj]; } - dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii], functype) * accumulator * accumulator; + dz_drou += grad_grad(ybar[kk * width + ii], z[kk * width + ii],functype) * accumulator * accumulator; dz_drou += dy2_dx[kk * size + ii % size]; dz2_dx[kk * width + ii] = dz_drou; } } } -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - void operator()(const GPUDevice &d, const FPTYPE *z, const FPTYPE *w, const FPTYPE *dz_dx, const FPTYPE *dy_dx, const FPTYPE *dy2_dx, const int length, const int width, const int size, FPTYPE *dz2_dx) - { - //Currently, Do nothing at all! + #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) { + //Currently, Do nothing at all! return; } -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM }; -template -class UnaggregatedDyDxSOp : public OpKernel -{ -public: - explicit UnaggregatedDyDxSOp(OpKernelConstruction *context) : OpKernel(context) {} +template +class UnaggregatedDyDxSOp : public OpKernel { + public: + explicit UnaggregatedDyDxSOp(OpKernelConstruction* context) : OpKernel(context) {} - void Compute(OpKernelContext *context) override - { - deepmd::safe_compute(context, [this](OpKernelContext *context) - { this->_Compute(context); }); + void Compute(OpKernelContext* context) override { + deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); } - void _Compute(OpKernelContext *context) - { + void _Compute(OpKernelContext* context) { // Grab the input tensor //xbar=xw+b int context_input_index = 0; - const Tensor &y = context->input(context_input_index++); - const Tensor &w = context->input(context_input_index++); - const Tensor &xbar = context->input(context_input_index++); - const Tensor &functype = context->input(context_input_index++); + const Tensor& y = context->input(context_input_index++); + const Tensor& w = context->input(context_input_index++); + const Tensor& xbar = context->input(context_input_index++); + const Tensor& functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES(context, (y.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES (context, (y.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); //check functype int context_output_index = 0; - Tensor *dy_dx = NULL; + Tensor* dy_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - y.shape(), - &dy_dx)); + y.shape(), + &dy_dx)); UnaggregatedDyDxSFunctor()( - context->eigen_device(), // define actually graph execution device + context->eigen_device(), // define actually graph execution device y.flat().data(), w.flat().data(), xbar.flat().data(), y.shape().dim_size(0), y.shape().dim_size(1), dy_dx->flat().data(), - functype.flat()(0)); + functype.flat()(0) + ); } private: }; -template -class UnaggregatedDy2DxSOp : public OpKernel -{ -public: - explicit UnaggregatedDy2DxSOp(OpKernelConstruction *context) : OpKernel(context) {} +template +class UnaggregatedDy2DxSOp : public OpKernel { + public: + explicit UnaggregatedDy2DxSOp(OpKernelConstruction* context) : OpKernel(context) {} - void Compute(OpKernelContext *context) override - { - deepmd::safe_compute(context, [this](OpKernelContext *context) - { this->_Compute(context); }); + void Compute(OpKernelContext* context) override { + deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); } - void _Compute(OpKernelContext *context) - { + void _Compute(OpKernelContext* context) { // Grab the input tensor int context_input_index = 0; - const Tensor &y = context->input(context_input_index++); - const Tensor &dy = context->input(context_input_index++); - const Tensor &w = context->input(context_input_index++); - const Tensor &xbar = context->input(context_input_index++); - const Tensor &functype = context->input(context_input_index++); + const Tensor& y = context->input(context_input_index++); + const Tensor& dy = context->input(context_input_index++); + const Tensor& w = context->input(context_input_index++); + const Tensor& xbar = context->input(context_input_index++); + const Tensor& functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES(context, (y.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (dy.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (xbar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - + OP_REQUIRES (context, (y.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (dy.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (xbar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + int context_output_index = 0; - Tensor *dy2_dx = NULL; + Tensor* dy2_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - y.shape(), - &dy2_dx)); + y.shape(), + &dy2_dx)); UnaggregatedDy2DxSFunctor()( - context->eigen_device(), // define actually graph execution device + context->eigen_device(), // define actually graph execution device y.flat().data(), dy.flat().data(), w.flat().data(), @@ -289,100 +262,94 @@ class UnaggregatedDy2DxSOp : public OpKernel y.shape().dim_size(0), y.shape().dim_size(1), dy2_dx->flat().data(), - functype.flat()(0)); + functype.flat()(0) + ); } private: }; -template -class UnaggregatedDyDxOp : public OpKernel -{ -public: - explicit UnaggregatedDyDxOp(OpKernelConstruction *context) : OpKernel(context) {} +template +class UnaggregatedDyDxOp : public OpKernel { + public: + explicit UnaggregatedDyDxOp(OpKernelConstruction* context) : OpKernel(context) {} - void Compute(OpKernelContext *context) override - { - deepmd::safe_compute(context, [this](OpKernelContext *context) - { this->_Compute(context); }); + void Compute(OpKernelContext* context) override { + deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); } - void _Compute(OpKernelContext *context) - { + void _Compute(OpKernelContext* context) { // Grab the input tensor int context_input_index = 0; - const Tensor &z = context->input(context_input_index++); - const Tensor &w = context->input(context_input_index++); - const Tensor &dy_dx = context->input(context_input_index++); - const Tensor &ybar = context->input(context_input_index++); - const Tensor &functype = context->input(context_input_index++); + const Tensor& z = context->input(context_input_index++); + const Tensor& w = context->input(context_input_index++); + const Tensor& dy_dx = context->input(context_input_index++); + const Tensor& ybar = context->input(context_input_index++); + const Tensor& functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES(context, (z.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (dy_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (ybar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES (context, (z.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (dy_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (ybar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); int context_output_index = 0; - Tensor *dz_dx = NULL; + Tensor* dz_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - z.shape(), - &dz_dx)); + z.shape(), + &dz_dx)); UnaggregatedDyDxFunctor()( - context->eigen_device(), // define actually graph execution device + context->eigen_device(), // define actually graph execution device z.flat().data(), w.flat().data(), dy_dx.flat().data(), ybar.flat().data(), z.shape().dim_size(0), - z.shape().dim_size(1), //N1 - w.shape().dim_size(0), //N0 , N1=2N0 + z.shape().dim_size(1), //N1 + w.shape().dim_size(0), //N0 , N1=2N0 dz_dx->flat().data(), - functype.flat()(0)); + functype.flat()(0) + ); } private: }; -template -class UnaggregatedDy2DxOp : public OpKernel -{ -public: - explicit UnaggregatedDy2DxOp(OpKernelConstruction *context) : OpKernel(context) {} +template +class UnaggregatedDy2DxOp : public OpKernel { + public: + explicit UnaggregatedDy2DxOp(OpKernelConstruction* context) : OpKernel(context) {} - void Compute(OpKernelContext *context) override - { - deepmd::safe_compute(context, [this](OpKernelContext *context) - { this->_Compute(context); }); + void Compute(OpKernelContext* context) override { + deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); } - void _Compute(OpKernelContext *context) - { + void _Compute(OpKernelContext* context) { // Grab the input tensor int context_input_index = 0; - const Tensor &z = context->input(context_input_index++); - const Tensor &w = context->input(context_input_index++); - const Tensor &dy_dx = context->input(context_input_index++); - const Tensor &dy2_dx = context->input(context_input_index++); - const Tensor &ybar = context->input(context_input_index++); - const Tensor &functype = context->input(context_input_index++); + const Tensor& z = context->input(context_input_index++); + const Tensor& w = context->input(context_input_index++); + const Tensor& dy_dx = context->input(context_input_index++); + const Tensor& dy2_dx = context->input(context_input_index++); + const Tensor& ybar = context->input(context_input_index++); + const Tensor& functype = context->input(context_input_index++); // set size of the sample - OP_REQUIRES(context, (z.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (w.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (dy_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (dy2_dx.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); - OP_REQUIRES(context, (ybar.shape().dims() == 2), errors::InvalidArgument("Dim of input should be 2")); + OP_REQUIRES (context, (z.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (w.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (dy_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (dy2_dx.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (ybar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); int context_output_index = 0; - Tensor *dz2_dx = NULL; + Tensor* dz2_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - z.shape(), - &dz2_dx)); + z.shape(), + &dz2_dx)); UnaggregatedDy2DxFunctor()( - context->eigen_device(), // define actually graph execution device + context->eigen_device(), // define actually graph execution device z.flat().data(), w.flat().data(), dy_dx.flat().data(), @@ -392,26 +359,27 @@ class UnaggregatedDy2DxOp : public OpKernel z.shape().dim_size(1), w.shape().dim_size(0), dz2_dx->flat().data(), - functype.flat()(0)); + functype.flat()(0) + ); } private: }; // Register the CPU kernels. -#define REGISTER_CPU(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint("T"), \ - UnaggregatedDyDxSOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint("T"), \ - UnaggregatedDyDxOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint("T"), \ - UnaggregatedDy2DxSOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint("T"), \ - UnaggregatedDy2DxOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint("T"), \ + UnaggregatedDyDxSOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint("T"), \ + UnaggregatedDyDxOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint("T"), \ + UnaggregatedDy2DxSOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint("T"), \ + UnaggregatedDy2DxOp); REGISTER_CPU(float); REGISTER_CPU(double); // Not required in the current situation @@ -423,7 +391,7 @@ REGISTER_CPU(double); // UnaggregatedDyDxSOp); \ // REGISTER_KERNEL_BUILDER( \ // Name("UnaggregatedDyDx").Device(DEVICE_GPU).TypeConstraint("T"), \ -// UnaggregatedDyDxOp); +// UnaggregatedDyDxOp); // REGISTER_GPU(float); // REGISTER_GPU(double); // #endif // GOOGLE_CUDA From 7a68bd716130a3bbaa72ecf6b560508626e8fc52 Mon Sep 17 00:00:00 2001 From: HLA Date: Thu, 26 Aug 2021 09:42:29 +0800 Subject: [PATCH 5/6] commit-message: format change --- source/op/unaggregated_grad.cc | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc index 0a56f5ab90..c132a09b3a 100644 --- a/source/op/unaggregated_grad.cc +++ b/source/op/unaggregated_grad.cc @@ -205,8 +205,8 @@ class UnaggregatedDyDxSOp : public OpKernel { int context_output_index = 0; Tensor* dy_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - y.shape(), - &dy_dx)); + y.shape(), + &dy_dx)); UnaggregatedDyDxSFunctor()( context->eigen_device(), // define actually graph execution device @@ -219,7 +219,6 @@ class UnaggregatedDyDxSOp : public OpKernel { functype.flat()(0) ); } - private: }; @@ -248,10 +247,10 @@ class UnaggregatedDy2DxSOp : public OpKernel { OP_REQUIRES (context, (xbar.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); int context_output_index = 0; - Tensor* dy2_dx = NULL; + Tensor* dy2_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - y.shape(), - &dy2_dx)); + y.shape(), + &dy2_dx)); UnaggregatedDy2DxSFunctor()( context->eigen_device(), // define actually graph execution device @@ -265,7 +264,6 @@ class UnaggregatedDy2DxSOp : public OpKernel { functype.flat()(0) ); } - private: }; @@ -296,8 +294,8 @@ class UnaggregatedDyDxOp : public OpKernel { int context_output_index = 0; Tensor* dz_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - z.shape(), - &dz_dx)); + z.shape(), + &dz_dx)); UnaggregatedDyDxFunctor()( context->eigen_device(), // define actually graph execution device @@ -312,7 +310,6 @@ class UnaggregatedDyDxOp : public OpKernel { functype.flat()(0) ); } - private: }; @@ -345,8 +342,8 @@ class UnaggregatedDy2DxOp : public OpKernel { int context_output_index = 0; Tensor* dz2_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - z.shape(), - &dz2_dx)); + z.shape(), + &dz2_dx)); UnaggregatedDy2DxFunctor()( context->eigen_device(), // define actually graph execution device @@ -362,7 +359,6 @@ class UnaggregatedDy2DxOp : public OpKernel { functype.flat()(0) ); } - private: }; From 1c71fee03d2acb1dec458935768c904aaaa73e7b Mon Sep 17 00:00:00 2001 From: HLA Date: Thu, 26 Aug 2021 15:14:38 +0800 Subject: [PATCH 6/6] commit-message: Format change --- source/op/unaggregated_grad.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc index c132a09b3a..89c14a84fb 100644 --- a/source/op/unaggregated_grad.cc +++ b/source/op/unaggregated_grad.cc @@ -205,8 +205,8 @@ class UnaggregatedDyDxSOp : public OpKernel { int context_output_index = 0; Tensor* dy_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - y.shape(), - &dy_dx)); + y.shape(), + &dy_dx)); UnaggregatedDyDxSFunctor()( context->eigen_device(), // define actually graph execution device @@ -249,8 +249,8 @@ class UnaggregatedDy2DxSOp : public OpKernel { int context_output_index = 0; Tensor* dy2_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - y.shape(), - &dy2_dx)); + y.shape(), + &dy2_dx)); UnaggregatedDy2DxSFunctor()( context->eigen_device(), // define actually graph execution device @@ -294,8 +294,8 @@ class UnaggregatedDyDxOp : public OpKernel { int context_output_index = 0; Tensor* dz_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - z.shape(), - &dz_dx)); + z.shape(), + &dz_dx)); UnaggregatedDyDxFunctor()( context->eigen_device(), // define actually graph execution device @@ -342,8 +342,8 @@ class UnaggregatedDy2DxOp : public OpKernel { int context_output_index = 0; Tensor* dz2_dx = NULL; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - z.shape(), - &dz2_dx)); + z.shape(), + &dz2_dx)); UnaggregatedDy2DxFunctor()( context->eigen_device(), // define actually graph execution device