diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 84c3d326da..dc35ee5dc0 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -125,7 +125,7 @@ else()
   endif()
 endif()
 if (USE_CUDA_TOOLKIT)
-  add_definitions("-DUSE_CUDA_TOOLKIT")
+  add_definitions("-D GOOGLE_CUDA")
 endif()
 
 # define USE_TTM
diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
new file mode 100644
index 0000000000..c446db8130
--- /dev/null
+++ b/source/lib/include/CustomeOperation.h
@@ -0,0 +1,572 @@
+#pragma once
+#include <vector>
+#include <stdio.h>
+#include <string.h>
+#include <algorithm>
+#include "MathUtilities.h"
+#if GOOGLE_CUDA
+#include "DeviceFunctor.h"
+#endif // GOOGLE_CUDA
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+struct NeighborInfo {
+    int type;
+    double dist;
+    int index;
+    NeighborInfo () : type (0), dist(0), index(0) {}
+    NeighborInfo (int tt, double dd, int ii) : type (tt), dist(dd), index(ii) {}
+    
+    bool operator < (const NeighborInfo & b) const {
+	    return (type < b.type || (type == b.type && (dist < b.dist || (dist == b.dist && index < b.index))));
+    }
+};
+
+template <typename FPTYPE>
+inline void spline5_switch (
+    FPTYPE & vv,
+	FPTYPE & dd,
+	const FPTYPE & xx, 
+	const float & rmin, 
+	const float & rmax)
+{
+    if (xx < rmin) {
+        dd = 0;
+        vv = 1;
+    }
+    else if (xx < rmax) {
+        FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
+        FPTYPE du = 1. / (rmax - rmin) ;
+        vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
+        dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
+    }
+    else {
+        dd = 0;
+        vv = 0;
+    }
+}
+
+template<typename FPTYPE> 
+int format_nlist_fill_se_a_cpu (
+    vector<int > &		    fmt_nei_idx_a,
+	const vector<FPTYPE > &	    posi,
+	const int &			    ntypes,
+	const vector<int > &    type,
+	const int &			    i_idx,
+	const vector<int > &    nei_idx_a, 
+	const float &		    rcut,
+	const vector<int > &    sec_a)
+{
+    fmt_nei_idx_a.resize (sec_a.back());
+    fill (fmt_nei_idx_a.begin(), fmt_nei_idx_a.end(), -1);
+  
+    // gether all neighbors
+    std::vector<int > nei_idx (nei_idx_a);
+    // allocate the information for all neighbors
+    vector<NeighborInfo > sel_nei;
+    sel_nei.reserve (nei_idx_a.size());
+    for (unsigned kk = 0; kk < nei_idx.size(); ++kk) {
+        FPTYPE diff[3];
+        const int & j_idx = nei_idx[kk];
+        for (int dd = 0; dd < 3; ++dd) {
+            diff[dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+        }
+        FPTYPE rr = sqrt(MathUtilities::dot<FPTYPE> (diff, diff));    
+        if (rr <= rcut) {
+            sel_nei.push_back(NeighborInfo(type[j_idx], rr, j_idx));
+        }
+    }
+    sort(sel_nei.begin(), sel_nei.end());  
+  
+    std::vector<int > nei_iter = sec_a;
+    int overflowed = -1;
+    for (unsigned kk = 0; kk < sel_nei.size(); ++kk) {
+        const int & nei_type = sel_nei[kk].type;
+        if (nei_iter[nei_type] < sec_a[nei_type+1]) {
+            fmt_nei_idx_a[nei_iter[nei_type] ++] = sel_nei[kk].index;
+        }
+    }
+    return overflowed;
+}
+
+template<typename FPTYPE> 
+void compute_descriptor_se_a_cpu (
+    vector<FPTYPE > &	        descrpt_a,
+	vector<FPTYPE > &	        descrpt_a_deriv,
+	vector<FPTYPE > &	        rij_a,
+	const vector<FPTYPE > &	    posi,
+	const int &				ntypes,
+	const vector<int > &	type,
+	const int &				i_idx,
+	const vector<int > &	fmt_nlist_a,
+	const vector<int > &	sec_a, 
+	const float &			rmin,
+	const float &			rmax) 
+{
+    // compute the diff of the neighbors
+    rij_a.resize (sec_a.back() * 3);
+    fill (rij_a.begin(), rij_a.end(), 0.0);
+    for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) {
+        for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) {
+            if (fmt_nlist_a[jj] < 0) break;
+            const int & j_idx = fmt_nlist_a[jj];
+
+            for (int dd = 0; dd < 3; ++dd) {
+                rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+            }
+        }
+    }
+    // 1./rr, cos(theta), cos(phi), sin(phi)
+    descrpt_a.resize (sec_a.back() * 4);
+    fill (descrpt_a.begin(), descrpt_a.end(), 0.0);
+    // deriv wrt center: 3
+    descrpt_a_deriv.resize (sec_a.back() * 4 * 3);
+    fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0);
+
+    for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) {
+        for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {      
+            if (fmt_nlist_a[nei_iter] < 0) break;
+            const FPTYPE * rr = &rij_a[nei_iter * 3];
+            FPTYPE nr2 = MathUtilities::dot(rr, rr);
+            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE nr = nr2 * inr;
+            FPTYPE inr2 = inr * inr;
+            FPTYPE inr4 = inr2 * inr2;
+            FPTYPE inr3 = inr4 * nr;
+            FPTYPE sw, dsw;
+            spline5_switch(sw, dsw, nr, rmin, rmax);
+            int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
+            int idx_value = nei_iter * 4;	// 4 components
+            // 4 value components
+            descrpt_a[idx_value + 0] = 1./nr;
+            descrpt_a[idx_value + 1] = rr[0] / nr2;
+            descrpt_a[idx_value + 2] = rr[1] / nr2;
+            descrpt_a[idx_value + 3] = rr[2] / nr2;
+            // deriv of component 1/r
+            descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr;
+            // deriv of component x/r2
+            descrpt_a_deriv[idx_deriv + 3] = (2. * rr[0] * rr[0] * inr4 - inr2) * sw - descrpt_a[idx_value + 1] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 4] = (2. * rr[0] * rr[1] * inr4	) * sw - descrpt_a[idx_value + 1] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 5] = (2. * rr[0] * rr[2] * inr4	) * sw - descrpt_a[idx_value + 1] * dsw * rr[2] * inr;
+            // deriv of component y/r2
+            descrpt_a_deriv[idx_deriv + 6] = (2. * rr[1] * rr[0] * inr4	) * sw - descrpt_a[idx_value + 2] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 7] = (2. * rr[1] * rr[1] * inr4 - inr2) * sw - descrpt_a[idx_value + 2] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 8] = (2. * rr[1] * rr[2] * inr4	) * sw - descrpt_a[idx_value + 2] * dsw * rr[2] * inr;
+            // deriv of component z/r2
+            descrpt_a_deriv[idx_deriv + 9] = (2. * rr[2] * rr[0] * inr4	) * sw - descrpt_a[idx_value + 3] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv +10] = (2. * rr[2] * rr[1] * inr4	) * sw - descrpt_a[idx_value + 3] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv +11] = (2. * rr[2] * rr[2] * inr4 - inr2) * sw - descrpt_a[idx_value + 3] * dsw * rr[2] * inr;
+            // 4 value components
+            descrpt_a[idx_value + 0] *= sw;
+            descrpt_a[idx_value + 1] *= sw;
+            descrpt_a[idx_value + 2] *= sw;
+            descrpt_a[idx_value + 3] *= sw;
+        }
+    }
+}
+
+template<typename FPTYPE>
+void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    // set & normalize coord
+    std::vector<FPTYPE> d_coord3(nall * 3);
+    for (int ii = 0; ii < nall; ++ii) {
+	    for (int dd = 0; dd < 3; ++dd) {
+	        d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+	    }
+    }
+
+    // set type
+    std::vector<int> d_type (nall);
+    for (int ii = 0; ii < nall; ++ii) {
+        d_type[ii] = type[ii];
+    }
+    
+    // build nlist
+    std::vector<vector<int > > d_nlist_a(nloc);
+
+	for (unsigned ii = 0; ii < nloc; ++ii) {
+	    d_nlist_a.reserve (jrange[nloc] / nloc + 10);
+	}
+	for (unsigned ii = 0; ii < nloc; ++ii) {
+	    int i_idx = ilist[ii];
+	    for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) {
+	        int j_idx = jlist[jj];
+	        d_nlist_a[i_idx].push_back (j_idx);
+	    }
+	}
+    
+    #pragma omp parallel for 
+    for (int ii = 0; ii < nloc; ++ii) {
+	    vector<int> fmt_nlist_a;
+	    int ret = -1;
+	    if (fill_nei_a) {
+	        format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a);
+	    }
+	    std::vector<FPTYPE> d_descrpt_a;
+	    std::vector<FPTYPE> d_descrpt_a_deriv;
+	    std::vector<FPTYPE> d_descrpt_r;
+	    std::vector<FPTYPE> d_descrpt_r_deriv;
+	    std::vector<FPTYPE> d_rij_a;
+	    compute_descriptor_se_a_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r);
+
+	    // check sizes
+	    assert (d_descrpt_a.size() == ndescrpt);
+	    assert (d_descrpt_a_deriv.size() == ndescrpt * 3);
+	    assert (d_rij_a.size() == nnei * 3);
+	    assert (fmt_nlist_a.size() == nnei);
+	    // record outputs
+	    for (int jj = 0; jj < ndescrpt; ++jj) {
+            descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj];
+        }
+	    for (int jj = 0; jj < ndescrpt * 3; ++jj) {
+	        descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3];
+	    }
+	    for (int jj = 0; jj < nnei * 3; ++jj) {
+	        rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+	    }
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+	    }
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op DescrptSeA
+// ******************************************************************************
+
+inline void make_descript_range (int & idx_start, int & idx_end, const int & nei_idx, const int& n_a_sel, const int n_a_shift) {
+    if (nei_idx < n_a_sel) {
+        idx_start = nei_idx * 4;
+        idx_end   = nei_idx * 4 + 4;
+    }
+    else {
+        idx_start = n_a_shift + (nei_idx - n_a_sel);
+        idx_end   = n_a_shift + (nei_idx - n_a_sel) + 1;
+    }
+}
+
+template<typename FPTYPE>
+void ProdForceSeACPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
+    // compute force of a frame
+    for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+	    // deriv wrt center atom
+	    for (int aa = 0; aa < ndescrpt; ++aa) {
+	        force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+	        force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+	        force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+	    }
+	    // deriv wrt neighbors
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        int j_idx = nlist[i_idx * nnei + jj];
+	        if (j_idx < 0) continue;
+	        int aa_start, aa_end;
+	        make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift);
+	        for (int aa = aa_start; aa < aa_end; ++aa) {
+	            force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+	            force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+	            force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+	        }
+	    }
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void ProdForceSeAGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    ProdForceSeAGPUExecuteFunctor<FPTYPE>()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+}
+#endif // GOOGLE_CUDA
+
+// ******************************************************************************
+// end of custome op ProdForceSeA
+// ******************************************************************************
+
+template<typename FPTYPE>
+void ProdVirialSeACPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    memset(virial, 0.0, sizeof(FPTYPE) * 9);
+    memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9);
+
+    // compute virial of a frame
+    for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+	    // deriv wrt neighbors
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        int j_idx = nlist[i_idx * nnei + jj];
+	        if (j_idx < 0) continue;
+	        int aa_start, aa_end;
+	        make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift);
+	        for (int aa = aa_start; aa < aa_end; ++aa) {
+	            FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + aa];
+	            for (int dd0 = 0; dd0 < 3; ++dd0)
+	                for (int dd1 = 0; dd1 < 3; ++dd1) {
+		                FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] *  in_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
+		                virial[dd0 * 3 + dd1] -= tmp_v;
+		                atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
+	                }
+	        }
+	    }
+	}
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void ProdVirialSeAGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    ProdVirialSeAGPUExecuteFunctor<FPTYPE>()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op ProdVirialSeA
+// ******************************************************************************
+
+template<typename FPTYPE>
+void GeluCPULauncher(const FPTYPE * in, FPTYPE * out, int const size) {
+    for (int ii = 0; ii < size; ii++) {
+        out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])));
+    }
+}
+
+template<typename FPTYPE>
+void GeluGradCPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+    for (int ii = 0; ii < size; ii++) {
+        FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
+    }
+}
+
+template <typename FPTYPE>
+void GeluGradGradCPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+    for (int ii = 0; ii < size; ii++) {
+        FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
+	    out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void GeluGPULauncher(const FPTYPE * in, FPTYPE * out, int const size) {
+    GeluGPUExecuteFunctor<FPTYPE>()(in, out, size);
+}
+
+template<typename FPTYPE>
+void GeluGradGPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+    GeluGradGPUExecuteFunctor<FPTYPE>()(dy, in, out, size);
+}
+
+template <typename FPTYPE>
+void GeluGradGradGPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+    GeluGradGradGPUExecuteFunctor<FPTYPE>()(dy, dy_, in, out, size);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op Gelu
+// ******************************************************************************
+
+template<typename FPTYPE> 
+void compute_descriptor_se_r_cpu (
+    vector<FPTYPE > &	        descrpt_a,
+	vector<FPTYPE > &	        descrpt_a_deriv,
+	vector<FPTYPE > &	        rij_a,
+	const vector<FPTYPE > &	    posi,
+	const int &				ntypes,
+	const vector<int > &	type,
+	const int &				i_idx,
+	const vector<int > &	fmt_nlist_a,
+	const vector<int > &	sec_a, 
+	const float &			rmin,
+	const float &			rmax) 
+{
+    // compute the diff of the neighbors
+    rij_a.resize (sec_a.back() * 3);
+    fill (rij_a.begin(), rij_a.end(), 0.0);
+    for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) {
+        for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) {
+            if (fmt_nlist_a[jj] < 0) break;
+            const int & j_idx = fmt_nlist_a[jj];
+
+            for (int dd = 0; dd < 3; ++dd) {
+                rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+            }
+        }
+    }
+    // 1./rr, cos(theta), cos(phi), sin(phi)
+    descrpt_a.resize (sec_a.back());
+    fill (descrpt_a.begin(), descrpt_a.end(), 0.0);
+    // deriv wrt center: 3
+    descrpt_a_deriv.resize (sec_a.back() * 3);
+    fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0);
+
+    for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) {
+        for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {      
+            if (fmt_nlist_a[nei_iter] < 0) break;
+            const FPTYPE * rr = &rij_a[nei_iter * 3];
+            FPTYPE nr2 = MathUtilities::dot(rr, rr);
+            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE nr = nr2 * inr;
+            FPTYPE inr2 = inr * inr;
+            FPTYPE inr4 = inr2 * inr2;
+            FPTYPE inr3 = inr4 * nr;
+            FPTYPE sw, dsw;
+            spline5_switch(sw, dsw, nr, rmin, rmax);
+            int idx_deriv = nei_iter * 3;	// 1 components time 3 directions
+            int idx_value = nei_iter;	    // 1 components
+            // 4 value components
+            descrpt_a[idx_value + 0] = 1./nr;
+            // deriv of component 1/r
+            descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr;
+            // 4 value components
+            descrpt_a[idx_value + 0] *= sw;
+        }
+    }
+}
+
+template<typename FPTYPE>
+void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    // set & normalize coord
+    std::vector<FPTYPE> d_coord3(nall * 3);
+    for (int ii = 0; ii < nall; ++ii) {
+	    for (int dd = 0; dd < 3; ++dd) {
+	        d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+	    }
+    }
+
+    // set type
+    std::vector<int> d_type (nall);
+    for (int ii = 0; ii < nall; ++ii) {
+        d_type[ii] = type[ii];
+    }
+    
+    // build nlist
+    std::vector<vector<int > > d_nlist_a(nloc);
+
+	for (unsigned ii = 0; ii < nloc; ++ii) {
+	    d_nlist_a.reserve (jrange[nloc] / nloc + 10);
+	}
+	for (unsigned ii = 0; ii < nloc; ++ii) {
+	    int i_idx = ilist[ii];
+	    for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) {
+	        int j_idx = jlist[jj];
+	        d_nlist_a[i_idx].push_back (j_idx);
+	    }
+	}
+    
+    #pragma omp parallel for 
+    for (int ii = 0; ii < nloc; ++ii) {
+	    vector<int> fmt_nlist_a;
+	    int ret = -1;
+	    if (fill_nei_a) {
+	        format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a);
+	    }
+	    std::vector<FPTYPE> d_descrpt_a;
+	    std::vector<FPTYPE> d_descrpt_a_deriv;
+	    std::vector<FPTYPE> d_descrpt_r;
+	    std::vector<FPTYPE> d_descrpt_r_deriv;
+	    std::vector<FPTYPE> d_rij_a;
+	    compute_descriptor_se_r_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r);
+
+	    // check sizes
+	    assert (d_descrpt_a.size() == ndescrpt);
+	    assert (d_descrpt_a_deriv.size() == ndescrpt * 3);
+	    assert (d_rij_a.size() == nnei * 3);
+	    assert (fmt_nlist_a.size() == nnei);
+	    // record outputs
+	    for (int jj = 0; jj < ndescrpt; ++jj) {
+            descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj];
+        }
+	    for (int jj = 0; jj < ndescrpt * 3; ++jj) {
+	        descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3];
+	    }
+	    for (int jj = 0; jj < nnei * 3; ++jj) {
+	        rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+	    }
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+	    }
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op DescrptSeR
+// ******************************************************************************
+
+template<typename FPTYPE>
+void ProdForceSeRCPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+    memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
+    // compute force of a frame
+    for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+	    // deriv wrt center atom
+	    for (int aa = 0; aa < ndescrpt; ++aa) {
+	        force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+	        force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+	        force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+	    }
+	    // deriv wrt neighbors
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        int j_idx = nlist[i_idx * nnei + jj];
+	        if (j_idx < 0) continue;
+	        force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 0];
+	        force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 1];
+	        force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 2];
+	    }
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void ProdForceSeRGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+    ProdForceSeRGPUExecuteFunctor<FPTYPE>()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt);
+}
+#endif // GOOGLE_CUDA
+
+// ******************************************************************************
+// end of custome op ProdForceSeR
+// ******************************************************************************
+
+template<typename FPTYPE>
+void ProdVirialSeRCPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+    memset(virial, 0.0, sizeof(FPTYPE) * 9);
+    memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9);
+
+    // compute virial of a frame
+    for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+	    // deriv wrt neighbors
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        int j_idx = nlist[i_idx * nnei + jj];
+	        if (j_idx < 0) continue;
+	        FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + jj];
+	        for (int dd0 = 0; dd0 < 3; ++dd0)
+	            for (int dd1 = 0; dd1 < 3; ++dd1) {
+		            FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] *  in_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0];
+		            virial[dd0 * 3 + dd1] -= tmp_v;
+		            atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
+	            }
+	    }
+	}
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void ProdVirialSeRGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+    ProdVirialSeRGPUExecuteFunctor<FPTYPE>()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op ProdVirialSeR
+// ******************************************************************************
diff --git a/source/lib/include/DeviceFunctor.h b/source/lib/include/DeviceFunctor.h
new file mode 100644
index 0000000000..d51d617f84
--- /dev/null
+++ b/source/lib/include/DeviceFunctor.h
@@ -0,0 +1,62 @@
+#pragma once
+#include <vector>
+#include <climits>
+#include <stdio.h>
+#include <iostream>
+#include <cuda_runtime.h>
+
+typedef unsigned long long int_64;
+#define SQRT_2_PI 0.7978845608028654 
+
+#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);}
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+    if (code != cudaSuccess) {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+template<typename FPTYPE>
+struct DescrptSeAGPUExecuteFunctor {
+    void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER);
+};
+
+template<typename FPTYPE>
+struct DescrptSeRGPUExecuteFunctor {
+    void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER);
+};
+
+template<typename FPTYPE>
+struct ProdForceSeAGPUExecuteFunctor {
+    void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+};
+
+template<typename FPTYPE>
+struct ProdForceSeRGPUExecuteFunctor {
+    void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt);
+};
+
+template<typename FPTYPE>
+struct ProdVirialSeAGPUExecuteFunctor {
+    void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+};
+
+template<typename FPTYPE>
+struct ProdVirialSeRGPUExecuteFunctor {
+    void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt);
+};
+
+template<typename FPTYPE>
+struct GeluGPUExecuteFunctor {
+    void operator()(const FPTYPE * in, FPTYPE * out, const int size);
+};
+
+template<typename FPTYPE>
+struct GeluGradGPUExecuteFunctor {
+    void operator()(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, const int size);
+};
+
+template<typename FPTYPE>
+struct GeluGradGradGPUExecuteFunctor {
+    void operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, const int size);
+};
\ No newline at end of file
diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h
index a32940a738..6c37770758 100644
--- a/source/lib/include/NNPInter.h
+++ b/source/lib/include/NNPInter.h
@@ -98,9 +98,6 @@ class NNPInter
 
   // function used for neighbor list copy
   vector<int> get_sel_a() const;
-#ifdef USE_CUDA_TOOLKIT
-  void update_nbor(const InternalNeighborList & nlist, const int nloc);
-#endif
 };
 
 class NNPInterModelDevi
@@ -195,9 +192,6 @@ class NNPInterModelDevi
   // function used for nborlist copy
   vector<vector<int> > get_sel() const;
   void cum_sum(const std::vector<std::vector<int32> > n_sel);
-#ifdef USE_CUDA_TOOLKIT
-  void update_nbor(const InternalNeighborList & nlist, const int nloc);
-#endif
 };
 
 
diff --git a/source/lib/include/common.h b/source/lib/include/common.h
index 3912f21f7f..4874274305 100644
--- a/source/lib/include/common.h
+++ b/source/lib/include/common.h
@@ -8,11 +8,17 @@
 
 using namespace tensorflow;
 using namespace std;
+#include <tensorflow/core/graph/default_device.h>
+#include <tensorflow/core/graph/graph_def_builder.h>
 
 #include "NNPAtomMap.h"
 #include <vector>
+#include <string>
+#include <iostream>
 #include "version.h"
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 #ifdef HIGH_PREC
 typedef double VALUETYPE;
 typedef double ENERGYTYPE;
@@ -122,6 +128,20 @@ session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
 		       const int			nghost = 0,
 		       const string			scope = "");
 
+int
+session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
+		       const vector<VALUETYPE> &	dcoord_,
+		       const int &			ntypes,
+		       const vector<int> &		datype_,
+		       const vector<VALUETYPE> &	dbox,		    
+		       InternalNeighborList &		dlist, 
+		       const vector<VALUETYPE> &	fparam_,
+		       const vector<VALUETYPE> &	aparam_,
+		       const NNPAtomMap<VALUETYPE>&	nnpmap,
+		       const int			nghost,
+		       const int			ago,
+		       const string			scope = "");
+
 int
 session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
 		       const vector<VALUETYPE> &	dcoord_,
diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
index aea9d48c9b..879c36d5c0 100644
--- a/source/lib/src/NNPInter.cc
+++ b/source/lib/src/NNPInter.cc
@@ -4,11 +4,8 @@
 #include <stdexcept>	
 
 
-#ifdef  USE_CUDA_TOOLKIT
+#if  GOOGLE_CUDA
 #include "cuda_runtime.h"
-#include <tensorflow/core/public/session.h>
-#include <tensorflow/core/graph/default_device.h>
-#include <tensorflow/core/graph/graph_def_builder.h>
 
 #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
 inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
@@ -57,7 +54,6 @@ run_model (ENERGYTYPE &			dener,
     return;
   }
 
-#ifdef USE_CUDA_TOOLKIT
   std::vector<Tensor> output_tensors;
   checkStatus (session->Run(input_tensors, 
 			    {"o_energy", "o_force", "o_atom_virial"}, 
@@ -74,56 +70,23 @@ run_model (ENERGYTYPE &			dener,
 
   dener = oe(0);
   vector<VALUETYPE> dforce (3 * nall);
-  vector<VALUETYPE> datom_virial (9 * nall);
   dvirial.resize (9);
   for (unsigned ii = 0; ii < nall * 3; ++ii){
     dforce[ii] = of(ii);
   }
-  for (int ii = 0; ii < nall * 9; ++ii) {
-    datom_virial[ii] = oav(ii);
-  }
   for (int ii = 0; ii < nall; ++ii) {
-    dvirial[0] += 1.0 * datom_virial[9*ii+0];
-    dvirial[1] += 1.0 * datom_virial[9*ii+1];
-    dvirial[2] += 1.0 * datom_virial[9*ii+2];
-    dvirial[3] += 1.0 * datom_virial[9*ii+3];
-    dvirial[4] += 1.0 * datom_virial[9*ii+4];
-    dvirial[5] += 1.0 * datom_virial[9*ii+5];
-    dvirial[6] += 1.0 * datom_virial[9*ii+6];
-    dvirial[7] += 1.0 * datom_virial[9*ii+7];
-    dvirial[8] += 1.0 * datom_virial[9*ii+8];
-	}
-
-  dforce_ = dforce;
-  nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-#else 
-  std::vector<Tensor> output_tensors;
-
-  checkStatus (session->Run(input_tensors, 
-			    {"o_energy", "o_force", "o_virial"}, 
-			    {}, 
-			    &output_tensors));
-  
-  Tensor output_e = output_tensors[0];
-  Tensor output_f = output_tensors[1];
-  Tensor output_v = output_tensors[2];
-
-  auto oe = output_e.flat <ENERGYTYPE> ();
-  auto of = output_f.flat <VALUETYPE> ();
-  auto ov = output_v.flat <VALUETYPE> ();
-
-  dener = oe(0);
-  vector<VALUETYPE> dforce (3 * nall);
-  dvirial.resize (9);
-  for (unsigned ii = 0; ii < nall * 3; ++ii){
-    dforce[ii] = of(ii);
-  }
-  for (unsigned ii = 0; ii < 9; ++ii){
-    dvirial[ii] = ov(ii);
+    dvirial[0] += 1.0 * oav(9*ii+0);
+    dvirial[1] += 1.0 * oav(9*ii+1);
+    dvirial[2] += 1.0 * oav(9*ii+2);
+    dvirial[3] += 1.0 * oav(9*ii+3);
+    dvirial[4] += 1.0 * oav(9*ii+4);
+    dvirial[5] += 1.0 * oav(9*ii+5);
+    dvirial[6] += 1.0 * oav(9*ii+6);
+    dvirial[7] += 1.0 * oav(9*ii+7);
+    dvirial[8] += 1.0 * oav(9*ii+8);
   }
   dforce_ = dforce;
   nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-#endif
 }
 
 static void run_model (ENERGYTYPE   &	dener,
@@ -155,7 +118,6 @@ static void run_model (ENERGYTYPE   &	dener,
         fill(datom_virial_.begin(), datom_virial_.end(), 0.0);
         return;
     }
-#ifdef USE_CUDA_TOOLKIT
     std::vector<Tensor> output_tensors;
 
     checkStatus (session->Run(input_tensors, 
@@ -204,50 +166,6 @@ static void run_model (ENERGYTYPE   &	dener,
     nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
     nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1);
     nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9);
-#else
-    std::vector<Tensor> output_tensors;
-
-    checkStatus (session->Run(input_tensors, 
-	  		    {"o_energy", "o_force", "o_virial", "o_atom_energy", "o_atom_virial"}, 
-	  		    {}, 
-	  		    &output_tensors));
-
-    Tensor output_e = output_tensors[0];
-    Tensor output_f = output_tensors[1];
-    Tensor output_v = output_tensors[2];
-    Tensor output_ae = output_tensors[3];
-    Tensor output_av = output_tensors[4];
-
-    auto oe = output_e.flat <ENERGYTYPE> ();
-    auto of = output_f.flat <VALUETYPE> ();
-    auto ov = output_v.flat <VALUETYPE> ();
-    auto oae = output_ae.flat <VALUETYPE> ();
-    auto oav = output_av.flat <VALUETYPE> ();
-
-    dener = oe(0);
-    vector<VALUETYPE> dforce (3 * nall);
-    vector<VALUETYPE> datom_energy (nall, 0);
-    vector<VALUETYPE> datom_virial (9 * nall);
-    dvirial.resize (9);
-    for (int ii = 0; ii < nall * 3; ++ii) {
-        dforce[ii] = of(ii);
-    }
-    for (int ii = 0; ii < nloc; ++ii) {
-        datom_energy[ii] = oae(ii);
-    }
-    for (int ii = 0; ii < nall * 9; ++ii) {
-        datom_virial[ii] = oav(ii);
-    }
-    for (int ii = 0; ii < 9; ++ii) {
-        dvirial[ii] = ov(ii);
-    }
-    dforce_ = dforce;
-    datom_energy_ = datom_energy;
-    datom_virial_ = datom_virial;
-    nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-    nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1);
-    nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9);
-#endif
 }
 
 
@@ -266,50 +184,8 @@ NNPInter (const string & model, const int & gpu_rank)
   init(model, gpu_rank);  
 }
 
-NNPInter::~NNPInter() {
-    #ifdef USE_CUDA_TOOLKIT
-    if (init_nbor) {
-        cudaErrcheck(cudaFree(ilist));
-        cudaErrcheck(cudaFree(jrange));
-        cudaErrcheck(cudaFree(jlist));
-    }
-    #endif
-}
-
-#ifdef USE_CUDA_TOOLKIT
-void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
-    if (!init_nbor) {
-        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
-        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
-        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        ilist_size = nlist.ilist.size();
-        jrange_size = nlist.jrange.size();
-        jlist_size = nlist.jlist.size();
-        init_nbor = true;
-    }
-    if (ilist_size < nlist.ilist.size()) {
-        cudaErrcheck(cudaFree(ilist));
-        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
-        ilist_size = nlist.ilist.size();
-    }
-    if (jrange_size < nlist.jrange.size()) {
-        cudaErrcheck(cudaFree(jrange));
-        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
-        jrange_size = nlist.jrange.size();
-    }
-    if (jlist_size < nlist.jlist.size()) {
-        cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        jlist_size = nlist.jlist.size();
-    }
-    
-    cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
-    cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
-    cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
-}
-#endif // USE_CUDA_TOOLKIT
+NNPInter::~NNPInter() {}
 
-#ifdef USE_CUDA_TOOLKIT
 void
 NNPInter::
 init (const string & model, const int & gpu_rank)
@@ -318,21 +194,21 @@ init (const string & model, const int & gpu_rank)
   SessionOptions options;
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
   options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  options.config.set_allow_soft_placement(true);
-  options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
-  options.config.mutable_gpu_options()->set_allow_growth(true);
 
   checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def));
   int gpu_num = -1;
-  cudaGetDeviceCount(&gpu_num);
-  // std::cout << "current number of devices: " << gpu_num << std::endl;
-  // set device to GPU only when at least GPU is found
+  #if GOOGLE_CUDA
+  cudaGetDeviceCount(&gpu_num); // check current device environment
   if (gpu_num > 0) {
+    options.config.set_allow_soft_placement(true);
+    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
+    options.config.mutable_gpu_options()->set_allow_growth(true);
+    cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
     std::string str = "/gpu:";
     str += std::to_string(gpu_rank % gpu_num);
     graph::SetDefaultDevice(str, &graph_def);
-    // std::cout << "current device rank: " << str << std::endl;
   }
+  #endif // GOOGLE_CUDA
   checkStatus (NewSession(options, &session));
   checkStatus (session->Create(graph_def));
   rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
@@ -340,8 +216,6 @@ init (const string & model, const int & gpu_rank)
   ntypes = get_scalar<int>("descrpt_attr/ntypes");
   dfparam = get_scalar<int>("fitting_attr/dfparam");
   daparam = get_scalar<int>("fitting_attr/daparam");
-  // assert(rcut == get_rcut());
-  // assert(ntypes == get_ntypes());
   if (dfparam < 0) dfparam = 0;
   if (daparam < 0) daparam = 0;
   inited = true;
@@ -350,38 +224,6 @@ init (const string & model, const int & gpu_rank)
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
 }
-#else
-void
-NNPInter::
-init (const string & model, const int & gpu_rank)
-{
-  assert (!inited);
-  SessionOptions options;
-  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
-  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  checkStatus (NewSession(options, &session));
-  checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def));
-  checkStatus (session->Create(graph_def));  
-  rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
-  cell_size = rcut;
-  ntypes = get_scalar<int>("descrpt_attr/ntypes");
-  dfparam = get_scalar<int>("fitting_attr/dfparam");
-  daparam = get_scalar<int>("fitting_attr/daparam");
-  // assert(rcut == get_rcut());
-  // assert(ntypes == get_ntypes());
-  if (dfparam < 0) dfparam = 0;
-  if (daparam < 0) daparam = 0;
-  // rcut = get_rcut();
-  // cell_size = rcut;
-  // ntypes = get_ntypes();
-  // dfparam = get_dfparam();
-  inited = true;
-
-  init_nbor = false;
-  ilist = NULL; jrange = NULL; jlist = NULL;
-  ilist_size = 0; jrange_size = 0; jlist_size = 0;
-}
-#endif
 
 void 
 NNPInter::
@@ -554,17 +396,10 @@ compute_inner (ENERGYTYPE &			dener,
         nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.begin() + nloc);
         assert (nloc == nnpmap.get_type().size());
 
-	shuffle_nlist (nlist, nnpmap);
-        #ifdef USE_CUDA_TOOLKIT
-            update_nbor(nlist, nloc);
-        #endif
+   shuffle_nlist (nlist, nnpmap);
     }
 
-    #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
-    #else
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-    #endif
+    int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
     assert (nloc == ret);
     run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost);
 }
@@ -622,16 +457,9 @@ compute (ENERGYTYPE &			dener,
         // InternalNeighborList nlist;
         convert_nlist_lmp_internal (nlist, lmp_list);
         shuffle_nlist (nlist, nnpmap);
-        #ifdef USE_CUDA_TOOLKIT
-            update_nbor(nlist, nloc);
-        #endif
     }
 
-    #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
-    #else
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-    #endif
+    int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
     assert (nloc == ret);
     run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost);
 }
@@ -663,17 +491,8 @@ NNPInterModelDevi (const vector<string> & models, const int & gpu_rank)
   init(models, gpu_rank);
 }
 
-NNPInterModelDevi::~NNPInterModelDevi() {
-#ifdef USE_CUDA_TOOLKIT
-    if (init_nbor) {
-        cudaErrcheck(cudaFree(ilist));
-        cudaErrcheck(cudaFree(jrange));
-        cudaErrcheck(cudaFree(jlist));
-    }
-#endif
-}
+NNPInterModelDevi::~NNPInterModelDevi() {}
 
-#ifdef USE_CUDA_TOOLKIT
 void
 NNPInterModelDevi::
 init (const vector<string> & models, const int & gpu_rank)
@@ -682,26 +501,32 @@ init (const vector<string> & models, const int & gpu_rank)
   numb_models = models.size();
   sessions.resize(numb_models);
   graph_defs.resize(numb_models);
+  
+  int gpu_num = -1;
+  #if GOOGLE_CUDA 
+  cudaGetDeviceCount(&gpu_num);
+  #endif // GOOGLE_CUDA
+
   SessionOptions options;
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
   options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  options.config.set_allow_soft_placement(true);
-  options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
-  options.config.mutable_gpu_options()->set_allow_growth(true);
-  
   for (unsigned ii = 0; ii < numb_models; ++ii){
     checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii]));
   }
-  int gpu_num = -1;
-  cudaGetDeviceCount(&gpu_num);
-  // std::cout << "current number of devices: " << gpu_num << std::endl;
-  for (unsigned ii = 0; ii < numb_models; ++ii){
-    // set device to GPU only when at least GPU is found
+  #if GOOGLE_CUDA 
+  if (gpu_num > 0) {
+      options.config.set_allow_soft_placement(true);
+      options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
+      options.config.mutable_gpu_options()->set_allow_growth(true);
+      cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
+  }
+  #endif // GOOGLE_CUDA
+
+  for (unsigned ii = 0; ii < numb_models; ++ii) {
     if (gpu_num > 0) {
       std::string str = "/gpu:";
       str += std::to_string(gpu_rank % gpu_num);
       graph::SetDefaultDevice(str, &graph_defs[ii]);
-      // std::cout << "current device rank: " << str << std::endl;
     }
     checkStatus (NewSession(options, &(sessions[ii])));
     checkStatus (sessions[ii]->Create(graph_defs[ii]));
@@ -722,40 +547,6 @@ init (const vector<string> & models, const int & gpu_rank)
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
 }
-#else
-void
-NNPInterModelDevi::
-init (const vector<string> & models, const int & gpu_rank)
-{
-  assert (!inited);
-  numb_models = models.size();
-  sessions.resize(numb_models);
-  graph_defs.resize(numb_models);
-  SessionOptions options;
-  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
-  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  for (unsigned ii = 0; ii < numb_models; ++ii){
-    checkStatus (NewSession(options, &(sessions[ii])));
-    checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii]));
-    checkStatus (sessions[ii]->Create(graph_defs[ii]));
-  }
-  rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
-  cell_size = rcut;
-  ntypes = get_scalar<int>("descrpt_attr/ntypes");
-  dfparam = get_scalar<int>("fitting_attr/dfparam");
-  daparam = get_scalar<int>("fitting_attr/daparam");
-  if (dfparam < 0) dfparam = 0;
-  if (daparam < 0) daparam = 0;
-  // rcut = get_rcut();
-  // cell_size = rcut;
-  // ntypes = get_ntypes();
-  inited = true;
-  
-  init_nbor = false;
-  ilist = NULL; jrange = NULL; jlist = NULL;
-  ilist_size = 0; jrange_size = 0; jlist_size = 0;
-}
-#endif
 
 template<class VT>
 VT
@@ -821,42 +612,6 @@ cum_sum (const std::vector<std::vector<int32> > n_sel)
     }
 }
 
-#ifdef USE_CUDA_TOOLKIT
-void
-NNPInterModelDevi::
-update_nbor(const InternalNeighborList & nlist, const int nloc) 
-{
-    if (!init_nbor) {
-        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
-        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
-        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        ilist_size = nlist.ilist.size();
-        jrange_size = nlist.jrange.size();
-        jlist_size = nlist.jlist.size();
-        init_nbor = true;
-    }
-    if (ilist_size < nlist.ilist.size()) {
-        cudaErrcheck(cudaFree(ilist));
-        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
-        ilist_size = nlist.ilist.size();
-    }
-    if (jrange_size < nlist.jrange.size()) {
-        cudaErrcheck(cudaFree(jrange));
-        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
-        jrange_size = nlist.jrange.size();
-    }
-    if (jlist_size < nlist.jlist.size()) {
-        cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        jlist_size = nlist.jlist.size();
-    }
-
-    cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
-    cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
-    cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
-}
-#endif //USE_CUDA_TOOLKIT
-
 void
 NNPInterModelDevi::
 validate_fparam_aparam(const int & nloc,
@@ -946,16 +701,8 @@ compute (vector<ENERGYTYPE> &		all_energy,
         // InternalNeighborList nlist;
         convert_nlist_lmp_internal (nlist, lmp_list);
         shuffle_nlist (nlist, nnpmap);
-        #ifdef USE_CUDA_TOOLKIT
-            update_nbor(nlist, nloc);
-        #endif
-
     }
-    #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
-    #else
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-    #endif
+    int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
 
     all_energy.resize (numb_models);
     all_force.resize (numb_models);
@@ -996,16 +743,8 @@ compute (vector<ENERGYTYPE> &			all_energy,
         // InternalNeighborList nlist;
         convert_nlist_lmp_internal (nlist, lmp_list);
         shuffle_nlist (nlist, nnpmap);
-        #ifdef USE_CUDA_TOOLKIT
-            update_nbor(nlist, nloc);
-        #endif
-        
     }
-    #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
-    #else
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-    #endif
+    int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
 
     all_energy.resize (numb_models);
     all_force .resize (numb_models);
diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc
index dd4c3f672a..d990195e6c 100644
--- a/source/lib/src/common.cc
+++ b/source/lib/src/common.cc
@@ -340,6 +340,136 @@ session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
   return nloc;
 }
 
+int
+session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
+		       const vector<VALUETYPE> &	dcoord_,
+		       const int &			ntypes,
+		       const vector<int> &		datype_,
+		       const vector<VALUETYPE> &	dbox,		    
+		       InternalNeighborList &		dlist, 
+		       const vector<VALUETYPE> &	fparam_,
+		       const vector<VALUETYPE> &	aparam_,
+		       const NNPAtomMap<VALUETYPE>&	nnpmap,
+		       const int			nghost,
+           const int      ago,
+		       const string			scope)
+{
+  assert (dbox.size() == 9);
+
+  int nframes = 1;
+  int nall = dcoord_.size() / 3;
+  int nloc = nall - nghost;
+  assert (nall == datype_.size());  
+
+  vector<int > datype = nnpmap.get_type();
+  vector<int > type_count (ntypes, 0);
+  for (unsigned ii = 0; ii < datype.size(); ++ii){
+    type_count[datype[ii]] ++;
+  }
+  datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
+
+  TensorShape coord_shape ;
+  coord_shape.AddDim (nframes);
+  coord_shape.AddDim (nall * 3);
+  TensorShape type_shape ;
+  type_shape.AddDim (nframes);
+  type_shape.AddDim (nall);
+  TensorShape box_shape ;
+  box_shape.AddDim (nframes);
+  box_shape.AddDim (9);
+  TensorShape mesh_shape ;
+  mesh_shape.AddDim (16);
+  TensorShape natoms_shape ;
+  natoms_shape.AddDim (2 + ntypes);
+  TensorShape fparam_shape ;
+  fparam_shape.AddDim (nframes);
+  fparam_shape.AddDim (fparam_.size());
+  TensorShape aparam_shape ;
+  aparam_shape.AddDim (nframes);
+  aparam_shape.AddDim (aparam_.size());
+  
+#ifdef HIGH_PREC
+  Tensor coord_tensor	(DT_DOUBLE, coord_shape);
+  Tensor box_tensor	(DT_DOUBLE, box_shape);
+  Tensor fparam_tensor  (DT_DOUBLE, fparam_shape);
+  Tensor aparam_tensor  (DT_DOUBLE, aparam_shape);
+#else
+  Tensor coord_tensor	(DT_FLOAT, coord_shape);
+  Tensor box_tensor	(DT_FLOAT, box_shape);
+  Tensor fparam_tensor  (DT_FLOAT, fparam_shape);
+  Tensor aparam_tensor  (DT_FLOAT, aparam_shape);
+#endif
+  Tensor type_tensor	(DT_INT32, type_shape);
+  Tensor mesh_tensor	(DT_INT32, mesh_shape);
+  Tensor natoms_tensor	(DT_INT32, natoms_shape);
+
+  auto coord = coord_tensor.matrix<VALUETYPE> ();
+  auto type = type_tensor.matrix<int> ();
+  auto box = box_tensor.matrix<VALUETYPE> ();
+  auto mesh = mesh_tensor.flat<int> ();
+  auto natoms = natoms_tensor.flat<int> ();
+  auto fparam = fparam_tensor.matrix<VALUETYPE> ();
+  auto aparam = aparam_tensor.matrix<VALUETYPE> ();
+
+  vector<VALUETYPE> dcoord (dcoord_);
+  nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
+  
+  for (int ii = 0; ii < nframes; ++ii){
+    for (int jj = 0; jj < nall * 3; ++jj){
+      coord(ii, jj) = dcoord[jj];
+    }
+    for (int jj = 0; jj < 9; ++jj){
+      box(ii, jj) = dbox[jj];
+    }
+    for (int jj = 0; jj < nall; ++jj){
+      type(ii, jj) = datype[jj];
+    }
+    for (int jj = 0; jj < fparam_.size(); ++jj){
+      fparam(ii, jj) = fparam_[jj];
+    }
+    for (int jj = 0; jj < aparam_.size(); ++jj){
+      aparam(ii, jj) = aparam_[jj];
+    }
+  }
+  
+  for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0;
+  
+  const int stride = sizeof(int *) / sizeof(int);
+  assert (stride * sizeof(int) == sizeof(int *));
+  assert (stride <= 4);
+  mesh (0) = ago;
+  mesh (1) = dlist.ilist.size();
+  mesh (2) = dlist.jrange.size();
+  mesh (3) = dlist.jlist.size();
+  dlist.make_ptrs();
+  memcpy (&mesh(4), &(dlist.pilist), sizeof(int *));
+  memcpy (&mesh(8), &(dlist.pjrange), sizeof(int *));
+  memcpy (&mesh(12), &(dlist.pjlist), sizeof(int *));
+
+  natoms (0) = nloc;
+  natoms (1) = nall;
+  for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
+
+  string prefix = "";
+  if (scope != ""){
+    prefix = scope + "/";
+  }
+  input_tensors = {
+    {prefix+"t_coord",	coord_tensor}, 
+    {prefix+"t_type",	type_tensor},
+    {prefix+"t_box",	box_tensor},
+    {prefix+"t_mesh",	mesh_tensor},
+    {prefix+"t_natoms",natoms_tensor},
+  };  
+  if (fparam_.size() > 0) {
+    input_tensors.push_back({prefix+"t_fparam", fparam_tensor});
+  }
+  if (aparam_.size() > 0) {
+    input_tensors.push_back({prefix+"t_aparam", aparam_tensor});
+  }
+  return nloc;
+}
+
 int
 session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
 		       const vector<VALUETYPE> &	dcoord_,
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 993a1b6fd4..c4d97cd85a 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -4,8 +4,7 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
 file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc)
-file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc)
-file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_gpu.cc)
+file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
@@ -27,7 +26,7 @@ if (BUILD_PY_IF)
   set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
   set(CMAKE_INSTALL_RPATH DESTINATION ${DP_PIP_INSTALL_PATH} ${DP_SETUP_INSTALL_PATH} ${CMAKE_BINARY_DIR}/op/cuda)
   if (USE_CUDA_TOOLKIT)
-    add_library(op_abi SHARED ${OP_PY_CUDA_SRC} ${OP_LIB})
+    add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
     add_library(op_grads SHARED ${OP_GRADS_SRC})
     add_subdirectory(cuda)
     find_package(CUDA REQUIRED)
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index 1636df8ff5..5965254111 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -1,42 +1,7 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#define EIGEN_USE_GPU
-#include <vector>
-#include <climits>
-#include <stdio.h>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_store.cuh>
 #include <cub/block/block_radix_sort.cuh>
-#include <cuda_runtime.h>
-
-#ifdef HIGH_PREC
-    typedef double  VALUETYPE;
-#else
-    typedef float   VALUETYPE;
-#endif
-
-typedef double compute_t;
-
-typedef unsigned long long int_64;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 template <
     typename    Key,
@@ -72,24 +37,25 @@ __global__ void BlockSortKernel(
     cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
 }
 
-template<typename T>
-__device__ inline T dev_dot(T * arr1, T * arr2) {
+template<typename FPTYPE>
+__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) {
     return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
 }
 
-__device__ inline void spline5_switch(compute_t & vv,
-        compute_t & dd,
-        compute_t & xx, 
-		const compute_t & rmin, 
-		const compute_t & rmax) 
+template<typename FPTYPE>
+__device__ inline void spline5_switch(FPTYPE & vv,
+        FPTYPE & dd,
+        FPTYPE & xx, 
+		const float & rmin, 
+		const float & rmax) 
 {
     if (xx < rmin) {
         dd = 0;
         vv = 1;
     }
     else if (xx < rmax) {
-        compute_t uu = (xx - rmin) / (rmax - rmin) ;
-        compute_t du = 1. / (rmax - rmin) ;
+        FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
+        FPTYPE du = 1. / (rmax - rmin) ;
         vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
         dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
     }
@@ -110,7 +76,8 @@ __global__ void get_i_idx_se_a(const int nloc,
     i_idx[ilist[idy]] = idy;
 }
 
-__global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
+template<typename FPTYPE>
+__global__ void format_nlist_fill_a_se_a(const FPTYPE * coord,
                             const int * type,
                             const int  * jrange,
                             const int  * jlist,
@@ -120,8 +87,8 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
                             const int MAGIC_NUMBER)
 {   
     // <<<nloc, MAGIC_NUMBER>>>
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     
     const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
     if (idy >= nsize) {
@@ -133,12 +100,12 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
 
     int_64 * key_in = key + idx * MAGIC_NUMBER;
 
-    compute_t diff[3];
+    FPTYPE diff[3];
     const int & j_idx = nei_idx[idy];
     for (int dd = 0; dd < 3; dd++) {
         diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
     }
-    compute_t rr = sqrt(dev_dot(diff, diff)); 
+    FPTYPE rr = sqrt(dev_dot(diff, diff)); 
     if (rr <= rcut) {
         key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx;
     }
@@ -180,33 +147,34 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
 }
 //it's ok!
 
-__global__ void compute_descriptor_se_a (VALUETYPE* descript,
+template<typename FPTYPE>
+__global__ void compute_descriptor_se_a (FPTYPE* descript,
                             const int ndescrpt,
-                            VALUETYPE* descript_deriv,
+                            FPTYPE* descript_deriv,
                             const int descript_deriv_size,
-                            VALUETYPE* rij,
+                            FPTYPE* rij,
                             const int rij_size,
                             const int* type,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
+                            const FPTYPE* avg,
+                            const FPTYPE* std,
                             int* nlist,
                             const int nlist_size,
-                            const VALUETYPE* coord,
-                            const VALUETYPE rmin,
-                            const VALUETYPE rmax,
+                            const FPTYPE* coord,
+                            const float rmin,
+                            const float rmax,
                             const int sec_a_size)
 {   
     // <<<nloc, sec_a.back()>>>
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     const int idx_deriv = idy * 4 * 3;	// 4 components time 3 directions
     const int idx_value = idy * 4;	// 4 components
     if (idy >= sec_a_size) {return;}
 
     // else {return;}
-    VALUETYPE * row_descript = descript + idx * ndescrpt;
-    VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
-    VALUETYPE * row_rij = rij + idx * rij_size;
+    FPTYPE * row_descript = descript + idx * ndescrpt;
+    FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
+    FPTYPE * row_rij = rij + idx * rij_size;
     int * row_nlist = nlist + idx * nlist_size;
 
     if (row_nlist[idy] >= 0) {
@@ -214,14 +182,14 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
         for (int kk = 0; kk < 3; kk++) {
             row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
         }
-        const compute_t * rr = &row_rij[idy * 3 + 0];
-        compute_t nr2 = dev_dot(rr, rr);
-        compute_t inr = 1./sqrt(nr2);
-        compute_t nr = nr2 * inr;
-        compute_t inr2 = inr * inr;
-        compute_t inr4 = inr2 * inr2;
-        compute_t inr3 = inr4 * nr;
-        compute_t sw, dsw;
+        const FPTYPE * rr = &row_rij[idy * 3 + 0];
+        FPTYPE nr2 = dev_dot(rr, rr);
+        FPTYPE inr = 1./sqrt(nr2);
+        FPTYPE nr = nr2 * inr;
+        FPTYPE inr2 = inr * inr;
+        FPTYPE inr4 = inr2 * inr2;
+        FPTYPE inr3 = inr4 * nr;
+        FPTYPE sw, dsw;
         spline5_switch(sw, dsw, nr, rmin, rmax);
         row_descript[idx_value + 0] = (1./nr)       ;//* sw;
         row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw;
@@ -260,8 +228,9 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
     }
 }
 
+template<typename FPTYPE>
 void format_nbor_list_256 (
-    const VALUETYPE* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -274,9 +243,10 @@ void format_nbor_list_256 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 256;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -292,8 +262,9 @@ void format_nbor_list_256 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
+template<typename FPTYPE>
 void format_nbor_list_512 (
-    const VALUETYPE* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -306,9 +277,10 @@ void format_nbor_list_512 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 512;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -324,8 +296,9 @@ void format_nbor_list_512 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
+template<typename FPTYPE>
 void format_nbor_list_1024 (
-    const VALUETYPE* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -338,9 +311,10 @@ void format_nbor_list_1024 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 1024;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -356,8 +330,9 @@ void format_nbor_list_1024 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
+template<typename FPTYPE>
 void format_nbor_list_2048 (
-    const VALUETYPE* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -370,9 +345,10 @@ void format_nbor_list_2048 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 2048;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -388,8 +364,9 @@ void format_nbor_list_2048 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
+template<typename FPTYPE>
 void format_nbor_list_4096 (
-    const VALUETYPE* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -402,9 +379,10 @@ void format_nbor_list_4096 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 4096;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -420,29 +398,8 @@ void format_nbor_list_4096 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
-void DescrptSeALauncher(const VALUETYPE* coord,
-                            const int* type,
-                            const int* ilist,
-                            const int* jrange,
-                            const int* jlist,
-                            int* array_int,
-                            unsigned long long* array_longlong,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
-                            VALUETYPE* descript,
-                            VALUETYPE* descript_deriv,
-                            VALUETYPE* rij,
-                            int* nlist,
-                            const int& nloc,       
-                            const int& nnei,       
-                            const float& rcut_r,     
-                            const float& rcut_r_smth,
-                            const int& ndescrpt, 
-                            const std::vector<int>& sec_a,      
-                            const bool& fill_nei_a,
-                            const int MAGIC_NUMBER
-)
-{   
+template <typename FPTYPE>
+void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
     const int LEN = 256;
     int nblock = (nloc + LEN -1) / LEN;
     int * sec_a_dev = array_int;
@@ -454,8 +411,8 @@ void DescrptSeALauncher(const VALUETYPE* coord,
     res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
     res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
-    res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res);
-    res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
+    res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res);
+    res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
 
     if (fill_nei_a) {
         // ~~~
@@ -534,8 +491,9 @@ void DescrptSeALauncher(const VALUETYPE* coord,
     }
 
     const int nblock_ = (sec_a.back() + LEN -1) / LEN;
-    dim3 block_grid(nblock_, nloc);
-    compute_descriptor_se_a<<<block_grid, LEN>>> (
+    dim3 block_grid(nloc, nblock_);
+    dim3 thread_grid(1, LEN);
+    compute_descriptor_se_a<<<block_grid, thread_grid>>> (
                             descript,
                             ndescrpt,
                             descript_deriv,
@@ -552,4 +510,7 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             rcut_r,
                             sec_a.back()
     );
-}  
\ No newline at end of file
+}
+
+template struct DescrptSeAGPUExecuteFunctor<float>;
+template struct DescrptSeAGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu
index fa9678be34..a65ba5887a 100644
--- a/source/op/cuda/descrpt_se_r.cu
+++ b/source/op/cuda/descrpt_se_r.cu
@@ -1,45 +1,7 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#define EIGEN_USE_GPU
-#include <vector>
-#include <climits>
-#include <stdio.h>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_store.cuh>
 #include <cub/block/block_radix_sort.cuh>
-#include <cuda_runtime.h>
-#include <fstream>
-
-#define MAGIC_NUMBER 256
-
-#ifdef HIGH_PREC
-    typedef double  VALUETYPE;
-#else
-    typedef float   VALUETYPE;
-#endif
-
-typedef double compute_t;
-
-typedef unsigned long long int_64;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 template <
     typename    Key,
@@ -48,7 +10,7 @@ template <
 __launch_bounds__ (BLOCK_THREADS)
 __global__ void BlockSortKernel(
     Key * d_in,
-    Key * d_out)         // Tile of output
+    Key * d_out)            // Tile of output
 {   
     enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
     // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
@@ -75,24 +37,25 @@ __global__ void BlockSortKernel(
     cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
 }
 
-template<typename T>
-__device__ inline T dev_dot(T * arr1, T * arr2) {
+template<typename FPTYPE>
+__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) {
     return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
 }
 
-__device__ inline void spline5_switch(compute_t & vv,
-        compute_t & dd,
-        compute_t & xx, 
-		const compute_t & rmin, 
-		const compute_t & rmax) 
+template<typename FPTYPE>
+__device__ inline void spline5_switch(FPTYPE & vv,
+        FPTYPE & dd,
+        FPTYPE & xx, 
+		const float & rmin, 
+		const float & rmax) 
 {
     if (xx < rmin) {
         dd = 0;
         vv = 1;
     }
     else if (xx < rmax) {
-        compute_t uu = (xx - rmin) / (rmax - rmin) ;
-        compute_t du = 1. / (rmax - rmin) ;
+        FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
+        FPTYPE du = 1. / (rmax - rmin) ;
         vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
         dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
     }
@@ -104,7 +67,7 @@ __device__ inline void spline5_switch(compute_t & vv,
 
 __global__ void get_i_idx_se_r(const int nloc,
                         const int * ilist,
-                        int * i_idx) 
+                        int * i_idx)
 {
     const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
     if(idy >= nloc) {
@@ -113,17 +76,19 @@ __global__ void get_i_idx_se_r(const int nloc,
     i_idx[ilist[idy]] = idy;
 }
 
-__global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord,
+template<typename FPTYPE>
+__global__ void format_nlist_fill_a_se_r(const FPTYPE * coord,
                             const int * type,
                             const int  * jrange,
                             const int  * jlist,
                             const float rcut,
                             int_64 * key,
-                            int * i_idx)
+                            int * i_idx,
+                            const int MAGIC_NUMBER)
 {   
     // <<<nloc, MAGIC_NUMBER>>>
     const unsigned int idx = blockIdx.x;
-    const unsigned int idy = threadIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     
     const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
     if (idy >= nsize) {
@@ -135,12 +100,12 @@ __global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord,
 
     int_64 * key_in = key + idx * MAGIC_NUMBER;
 
-    compute_t diff[3];
+    FPTYPE diff[3];
     const int & j_idx = nei_idx[idy];
     for (int dd = 0; dd < 3; dd++) {
         diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
     }
-    compute_t rr = sqrt(dev_dot(diff, diff)); 
+    FPTYPE rr = sqrt(dev_dot(diff, diff)); 
     if (rr <= rcut) {
         key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx;
     }
@@ -153,9 +118,10 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
                             const int * jrange,
                             const int * jlist,
                             int_64 * key,
-                            const int * sec,
+                            const int * sec_a,
                             const int sec_a_size,
-                            int * nei_iter_dev)
+                            int * nei_iter_dev,
+                            const int MAGIC_NUMBER)
 { 
 
     const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -169,137 +135,329 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
     int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER;
 
     for (int ii = 0; ii < sec_a_size; ii++) {
-        nei_iter[ii] = sec[ii];
+        nei_iter[ii] = sec_a[ii];
     }
     
     for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) {
         const int & nei_type = key_out[kk] / 1E15;
-        if (nei_iter[nei_type] < sec[nei_type + 1]) {
+        if (nei_iter[nei_type] < sec_a[nei_type + 1]) {
             row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000;
         }
     }
 }
 //it's ok!
 
-__global__ void compute_descriptor_se_r (VALUETYPE* descript,
+template<typename FPTYPE>
+__global__ void compute_descriptor_se_r (FPTYPE* descript,
                             const int ndescrpt,
-                            VALUETYPE* descript_deriv,
+                            FPTYPE* descript_deriv,
                             const int descript_deriv_size,
-                            VALUETYPE* rij,
+                            FPTYPE* rij,
                             const int rij_size,
                             const int* type,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
+                            const FPTYPE* avg,
+                            const FPTYPE* std,
                             int* nlist,
                             const int nlist_size,
-                            const VALUETYPE* coord,
-                            const VALUETYPE rmin,
-                            const VALUETYPE rmax,
-                            compute_t* sel_diff_dev,
-                            const int sec_size)
+                            const FPTYPE* coord,
+                            const float rmin,
+                            const float rmax,
+                            const int sec_a_size)
 {   
-    // <<<nloc, sec.back()>>>
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
-    const int idx_deriv = idy * 3;	// 1 components time 3 directions
-    const int idx_value = idy;	// 1 components
-    if (idy >= sec_size) {return;}
-    
+    // <<<nloc, sec_a.back()>>>
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+    const int idx_deriv = idy * 3;	// 4 components time 3 directions
+    const int idx_value = idy;	    // 4 components
+    if (idy >= sec_a_size) {return;}
+
     // else {return;}
-    VALUETYPE * row_descript = descript + idx * ndescrpt;
-    VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
-    VALUETYPE * row_rij = rij + idx * rij_size;
-    compute_t * sel_diff = sel_diff_dev + idx * nlist_size * 3;
+    FPTYPE * row_descript = descript + idx * ndescrpt;
+    FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
+    FPTYPE * row_rij = rij + idx * rij_size;
     int * row_nlist = nlist + idx * nlist_size;
-    
+
     if (row_nlist[idy] >= 0) {
         const int & j_idx = row_nlist[idy];
         for (int kk = 0; kk < 3; kk++) {
-            sel_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
-            row_rij[idy * 3 + kk] = sel_diff[idy * 3 + kk];
+            row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
         }
-        const compute_t * rr = &sel_diff[idy * 3 + 0];
-        compute_t nr2 = dev_dot(rr, rr);
-        compute_t inr = 1./sqrt(nr2);
-        compute_t nr = nr2 * inr;
-        compute_t inr2 = inr * inr;
-        compute_t inr4 = inr2 * inr2;
-        compute_t inr3 = inr4 * nr;
-        compute_t sw, dsw;
+        const FPTYPE * rr = &row_rij[idy * 3 + 0];
+        FPTYPE nr2 = dev_dot(rr, rr);
+        FPTYPE inr = 1./sqrt(nr2);
+        FPTYPE nr = nr2 * inr;
+        FPTYPE inr2 = inr * inr;
+        FPTYPE inr4 = inr2 * inr2;
+        FPTYPE inr3 = inr4 * nr;
+        FPTYPE sw, dsw;
         spline5_switch(sw, dsw, nr, rmin, rmax);
         row_descript[idx_value + 0] = (1./nr)       ;//* sw;
 
         row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
         row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
         row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-        // 1 value components
+        // 4 value components
         row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
     }
 
     for (int ii = 0; ii < 1; ii++) {
         row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii];
     }
+    // idy nloc, idx ndescrpt * 3
+    // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3];
     for (int ii = 0; ii < 3; ii++) {
         row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3];
     }
 }
 
-void DescrptSeRLauncher(const VALUETYPE* coord,
-                        const int* type,
-                        const int* ilist,
-                        const int* jrange,
-                        const int* jlist,
-                        int* array_int,
-                        unsigned long long* array_longlong,
-                        compute_t* array_double,
-                        const VALUETYPE* avg,
-                        const VALUETYPE* std,
-                        VALUETYPE* descript,
-                        VALUETYPE* descript_deriv,
-                        VALUETYPE* rij,
-                        int* nlist,
-                        const int& ntypes,
-                        const int& nloc,
-                        const int& nall,
-                        const int& nnei,
-                        const float& rcut,
-                        const float& rcut_smth,
-                        const int& ndescrpt,
-                        const std::vector<int>& sec,
-                        const bool& fill_nei_a
-)
+template<typename FPTYPE>
+void format_nbor_list_256 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 256;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 4;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_512 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 512;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 4;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_1024 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
 {   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 1024;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 8;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_2048 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 2048;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 8;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_4096 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 4096;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 16;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template <typename FPTYPE>
+void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
     const int LEN = 256;
     int nblock = (nloc + LEN -1) / LEN;
-    int * sec_dev = array_int;
-    int * nei_iter = array_int + sec.size(); // = new int[sec_a_size];
-    int * i_idx = array_int + sec.size() + nloc * sec.size();
+    int * sec_a_dev = array_int;
+    int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size];
+    int * i_idx = array_int + sec_a.size() + nloc * sec_a.size();
     int_64 * key = array_longlong;
-    compute_t * sel_diff = array_double;    // = new VALUETYPE *[nlist_size]; nnei
     
     cudaError_t res = cudaSuccess;
-    res = cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
+    res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
     res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
-    res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res);
-    res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
+    res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res);
+    res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
 
     if (fill_nei_a) {
+        // ~~~
         // cudaProfilerStart();
         get_i_idx_se_r<<<nblock, LEN>>> (nloc, ilist, i_idx);
 
-        format_nlist_fill_a_se_r<<<nloc, MAGIC_NUMBER>>> (
-                            coord,
-                            type,
-                            jrange,
-                            jlist,
-                            rcut,
-                            key,
-                            i_idx
-        );
-        const int ITEMS_PER_THREAD = 4;
-        const int BLOCK_THREADS = 64;
-        BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+        if (nnei <= 256) {
+            format_nbor_list_256 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 512) {
+            format_nbor_list_512 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 1024) {
+            format_nbor_list_1024 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 2048) {
+            format_nbor_list_2048 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 4096) {
+            format_nbor_list_4096 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } 
+
         format_nlist_fill_b_se_r<<<nblock, LEN>>> (
                             nlist,
                             nnei,       
@@ -307,14 +465,17 @@ void DescrptSeRLauncher(const VALUETYPE* coord,
                             jrange,
                             jlist,
                             key,
-                            sec_dev,
-                            sec.size(),
-                            nei_iter
+                            sec_a_dev,
+                            sec_a.size(),
+                            nei_iter,
+                            MAGIC_NUMBER
         );
     }
-    const int nblock_ = (sec.back() + LEN -1) / LEN;
-    dim3 block_grid(nblock_, nloc);
-    compute_descriptor_se_r<<<block_grid, LEN>>> (
+
+    const int nblock_ = (sec_a.back() + LEN -1) / LEN;
+    dim3 block_grid(nloc, nblock_);
+    dim3 thread_grid(1, LEN);
+    compute_descriptor_se_r<<<block_grid, thread_grid>>> (
                             descript,
                             ndescrpt,
                             descript_deriv,
@@ -327,9 +488,11 @@ void DescrptSeRLauncher(const VALUETYPE* coord,
                             nlist,
                             nnei,
                             coord,
-                            rcut_smth,
-                            rcut,
-                            sel_diff,
-                            sec.back()
+                            rcut_r_smth,
+                            rcut_r,
+                            sec_a.back()
     );
-}  
\ No newline at end of file
+}
+
+template struct DescrptSeRGPUExecuteFunctor<float>;
+template struct DescrptSeRGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu
index 99b7b1aed4..6329c8f085 100644
--- a/source/op/cuda/gelu.cu
+++ b/source/op/cuda/gelu.cu
@@ -1,39 +1,35 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
+#include "DeviceFunctor.h"
 
-#define SQRT_2_PI 0.7978845608028654 
-
-template <typename T>
-__global__ void gelu(const T * in, T * out, int const size) {
+template <typename FPTYPE>
+__global__ void gelu(const FPTYPE * in, FPTYPE * out, int const size) {
     int const idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= size) {return;}
 
     out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
 }
 
-template <typename T>
-__global__ void gelu_grad(const T * dy, const T * in, T * out, int const size) {
+template <typename FPTYPE>
+__global__ void gelu_grad(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
     int const idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= size) {return;}
 
     // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
-    T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
+    FPTYPE const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
     out[idx] = dy[idx] * (0.5 * SQRT_2_PI * in[idx] * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1) + 0.5 * var1 + 0.5);
 }
 
-template <typename T>
-__global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * out, int const size) {
+template <typename FPTYPE>
+__global__ void gelu_grad_grad(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
     int const idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= size) {return;}
 
     // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
-    T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
-    T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1);
+    FPTYPE const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
+    FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1);
     
 	out[idx] = dy[idx] * dy_[idx] * (0.134145 * SQRT_2_PI * in[idx] * in[idx] * (1 - var1 * var1) - SQRT_2_PI * in[idx] * var2 * (0.134145 * in[idx] * in[idx] + 1) * var1 + var2);
 }
 
-
 void GeluLauncher(const float * in, float * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
@@ -75,3 +71,34 @@ void GeluGradGradLauncher(const double * dy, const double * dy_, const double *
 
     gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
 }
+
+template <typename FPTYPE>
+void GeluGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * in, FPTYPE * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
+}
+
+template <typename FPTYPE>
+void GeluGradGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
+}
+ 
+template <typename FPTYPE>
+void GeluGradGradGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
+}
+
+template struct GeluGPUExecuteFunctor<float>;
+template struct GeluGPUExecuteFunctor<double>;
+template struct GeluGradGPUExecuteFunctor<float>;
+template struct GeluGradGPUExecuteFunctor<double>;
+template struct GeluGradGradGPUExecuteFunctor<float>;
+template struct GeluGradGradGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu
index 080ff8ef75..1667c15f90 100644
--- a/source/op/cuda/prod_force_se_a.cu
+++ b/source/op/cuda/prod_force_se_a.cu
@@ -1,22 +1,4 @@
-#include <stdio.h>
-#include <iostream>
-#include <cuda_runtime.h>
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val) {
@@ -32,25 +14,25 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-__global__ void deriv_wrt_center_atom_se_a(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template<typename FPTYPE>
+__global__ void deriv_wrt_center_atom_se_a(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int idz = threadIdx.y;
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+    const unsigned int idz = threadIdx.x;
 
-    if (idy >= ndescrpt) {
-        return;
-    }
+    if (idy >= ndescrpt) {return;}
     
     atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
-__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template<typename FPTYPE>
+__global__ void deriv_wrt_neighbors_se_a(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
@@ -75,23 +57,24 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force,
     atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]);
 }
 
-void ProdForceSeALauncher(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template <typename FPTYPE>
+void ProdForceSeAGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nall,
-                        const int ndescrpt,
                         const int nnei,
+                        const int ndescrpt,
                         const int n_a_sel,
                         const int n_a_shift)
 {   
     // std::cout << "I'm here!" << std::endl;
-    cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3));
+    cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3));
     const int LEN1 = 256;
     const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
-    dim3 grid(nblock1, nloc);
-    dim3 thread(LEN1, 3);
+    dim3 grid(nloc, nblock1);
+    dim3 thread(3, LEN1);
     deriv_wrt_center_atom_se_a<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
     
     const int LEN = 64;
@@ -99,4 +82,7 @@ void ProdForceSeALauncher(VALUETYPE * force,
     dim3 block_grid(nblock, nnei);
     dim3 thread_grid(LEN, 3, 4);
     deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
-}
\ No newline at end of file
+}
+
+template struct ProdForceSeAGPUExecuteFunctor<float>;
+template struct ProdForceSeAGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/prod_force_se_r.cu b/source/op/cuda/prod_force_se_r.cu
index 765842d9c3..5a4b582dd0 100644
--- a/source/op/cuda/prod_force_se_r.cu
+++ b/source/op/cuda/prod_force_se_r.cu
@@ -1,21 +1,4 @@
-#include <stdio.h>
-#include <cuda_runtime.h>
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val) {
@@ -31,31 +14,29 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-__global__ void deriv_wrt_center_atom_se_r(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template<typename FPTYPE>
+__global__ void deriv_wrt_center_atom_se_r(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int idz = threadIdx.y;
-    
-    if (idy >= ndescrpt) {
-        return;
-    }
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+    const unsigned int idz = threadIdx.x;
 
+    if (idy >= ndescrpt) {return;}
+    
     atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
-__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template<typename FPTYPE>
+__global__ void deriv_wrt_neighbors_se_r(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift)
+                        const int ndescrpt)
 {  
     // idy -> nnei
     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -73,27 +54,30 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force,
     atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
-void ProdForceSeRLauncher(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template <typename FPTYPE>
+void ProdForceSeRGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nall,
-                        const int ndescrpt,
                         const int nnei,
-                        const int n_a_sel,
-                        const int n_a_shift)
-{
-    cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3));
+                        const int ndescrpt)
+{   
+    // std::cout << "I'm here!" << std::endl;
+    cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3));
     const int LEN1 = 256;
     const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
-    dim3 grid(nblock1, nloc);
-    dim3 thread(LEN1, 3);
+    dim3 grid(nloc, nblock1);
+    dim3 thread(3, LEN1);
     deriv_wrt_center_atom_se_r<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
     
     const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
     dim3 thread_grid(LEN, 3);
-    deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
-}
\ No newline at end of file
+    deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt);
+}
+
+template struct ProdForceSeRGPUExecuteFunctor<float>;
+template struct ProdForceSeRGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu
index 241e2b7e06..e084720c6d 100644
--- a/source/op/cuda/prod_virial_se_a.cu
+++ b/source/op/cuda/prod_virial_se_a.cu
@@ -1,21 +1,4 @@
-#include <stdio.h>
-#include <cuda_runtime.h>
-
-#define MUL 512
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
-    if (code != cudaSuccess) {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val) {
@@ -31,11 +14,12 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
+template<typename FPTYPE>
+__global__ void deriv_wrt_neighbors_se_a(FPTYPE * virial, 
+                        FPTYPE * atom_virial,
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
+                        const FPTYPE * rij,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
@@ -64,21 +48,22 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial,
     atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]);
 }
 
-void ProdVirialSeALauncher(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
+template <typename FPTYPE>
+void ProdVirialSeAGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * virial, 
+                        FPTYPE * atom_virial,
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
+                        const FPTYPE * rij,
                         const int * nlist,
                         const int nloc,
                         const int nall,
                         const int nnei,
                         const int ndescrpt,
                         const int n_a_sel,
-                        const int n_a_shift) 
+                        const int n_a_shift)
 {
-    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9));
-    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall));
+    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(FPTYPE) * 9));
+    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall));
 
     const int LEN = 16;
     int nblock = (nloc + LEN -1) / LEN;
@@ -99,3 +84,6 @@ void ProdVirialSeALauncher(VALUETYPE * virial,
                         n_a_shift
     );
 }
+
+template struct ProdVirialSeAGPUExecuteFunctor<float>;
+template struct ProdVirialSeAGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/prod_virial_se_r.cu b/source/op/cuda/prod_virial_se_r.cu
index a2c02007fc..9b8f43543f 100644
--- a/source/op/cuda/prod_virial_se_r.cu
+++ b/source/op/cuda/prod_virial_se_r.cu
@@ -1,24 +1,5 @@
-#include <stdio.h>
-#include <iostream>
-#include <cuda_runtime.h>
+#include "DeviceFunctor.h"
 
-#define MUL 512
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
-    if (code != cudaSuccess) {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-// currently, double precision atomicAdd only support arch number larger than 6.0
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val) {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
@@ -33,17 +14,16 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
+template<typename FPTYPE>
+__global__ void deriv_wrt_neighbors_se_r(FPTYPE * virial, 
+                        FPTYPE * atom_virial,
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
+                        const FPTYPE * rij,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift) 
+                        const int ndescrpt) 
 {
     // idx -> nloc
     // idy -> nnei
@@ -61,26 +41,26 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial,
     if (j_idx < 0) {
         return;
     }
+    // atomicAdd(virial + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
     atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
 }
 
-void ProdVirialSeRLauncher(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
+template <typename FPTYPE>
+void ProdVirialSeRGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * virial, 
+                        FPTYPE * atom_virial,
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
+                        const FPTYPE * rij,
                         const int * nlist,
                         const int nloc,
                         const int nall,
                         const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift) 
+                        const int ndescrpt)
 {
-    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9));
-    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall));
+    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(FPTYPE) * 9));
+    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall));
 
-    const int LEN = 16;
+    const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
     dim3 thread_grid(LEN, 9);
@@ -94,8 +74,9 @@ void ProdVirialSeRLauncher(VALUETYPE * virial,
                         nlist,
                         nloc,
                         nnei,
-                        ndescrpt,
-                        n_a_sel,
-                        n_a_shift
+                        ndescrpt
     );
-}
\ No newline at end of file
+}
+
+template struct ProdVirialSeRGPUExecuteFunctor<float>;
+template struct ProdVirialSeRGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc
index 75c7640b2b..147ca687cf 100644
--- a/source/op/descrpt.cc
+++ b/source/op/descrpt.cc
@@ -12,54 +12,30 @@ typedef double compute_t;
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("Descrpt")
-.Input("coord: double")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: double")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("davg: double")
-.Input("dstd: double")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut_a: float")
 .Attr("rcut_r: float")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
 .Attr("axis_rule: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32")
 .Output("axis: int32")
-.Output("rot_mat: double");
-#else
-REGISTER_OP("Descrpt")
-.Input("coord: float")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: float")
-.Input("mesh: int32")
-.Input("davg: float")
-.Input("dstd: float")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Attr("axis_rule: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
-.Output("nlist: int32")
-.Output("axis: int32")
-.Output("rot_mat: float");
-#endif
+.Output("rot_mat: T");
 
+template<typename Device, typename FPTYPE>
 class DescrptOp : public OpKernel {
 public:
   explicit DescrptOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -182,18 +158,18 @@ class DescrptOp : public OpKernel {
     Tensor* rot_mat_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(5, rot_mat_shape, &rot_mat_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
     auto axis		= axis_tensor		->matrix<int>();
-    auto rot_mat	= rot_mat_tensor		->matrix<VALUETYPE>();
+    auto rot_mat	= rot_mat_tensor		->matrix<FPTYPE>();
 
     // // check the types
     // int max_type_v = 0;
@@ -624,5 +600,10 @@ class DescrptOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("Descrpt").Device(DEVICE_CPU), DescrptOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("Descrpt").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
diff --git a/source/op/descrpt_se_a.cc b/source/op/descrpt_se_a.cc
index b970f6dbf0..88f866944d 100644
--- a/source/op/descrpt_se_a.cc
+++ b/source/op/descrpt_se_a.cc
@@ -12,50 +12,29 @@ typedef double compute_t;
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("DescrptSeA")
-.Input("coord: double")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: double")
-.Input("mesh: int32")
-.Input("davg: double")
-.Input("dstd: double")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
-.Output("nlist: int32");
-#else
-REGISTER_OP("DescrptSeA")
-.Input("coord: float")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: float")
-.Input("mesh: int32")
-.Input("davg: float")
-.Input("dstd: float")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
-.Output("nlist: int32");
-#endif
+    .Attr("T: {float, double}")
+    .Input("coord: T")          //atomic coordinates
+    .Input("type: int32")       //atomic type
+    .Input("natoms: int32")     //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers
+    .Input("box : T")
+    .Input("mesh : int32")
+    .Input("davg: T")           //average value of data
+    .Input("dstd: T")           //standard deviation
+    .Attr("rcut_a: float")      //no use
+    .Attr("rcut_r: float")
+    .Attr("rcut_r_smth: float")
+    .Attr("sel_a: list(int)")
+    .Attr("sel_r: list(int)")   //all zero
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
+    .Output("nlist: int32");
 
+template<typename Device, typename FPTYPE>
 class DescrptSeAOp : public OpKernel {
 public:
   explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -180,15 +159,15 @@ class DescrptSeAOp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     // // check the types
@@ -369,5 +348,10 @@ class DescrptSeAOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_CPU), DescrptSeAOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptSeAOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc
deleted file mode 100644
index fd5ae632cb..0000000000
--- a/source/op/descrpt_se_a_gpu.cc
+++ /dev/null
@@ -1,282 +0,0 @@
-#include <vector>
-#include <string.h>
-#include <iostream>
-#include <cuda_runtime.h>
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-using namespace tensorflow;  // NOLINT(build/namespaces)
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE ;
-#else
-    typedef float  VALUETYPE ;
-#endif
-
-typedef double compute_t;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-using GPUDevice = Eigen::GpuDevice;
-
-#ifdef HIGH_PREC
-REGISTER_OP("DescrptSeA")
-    .Input("coord: double")     //atomic coordinates
-    .Input("type: int32")       //atomic type
-    .Input("natoms: int32")     //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers
-    .Input("box : double")
-    .Input("mesh : int32")
-    .Input("davg: double")      //average value of data
-    .Input("dstd: double")      //standard deviation
-    .Attr("rcut_a: float")      //no use
-    .Attr("rcut_r: float")
-    .Attr("rcut_r_smth: float")
-    .Attr("sel_a: list(int)")
-    .Attr("sel_r: list(int)")   //all zero
-    .Output("descrpt: double")
-    .Output("descrpt_deriv: double")
-    .Output("rij: double")
-    .Output("nlist: int32");
-    // only sel_a and rcut_r uesd.
-#else
-REGISTER_OP("DescrptSeA")
-    .Input("coord: float")
-    .Input("type: int32")
-    .Input("natoms: int32")
-    .Input("box : float")
-    .Input("mesh : int32")
-    .Input("davg: float")
-    .Input("dstd: float")
-    .Attr("rcut_a: float")
-    .Attr("rcut_r: float")
-    .Attr("rcut_r_smth: float")
-    .Attr("sel_a: list(int)")
-    .Attr("sel_r: list(int)")
-    .Output("descrpt: float")
-    .Output("descrpt_deriv: float")
-    .Output("rij: float")
-    .Output("nlist: int32");
-#endif
-
-int get_magic_number(int const nnei) {
-    if (nnei <= 256) {
-        return 256;
-    }
-    else if (nnei <= 512) {
-        return 512;
-    }
-    else if (nnei <= 1024) {
-        return 1024;
-    }
-    else if (nnei <= 2048) {
-        return 2048;
-    }
-    else if (nnei <= 4096) {
-        return 4096;
-    }
-}
-
-void DescrptSeALauncher(const VALUETYPE* coord,
-                            const int* type,
-                            const int* ilist,
-                            const int* jrange,
-                            const int* jlist,
-                            int* array_int,
-                            unsigned long long* array_longlong,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
-                            VALUETYPE* descript,
-                            VALUETYPE* descript_deriv,
-                            VALUETYPE* rij,
-                            int* nlist,
-                            const int& nloc,
-                            const int& nnei,
-                            const float& rcut_r,
-                            const float& rcut_r_smth,
-                            const int& ndescrpt,
-                            const std::vector<int>& sec_a,
-                            const bool& fill_nei_a,
-                            const int MAGIC_NUMBER
-);
-
-class DescrptSeAOp : public OpKernel {
-public:
-    explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
-        float nloc_f, nall_f;
-        OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
-        OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
-        OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth));
-        OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a));
-        OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r));
-        // OP_REQUIRES_OK(context, context->GetAttr("nloc", &nloc_f));
-        // OP_REQUIRES_OK(context, context->GetAttr("nall", &nall_f));
-        cum_sum (sec_a, sel_a);
-        cum_sum (sec_r, sel_r);
-        ndescrpt_a = sec_a.back() * 4;
-        ndescrpt_r = sec_r.back() * 1;
-        ndescrpt = ndescrpt_a + ndescrpt_r;
-        nnei_a = sec_a.back();
-        nnei_r = sec_r.back();
-        nnei = nnei_a + nnei_r;
-        fill_nei_a = (rcut_a < 0);
-        magic_number = get_magic_number(nnei);
-    }
-
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        int context_input_index = 0;
-        const Tensor& coord_tensor	= context->input(context_input_index++);
-        const Tensor& type_tensor	= context->input(context_input_index++);
-        const Tensor& natoms_tensor	= context->input(context_input_index++);
-        const Tensor& box_tensor	= context->input(context_input_index++);
-        const Tensor& mesh_tensor   = context->input(context_input_index++);
-        const Tensor& avg_tensor	= context->input(context_input_index++);
-        const Tensor& std_tensor	= context->input(context_input_index++);
-        // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3]
-        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
-        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
-        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
-        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of box should be 2"));
-        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
-        OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of avg should be 2"));
-        OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of std should be 2"));
-        OP_REQUIRES (context, (fill_nei_a),                         errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
-        OP_REQUIRES (context, (sec_r.back() == 0),			        errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero."));
-
-        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
-        int nloc = natoms[0];
-        int nall = natoms[1];
-        int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
-        int nsamples = coord_tensor.shape().dim_size(0);
-        //
-        //// check the sizes
-        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
-        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
-        OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
-        OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
-        
-        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
-        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
-        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		    errors::InvalidArgument ("number of box should be 9"));
-        OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
-        OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt"));   
-        
-        OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
-        OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
-        OP_REQUIRES (context, (nnei <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit."));
-
-        // Create output tensors
-        TensorShape descrpt_shape ;
-        descrpt_shape.AddDim (nsamples);
-        descrpt_shape.AddDim (nloc * ndescrpt);
-        TensorShape descrpt_deriv_shape ;
-        descrpt_deriv_shape.AddDim (nsamples);
-        descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
-        TensorShape rij_shape ;
-        rij_shape.AddDim (nsamples);
-        rij_shape.AddDim (nloc * nnei * 3);
-        TensorShape nlist_shape ;
-        nlist_shape.AddDim (nsamples);
-        nlist_shape.AddDim (nloc * nnei);
-
-        int context_output_index = 0;
-        Tensor* descrpt_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     descrpt_shape,
-	    					     &descrpt_tensor));
-        Tensor* descrpt_deriv_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     descrpt_deriv_shape,
-	    					     &descrpt_deriv_tensor));
-        Tensor* rij_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     rij_shape,
-	    					     &rij_tensor));
-        Tensor* nlist_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     nlist_shape,
-	    					     &nlist_tensor));
-        
-        // allocate temp memory, temp memory must not be used after this operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
-
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(nloc * magic_number * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
-
-	    int * ilist = NULL, * jrange = NULL, * jlist = NULL;
-        int * array_int = int_temp.flat<int>().data(); 
-        unsigned long long * array_longlong = uint64_temp.flat<unsigned long long>().data(); 
-        cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-
-        // Launch computation
-        for (int II = 0; II < nsamples; II++) {
-            DescrptSeALauncher(coord_tensor.matrix<VALUETYPE>().data() + II * (nall * 3),    // related to the kk argument
-                        type_tensor.matrix<int>().data() + II * nall,           // also related to the kk argument
-                        ilist,
-                        jrange,
-                        jlist,
-                        array_int,
-                        array_longlong,
-                        avg_tensor.matrix<VALUETYPE>().data(),
-                        std_tensor.matrix<VALUETYPE>().data(),
-                        descrpt_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                        descrpt_deriv_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                        rij_tensor->matrix<VALUETYPE>().data() + II * (nloc * nnei * 3),
-                        nlist_tensor->matrix<int>().data() + II * (nloc * nnei),
-                        nloc,
-                        nnei,
-                        rcut_r,
-                        rcut_r_smth,
-                        ndescrpt,
-                        sec_a,
-                        fill_nei_a,
-                        magic_number
-            );
-        }
-        // std::cout << "done" << std::endl;
-        delete[] natoms;
-    }
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-private:
-    float rcut_a;
-    float rcut_r;
-    float rcut_r_smth;
-    std::vector<int32> sel_r;
-    std::vector<int32> sel_a;
-    std::vector<int> sec_a;
-    std::vector<int> sec_r;
-    int ndescrpt, ndescrpt_a, ndescrpt_r;
-    int nnei, nnei_a, nnei_r, nloc, nall, magic_number;
-    bool fill_nei_a;
-
-    //private func
-    void cum_sum (std::vector<int> & sec, const std::vector<int32> & n_sel) const {
-        sec.resize (n_sel.size() + 1);
-        sec[0] = 0;
-        for (int ii = 1; ii < sec.size(); ++ii) {
-            sec[ii] = sec[ii-1] + n_sel[ii-1];
-        }
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_GPU), DescrptSeAOp);
\ No newline at end of file
diff --git a/source/op/descrpt_se_a_multi_device.cc b/source/op/descrpt_se_a_multi_device.cc
new file mode 100644
index 0000000000..ae5e623171
--- /dev/null
+++ b/source/op/descrpt_se_a_multi_device.cc
@@ -0,0 +1,307 @@
+#include "common.h"
+#include "CustomeOperation.h"
+
+REGISTER_OP("DescrptSeA")
+    .Attr("T: {float, double}")
+    .Input("coord: T")          //atomic coordinates
+    .Input("type: int32")       //atomic type
+    .Input("natoms: int32")     //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers
+    .Input("box : T")
+    .Input("mesh : int32")
+    .Input("davg: T")           //average value of data
+    .Input("dstd: T")           //standard deviation
+    .Attr("rcut_a: float")      //no use
+    .Attr("rcut_r: float")
+    .Attr("rcut_r_smth: float")
+    .Attr("sel_a: list(int)")
+    .Attr("sel_r: list(int)")   //all zero
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
+    .Output("nlist: int32");
+    // only sel_a and rcut_r uesd.
+
+struct DeviceFunctor {
+    void operator()(const CPUDevice& d, std::string& device) {
+        device = "CPU";
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, std::string& device) {
+        device = "GPU";
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template <typename FPTYPE>
+struct DescrptSeAFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+        DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+        DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template <typename Device, typename FPTYPE>
+class DescrptSeAOp : public OpKernel {
+public:
+    explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
+        float nloc_f, nall_f;
+        OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
+        OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
+        OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth));
+        OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a));
+        OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r));
+        // OP_REQUIRES_OK(context, context->GetAttr("nloc", &nloc_f));
+        // OP_REQUIRES_OK(context, context->GetAttr("nall", &nall_f));
+        cum_sum (sec_a, sel_a);
+        cum_sum (sec_r, sel_r);
+        ndescrpt_a = sec_a.back() * 4;
+        ndescrpt_r = sec_r.back() * 1;
+        ndescrpt = ndescrpt_a + ndescrpt_r;
+        nnei_a = sec_a.back();
+        nnei_r = sec_r.back();
+        nnei = nnei_a + nnei_r;
+        fill_nei_a = (rcut_a < 0);
+        magic_number = get_magic_number(nnei);
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& coord_tensor	= context->input(context_input_index++);
+        const Tensor& type_tensor	= context->input(context_input_index++);
+        const Tensor& natoms_tensor	= context->input(context_input_index++);
+        const Tensor& box_tensor	= context->input(context_input_index++);
+        const Tensor& mesh_tensor   = context->input(context_input_index++);
+        const Tensor& avg_tensor	= context->input(context_input_index++);
+        const Tensor& std_tensor	= context->input(context_input_index++);
+        // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3]
+        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
+        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
+        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of box should be 2"));
+        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
+        OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of avg should be 2"));
+        OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of std should be 2"));
+        OP_REQUIRES (context, (fill_nei_a),                         errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
+        OP_REQUIRES (context, (sec_r.back() == 0),			        errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero."));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+
+        DeviceFunctor() (
+            context->eigen_device<Device>(),
+            device
+        );
+
+        const int * natoms = natoms_tensor.flat<int>().data();
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
+        int nsamples = coord_tensor.shape().dim_size(0);
+        //
+        //// check the sizes
+        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
+        OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
+        
+        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		    errors::InvalidArgument ("number of box should be 9"));
+        OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
+        OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt"));   
+        
+        OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
+        OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
+        OP_REQUIRES (context, (nnei <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit."));
+
+        // Create output tensors
+        TensorShape descrpt_shape ;
+        descrpt_shape.AddDim (nsamples);
+        descrpt_shape.AddDim (nloc * ndescrpt);
+        TensorShape descrpt_deriv_shape ;
+        descrpt_deriv_shape.AddDim (nsamples);
+        descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
+        TensorShape rij_shape ;
+        rij_shape.AddDim (nsamples);
+        rij_shape.AddDim (nloc * nnei * 3);
+        TensorShape nlist_shape ;
+        nlist_shape.AddDim (nsamples);
+        nlist_shape.AddDim (nloc * nnei);
+
+        int context_output_index = 0;
+        Tensor* descrpt_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     descrpt_shape,
+	    					     &descrpt_tensor));
+        Tensor* descrpt_deriv_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     descrpt_deriv_shape,
+	    					     &descrpt_deriv_tensor));
+        Tensor* rij_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     rij_shape,
+	    					     &rij_tensor));
+        Tensor* nlist_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     nlist_shape,
+	    					     &nlist_tensor));
+        
+        if(device == "GPU") {
+            // allocate temp memory, temp memory must not be used after this operation!
+            Tensor int_temp;
+            TensorShape int_shape;
+            int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+            Tensor uint64_temp;
+            TensorShape uint64_shape;
+            uint64_shape.AddDim(nloc * magic_number * 2);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+
+            array_int = int_temp.flat<int>().data(); 
+            array_longlong = uint64_temp.flat<unsigned long long>().data();
+
+            nbor_update(mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
+        }
+        else if (device == "CPU") {
+            memcpy (&ilist,  4  + mesh_tensor.flat<int>().data(), sizeof(int *));
+	        memcpy (&jrange, 8  + mesh_tensor.flat<int>().data(), sizeof(int *));
+	        memcpy (&jlist,  12 + mesh_tensor.flat<int>().data(), sizeof(int *));
+        }
+
+        DescrptSeAFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            coord_tensor.matrix<FPTYPE>().data(),    // related to the kk argument
+            type_tensor.matrix<int>().data(),           // also related to the kk argument
+            mesh_tensor.flat<int>().data(),
+            ilist,
+            jrange,
+            jlist,
+            array_int,
+            array_longlong,
+            avg_tensor.matrix<FPTYPE>().data(),
+            std_tensor.matrix<FPTYPE>().data(),
+            descrpt_tensor->matrix<FPTYPE>().data(),
+            descrpt_deriv_tensor->matrix<FPTYPE>().data(),
+            rij_tensor->matrix<FPTYPE>().data(),
+            nlist_tensor->matrix<int>().data(),
+            nloc,
+            nall,
+            nnei,
+            ntypes,
+            ndescrpt,
+            rcut_r,
+            rcut_r_smth,
+            sec_a,
+            fill_nei_a,
+            magic_number
+        );
+    }
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+private:
+    float rcut_a;
+    float rcut_r;
+    float rcut_r_smth;
+    std::vector<int32> sel_r;
+    std::vector<int32> sel_a;
+    std::vector<int> sec_a;
+    std::vector<int> sec_r;
+    int ndescrpt, ndescrpt_a, ndescrpt_r;
+    int nnei, nnei_a, nnei_r, nloc, nall, magic_number;
+    bool fill_nei_a;
+
+    //private func
+    void cum_sum (std::vector<int> & sec, const std::vector<int32> & n_sel) const {
+        sec.resize (n_sel.size() + 1);
+        sec[0] = 0;
+        for (int ii = 1; ii < sec.size(); ++ii) {
+            sec[ii] = sec[ii-1] + n_sel[ii-1];
+        }
+    }
+
+    std::string device;
+    int *array_int;
+    unsigned long long*array_longlong;
+    int * ilist = NULL, * jrange = NULL, * jlist = NULL;
+    int ilist_size = 0, jrange_size = 0, jlist_size = 0;
+    bool init = false;
+
+    void nbor_update(const int * mesh, const int size) {
+        int *mesh_host = new int[size], *ilist_host = NULL, *jrange_host = NULL, *jlist_host = NULL;
+        cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost));
+        memcpy (&ilist_host,  4  + mesh_host, sizeof(int *));
+	    memcpy (&jrange_host, 8  + mesh_host, sizeof(int *));
+	    memcpy (&jlist_host,  12 + mesh_host, sizeof(int *));
+        int const ago = mesh_host[0];
+        if (!init) {
+            ilist_size  = (int)(mesh_host[1] * 1.2);
+            jrange_size = (int)(mesh_host[2] * 1.2);
+            jlist_size  = (int)(mesh_host[3] * 1.2);
+            cudaErrcheck(cudaMalloc((void **)&ilist,     sizeof(int) * ilist_size));
+            cudaErrcheck(cudaMalloc((void **)&jrange,    sizeof(int) * jrange_size));
+            cudaErrcheck(cudaMalloc((void **)&jlist,     sizeof(int) * jlist_size));
+            init = true;
+        }
+        if (ago == 0) {
+            if (ilist_size < mesh_host[1]) {
+                ilist_size = (int)(mesh_host[1] * 1.2);
+                cudaErrcheck(cudaFree(ilist));
+                cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size));
+            }
+            if (jrange_size < mesh_host[2]) {
+                jrange_size = (int)(mesh_host[2] * 1.2);
+                cudaErrcheck(cudaFree(jrange));
+                cudaErrcheck(cudaMalloc((void **)&jrange,sizeof(int) * jrange_size));
+            }
+            if (jlist_size < mesh_host[3]) {
+                jlist_size = (int)(mesh_host[3] * 1.2);
+                cudaErrcheck(cudaFree(jlist));
+                cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size));
+            }
+            cudaErrcheck(cudaMemcpy(ilist,  ilist_host,  sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice));
+            cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice));
+            cudaErrcheck(cudaMemcpy(jlist,  jlist_host,  sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice));
+        }
+        delete [] mesh_host;
+    }
+
+    int get_magic_number(int const nnei) {
+        if (nnei <= 256) {
+            return 256;
+        }
+        else if (nnei <= 512) {
+            return 512;
+        }
+        else if (nnei <= 1024) {
+            return 1024;
+        }
+        else if (nnei <= 2048) {
+            return 2048;
+        }
+        else if (nnei <= 4096) {
+            return 4096;
+        }
+    }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptSeAOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"),  \
+    DescrptSeAOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/descrpt_se_r.cc b/source/op/descrpt_se_r.cc
index 6798df503c..a4bfe341ac 100644
--- a/source/op/descrpt_se_r.cc
+++ b/source/op/descrpt_se_r.cc
@@ -12,45 +12,26 @@ typedef double compute_t;
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
 
 REGISTER_OP("DescrptSeR")
-#ifdef HIGH_PREC
-.Input("coord: double")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: double")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("davg: double")
-.Input("dstd: double")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut: float")
 .Attr("rcut_smth: float")
 .Attr("sel: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32");
-#else
-.Input("coord: float")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: float")
-.Input("mesh: int32")
-.Input("davg: float")
-.Input("dstd: float")
-.Attr("rcut: float")
-.Attr("rcut_smth: float")
-.Attr("sel: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
-.Output("nlist: int32");
-#endif
 
+template<typename Device, typename FPTYPE>
 class DescrptSeROp : public OpKernel {
 public:
   explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -169,15 +150,15 @@ class DescrptSeROp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     OP_REQUIRES (context, (ntypes == int(sel.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
@@ -351,5 +332,10 @@ class DescrptSeROp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_CPU), DescrptSeROp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptSeROp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
diff --git a/source/op/descrpt_se_r_gpu.cc b/source/op/descrpt_se_r_gpu.cc
deleted file mode 100644
index 65e2682ef0..0000000000
--- a/source/op/descrpt_se_r_gpu.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-#include <vector>
-#include <string.h>
-#include <iostream>
-#include <cuda_runtime.h>
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-using namespace tensorflow;  // NOLINT(build/namespaces)
-#define MAGIC_NUMBER 256
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE ;
-#else
-    typedef float  VALUETYPE ;
-#endif
-
-typedef double compute_t;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-// sec_a kao,sec_r,
-
-using GPUDevice = Eigen::GpuDevice;
-
-#ifdef HIGH_PREC
-REGISTER_OP("DescrptSeR")
-    .Input("coord: double")
-    .Input("type: int32")
-    .Input("natoms: int32")
-    .Input("box: double")
-    .Input("mesh: int32")
-    .Input("davg: double")
-    .Input("dstd: double")
-    .Attr("rcut: float")
-    .Attr("rcut_smth: float")
-    .Attr("sel: list(int)")
-    .Output("descrpt: double")
-    .Output("descrpt_deriv: double")
-    .Output("rij: double")
-    .Output("nlist: int32");
-#else
-REGISTER_OP("DescrptSeR")
-    .Input("coord: float")
-    .Input("type: int32")
-    .Input("natoms: int32")
-    .Input("box: float")
-    .Input("mesh: int32")
-    .Input("davg: float")
-    .Input("dstd: float")
-    .Attr("rcut: float")
-    .Attr("rcut_smth: float")
-    .Attr("sel: list(int)")
-    .Output("descrpt: float")
-    .Output("descrpt_deriv: float")
-    .Output("rij: float")
-    .Output("nlist: int32");
-#endif
-
-void DescrptSeRLauncher(const VALUETYPE* coord,
-                            const int* type,
-                            const int* ilist,
-                            const int* jrange,
-                            const int* jlist,
-                            int* array_int,
-                            unsigned long long* array_longlong,
-                            compute_t* array_double,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
-                            VALUETYPE* descript,
-                            VALUETYPE* descript_deriv,
-                            VALUETYPE* rij,
-                            int* nlist,
-                            const int& ntypes,
-                            const int& nloc,
-                            const int& nall,
-                            const int& nnei,
-                            const float& rcut,
-                            const float& rcut_smth,
-                            const int& ndescrpt,
-                            const std::vector<int>& sec,
-                            const bool& fill_nei_a
-);
-
-class DescrptSeROp : public OpKernel {
-public:
-    explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
-        float nloc_f, nall_f;
-        OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
-        OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth));
-        OP_REQUIRES_OK(context, context->GetAttr("sel", &sel));
-        cum_sum (sec, sel);
-        sel_null.resize(3, 0);
-        cum_sum (sec_null, sel_null);
-        ndescrpt = sec.back() * 1;
-        nnei = sec.back();
-        fill_nei_a = true;
-        // count_nei_idx_overflow = 0;
-        // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl;
-    }
-
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        int context_input_index = 0;
-        const Tensor& coord_tensor	= context->input(context_input_index++);
-        const Tensor& type_tensor	= context->input(context_input_index++);
-        const Tensor& natoms_tensor	= context->input(context_input_index++);
-        const Tensor& box_tensor	= context->input(context_input_index++);
-        const Tensor& mesh_tensor   = context->input(context_input_index++);
-        const Tensor& avg_tensor	= context->input(context_input_index++);
-        const Tensor& std_tensor	= context->input(context_input_index++);
-        // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3]
-        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
-        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
-        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
-        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of box should be 2"));
-        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
-        OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of avg should be 2"));
-        OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of std should be 2"));
-        OP_REQUIRES (context, (fill_nei_a),                         errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
-
-        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
-        int nloc = natoms[0];
-        int nall = natoms[1];
-        int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
-        int nsamples = coord_tensor.shape().dim_size(0);
-        //
-        //// check the sizes
-        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
-        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
-        OP_REQUIRES (context, (ntypes == int(sel.size())),	                    errors::InvalidArgument ("number of types should match the length of sel array"));
-        OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
-        OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
-        
-        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
-        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
-        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		    errors::InvalidArgument ("number of box should be 9"));
-        OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
-        OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt"));   
-        
-        // Create output tensors
-        TensorShape descrpt_shape ;
-        descrpt_shape.AddDim (nsamples);
-        descrpt_shape.AddDim (nloc * ndescrpt);
-        TensorShape descrpt_deriv_shape ;
-        descrpt_deriv_shape.AddDim (nsamples);
-        descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
-        TensorShape rij_shape ;
-        rij_shape.AddDim (nsamples);
-        rij_shape.AddDim (nloc * nnei * 3);
-        TensorShape nlist_shape ;
-        nlist_shape.AddDim (nsamples);
-        nlist_shape.AddDim (nloc * nnei);
-
-        int context_output_index = 0;
-        Tensor* descrpt_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     descrpt_shape,
-	    					     &descrpt_tensor));
-        Tensor* descrpt_deriv_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     descrpt_deriv_shape,
-	    					     &descrpt_deriv_tensor));
-        Tensor* rij_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     rij_shape,
-	    					     &rij_tensor));
-        Tensor* nlist_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     nlist_shape,
-	    					     &nlist_tensor));
-        
-	    int * ilist = NULL, *jrange = NULL, *jlist = NULL;
-        int *array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL;
-        cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat<int>().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat<int>().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost));
-
-        // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice));
-        // Launch computation
-        for (int II = 0; II < nsamples; II++) {
-            DescrptSeRLauncher( coord_tensor.matrix<VALUETYPE>().data() + II * (nall * 3),
-                                type_tensor.matrix<int>().data() + II * nall,
-                                ilist,
-                                jrange,
-                                jlist,
-                                array_int,
-                                array_longlong,
-                                array_double,
-                                avg_tensor.matrix<VALUETYPE>().data(),
-                                std_tensor.matrix<VALUETYPE>().data(),
-                                descrpt_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                descrpt_deriv_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                rij_tensor->matrix<VALUETYPE>().data() + II * (nloc * nnei * 3),
-                                nlist_tensor->matrix<int>().data() + II * (nloc * nnei),
-                                ntypes,
-                                nloc,
-                                nall,
-                                nnei,
-                                rcut,
-                                rcut_smth,
-                                ndescrpt,
-                                sec,
-                                fill_nei_a
-            );
-        }
-        // std::cout << "done" << std::endl;
-        delete[] natoms;
-    }
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-
-private:
-    float rcut;
-    float rcut_smth;
-    std::vector<int32> sel;
-    std::vector<int32> sel_null;
-    std::vector<int> sec;
-    std::vector<int> sec_null;
-    int nnei, ndescrpt, nloc, nall;
-    bool fill_nei_a;
-
-    //private func
-    void cum_sum (std::vector<int> & sec, const std::vector<int32> & n_sel) const {
-        sec.resize (n_sel.size() + 1);
-        sec[0] = 0;
-        for (int ii = 1; ii < sec.size(); ++ii) {
-            sec[ii] = sec[ii-1] + n_sel[ii-1];
-        }
-    }
-};
-
-REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_GPU), DescrptSeROp);
\ No newline at end of file
diff --git a/source/op/descrpt_se_r_multi_device.cc b/source/op/descrpt_se_r_multi_device.cc
new file mode 100644
index 0000000000..c5eaff616c
--- /dev/null
+++ b/source/op/descrpt_se_r_multi_device.cc
@@ -0,0 +1,297 @@
+#include "common.h"
+#include "CustomeOperation.h"
+
+REGISTER_OP("DescrptSeR")
+    .Attr("T: {float, double}")
+    .Input("coord: T")
+    .Input("type: int32")
+    .Input("natoms: int32")
+    .Input("box: T")
+    .Input("mesh: int32")
+    .Input("davg: T")
+    .Input("dstd: T")
+    .Attr("rcut: float")
+    .Attr("rcut_smth: float")
+    .Attr("sel: list(int)")
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
+    .Output("nlist: int32");
+
+struct DeviceFunctor {
+    void operator()(const CPUDevice& d, std::string& device) {
+        device = "CPU";
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, std::string& device) {
+        device = "GPU";
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template <typename T>
+struct DescrptSeRFunctor {
+    void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+        DescrptSeRCPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+        DescrptSeRGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template<typename Device, typename FPTYPE>
+class DescrptSeROp : public OpKernel {
+public:
+    explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
+        OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth));
+        OP_REQUIRES_OK(context, context->GetAttr("sel", &sel));
+        cum_sum (sec, sel);
+        sel_null.resize(3, 0);
+        cum_sum (sec_null, sel_null);
+        ndescrpt = sec.back() * 1;
+        nnei = sec.back();
+        fill_nei_a = true;
+        magic_number = get_magic_number(nnei);
+        // count_nei_idx_overflow = 0;
+        // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& coord_tensor	= context->input(context_input_index++);
+        const Tensor& type_tensor	= context->input(context_input_index++);
+        const Tensor& natoms_tensor	= context->input(context_input_index++);
+        const Tensor& box_tensor	= context->input(context_input_index++);
+        const Tensor& mesh_tensor	= context->input(context_input_index++);
+        const Tensor& avg_tensor	= context->input(context_input_index++);
+        const Tensor& std_tensor	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
+        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
+        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of box should be 2"));
+        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
+        OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of avg should be 2"));
+        OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of std should be 2"));
+        OP_REQUIRES (context, (fill_nei_a),				errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+
+        DeviceFunctor() (
+            context->eigen_device<Device>(),
+            device
+        );
+
+        const int * natoms = natoms_tensor.flat<int>().data();
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
+        int nsamples = coord_tensor.shape().dim_size(0);
+        //
+        //// check the sizes
+        // check the sizes
+        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
+        OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
+
+        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of box should be 9"));
+        OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
+        OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt")); 
+        
+        OP_REQUIRES (context, (nnei <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit."));
+
+        // Create an output tensor
+        TensorShape descrpt_shape ;
+        descrpt_shape.AddDim (nsamples);
+        descrpt_shape.AddDim (nloc * ndescrpt);
+        TensorShape descrpt_deriv_shape ;
+        descrpt_deriv_shape.AddDim (nsamples);
+        descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
+        TensorShape rij_shape ;
+        rij_shape.AddDim (nsamples);
+        rij_shape.AddDim (nloc * nnei * 3);
+        TensorShape nlist_shape ;
+        nlist_shape.AddDim (nsamples);
+        nlist_shape.AddDim (nloc * nnei);
+    
+        int context_output_index = 0;
+        Tensor* descrpt_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     descrpt_shape, 
+	    					     &descrpt_tensor));
+        Tensor* descrpt_deriv_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     descrpt_deriv_shape, 
+	    					     &descrpt_deriv_tensor));
+        Tensor* rij_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     rij_shape,
+	    					     &rij_tensor));
+        Tensor* nlist_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     nlist_shape,
+	    					     &nlist_tensor));
+            
+        if(device == "GPU") {
+            // allocate temp memory, temp memory must not be used after this operation!
+            Tensor int_temp;
+            TensorShape int_shape;
+            int_shape.AddDim(sec.size() + nloc * sec.size() + nloc);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+            Tensor uint64_temp;
+            TensorShape uint64_shape;
+            uint64_shape.AddDim(nloc * magic_number * 2);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+
+            array_int = int_temp.flat<int>().data(); 
+            array_longlong = uint64_temp.flat<unsigned long long>().data();
+
+            nbor_update(mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
+        }
+        else if (device == "CPU") {
+            memcpy (&ilist,  4  + mesh_tensor.flat<int>().data(), sizeof(int *));
+	        memcpy (&jrange, 8  + mesh_tensor.flat<int>().data(), sizeof(int *));
+	        memcpy (&jlist,  12 + mesh_tensor.flat<int>().data(), sizeof(int *));
+        }
+
+        DescrptSeRFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            coord_tensor.matrix<FPTYPE>().data(),    // related to the kk argument
+            type_tensor.matrix<int>().data(),           // also related to the kk argument
+            mesh_tensor.flat<int>().data(),
+            ilist,
+            jrange,
+            jlist,
+            array_int,
+            array_longlong,
+            avg_tensor.matrix<FPTYPE>().data(),
+            std_tensor.matrix<FPTYPE>().data(),
+            descrpt_tensor->matrix<FPTYPE>().data(),
+            descrpt_deriv_tensor->matrix<FPTYPE>().data(),
+            rij_tensor->matrix<FPTYPE>().data(),
+            nlist_tensor->matrix<int>().data(),
+            nloc,
+            nall,
+            nnei,
+            ntypes,
+            ndescrpt,
+            rcut,
+            rcut_smth,
+            sec,
+            fill_nei_a,
+            magic_number
+        );
+    }
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+private:
+    float rcut;
+    float rcut_smth;
+    std::vector<int32> sel;
+    std::vector<int32> sel_null;
+    std::vector<int> sec;
+    std::vector<int> sec_null;
+    int nnei, ndescrpt, nloc, nall;
+    bool fill_nei_a;
+
+    //private func
+    void cum_sum (std::vector<int> & sec, const std::vector<int32> & n_sel) const {
+        sec.resize (n_sel.size() + 1);
+        sec[0] = 0;
+        for (int ii = 1; ii < sec.size(); ++ii) {
+            sec[ii] = sec[ii-1] + n_sel[ii-1];
+        }
+    }
+
+    int magic_number;
+    std::string device;
+    int *array_int;
+    unsigned long long*array_longlong;
+    int * ilist = NULL, * jrange = NULL, * jlist = NULL;
+    int ilist_size = 0, jrange_size = 0, jlist_size = 0;
+    bool init = false;
+
+    void nbor_update(const int * mesh, const int size) {
+        int *mesh_host = new int[size], *ilist_host = NULL, *jrange_host = NULL, *jlist_host = NULL;
+        cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost));
+        memcpy (&ilist_host,  4  + mesh_host, sizeof(int *));
+	    memcpy (&jrange_host, 8  + mesh_host, sizeof(int *));
+	    memcpy (&jlist_host,  12 + mesh_host, sizeof(int *));
+        int const ago = mesh_host[0];
+        if (!init) {
+            ilist_size  = (int)(mesh_host[1] * 1.2);
+            jrange_size = (int)(mesh_host[2] * 1.2);
+            jlist_size  = (int)(mesh_host[3] * 1.2);
+            cudaErrcheck(cudaMalloc((void **)&ilist,     sizeof(int) * ilist_size));
+            cudaErrcheck(cudaMalloc((void **)&jrange,    sizeof(int) * jrange_size));
+            cudaErrcheck(cudaMalloc((void **)&jlist,     sizeof(int) * jlist_size));
+            init = true;
+        }
+        if (ago == 0) {
+            if (ilist_size < mesh_host[1]) {
+                ilist_size = (int)(mesh_host[1] * 1.2);
+                cudaErrcheck(cudaFree(ilist));
+                cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size));
+            }
+            if (jrange_size < mesh_host[2]) {
+                jrange_size = (int)(mesh_host[2] * 1.2);
+                cudaErrcheck(cudaFree(jrange));
+                cudaErrcheck(cudaMalloc((void **)&jrange,sizeof(int) * jrange_size));
+            }
+            if (jlist_size < mesh_host[3]) {
+                jlist_size = (int)(mesh_host[3] * 1.2);
+                cudaErrcheck(cudaFree(jlist));
+                cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size));
+            }
+            cudaErrcheck(cudaMemcpy(ilist,  ilist_host,  sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice));
+            cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice));
+            cudaErrcheck(cudaMemcpy(jlist,  jlist_host,  sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice));
+        }
+        delete [] mesh_host;
+    }
+
+    int get_magic_number(int const nnei) {
+        if (nnei <= 256) {
+            return 256;
+        }
+        else if (nnei <= 512) {
+            return 512;
+        }
+        else if (nnei <= 1024) {
+            return 1024;
+        }
+        else if (nnei <= 2048) {
+            return 2048;
+        }
+        else if (nnei <= 4096) {
+            return 4096;
+        }
+    }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptSeROp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"),  \
+    DescrptSeROp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc
index 29daaa53c8..22c61b7429 100644
--- a/source/op/ewald_recp.cc
+++ b/source/op/ewald_recp.cc
@@ -10,36 +10,21 @@ typedef double boxtensor_t ;
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("EwaldRecp")
-.Input("coord: double")
-.Input("charge: double")
+.Attr("T: {float, double}")
+.Input("coord: T")
+.Input("charge: T")
 .Input("natoms: int32")
-.Input("box: double")
+.Input("box: T")
 .Attr("ewald_beta: float")
 .Attr("ewald_h: float")
-.Output("energy: double")
-.Output("force: double")
-.Output("virial: double");
-#else
-REGISTER_OP("EwaldRecp")
-.Input("coord: float")
-.Input("charge: float")
-.Input("natoms: int32")
-.Input("box: float")
-.Attr("ewald_beta: float")
-.Attr("ewald_h: float")
-.Output("energy: float")
-.Output("force: float")
-.Output("virial: float");
-#endif
+.Output("energy: T")
+.Output("force: T")
+.Output("virial: T");
 
+template<typename Device, typename FPTYPE>
 class EwaldRecpOp : public OpKernel {
 public:
   explicit EwaldRecpOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -90,12 +75,12 @@ class EwaldRecpOp : public OpKernel {
     Tensor* virial_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(cc++, virial_shape, &virial_tensor));
     
-    auto coord	= coord_tensor	.flat<VALUETYPE>();
-    auto charge	= charge_tensor	.flat<VALUETYPE>();
-    auto box	= box_tensor	.flat<VALUETYPE>();
-    auto energy	= energy_tensor	->flat<VALUETYPE>();
-    auto force	= force_tensor	->matrix<VALUETYPE>();
-    auto virial	= virial_tensor	->matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.flat<FPTYPE>();
+    auto charge	= charge_tensor	.flat<FPTYPE>();
+    auto box	= box_tensor	.flat<FPTYPE>();
+    auto energy	= energy_tensor	->flat<FPTYPE>();
+    auto force	= force_tensor	->matrix<FPTYPE>();
+    auto virial	= virial_tensor	->matrix<FPTYPE>();
 
     for (int kk = 0; kk < nsamples; ++kk){
       int box_iter = kk * 9;
@@ -122,19 +107,19 @@ class EwaldRecpOp : public OpKernel {
 	  else if (inter[dd] >= 1) inter[dd] -= 1.;
 	}
       }
-      vector<VALUETYPE > d_coord3 (nloc*3);
+      vector<FPTYPE > d_coord3 (nloc*3);
       for (int ii = 0; ii < nloc * 3; ++ii) {
 	d_coord3[ii] = d_coord3_[ii];
       }
 
       // set charge
-      vector<VALUETYPE > d_charge (nloc);
+      vector<FPTYPE > d_charge (nloc);
       for (int ii = 0; ii < nloc; ++ii) d_charge[ii] = charge(charge_iter + ii);
 
       // prepare outputs vectors
-      VALUETYPE d_ener;
-      vector<VALUETYPE> d_force(nloc*3);
-      vector<VALUETYPE> d_virial(9);
+      FPTYPE d_ener;
+      vector<FPTYPE> d_force(nloc*3);
+      vector<FPTYPE> d_virial(9);
 
       // compute
       EwaldReciprocal(d_ener, d_force, d_virial, d_coord3, d_charge, region, ep);
@@ -150,8 +135,12 @@ class EwaldRecpOp : public OpKernel {
     }
   }
 private:
-  EwaldParameters<VALUETYPE> ep;
+  EwaldParameters<FPTYPE> ep;
 };
 
-REGISTER_KERNEL_BUILDER(Name("EwaldRecp").Device(DEVICE_CPU), EwaldRecpOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("EwaldRecp").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    EwaldRecpOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
diff --git a/source/op/gelu.cc b/source/op/gelu.cc
index 26c53c8511..7012438db9 100644
--- a/source/op/gelu.cc
+++ b/source/op/gelu.cc
@@ -2,11 +2,11 @@
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#define SQRT_2_PI 0.7978845608028654
 
 using namespace tensorflow;
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
+#define SQRT_2_PI 0.7978845608028654
 
 REGISTER_OP("Gelu")
     .Attr("T: {float, double}")
@@ -26,43 +26,68 @@ REGISTER_OP("GeluGradGrad")
     .Input("x: T")
     .Output("output: T");
 
-template <typename Device, typename T>
+#if GOOGLE_CUDA
+// maybe instead use cudnn activation forward 
+void GeluLauncher(const float * in, float * out, int const size);
+void GeluLauncher(const double * in, double * out, int const size);
+void GeluGradLauncher(const float * dy, const float * in, float * out, int const size);
+void GeluGradLauncher(const double * dy, const double * in, double * out, int const size);
+void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size);
+void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size);
+#endif // GOOGLE_CUDa
+
+template <typename FPTYPE>
 struct GeluFunctor {
-    void operator()(const Device& d, const T * in, T * out, int const size) {
+    void operator()(const CPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) {
 		#pragma omp parallel for 
 		for (int ii = 0; ii < size; ii++) {
 			out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] * in[ii])));
 		}
 	}
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) {
+		GeluLauncher(in, out, size);
+	}
+    #endif
 };
 
-template <typename Device, typename T>
+template <typename FPTYPE>
 struct GeluGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) {
+    void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
         #pragma omp parallel for 
 		for (int ii = 0; ii < size; ii++) {
-        	T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        	FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
     		out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
 		}
     }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+        GeluGradLauncher(dy, in, out, size);
+    }
+    #endif
 };
 
-template <typename Device, typename T>
+template <typename FPTYPE>
 struct GeluGradGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+    void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
         #pragma omp parallel for
 		for (int ii = 0; ii < size; ii++) {
-			T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
-    		T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
+			FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+    		FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
 
     		out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
         }
     }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+        GeluGradGradLauncher(dy, dy_, in, out, size);
+    }
+    #endif
 };
 
 // OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
 class GeluOp : public OpKernel {
   public :
     explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -70,26 +95,25 @@ class GeluOp : public OpKernel {
     void Compute(OpKernelContext* context) override {
         // Grab the input tensor
         const Tensor& x = context->input(0);
-        
         Tensor * output = NULL;
         int context_output_index = 0;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
 					    x.shape(),
 					    &output));
 		
-		GeluFunctor<Device, T>()(
+		GeluFunctor<FPTYPE>()(
 			context->eigen_device<Device>(),
-			x.flat<T>().data(),
-			output->flat<T>().data(),
+			x.flat<FPTYPE>().data(),
+			output->flat<FPTYPE>().data(),
 			static_cast<int>(output->NumElements())
 		);
-        // GeluLauncher(x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+        // GeluLauncher(x.flat<FPTYPE>().data(), output->flat<FPTYPE>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
 // OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
 class GeluGradOp : public OpKernel {
   public :
     explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -105,20 +129,20 @@ class GeluGradOp : public OpKernel {
 					    x.shape(),
 					    &output));
 		
-		GeluGradFunctor<Device, T>()(
+		GeluGradFunctor<FPTYPE>()(
             context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
+            dy.flat<FPTYPE>().data(),
+            x.flat<FPTYPE>().data(),
+            output->flat<FPTYPE>().data(),
             static_cast<int>(output->NumElements())
         );
-        // GeluGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+        // GeluGradLauncher(dy.flat<FPTYPE>().data(), x.flat<FPTYPE>().data(), output->flat<FPTYPE>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
 // OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
 class GeluGradGradOp : public OpKernel {
   public :
     explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -135,12 +159,12 @@ class GeluGradGradOp : public OpKernel {
 					    x.shape(),
 					    &output));
 		
-		GeluGradGradFunctor<Device, T>()(
+		GeluGradGradFunctor<FPTYPE>()(
             context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            dy_.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
+            dy.flat<FPTYPE>().data(),
+            dy_.flat<FPTYPE>().data(),
+            x.flat<FPTYPE>().data(),
+            output->flat<FPTYPE>().data(),
             static_cast<int>(output->NumElements())
         );
         // GeluGradGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
@@ -148,17 +172,35 @@ class GeluGradGradOp : public OpKernel {
 };
 
 #define REGISTER_CPU(T)                                                     \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
-        GeluOp<CPUDevice, T>);                                              \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),         \
-        GeluGradOp<CPUDevice, T>);                                          \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
-        GeluGradGradOp<CPUDevice, T>);
-    REGISTER_CPU(float);
-    REGISTER_CPU(double);
\ No newline at end of file
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
+    GeluOp<CPUDevice, T>);                                              \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),         \
+    GeluGradOp<CPUDevice, T>);                                          \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
+    GeluGradGradOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                     \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
+    GeluOp<GPUDevice, T>);                                              \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),         \
+    GeluGradOp<GPUDevice, T>);                                          \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
+    GeluGradGradOp<GPUDevice, T>);                                      
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif // GOOGLE_CUDA
diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc
deleted file mode 100644
index 34d4183f98..0000000000
--- a/source/op/gelu_gpu.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-using namespace tensorflow;
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-REGISTER_OP("Gelu")
-    .Attr("T: {float, double}")
-    .Input("x: T")
-    .Output("output: T");
-
-REGISTER_OP("GeluGrad")
-    .Attr("T: {float, double}")
-    .Input("dy: T")
-    .Input("x: T")
-    .Output("output: T");
-
-REGISTER_OP("GeluGradGrad")
-    .Attr("T: {float, double}")
-    .Input("dy: T")
-    .Input("dy_: T")
-    .Input("x: T")
-    .Output("output: T");
-
-// maybe instead use cudnn activation forward 
-void GeluLauncher(const float * in, float * out, int const size);
-void GeluLauncher(const double * in, double * out, int const size);
-
-void GeluGradLauncher(const float * dy, const float * in, float * out, int const size);
-void GeluGradLauncher(const double * dy, const double * in, double * out, int const size);
-
-void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size);
-void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size);
-
-template <typename Device, typename T>
-struct GeluFunctor {
-    void operator()(const Device& d, const T * in, T * out, int const size) {
-		GeluLauncher(in, out, size);
-	}
-};
-
-template <typename Device, typename T>
-struct GeluGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) {
-        GeluGradLauncher(dy, in, out, size);
-    }
-};
-
-template <typename Device, typename T>
-struct GeluGradGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
-        GeluGradGradLauncher(dy, dy_, in, out, size);
-    }
-};
-
-// OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
-class GeluOp : public OpKernel {
-  public :
-    explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        const Tensor& x = context->input(0);
-        Tensor * output = NULL;
-        int context_output_index = 0;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-					    x.shape(),
-					    &output));
-		
-		GeluFunctor<Device, T>()(
-			context->eigen_device<Device>(),
-			x.flat<T>().data(),
-			output->flat<T>().data(),
-			static_cast<int>(output->NumElements())
-		);
-        // GeluLauncher(x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
-    }
-};
-
-// OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
-class GeluGradOp : public OpKernel {
-  public :
-    explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        const Tensor& dy = context->input(0);
-        const Tensor& x  = context->input(1);
-        
-        Tensor * output = NULL;
-        int context_output_index = 0;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-					    x.shape(),
-					    &output));
-		
-		GeluGradFunctor<Device, T>()(
-            context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
-            static_cast<int>(output->NumElements())
-        );
-        // GeluGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
-    }
-};
-
-// OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
-class GeluGradGradOp : public OpKernel {
-  public :
-    explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
-	
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        const Tensor& dy = context->input(0);
-        const Tensor& dy_ = context->input(1);
-        const Tensor& x  = context->input(2);
-	
-		Tensor * output = NULL;
-		int context_output_index = 0;	
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-					    x.shape(),
-					    &output));
-		
-		GeluGradGradFunctor<Device, T>()(
-            context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            dy_.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
-            static_cast<int>(output->NumElements())
-        );
-        // GeluGradGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
-    }
-};
-
-#define REGISTER_GPU(T)                                                     \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
-        GeluOp<GPUDevice, T>);                                              \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),         \
-        GeluGradOp<GPUDevice, T>);                                          \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
-        GeluGradGradOp<GPUDevice, T>);                                      
-    REGISTER_GPU(float);
-    REGISTER_GPU(double);
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
new file mode 100644
index 0000000000..f84c9c0f9f
--- /dev/null
+++ b/source/op/gelu_multi_device.cc
@@ -0,0 +1,167 @@
+#include "common.h"
+#include "CustomeOperation.h"
+
+REGISTER_OP("Gelu")
+    .Attr("T: {float, double}")
+    .Input("x: T")
+    .Output("output: T");
+
+REGISTER_OP("GeluGrad")
+    .Attr("T: {float, double}")
+    .Input("dy: T")
+    .Input("x: T")
+    .Output("output: T");
+
+REGISTER_OP("GeluGradGrad")
+    .Attr("T: {float, double}")
+    .Input("dy: T")
+    .Input("dy_: T")
+    .Input("x: T")
+    .Output("output: T");
+
+template <typename FPTYPE>
+struct GeluFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) {
+		GeluCPULauncher(in, out, size);
+	}
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) {
+        GeluGPULauncher(in, out, size);
+    }
+    #endif
+};
+
+template <typename FPTYPE>
+struct GeluGradFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+        GeluGradCPULauncher(dy, in, out, size);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+        GeluGradGPULauncher(dy, in, out, size);
+    }
+    #endif
+};
+
+template <typename FPTYPE>
+struct GeluGradGradFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+        GeluGradGradCPULauncher(dy, dy_, in, out, size);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+        GeluGradGradGPULauncher(dy, dy_, in, out, size);
+    }
+    #endif
+};
+
+// OpKernel definition.
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
+class GeluOp : public OpKernel {
+  public :
+    explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& x = context->input(0);
+        Tensor * output = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluFunctor<FPTYPE>()(
+			context->eigen_device<Device>(),
+			x.flat<FPTYPE>().data(),
+			output->flat<FPTYPE>().data(),
+			static_cast<int>(output->NumElements())
+		);
+    }
+};
+
+// OpKernel definition.
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
+class GeluGradOp : public OpKernel {
+  public :
+    explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& dy = context->input(0);
+        const Tensor& x  = context->input(1);
+        
+        Tensor * output = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluGradFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),
+            dy.flat<FPTYPE>().data(),
+            x.flat<FPTYPE>().data(),
+            output->flat<FPTYPE>().data(),
+            static_cast<int>(output->NumElements())
+        );
+    }
+};
+
+// OpKernel definition.
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
+class GeluGradGradOp : public OpKernel {
+  public :
+    explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+	
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& dy = context->input(0);
+        const Tensor& dy_ = context->input(1);
+        const Tensor& x  = context->input(2);
+	
+		Tensor * output = NULL;
+		int context_output_index = 0;	
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluGradGradFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),
+            dy.flat<FPTYPE>().data(),
+            dy_.flat<FPTYPE>().data(),
+            x.flat<FPTYPE>().data(),
+            output->flat<FPTYPE>().data(),
+            static_cast<int>(output->NumElements())
+        );
+    }
+};
+
+#define REGISTER_CPU(T)                                                 \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
+    GeluOp<CPUDevice, T>);                                              \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),         \
+    GeluGradOp<CPUDevice, T>);                                          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
+    GeluGradGradOp<CPUDevice, T>);                                      
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                 \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
+    GeluOp<GPUDevice, T>);                                              \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),         \
+    GeluGradOp<GPUDevice, T>);                                          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
+    GeluGradGradOp<GPUDevice, T>);                                      
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_force.cc b/source/op/prod_force.cc
index e41b908d41..7ba99b6e81 100644
--- a/source/op/prod_force.cc
+++ b/source/op/prod_force.cc
@@ -6,36 +6,23 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("ProdForce")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("nlist: int32")
-.Input("axis: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("force: double");
-#else
 REGISTER_OP("ProdForce")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("axis: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("force: float");
-#endif
+.Output("force: T");
+
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename FPTYPE>
 class ProdForceOp : public OpKernel {
  public:
   explicit ProdForceOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -86,11 +73,11 @@ class ProdForceOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
     auto axis = axis_tensor.flat<int>();
-    auto force = force_tensor->flat<VALUETYPE>();
+    auto force = force_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -176,7 +163,11 @@ class ProdForceOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForce").Device(DEVICE_CPU), ProdForceOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdForce").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdForceOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/prod_force_grad.cc b/source/op/prod_force_grad.cc
index 14e884c5c9..2c8e62550c 100644
--- a/source/op/prod_force_grad.cc
+++ b/source/op/prod_force_grad.cc
@@ -6,36 +6,21 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("ProdForceGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("nlist: int32")
-.Input("axis: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
 REGISTER_OP("ProdForceGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("axis: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename FPTYPE>
 class ProdForceGradOp : public OpKernel 
 {
 public:
@@ -97,12 +82,12 @@ class ProdForceGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
     auto axis		= axis_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -190,4 +175,9 @@ class ProdForceGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceGrad").Device(DEVICE_CPU), ProdForceGradOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdForceGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdForceGradOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_force_se_a.cc b/source/op/prod_force_se_a.cc
index af0e712492..1b5053377c 100644
--- a/source/op/prod_force_se_a.cc
+++ b/source/op/prod_force_se_a.cc
@@ -6,34 +6,23 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("ProdForceSeA")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("force: double");
-#else
 REGISTER_OP("ProdForceSeA")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("force: float");
-#endif
+.Output("force: T");
+
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+template<typename Device, typename FPTYPE>
 class ProdForceSeAOp : public OpKernel {
  public:
   explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -83,10 +72,10 @@ class ProdForceSeAOp : public OpKernel {
 						     force_shape, &force_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto force = force_tensor->flat<VALUETYPE>();
+    auto force = force_tensor->flat<FPTYPE>();
 
     assert (nframes == force_shape.dim_size(0));
     assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -155,7 +144,11 @@ class ProdForceSeAOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_CPU), ProdForceSeAOp);
-
-
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeAOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
diff --git a/source/op/prod_force_se_a_grad.cc b/source/op/prod_force_se_a_grad.cc
index eda965974a..884e46f9a8 100644
--- a/source/op/prod_force_se_a_grad.cc
+++ b/source/op/prod_force_se_a_grad.cc
@@ -6,34 +6,20 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
 REGISTER_OP("ProdForceSeAGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
-REGISTER_OP("ProdForceSeAGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename FPTYPE>
 class ProdForceSeAGradOp : public OpKernel 
 {
 public:
@@ -91,11 +77,11 @@ class ProdForceSeAGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -158,4 +144,9 @@ class ProdForceSeAGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeAGrad").Device(DEVICE_CPU), ProdForceSeAGradOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdForceSeAGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdForceSeAGradOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_force_se_a_gpu.cc b/source/op/prod_force_se_a_multi_device.cc
similarity index 58%
rename from source/op/prod_force_se_a_gpu.cc
rename to source/op/prod_force_se_a_multi_device.cc
index 2d159a8505..87a3ae3ecc 100644
--- a/source/op/prod_force_se_a_gpu.cc
+++ b/source/op/prod_force_se_a_multi_device.cc
@@ -1,58 +1,29 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <cuda_runtime.h>
-#include <iostream>
+#include "common.h"
+#include "CustomeOperation.h"
 
-using namespace tensorflow;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("ProdForceSeA")
-    .Input("net_deriv: double")
-    .Input("in_deriv: double")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Attr("n_a_sel: int")
-    .Attr("n_r_sel: int")
-    .Output("force: double");
-#else
 REGISTER_OP("ProdForceSeA")
-    .Input("net_deriv: float")
-    .Input("in_deriv: float")
+    .Attr("T: {float, double}")
+    .Input("net_deriv: T")
+    .Input("in_deriv: T")
     .Input("nlist: int32")
     .Input("natoms: int32")
     .Attr("n_a_sel: int")
     .Attr("n_r_sel: int")
-    .Output("force: float");
-#endif
+    .Output("force: T");
 
-void ProdForceSeALauncher(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int ndescrpt,
-                        const int nnei,
-                        const int n_a_sel,
-                        const int n_a_shift);
+template <typename FPTYPE>
+struct ProdForceSeAFunctor {
+    void operator()(const CPUDevice& d, FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+        ProdForceSeACPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+        ProdForceSeAGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+    }
+    #endif // GOOGLE_CUDA
+};
 
+template<typename Device, typename FPTYPE>
 class ProdForceSeAOp : public OpKernel {
 public:
     explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -76,8 +47,7 @@ class ProdForceSeAOp : public OpKernel {
         OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
 
         OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        const int * natoms = natoms_tensor.flat<int>().data();
         int nloc = natoms[0];
         int nall = natoms[1];
         int nframes = net_deriv_tensor.shape().dim_size(0);
@@ -102,10 +72,10 @@ class ProdForceSeAOp : public OpKernel {
 	    					     force_shape, &force_tensor));
 
         // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+        auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+        auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
         auto nlist = nlist_tensor.flat<int>();
-        auto force = force_tensor->flat<VALUETYPE>();
+        auto force = force_tensor->flat<FPTYPE>();
 
         assert (nframes == force_shape.dim_size(0));
         assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -117,23 +87,37 @@ class ProdForceSeAOp : public OpKernel {
         assert (nloc * nnei == nlist_tensor.shape().dim_size(1));
         assert (nnei * 4 == ndescrpt);	    
 
-        for (int II = 0; II < nframes; II++) {
-            ProdForceSeALauncher(force_tensor->flat<VALUETYPE>().data() + II * (nall * 3),
-                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
-                                nloc,
-                                nall, 
-                                ndescrpt,
-                                nnei,
-                                n_a_sel,
-                                n_a_shift
-            );
-        }
-        delete[] natoms;
+        ProdForceSeAFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),
+            force_tensor->flat<FPTYPE>().data(),
+            net_deriv_tensor.flat<FPTYPE>().data(),
+            in_deriv_tensor.flat<FPTYPE>().data(),
+            nlist_tensor.flat<int>().data(),
+            nloc,
+            nall, 
+            nnei,
+            ndescrpt,
+            n_a_sel,
+            n_a_shift
+        );
     }
 private:
     int n_r_sel, n_a_sel, n_a_shift;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_GPU), ProdForceSeAOp);
\ No newline at end of file
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeAOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdForceSeAOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_force_se_r.cc b/source/op/prod_force_se_r.cc
index b4933c5b4a..4347442167 100644
--- a/source/op/prod_force_se_r.cc
+++ b/source/op/prod_force_se_r.cc
@@ -6,29 +6,19 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdForceSeR")
-#ifdef HIGH_PREC
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Output("force: double");
-#else
-.Input("net_deriv: float")
-.Input("in_deriv: float")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Output("force: float");
-#endif
+.Output("force: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename FPTYPE>
 class ProdForceSeROp : public OpKernel {
  public:
   explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -73,10 +63,10 @@ class ProdForceSeROp : public OpKernel {
 						     force_shape, &force_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto force = force_tensor->flat<VALUETYPE>();
+    auto force = force_tensor->flat<FPTYPE>();
 
     assert (nframes == force_shape.dim_size(0));
     assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -126,7 +116,12 @@ class ProdForceSeROp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_CPU), ProdForceSeROp);
-
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeROp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/prod_force_se_r_gpu.cc b/source/op/prod_force_se_r_gpu.cc
deleted file mode 100644
index 3dc0bc7853..0000000000
--- a/source/op/prod_force_se_r_gpu.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <cuda_runtime.h>
-#include <iostream>
-
-using namespace tensorflow;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("ProdForceSeR")
-    .Input("net_deriv: double")
-    .Input("in_deriv: double")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Output("force: double");
-#else
-REGISTER_OP("ProdForceSeR")
-    .Input("net_deriv: float")
-    .Input("in_deriv: float")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Output("force: float");
-#endif
-
-void ProdForceSeRLauncher(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int ndescrpt,
-                        const int nnei,
-                        const int n_a_sel,
-                        const int n_a_shift);
-
-class ProdForceSeROp : public OpKernel {
-public:
-    explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {
-        // std::cout << "I'm in prod_force_se_r_gpu.cc" << std::endl;
-    }
-
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        int context_input_index = 0;
-        const Tensor& net_deriv_tensor	= context->input(context_input_index++);
-        const Tensor& in_deriv_tensor	= context->input(context_input_index++);
-        const Tensor& nlist_tensor		= context->input(context_input_index++);
-        const Tensor& natoms_tensor		= context->input(context_input_index++);
-
-        // set size of the sample
-        OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of net deriv should be 2"));
-        OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of input deriv should be 2"));
-        OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2),		errors::InvalidArgument ("Dim of nlist should be 2"));
-        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
-
-        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
-        int nloc = natoms[0];
-        int nall = natoms[1];
-        int nframes = net_deriv_tensor.shape().dim_size(0);
-        int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
-        int nnei = nlist_tensor.shape().dim_size(1) / nloc;
-
-        // check the sizes
-        OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
-        OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
-
-        OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
-        // OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel),				errors::InvalidArgument ("number of neighbors should match"));
-        // OP_REQUIRES (context, (0 == n_r_sel),					        errors::InvalidArgument ("Rotational free only support all-angular information"));
-
-        // Create an output tensor
-        TensorShape force_shape;
-        force_shape.AddDim (nframes);
-        force_shape.AddDim (3 * nall);
-        Tensor* force_tensor = NULL;
-        int context_output_index = 0;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     force_shape, &force_tensor));
-
-        // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-        auto nlist = nlist_tensor.flat<int>();
-        auto force = force_tensor->flat<VALUETYPE>();
-
-        assert (nframes == force_shape.dim_size(0));
-        assert (nframes == net_deriv_tensor.shape().dim_size(0));
-        assert (nframes == in_deriv_tensor.shape().dim_size(0));
-        assert (nframes == nlist_tensor.shape().dim_size(0));
-        assert (nall * 3 == force_shape.dim_size(1));
-        assert (nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
-        assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
-        assert (nloc * nnei == nlist_tensor.shape().dim_size(1));
-        assert (nnei * 4 == ndescrpt);	    
-
-        for (int II = 0; II < nframes; II++) {
-            ProdForceSeRLauncher(force_tensor->flat<VALUETYPE>().data() + II * (nall * 3),
-                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
-                                nloc,
-                                nall, 
-                                ndescrpt,
-                                nnei,
-                                n_a_sel,
-                                n_a_shift
-            );
-        }
-        delete[] natoms;
-    }
-private:
-    int n_r_sel, n_a_sel, n_a_shift;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_GPU), ProdForceSeROp);
\ No newline at end of file
diff --git a/source/op/prod_force_se_r_grad.cc b/source/op/prod_force_se_r_grad.cc
index 3866ef9b86..dfe9a5ff98 100644
--- a/source/op/prod_force_se_r_grad.cc
+++ b/source/op/prod_force_se_r_grad.cc
@@ -6,30 +6,18 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
 REGISTER_OP("ProdForceSeRGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Output("grad_net: double");
-#else
-REGISTER_OP("ProdForceSeRGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename FPTYPE>
 class ProdForceSeRGradOp : public OpKernel 
 {
 public:
@@ -83,11 +71,11 @@ class ProdForceSeRGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -131,4 +119,9 @@ class ProdForceSeRGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeRGrad").Device(DEVICE_CPU), ProdForceSeRGradOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdForceSeRGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdForceSeRGradOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_force_se_r_multi_device.cc b/source/op/prod_force_se_r_multi_device.cc
new file mode 100644
index 0000000000..0a97d17742
--- /dev/null
+++ b/source/op/prod_force_se_r_multi_device.cc
@@ -0,0 +1,111 @@
+#include "common.h"
+#include "CustomeOperation.h"
+
+REGISTER_OP("ProdForceSeR")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("nlist: int32")
+.Input("natoms: int32")
+.Output("force: T");
+
+template <typename T>
+struct ProdForceSeRFunctor {
+    void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+        ProdForceSeRCPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+        ProdForceSeRGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt);
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template<typename Device, typename T>
+class ProdForceSeROp : public OpKernel {
+public:
+    explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& net_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& in_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& nlist_tensor		= context->input(context_input_index++);
+        const Tensor& natoms_tensor		= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of net deriv should be 2"));
+        OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of input deriv should be 2"));
+        OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2),		errors::InvalidArgument ("Dim of nlist should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+        const int * natoms = natoms_tensor.flat<int>().data();
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int nframes = net_deriv_tensor.shape().dim_size(0);
+        int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
+        int nnei = nlist_tensor.shape().dim_size(1) / nloc;
+
+        // check the sizes
+        OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+
+        OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
+
+        // Create an output tensor
+        TensorShape force_shape ;
+        force_shape.AddDim (nframes);
+        force_shape.AddDim (3 * nall);
+        Tensor* force_tensor = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     force_shape, &force_tensor));
+
+        // flat the tensors
+        auto net_deriv = net_deriv_tensor.flat<T>();
+        auto in_deriv = in_deriv_tensor.flat<T>();
+        auto nlist = nlist_tensor.flat<int>();
+        auto force = force_tensor->flat<T>();
+
+        assert (nframes == force_shape.dim_size(0));
+        assert (nframes == net_deriv_tensor.shape().dim_size(0));
+        assert (nframes == in_deriv_tensor.shape().dim_size(0));
+        assert (nframes == nlist_tensor.shape().dim_size(0));
+        assert (nall * 3 == force_shape.dim_size(1));
+        assert (nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1));
+        assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1));
+        assert (nloc * nnei == nlist_tensor.shape().dim_size(1));
+        assert (nnei * 4 == ndescrpt);	    
+
+        ProdForceSeRFunctor<T>()(
+            context->eigen_device<Device>(),
+            force_tensor->flat<T>().data(),
+            net_deriv_tensor.flat<T>().data(),
+            in_deriv_tensor.flat<T>().data(),
+            nlist_tensor.flat<int>().data(),
+            nloc,
+            nall, 
+            nnei,
+            ndescrpt
+        );
+    }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeROp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdForceSeROp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_virial.cc b/source/op/prod_virial.cc
index 55b0b4b60d..f42a5055c2 100644
--- a/source/op/prod_virial.cc
+++ b/source/op/prod_virial.cc
@@ -6,42 +6,25 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("ProdVirial")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
-.Input("nlist: int32")
-.Input("axis: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("virial: double")
-.Output("atom_virial: double")
-;
-#else
 REGISTER_OP("ProdVirial")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("axis: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("virial: float")
-.Output("atom_virial: float")
-;
-#endif
+.Output("virial: T")
+.Output("atom_virial: T");
+
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename FPTYPE>
 class ProdVirialOp : public OpKernel {
  public:
   explicit ProdVirialOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -100,13 +83,13 @@ class ProdVirialOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-    auto rij = rij_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
+    auto rij = rij_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
     auto axis = axis_tensor.flat<int>();
-    auto virial = virial_tensor->flat<VALUETYPE>();
-    auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+    auto virial = virial_tensor->flat<FPTYPE>();
+    auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for
@@ -144,10 +127,10 @@ class ProdVirialOp : public OpKernel {
 	  if (j_idx < 0) continue;
 	  if (jj == axis_0) {
 	    for (int aa = 0; aa < ndescrpt; ++aa){
-	      VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	      FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0);
+		  FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -156,10 +139,10 @@ class ProdVirialOp : public OpKernel {
 	  }
 	  else if (jj == axis_1) {
 	    for (int aa = 0; aa < ndescrpt; ++aa){
-	      VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	      FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0);
+		  FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -170,10 +153,10 @@ class ProdVirialOp : public OpKernel {
 	    int aa_start, aa_end;
 	    make_descript_range (aa_start, aa_end, jj);
 	    for (int aa = aa_start; aa < aa_end; ++aa) {
-	      VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	      FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0);
+		  FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -201,7 +184,11 @@ class ProdVirialOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirial").Device(DEVICE_CPU), ProdVirialOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdVirial").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdVirialOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/prod_virial_grad.cc b/source/op/prod_virial_grad.cc
index 5257467029..5d75f5f649 100644
--- a/source/op/prod_virial_grad.cc
+++ b/source/op/prod_virial_grad.cc
@@ -6,38 +6,22 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("ProdVirialGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
-.Input("nlist: int32")
-.Input("axis: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
 REGISTER_OP("ProdVirialGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("axis: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename FPTYPE>
 class ProdVirialGradOp : public OpKernel 
 {
 public:
@@ -104,13 +88,13 @@ class ProdVirialGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
-    auto rij		= rij_tensor		.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
+    auto rij		= rij_tensor		.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
     auto axis		= axis_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -200,4 +184,9 @@ class ProdVirialGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialGrad").Device(DEVICE_CPU), ProdVirialGradOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdVirialGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdVirialGradOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_a.cc b/source/op/prod_virial_se_a.cc
index 2f71d37505..ba934fa54e 100644
--- a/source/op/prod_virial_se_a.cc
+++ b/source/op/prod_virial_se_a.cc
@@ -6,40 +6,25 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeA")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("virial: double")
-.Output("atom_virial: double")
-;
-#else
-REGISTER_OP("ProdVirialSeA")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("virial: float")
-.Output("atom_virial: float")
-;
-#endif
+.Output("virial: T")
+.Output("atom_virial: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+template<typename Device, typename FPTYPE>
 class ProdVirialSeAOp : public OpKernel {
  public:
   explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -95,12 +80,12 @@ class ProdVirialSeAOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-    auto rij = rij_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
+    auto rij = rij_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto virial = virial_tensor->flat<VALUETYPE>();
-    auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+    auto virial = virial_tensor->flat<FPTYPE>();
+    auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for
@@ -131,10 +116,10 @@ class ProdVirialSeAOp : public OpKernel {
 	  int aa_start, aa_end;
 	  make_descript_range (aa_start, aa_end, jj);
 	  for (int aa = aa_start; aa < aa_end; ++aa) {
-	    VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	    FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	    for (int dd0 = 0; dd0 < 3; ++dd0){
 	      for (int dd1 = 0; dd1 < 3; ++dd1){
-		VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0);
+		FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0);
 		virial (virial_iter + dd0 * 3 + dd1) -= tmp_v;
 		atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	      }
@@ -161,7 +146,13 @@ class ProdVirialSeAOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_CPU), ProdVirialSeAOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeAOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
 
diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc
index 660f652566..ebbd857bf0 100644
--- a/source/op/prod_virial_se_a_grad.cc
+++ b/source/op/prod_virial_se_a_grad.cc
@@ -6,36 +6,21 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeAGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
-REGISTER_OP("ProdVirialSeAGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename FPTYPE>
 class ProdVirialSeAGradOp : public OpKernel 
 {
 public:
@@ -98,12 +83,12 @@ class ProdVirialSeAGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
-    auto rij		= rij_tensor		.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
+    auto rij		= rij_tensor		.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -162,4 +147,10 @@ class ProdVirialSeAGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeAGrad").Device(DEVICE_CPU), ProdVirialSeAGradOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeAGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeAGradOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_a_gpu.cc b/source/op/prod_virial_se_a_multi_device.cc
similarity index 53%
rename from source/op/prod_virial_se_a_gpu.cc
rename to source/op/prod_virial_se_a_multi_device.cc
index 42f70d06d2..7929fbd588 100644
--- a/source/op/prod_virial_se_a_gpu.cc
+++ b/source/op/prod_virial_se_a_multi_device.cc
@@ -1,64 +1,31 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <iostream>
-#include <cuda_runtime.h>
+#include "common.h"
+#include "CustomeOperation.h"
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("ProdVirialSeA")
-    .Input("net_deriv: double")
-    .Input("in_deriv: double")
-    .Input("rij: double")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Attr("n_a_sel: int")
-    .Attr("n_r_sel: int")
-    .Output("virial: double")
-    .Output("atom_virial: double");
-#else
 REGISTER_OP("ProdVirialSeA")
-    .Input("net_deriv: float")
-    .Input("in_deriv: float")
-    .Input("rij: float")
+    .Attr("T: {float, double}")
+    .Input("net_deriv: T")
+    .Input("in_deriv: T")
+    .Input("rij: T")
     .Input("nlist: int32")
     .Input("natoms: int32")
     .Attr("n_a_sel: int")
     .Attr("n_r_sel: int")
-    .Output("virial: float")
-    .Output("atom_virial: float");
-#endif
-
-using namespace tensorflow;
+    .Output("virial: T")
+    .Output("atom_virial: T");
 
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
+template<typename FPTYPE>
+struct ProdVirialSeAFunctor {
+    void operator()(const CPUDevice& d, FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+        ProdVirialSeACPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
     }
-}
-
-void ProdVirialSeALauncher(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift);
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+        ProdVirialSeAGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+    }
+    #endif // GOOGLE_CUDA
+};
 
+template<typename Device, typename FPTYPE>
 class ProdVirialSeAOp : public OpKernel {
  public:
     explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -84,8 +51,7 @@ class ProdVirialSeAOp : public OpKernel {
         OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
 
         OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        const int * natoms = natoms_tensor.flat<int>().data();
         int nloc = natoms[0];
         int nall = natoms[1];
         int nnei = nlist_tensor.shape().dim_size(1) / nloc;
@@ -114,32 +80,46 @@ class ProdVirialSeAOp : public OpKernel {
         OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
 
         // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-        auto rij = rij_tensor.flat<VALUETYPE>();
+        auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+        auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
+        auto rij = rij_tensor.flat<FPTYPE>();
         auto nlist = nlist_tensor.flat<int>();
-        auto virial = virial_tensor->flat<VALUETYPE>();
-        auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+        auto virial = virial_tensor->flat<FPTYPE>();
+        auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
         
-        for (int II = 0; II < nframes; II++) {
-            ProdVirialSeALauncher(virial_tensor->flat<VALUETYPE>().data() + II * 9, 
-                                atom_virial_tensor->flat<VALUETYPE>().data() + II * (nall * 9),
-                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                rij_tensor.flat<VALUETYPE>().data() + II * (nloc * nnei * 3),
-                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
-                                nloc,
-                                nall,
-                                nnei,
-                                ndescrpt,
-                                n_a_sel,
-                                n_a_shift
-            );
-        }
-        delete[] natoms;
+        ProdVirialSeAFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),
+            virial_tensor->flat<FPTYPE>().data(), 
+            atom_virial_tensor->flat<FPTYPE>().data(),
+            net_deriv_tensor.flat<FPTYPE>().data(),
+            in_deriv_tensor.flat<FPTYPE>().data(),
+            rij_tensor.flat<FPTYPE>().data(),
+            nlist_tensor.flat<int>().data(),
+            nloc,
+            nall,
+            nnei,
+            ndescrpt,
+            n_a_sel,
+            n_a_shift
+        );
     }
 private:
     int n_r_sel, n_a_sel, n_a_shift;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_GPU), ProdVirialSeAOp);
\ No newline at end of file
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeAOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdVirialSeAOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_virial_se_r.cc b/source/op/prod_virial_se_r.cc
index 1d21234421..5337246c21 100644
--- a/source/op/prod_virial_se_r.cc
+++ b/source/op/prod_virial_se_r.cc
@@ -6,36 +6,22 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeR")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Output("virial: double")
-.Output("atom_virial: double")
-;
-#else
-REGISTER_OP("ProdVirialSeR")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Output("virial: float")
-.Output("atom_virial: float")
-;
-#endif
+.Output("virial: T")
+.Output("atom_virial: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename FPTYPE>
 class ProdVirialSeROp : public OpKernel {
  public:
   explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -87,12 +73,12 @@ class ProdVirialSeROp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-    auto rij = rij_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
+    auto rij = rij_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto virial = virial_tensor->flat<VALUETYPE>();
-    auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+    auto virial = virial_tensor->flat<FPTYPE>();
+    auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for
@@ -119,10 +105,10 @@ class ProdVirialSeROp : public OpKernel {
 	for (int jj = 0; jj < nnei; ++jj){
 	  int j_idx = nlist (nlist_iter + i_idx * nnei + jj);
 	  if (j_idx < 0) continue;
-	  VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj);
+	  FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj);
 	  for (int dd0 = 0; dd0 < 3; ++dd0){
 	    for (int dd1 = 0; dd1 < 3; ++dd1){
-	      VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0);
+	      FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0);
 	      virial (virial_iter + dd0 * 3 + dd1) -= tmp_v;
 	      atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	    }
@@ -133,7 +119,12 @@ class ProdVirialSeROp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_CPU), ProdVirialSeROp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdVirialSeROp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
 
diff --git a/source/op/prod_virial_se_r_gpu.cc b/source/op/prod_virial_se_r_gpu.cc
deleted file mode 100644
index 91f965b72c..0000000000
--- a/source/op/prod_virial_se_r_gpu.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <iostream>
-#include <cuda_runtime.h>
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("ProdVirialSeR")
-    .Input("net_deriv: double")
-    .Input("in_deriv: double")
-    .Input("rij: double")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Output("virial: double")
-    .Output("atom_virial: double");
-#else
-REGISTER_OP("ProdVirialSeR")
-    .Input("net_deriv: float")
-    .Input("in_deriv: float")
-    .Input("rij: float")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Output("virial: float")
-    .Output("atom_virial: float");
-#endif
-
-using namespace tensorflow;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-void ProdVirialSeRLauncher(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift);
-
-class ProdVirialSeROp : public OpKernel {
- public:
-    explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {
-        // std::cout << "I'm in prod_virial_se_r_gpu.cc" << std::endl;
-    }
-
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        int context_input_index = 0;
-        const Tensor& net_deriv_tensor	= context->input(context_input_index++);
-        const Tensor& in_deriv_tensor	= context->input(context_input_index++);
-        const Tensor& rij_tensor		= context->input(context_input_index++);
-        const Tensor& nlist_tensor		= context->input(context_input_index++);
-        const Tensor& natoms_tensor		= context->input(context_input_index++);
-        // set size of the sample
-        OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of net deriv should be 2"));
-        OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of input deriv should be 2"));
-        OP_REQUIRES (context, (rij_tensor.shape().dims() == 2),		    errors::InvalidArgument ("Dim of rij should be 2"));
-        OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2),		errors::InvalidArgument ("Dim of nlist should be 2"));
-        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
-        cudaErrcheck(cudaDeviceSynchronize());
-        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
-        int nloc = natoms[0];
-        int nall = natoms[1];
-        int nnei = nlist_tensor.shape().dim_size(1) / nloc;
-        int nframes = net_deriv_tensor.shape().dim_size(0);
-        int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
-
-        // check the sizes
-        OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
-        OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
-        OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
-
-        OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
-        OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),	        errors::InvalidArgument ("dim of rij should be nnei * 3"));
-
-        // Create an output tensor
-        TensorShape virial_shape;
-        virial_shape.AddDim (nframes);
-        virial_shape.AddDim (9);
-        Tensor* virial_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor));
-        TensorShape atom_virial_shape ;
-        atom_virial_shape.AddDim (nframes);
-        atom_virial_shape.AddDim (9 * nall);
-        Tensor* atom_virial_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
-
-        // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-        auto rij = rij_tensor.flat<VALUETYPE>();
-        auto nlist = nlist_tensor.flat<int>();
-        auto virial = virial_tensor->flat<VALUETYPE>();
-        auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
-
-        for (int II = 0; II < nframes; II++) {
-            ProdVirialSeRLauncher(virial_tensor->flat<VALUETYPE>().data() + II * 9, 
-                                atom_virial_tensor->flat<VALUETYPE>().data() + II * (nall * 9),
-                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                rij_tensor.flat<VALUETYPE>().data() + II * (nloc * nnei * 3),
-                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
-                                nloc,
-                                nall,
-                                nnei,
-                                ndescrpt,
-                                n_a_sel,
-                                n_a_shift
-            );
-        }
-        delete[] natoms;
-    }
-private:
-    int n_r_sel, n_a_sel, n_a_shift;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_GPU), ProdVirialSeROp);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc
index 20b53cf3c9..61b8970e71 100644
--- a/source/op/prod_virial_se_r_grad.cc
+++ b/source/op/prod_virial_se_r_grad.cc
@@ -6,32 +6,19 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeRGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Output("grad_net: double");
-#else
-REGISTER_OP("ProdVirialSeRGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename FPTYPE>
 class ProdVirialSeRGradOp : public OpKernel 
 {
 public:
@@ -90,12 +77,12 @@ class ProdVirialSeRGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
-    auto rij		= rij_tensor		.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
+    auto rij		= rij_tensor		.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -135,4 +122,10 @@ class ProdVirialSeRGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeRGrad").Device(DEVICE_CPU), ProdVirialSeRGradOp);
+// Register the GPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeRGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeRGradOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_r_multi_device.cc b/source/op/prod_virial_se_r_multi_device.cc
new file mode 100644
index 0000000000..61e37eb215
--- /dev/null
+++ b/source/op/prod_virial_se_r_multi_device.cc
@@ -0,0 +1,114 @@
+#include "common.h"
+#include "CustomeOperation.h"
+
+REGISTER_OP("ProdVirialSeR")
+    .Attr("T: {float, double}")
+    .Input("net_deriv: T")
+    .Input("in_deriv: T")
+    .Input("rij: T")
+    .Input("nlist: int32")
+    .Input("natoms: int32")
+    .Output("virial: T")
+    .Output("atom_virial: T");
+
+template<typename T>
+struct ProdVirialSeRFunctor {
+    void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+        ProdVirialSeRCPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+        ProdVirialSeRGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt);
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template<typename Device, typename T>
+class ProdVirialSeROp : public OpKernel {
+ public:
+    explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& net_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& in_deriv_tensor	= context->input(context_input_index++);
+        const Tensor& rij_tensor		= context->input(context_input_index++);
+        const Tensor& nlist_tensor		= context->input(context_input_index++);
+        const Tensor& natoms_tensor		= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of net deriv should be 2"));
+        OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of input deriv should be 2"));
+        OP_REQUIRES (context, (rij_tensor.shape().dims() == 2),		    errors::InvalidArgument ("Dim of rij should be 2"));
+        OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2),		errors::InvalidArgument ("Dim of nlist should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+        const int * natoms = natoms_tensor.flat<int>().data();
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int nnei = nlist_tensor.shape().dim_size(1) / nloc;
+        int nframes = net_deriv_tensor.shape().dim_size(0);
+        int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
+
+        // check the sizes
+        OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+
+        OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
+        OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),	errors::InvalidArgument ("dim of rij should be nnei * 3"));
+
+        // Create an output tensor
+        TensorShape virial_shape ;
+        virial_shape.AddDim (nframes);
+        virial_shape.AddDim (9);
+        Tensor* virial_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor));
+        TensorShape atom_virial_shape;
+        atom_virial_shape.AddDim (nframes);
+        atom_virial_shape.AddDim (9 * nall);
+        Tensor* atom_virial_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
+
+        // flat the tensors
+        auto net_deriv = net_deriv_tensor.flat<T>();
+        auto in_deriv = in_deriv_tensor.flat<T>();
+        auto rij = rij_tensor.flat<T>();
+        auto nlist = nlist_tensor.flat<int>();
+        auto virial = virial_tensor->flat<T>();
+        auto atom_virial = atom_virial_tensor->flat<T>();
+        
+        ProdVirialSeRFunctor<T>()(
+            context->eigen_device<Device>(),
+            virial_tensor->flat<T>().data(), 
+            atom_virial_tensor->flat<T>().data(),
+            net_deriv_tensor.flat<T>().data(),
+            in_deriv_tensor.flat<T>().data(),
+            rij_tensor.flat<T>().data(),
+            nlist_tensor.flat<int>().data(),
+            nloc,
+            nall,
+            nnei,
+            ndescrpt
+        );
+    }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeROp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdVirialSeROp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/soft_min.cc b/source/op/soft_min.cc
index 6b23305d3e..6f3bc58932 100644
--- a/source/op/soft_min.cc
+++ b/source/op/soft_min.cc
@@ -8,16 +8,11 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("SoftMinSwitch")
+.Attr("T: {float, double}")
 .Input("type: int32")
-.Input("rij: double")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("sel_a: list(int)")
@@ -25,25 +20,14 @@ REGISTER_OP("SoftMinSwitch")
 .Attr("alpha: float")
 .Attr("rmin: float")
 .Attr("rmax: float")
-.Output("sw_value: double")
-.Output("sw_deriv: double");
-#else
-REGISTER_OP("SoftMinSwitch")
-.Input("type: int32")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Attr("alpha: float")
-.Attr("rmin: float")
-.Attr("rmax: float")
-.Output("sw_value: float")
-.Output("sw_deriv: float");
-#endif
+.Output("sw_value: T")
+.Output("sw_deriv: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename FPTYPE>
 class SoftMinSwitchOp : public OpKernel {
  public:
   explicit SoftMinSwitchOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -106,10 +90,10 @@ class SoftMinSwitchOp : public OpKernel {
     
     // flat the tensors
     auto type	= type_tensor	.matrix<int>();
-    auto rij	= rij_tensor	.matrix<VALUETYPE>();
+    auto rij	= rij_tensor	.matrix<FPTYPE>();
     auto nlist	= nlist_tensor	.matrix<int>();
-    auto sw_value = sw_value_tensor	->matrix<VALUETYPE>();
-    auto sw_deriv = sw_deriv_tensor	->matrix<VALUETYPE>();
+    auto sw_value = sw_value_tensor	->matrix<FPTYPE>();
+    auto sw_deriv = sw_deriv_tensor	->matrix<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -126,26 +110,26 @@ class SoftMinSwitchOp : public OpKernel {
       // compute force of a frame      
       for (int ii = 0; ii < nloc; ++ii){
 	int i_idx = ii;
-	VALUETYPE aa = 0;
-	VALUETYPE bb = 0;
+	FPTYPE aa = 0;
+	FPTYPE bb = 0;
 	for (int jj = 0; jj < nnei; ++jj){
 	  int j_idx = nlist (kk, i_idx * nnei + jj);
 	  if (j_idx < 0) continue;
 	  int rij_idx_shift = (i_idx * nnei + jj) * 3;
-	  VALUETYPE dr[3] = {
+	  FPTYPE dr[3] = {
 	    rij(kk, rij_idx_shift + 0),
 	    rij(kk, rij_idx_shift + 1),
 	    rij(kk, rij_idx_shift + 2)
 	  };
-	  VALUETYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
-	  VALUETYPE rr = sqrt(rr2);
-	  VALUETYPE ee = exp(-rr / alpha);
+	  FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
+	  FPTYPE rr = sqrt(rr2);
+	  FPTYPE ee = exp(-rr / alpha);
 	  aa += ee;
 	  bb += rr * ee;
 	}
-	VALUETYPE smin = bb / aa;
-	VALUETYPE vv, dd;
-	spline5_switch(vv, dd, smin, static_cast<VALUETYPE>(rmin), static_cast<VALUETYPE>(rmax));
+	FPTYPE smin = bb / aa;
+	FPTYPE vv, dd;
+	spline5_switch(vv, dd, smin, static_cast<FPTYPE>(rmin), static_cast<FPTYPE>(rmax));
 	// value of switch
 	sw_value(kk, i_idx) = vv;
 	// deriv of switch distributed as force
@@ -153,17 +137,17 @@ class SoftMinSwitchOp : public OpKernel {
 	  int j_idx = nlist (kk, i_idx * nnei + jj);
 	  if (j_idx < 0) continue;
 	  int rij_idx_shift = (ii * nnei + jj) * 3;
-	  VALUETYPE dr[3] = {
+	  FPTYPE dr[3] = {
 	    rij(kk, rij_idx_shift + 0),
 	    rij(kk, rij_idx_shift + 1),
 	    rij(kk, rij_idx_shift + 2)
 	  };
-	  VALUETYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
-	  VALUETYPE rr = sqrt(rr2);
-	  VALUETYPE ee = exp(-rr / alpha);
-	  VALUETYPE pref_c = (1./rr - 1./alpha) * ee ;
-	  VALUETYPE pref_d = 1./(rr * alpha) * ee;
-	  VALUETYPE ts;
+	  FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
+	  FPTYPE rr = sqrt(rr2);
+	  FPTYPE ee = exp(-rr / alpha);
+	  FPTYPE pref_c = (1./rr - 1./alpha) * ee ;
+	  FPTYPE pref_d = 1./(rr * alpha) * ee;
+	  FPTYPE ts;
 	  ts = dd / (aa * aa) * (aa * pref_c + bb * pref_d);
 	  sw_deriv(kk, rij_idx_shift + 0) += ts * dr[0];
 	  sw_deriv(kk, rij_idx_shift + 1) += ts * dr[1];
@@ -196,7 +180,12 @@ class SoftMinSwitchOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinSwitch").Device(DEVICE_CPU), SoftMinSwitchOp);
-
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinSwitch").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinSwitchOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/soft_min_force.cc b/source/op/soft_min_force.cc
index e51aadbc79..ffd6442a1a 100644
--- a/source/op/soft_min_force.cc
+++ b/source/op/soft_min_force.cc
@@ -6,34 +6,21 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
 REGISTER_OP("SoftMinForce")
-.Input("du: double")
-.Input("sw_deriv: double")
+.Attr("T: {float, double}")
+.Input("du: T")
+.Input("sw_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("force: double");
-#else
-REGISTER_OP("SoftMinForce")
-.Input("du: float")
-.Input("sw_deriv: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("force: float");
-#endif
+.Output("force: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename FPTYPE>
 class SoftMinForceOp : public OpKernel {
  public:
   explicit SoftMinForceOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -78,10 +65,10 @@ class SoftMinForceOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor));
     
     // flat the tensors
-    auto du = du_tensor.matrix<VALUETYPE>();
-    auto sw_deriv = sw_deriv_tensor.matrix<VALUETYPE>();
+    auto du = du_tensor.matrix<FPTYPE>();
+    auto sw_deriv = sw_deriv_tensor.matrix<FPTYPE>();
     auto nlist = nlist_tensor.matrix<int>();
-    auto force = force_tensor->matrix<VALUETYPE>();
+    auto force = force_tensor->matrix<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -118,4 +105,10 @@ class SoftMinForceOp : public OpKernel {
   int n_r_sel, n_a_sel;
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinForce").Device(DEVICE_CPU), SoftMinForceOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinForce").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinForceOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
diff --git a/source/op/soft_min_force_grad.cc b/source/op/soft_min_force_grad.cc
index 4c8a4b21ff..2461828853 100644
--- a/source/op/soft_min_force_grad.cc
+++ b/source/op/soft_min_force_grad.cc
@@ -6,34 +6,20 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
 REGISTER_OP("SoftMinForceGrad")
-.Input("grad: double")
-.Input("du: double")
-.Input("sw_deriv: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("du: T")
+.Input("sw_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
-REGISTER_OP("SoftMinForceGrad")
-.Input("grad: float")
-.Input("du: float")
-.Input("sw_deriv: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename FPTYPE>
 class SoftMinForceGradOp : public OpKernel 
 {
 public:
@@ -89,11 +75,11 @@ class SoftMinForceGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.matrix<VALUETYPE>();
-    auto du		= du_tensor		.matrix<VALUETYPE>();
-    auto sw_deriv	= sw_deriv_tensor	.matrix<VALUETYPE>();
+    auto grad		= grad_tensor		.matrix<FPTYPE>();
+    auto du		= du_tensor		.matrix<FPTYPE>();
+    auto sw_deriv	= sw_deriv_tensor	.matrix<FPTYPE>();
     auto nlist		= nlist_tensor		.matrix<int>();
-    auto grad_net	= grad_net_tensor	->matrix<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->matrix<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -125,4 +111,10 @@ class SoftMinForceGradOp : public OpKernel
   int n_r_sel, n_a_sel;
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinForceGrad").Device(DEVICE_CPU), SoftMinForceGradOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinForceGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinForceGradOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/soft_min_virial.cc b/source/op/soft_min_virial.cc
index 193c34f981..f369808685 100644
--- a/source/op/soft_min_virial.cc
+++ b/source/op/soft_min_virial.cc
@@ -6,40 +6,23 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("SoftMinVirial")
-.Input("du: double")
-.Input("sw_deriv: double")
-.Input("rij: double")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("virial: double")
-.Output("atom_virial: double")
-;
-#else
 REGISTER_OP("SoftMinVirial")
-.Input("du: float")
-.Input("sw_deriv: float")
-.Input("rij: float")
+.Attr("T: {float, double}")
+.Input("du: T")
+.Input("sw_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("virial: float")
-.Output("atom_virial: float")
-;
-#endif
+.Output("virial: T")
+.Output("atom_virial: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename FPTYPE>
 class SoftMinVirialOp : public OpKernel {
  public:
   explicit SoftMinVirialOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -94,12 +77,12 @@ class SoftMinVirialOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto du = du_tensor.matrix<VALUETYPE>();
-    auto sw_deriv = sw_deriv_tensor.matrix<VALUETYPE>();
-    auto rij = rij_tensor.matrix<VALUETYPE>();
+    auto du = du_tensor.matrix<FPTYPE>();
+    auto sw_deriv = sw_deriv_tensor.matrix<FPTYPE>();
+    auto rij = rij_tensor.matrix<FPTYPE>();
     auto nlist = nlist_tensor.matrix<int>();
-    auto virial = virial_tensor->matrix<VALUETYPE>();
-    auto atom_virial = atom_virial_tensor->matrix<VALUETYPE>();
+    auto virial = virial_tensor->matrix<FPTYPE>();
+    auto atom_virial = atom_virial_tensor->matrix<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for
@@ -122,7 +105,7 @@ class SoftMinVirialOp : public OpKernel {
 	  int rij_idx_shift = (ii * nnei + jj) * 3;
 	  for (int dd0 = 0; dd0 < 3; ++dd0){
 	    for (int dd1 = 0; dd1 < 3; ++dd1){
-	      VALUETYPE tmp_v = du(kk, i_idx) * sw_deriv(kk, rij_idx_shift + dd0) * rij(kk, rij_idx_shift + dd1);
+	      FPTYPE tmp_v = du(kk, i_idx) * sw_deriv(kk, rij_idx_shift + dd0) * rij(kk, rij_idx_shift + dd1);
 	      virial(kk, dd0 * 3 + dd1) -= tmp_v;		  
 	      atom_virial(kk, j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	    }
@@ -135,7 +118,12 @@ class SoftMinVirialOp : public OpKernel {
   int n_r_sel, n_a_sel;
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinVirial").Device(DEVICE_CPU), SoftMinVirialOp);
-
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinVirial").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinVirialOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/soft_min_virial_grad.cc b/source/op/soft_min_virial_grad.cc
index 6f8703bdee..0bafaa5bfa 100644
--- a/source/op/soft_min_virial_grad.cc
+++ b/source/op/soft_min_virial_grad.cc
@@ -6,36 +6,22 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("SoftMinVirialGrad")
-.Input("grad: double")
-.Input("du: double")
-.Input("sw_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("du: T")
+.Input("sw_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
-REGISTER_OP("SoftMinVirialGrad")
-.Input("grad: float")
-.Input("du: float")
-.Input("sw_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename FPTYPE>
 class SoftMinVirialGradOp : public OpKernel 
 {
 public:
@@ -97,12 +83,12 @@ class SoftMinVirialGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.matrix<VALUETYPE>();
-    auto du		= du_tensor		.matrix<VALUETYPE>();
-    auto sw_deriv	= sw_deriv_tensor	.matrix<VALUETYPE>();
-    auto rij		= rij_tensor		.matrix<VALUETYPE>();
+    auto grad		= grad_tensor		.matrix<FPTYPE>();
+    auto du		= du_tensor		.matrix<FPTYPE>();
+    auto sw_deriv	= sw_deriv_tensor	.matrix<FPTYPE>();
+    auto rij		= rij_tensor		.matrix<FPTYPE>();
     auto nlist		= nlist_tensor		.matrix<int>();
-    auto grad_net	= grad_net_tensor	->matrix<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->matrix<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -148,4 +134,10 @@ class SoftMinVirialGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinVirialGrad").Device(DEVICE_CPU), SoftMinVirialGradOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinVirialGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinVirialGradOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/tab_inter.cc b/source/op/tab_inter.cc
index 242e52a6e7..63c7af210c 100644
--- a/source/op/tab_inter.cc
+++ b/source/op/tab_inter.cc
@@ -6,44 +6,26 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("TabInter")
+.Attr("T: {float, double}")
 .Input("table_info: double")
 .Input("table_data: double")
 .Input("type: int32")
-.Input("rij: double")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Input("scale: double")
+.Input("scale: T")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
-.Output("atom_energy: double")
-.Output("force: double")
-.Output("atom_virial: double");
-#else
-REGISTER_OP("TabInter")
-.Input("table_info: double")
-.Input("table_data: double")
-.Input("type: int32")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Input("scale: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("atom_energy: float")
-.Output("force: float")
-.Output("atom_virial: float");
-#endif
+.Output("atom_energy: T")
+.Output("force: T")
+.Output("atom_virial: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
 inline 
 void tabulated_inter (double & ener, 
 		      double & fscale, 
@@ -86,6 +68,7 @@ void tabulated_inter (double & ener,
   fscale *= -hi;
 }
 
+template<typename Device, typename FPTYPE>
 class TabInterOp : public OpKernel {
  public:
   explicit TabInterOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -156,15 +139,15 @@ class TabInterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(tmp_idx++, virial_shape, &virial_tensor));
     
     // flat the tensors
-    auto table_info = table_info_tensor.flat<VALUETYPE>();
-    auto table_data = table_data_tensor.flat<VALUETYPE>();
+    auto table_info = table_info_tensor.flat<FPTYPE>();
+    auto table_data = table_data_tensor.flat<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto rij	= rij_tensor	.matrix<VALUETYPE>();
+    auto rij	= rij_tensor	.matrix<FPTYPE>();
     auto nlist	= nlist_tensor	.matrix<int>();
-    auto scale  = scale_tensor	.matrix<VALUETYPE>();
-    auto energy = energy_tensor	->matrix<VALUETYPE>();
-    auto force	= force_tensor	->matrix<VALUETYPE>();
-    auto virial = virial_tensor	->matrix<VALUETYPE>();
+    auto scale  = scale_tensor	.matrix<FPTYPE>();
+    auto energy = energy_tensor	->matrix<FPTYPE>();
+    auto force	= force_tensor	->matrix<FPTYPE>();
+    auto virial = virial_tensor	->matrix<FPTYPE>();
 
     OP_REQUIRES (context, (ntypes == int(table_info(3)+0.1)),	errors::InvalidArgument ("ntypes provided in table does not match deeppot"));
     int nspline = table_info(2)+0.1;
@@ -203,7 +186,7 @@ class TabInterOp : public OpKernel {
       for (int tt = 0; tt < ntypes; ++tt) {
 	for (int ii = 0; ii < natoms(2+tt); ++ii){
 	  int i_type = type(kk, i_idx);
-	  VALUETYPE i_scale = scale(kk, i_idx);
+	  FPTYPE i_scale = scale(kk, i_idx);
 	  assert(i_type == tt) ;
 	  int jiter = 0;
 	  // a neighbor
@@ -318,7 +301,13 @@ class TabInterOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("TabInter").Device(DEVICE_CPU), TabInterOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("TabInter").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    TabInterOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
 
diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py
index e46265231f..1c93e92f02 100644
--- a/source/train/DescrptSeA.py
+++ b/source/train/DescrptSeA.py
@@ -391,14 +391,18 @@ def _filter(self,
               xyz_scatter = tf.matmul(xyz_scatter, w)
             # natom x nei_type_i x out_size
             xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
-            xyz_scatter_total.append(xyz_scatter)
 
+            # xyz_scatter_total.append(xyz_scatter)
+            if type_i == 0 :
+                xyz_scatter_1 = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
+            else :
+                xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
           # natom x nei x outputs_size
-          xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
+          # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
           # natom x nei x 4
-          inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
+          # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
           # natom x 4 x outputs_size
-          xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
+          # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
           xyz_scatter_1 = xyz_scatter_1 * (4.0 / shape[1])
           # natom x 4 x outputs_size_2
           xyz_scatter_2 = tf.slice(xyz_scatter_1, [0,0,0],[-1,-1,outputs_size_2])