diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 84c3d326da..dc35ee5dc0 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -125,7 +125,7 @@ else() endif() endif() if (USE_CUDA_TOOLKIT) - add_definitions("-DUSE_CUDA_TOOLKIT") + add_definitions("-D GOOGLE_CUDA") endif() # define USE_TTM diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h new file mode 100644 index 0000000000..c446db8130 --- /dev/null +++ b/source/lib/include/CustomeOperation.h @@ -0,0 +1,572 @@ +#pragma once +#include +#include +#include +#include +#include "MathUtilities.h" +#if GOOGLE_CUDA +#include "DeviceFunctor.h" +#endif // GOOGLE_CUDA + +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +struct NeighborInfo { + int type; + double dist; + int index; + NeighborInfo () : type (0), dist(0), index(0) {} + NeighborInfo (int tt, double dd, int ii) : type (tt), dist(dd), index(ii) {} + + bool operator < (const NeighborInfo & b) const { + return (type < b.type || (type == b.type && (dist < b.dist || (dist == b.dist && index < b.index)))); + } +}; + +template +inline void spline5_switch ( + FPTYPE & vv, + FPTYPE & dd, + const FPTYPE & xx, + const float & rmin, + const float & rmax) +{ + if (xx < rmin) { + dd = 0; + vv = 1; + } + else if (xx < rmax) { + FPTYPE uu = (xx - rmin) / (rmax - rmin) ; + FPTYPE du = 1. / (rmax - rmin) ; + vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; + dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; + } + else { + dd = 0; + vv = 0; + } +} + +template +int format_nlist_fill_se_a_cpu ( + vector & fmt_nei_idx_a, + const vector & posi, + const int & ntypes, + const vector & type, + const int & i_idx, + const vector & nei_idx_a, + const float & rcut, + const vector & sec_a) +{ + fmt_nei_idx_a.resize (sec_a.back()); + fill (fmt_nei_idx_a.begin(), fmt_nei_idx_a.end(), -1); + + // gether all neighbors + std::vector nei_idx (nei_idx_a); + // allocate the information for all neighbors + vector sel_nei; + sel_nei.reserve (nei_idx_a.size()); + for (unsigned kk = 0; kk < nei_idx.size(); ++kk) { + FPTYPE diff[3]; + const int & j_idx = nei_idx[kk]; + for (int dd = 0; dd < 3; ++dd) { + diff[dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd]; + } + FPTYPE rr = sqrt(MathUtilities::dot (diff, diff)); + if (rr <= rcut) { + sel_nei.push_back(NeighborInfo(type[j_idx], rr, j_idx)); + } + } + sort(sel_nei.begin(), sel_nei.end()); + + std::vector nei_iter = sec_a; + int overflowed = -1; + for (unsigned kk = 0; kk < sel_nei.size(); ++kk) { + const int & nei_type = sel_nei[kk].type; + if (nei_iter[nei_type] < sec_a[nei_type+1]) { + fmt_nei_idx_a[nei_iter[nei_type] ++] = sel_nei[kk].index; + } + } + return overflowed; +} + +template +void compute_descriptor_se_a_cpu ( + vector & descrpt_a, + vector & descrpt_a_deriv, + vector & rij_a, + const vector & posi, + const int & ntypes, + const vector & type, + const int & i_idx, + const vector & fmt_nlist_a, + const vector & sec_a, + const float & rmin, + const float & rmax) +{ + // compute the diff of the neighbors + rij_a.resize (sec_a.back() * 3); + fill (rij_a.begin(), rij_a.end(), 0.0); + for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) { + for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) { + if (fmt_nlist_a[jj] < 0) break; + const int & j_idx = fmt_nlist_a[jj]; + + for (int dd = 0; dd < 3; ++dd) { + rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd]; + } + } + } + // 1./rr, cos(theta), cos(phi), sin(phi) + descrpt_a.resize (sec_a.back() * 4); + fill (descrpt_a.begin(), descrpt_a.end(), 0.0); + // deriv wrt center: 3 + descrpt_a_deriv.resize (sec_a.back() * 4 * 3); + fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0); + + for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) { + for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) { + if (fmt_nlist_a[nei_iter] < 0) break; + const FPTYPE * rr = &rij_a[nei_iter * 3]; + FPTYPE nr2 = MathUtilities::dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; + spline5_switch(sw, dsw, nr, rmin, rmax); + int idx_deriv = nei_iter * 4 * 3; // 4 components time 3 directions + int idx_value = nei_iter * 4; // 4 components + // 4 value components + descrpt_a[idx_value + 0] = 1./nr; + descrpt_a[idx_value + 1] = rr[0] / nr2; + descrpt_a[idx_value + 2] = rr[1] / nr2; + descrpt_a[idx_value + 3] = rr[2] / nr2; + // deriv of component 1/r + descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr; + // deriv of component x/r2 + descrpt_a_deriv[idx_deriv + 3] = (2. * rr[0] * rr[0] * inr4 - inr2) * sw - descrpt_a[idx_value + 1] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 4] = (2. * rr[0] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 5] = (2. * rr[0] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[2] * inr; + // deriv of component y/r2 + descrpt_a_deriv[idx_deriv + 6] = (2. * rr[1] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 7] = (2. * rr[1] * rr[1] * inr4 - inr2) * sw - descrpt_a[idx_value + 2] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 8] = (2. * rr[1] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[2] * inr; + // deriv of component z/r2 + descrpt_a_deriv[idx_deriv + 9] = (2. * rr[2] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv +10] = (2. * rr[2] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv +11] = (2. * rr[2] * rr[2] * inr4 - inr2) * sw - descrpt_a[idx_value + 3] * dsw * rr[2] * inr; + // 4 value components + descrpt_a[idx_value + 0] *= sw; + descrpt_a[idx_value + 1] *= sw; + descrpt_a[idx_value + 2] *= sw; + descrpt_a[idx_value + 3] *= sw; + } + } +} + +template +void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + // set & normalize coord + std::vector d_coord3(nall * 3); + for (int ii = 0; ii < nall; ++ii) { + for (int dd = 0; dd < 3; ++dd) { + d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; + } + } + + // set type + std::vector d_type (nall); + for (int ii = 0; ii < nall; ++ii) { + d_type[ii] = type[ii]; + } + + // build nlist + std::vector > d_nlist_a(nloc); + + for (unsigned ii = 0; ii < nloc; ++ii) { + d_nlist_a.reserve (jrange[nloc] / nloc + 10); + } + for (unsigned ii = 0; ii < nloc; ++ii) { + int i_idx = ilist[ii]; + for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) { + int j_idx = jlist[jj]; + d_nlist_a[i_idx].push_back (j_idx); + } + } + + #pragma omp parallel for + for (int ii = 0; ii < nloc; ++ii) { + vector fmt_nlist_a; + int ret = -1; + if (fill_nei_a) { + format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a); + } + std::vector d_descrpt_a; + std::vector d_descrpt_a_deriv; + std::vector d_descrpt_r; + std::vector d_descrpt_r_deriv; + std::vector d_rij_a; + compute_descriptor_se_a_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r); + + // check sizes + assert (d_descrpt_a.size() == ndescrpt); + assert (d_descrpt_a_deriv.size() == ndescrpt * 3); + assert (d_rij_a.size() == nnei * 3); + assert (fmt_nlist_a.size() == nnei); + // record outputs + for (int jj = 0; jj < ndescrpt; ++jj) { + descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj]; + } + for (int jj = 0; jj < ndescrpt * 3; ++jj) { + descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3]; + } + for (int jj = 0; jj < nnei * 3; ++jj) { + rij[ii * nnei * 3 + jj] = d_rij_a[jj]; + } + for (int jj = 0; jj < nnei; ++jj) { + nlist[ii * nnei + jj] = fmt_nlist_a[jj]; + } + } +} + +#if GOOGLE_CUDA +template +void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeAGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); +} +#endif // GOOGLE_CUDA +// ****************************************************************************** +// end of custome op DescrptSeA +// ****************************************************************************** + +inline void make_descript_range (int & idx_start, int & idx_end, const int & nei_idx, const int& n_a_sel, const int n_a_shift) { + if (nei_idx < n_a_sel) { + idx_start = nei_idx * 4; + idx_end = nei_idx * 4 + 4; + } + else { + idx_start = n_a_shift + (nei_idx - n_a_sel); + idx_end = n_a_shift + (nei_idx - n_a_sel) + 1; + } +} + +template +void ProdForceSeACPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + memset(force, 0.0, sizeof(FPTYPE) * nall * 3); + // compute force of a frame + for (int i_idx = 0; i_idx < nloc; ++i_idx) { + // deriv wrt center atom + for (int aa = 0; aa < ndescrpt; ++aa) { + force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; + force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; + force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; + } + // deriv wrt neighbors + for (int jj = 0; jj < nnei; ++jj) { + int j_idx = nlist[i_idx * nnei + jj]; + if (j_idx < 0) continue; + int aa_start, aa_end; + make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift); + for (int aa = aa_start; aa < aa_end; ++aa) { + force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; + force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; + force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; + } + } + } +} + +#if GOOGLE_CUDA +template +void ProdForceSeAGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdForceSeAGPUExecuteFunctor()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); +} +#endif // GOOGLE_CUDA + +// ****************************************************************************** +// end of custome op ProdForceSeA +// ****************************************************************************** + +template +void ProdVirialSeACPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + memset(virial, 0.0, sizeof(FPTYPE) * 9); + memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9); + + // compute virial of a frame + for (int i_idx = 0; i_idx < nloc; ++i_idx) { + // deriv wrt neighbors + for (int jj = 0; jj < nnei; ++jj) { + int j_idx = nlist[i_idx * nnei + jj]; + if (j_idx < 0) continue; + int aa_start, aa_end; + make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift); + for (int aa = aa_start; aa < aa_end; ++aa) { + FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + aa]; + for (int dd0 = 0; dd0 < 3; ++dd0) + for (int dd1 = 0; dd1 < 3; ++dd1) { + FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0]; + virial[dd0 * 3 + dd1] -= tmp_v; + atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v; + } + } + } + } +} + +#if GOOGLE_CUDA +template +void ProdVirialSeAGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdVirialSeAGPUExecuteFunctor()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); +} +#endif // GOOGLE_CUDA +// ****************************************************************************** +// end of custome op ProdVirialSeA +// ****************************************************************************** + +template +void GeluCPULauncher(const FPTYPE * in, FPTYPE * out, int const size) { + for (int ii = 0; ii < size; ii++) { + out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]))); + } +} + +template +void GeluGradCPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { + for (int ii = 0; ii < size; ii++) { + FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5); + } +} + +template +void GeluGradGradCPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { + for (int ii = 0; ii < size; ii++) { + FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); + out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2); + } +} + +#if GOOGLE_CUDA +template +void GeluGPULauncher(const FPTYPE * in, FPTYPE * out, int const size) { + GeluGPUExecuteFunctor()(in, out, size); +} + +template +void GeluGradGPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradGPUExecuteFunctor()(dy, in, out, size); +} + +template +void GeluGradGradGPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradGradGPUExecuteFunctor()(dy, dy_, in, out, size); +} +#endif // GOOGLE_CUDA +// ****************************************************************************** +// end of custome op Gelu +// ****************************************************************************** + +template +void compute_descriptor_se_r_cpu ( + vector & descrpt_a, + vector & descrpt_a_deriv, + vector & rij_a, + const vector & posi, + const int & ntypes, + const vector & type, + const int & i_idx, + const vector & fmt_nlist_a, + const vector & sec_a, + const float & rmin, + const float & rmax) +{ + // compute the diff of the neighbors + rij_a.resize (sec_a.back() * 3); + fill (rij_a.begin(), rij_a.end(), 0.0); + for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) { + for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) { + if (fmt_nlist_a[jj] < 0) break; + const int & j_idx = fmt_nlist_a[jj]; + + for (int dd = 0; dd < 3; ++dd) { + rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd]; + } + } + } + // 1./rr, cos(theta), cos(phi), sin(phi) + descrpt_a.resize (sec_a.back()); + fill (descrpt_a.begin(), descrpt_a.end(), 0.0); + // deriv wrt center: 3 + descrpt_a_deriv.resize (sec_a.back() * 3); + fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0); + + for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) { + for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) { + if (fmt_nlist_a[nei_iter] < 0) break; + const FPTYPE * rr = &rij_a[nei_iter * 3]; + FPTYPE nr2 = MathUtilities::dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; + spline5_switch(sw, dsw, nr, rmin, rmax); + int idx_deriv = nei_iter * 3; // 1 components time 3 directions + int idx_value = nei_iter; // 1 components + // 4 value components + descrpt_a[idx_value + 0] = 1./nr; + // deriv of component 1/r + descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr; + // 4 value components + descrpt_a[idx_value + 0] *= sw; + } + } +} + +template +void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + // set & normalize coord + std::vector d_coord3(nall * 3); + for (int ii = 0; ii < nall; ++ii) { + for (int dd = 0; dd < 3; ++dd) { + d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; + } + } + + // set type + std::vector d_type (nall); + for (int ii = 0; ii < nall; ++ii) { + d_type[ii] = type[ii]; + } + + // build nlist + std::vector > d_nlist_a(nloc); + + for (unsigned ii = 0; ii < nloc; ++ii) { + d_nlist_a.reserve (jrange[nloc] / nloc + 10); + } + for (unsigned ii = 0; ii < nloc; ++ii) { + int i_idx = ilist[ii]; + for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) { + int j_idx = jlist[jj]; + d_nlist_a[i_idx].push_back (j_idx); + } + } + + #pragma omp parallel for + for (int ii = 0; ii < nloc; ++ii) { + vector fmt_nlist_a; + int ret = -1; + if (fill_nei_a) { + format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a); + } + std::vector d_descrpt_a; + std::vector d_descrpt_a_deriv; + std::vector d_descrpt_r; + std::vector d_descrpt_r_deriv; + std::vector d_rij_a; + compute_descriptor_se_r_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r); + + // check sizes + assert (d_descrpt_a.size() == ndescrpt); + assert (d_descrpt_a_deriv.size() == ndescrpt * 3); + assert (d_rij_a.size() == nnei * 3); + assert (fmt_nlist_a.size() == nnei); + // record outputs + for (int jj = 0; jj < ndescrpt; ++jj) { + descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj]; + } + for (int jj = 0; jj < ndescrpt * 3; ++jj) { + descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3]; + } + for (int jj = 0; jj < nnei * 3; ++jj) { + rij[ii * nnei * 3 + jj] = d_rij_a[jj]; + } + for (int jj = 0; jj < nnei; ++jj) { + nlist[ii * nnei + jj] = fmt_nlist_a[jj]; + } + } +} + +#if GOOGLE_CUDA +template +void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeRGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); +} +#endif // GOOGLE_CUDA +// ****************************************************************************** +// end of custome op DescrptSeR +// ****************************************************************************** + +template +void ProdForceSeRCPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + memset(force, 0.0, sizeof(FPTYPE) * nall * 3); + // compute force of a frame + for (int i_idx = 0; i_idx < nloc; ++i_idx) { + // deriv wrt center atom + for (int aa = 0; aa < ndescrpt; ++aa) { + force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; + force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; + force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; + } + // deriv wrt neighbors + for (int jj = 0; jj < nnei; ++jj) { + int j_idx = nlist[i_idx * nnei + jj]; + if (j_idx < 0) continue; + force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 0]; + force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 1]; + force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 2]; + } + } +} + +#if GOOGLE_CUDA +template +void ProdForceSeRGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdForceSeRGPUExecuteFunctor()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt); +} +#endif // GOOGLE_CUDA + +// ****************************************************************************** +// end of custome op ProdForceSeR +// ****************************************************************************** + +template +void ProdVirialSeRCPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + memset(virial, 0.0, sizeof(FPTYPE) * 9); + memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9); + + // compute virial of a frame + for (int i_idx = 0; i_idx < nloc; ++i_idx) { + // deriv wrt neighbors + for (int jj = 0; jj < nnei; ++jj) { + int j_idx = nlist[i_idx * nnei + jj]; + if (j_idx < 0) continue; + FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + jj]; + for (int dd0 = 0; dd0 < 3; ++dd0) + for (int dd1 = 0; dd1 < 3; ++dd1) { + FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0]; + virial[dd0 * 3 + dd1] -= tmp_v; + atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v; + } + } + } +} + +#if GOOGLE_CUDA +template +void ProdVirialSeRGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdVirialSeRGPUExecuteFunctor()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt); +} +#endif // GOOGLE_CUDA +// ****************************************************************************** +// end of custome op ProdVirialSeR +// ****************************************************************************** diff --git a/source/lib/include/DeviceFunctor.h b/source/lib/include/DeviceFunctor.h new file mode 100644 index 0000000000..d51d617f84 --- /dev/null +++ b/source/lib/include/DeviceFunctor.h @@ -0,0 +1,62 @@ +#pragma once +#include +#include +#include +#include +#include + +typedef unsigned long long int_64; +#define SQRT_2_PI 0.7978845608028654 + +#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);} +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +template +struct DescrptSeAGPUExecuteFunctor { + void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER); +}; + +template +struct DescrptSeRGPUExecuteFunctor { + void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER); +}; + +template +struct ProdForceSeAGPUExecuteFunctor { + void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); +}; + +template +struct ProdForceSeRGPUExecuteFunctor { + void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt); +}; + +template +struct ProdVirialSeAGPUExecuteFunctor { + void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); +}; + +template +struct ProdVirialSeRGPUExecuteFunctor { + void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt); +}; + +template +struct GeluGPUExecuteFunctor { + void operator()(const FPTYPE * in, FPTYPE * out, const int size); +}; + +template +struct GeluGradGPUExecuteFunctor { + void operator()(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, const int size); +}; + +template +struct GeluGradGradGPUExecuteFunctor { + void operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, const int size); +}; \ No newline at end of file diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h index a32940a738..6c37770758 100644 --- a/source/lib/include/NNPInter.h +++ b/source/lib/include/NNPInter.h @@ -98,9 +98,6 @@ class NNPInter // function used for neighbor list copy vector get_sel_a() const; -#ifdef USE_CUDA_TOOLKIT - void update_nbor(const InternalNeighborList & nlist, const int nloc); -#endif }; class NNPInterModelDevi @@ -195,9 +192,6 @@ class NNPInterModelDevi // function used for nborlist copy vector > get_sel() const; void cum_sum(const std::vector > n_sel); -#ifdef USE_CUDA_TOOLKIT - void update_nbor(const InternalNeighborList & nlist, const int nloc); -#endif }; diff --git a/source/lib/include/common.h b/source/lib/include/common.h index 3912f21f7f..4874274305 100644 --- a/source/lib/include/common.h +++ b/source/lib/include/common.h @@ -8,11 +8,17 @@ using namespace tensorflow; using namespace std; +#include +#include #include "NNPAtomMap.h" #include +#include +#include #include "version.h" +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; #ifdef HIGH_PREC typedef double VALUETYPE; typedef double ENERGYTYPE; @@ -122,6 +128,20 @@ session_input_tensors (std::vector> & input_tensors, const int nghost = 0, const string scope = ""); +int +session_input_tensors (std::vector> & input_tensors, + const vector & dcoord_, + const int & ntypes, + const vector & datype_, + const vector & dbox, + InternalNeighborList & dlist, + const vector & fparam_, + const vector & aparam_, + const NNPAtomMap& nnpmap, + const int nghost, + const int ago, + const string scope = ""); + int session_input_tensors (std::vector> & input_tensors, const vector & dcoord_, diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc index aea9d48c9b..879c36d5c0 100644 --- a/source/lib/src/NNPInter.cc +++ b/source/lib/src/NNPInter.cc @@ -4,11 +4,8 @@ #include -#ifdef USE_CUDA_TOOLKIT +#if GOOGLE_CUDA #include "cuda_runtime.h" -#include -#include -#include #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) @@ -57,7 +54,6 @@ run_model (ENERGYTYPE & dener, return; } -#ifdef USE_CUDA_TOOLKIT std::vector output_tensors; checkStatus (session->Run(input_tensors, {"o_energy", "o_force", "o_atom_virial"}, @@ -74,56 +70,23 @@ run_model (ENERGYTYPE & dener, dener = oe(0); vector dforce (3 * nall); - vector datom_virial (9 * nall); dvirial.resize (9); for (unsigned ii = 0; ii < nall * 3; ++ii){ dforce[ii] = of(ii); } - for (int ii = 0; ii < nall * 9; ++ii) { - datom_virial[ii] = oav(ii); - } for (int ii = 0; ii < nall; ++ii) { - dvirial[0] += 1.0 * datom_virial[9*ii+0]; - dvirial[1] += 1.0 * datom_virial[9*ii+1]; - dvirial[2] += 1.0 * datom_virial[9*ii+2]; - dvirial[3] += 1.0 * datom_virial[9*ii+3]; - dvirial[4] += 1.0 * datom_virial[9*ii+4]; - dvirial[5] += 1.0 * datom_virial[9*ii+5]; - dvirial[6] += 1.0 * datom_virial[9*ii+6]; - dvirial[7] += 1.0 * datom_virial[9*ii+7]; - dvirial[8] += 1.0 * datom_virial[9*ii+8]; - } - - dforce_ = dforce; - nnpmap.backward (dforce_.begin(), dforce.begin(), 3); -#else - std::vector output_tensors; - - checkStatus (session->Run(input_tensors, - {"o_energy", "o_force", "o_virial"}, - {}, - &output_tensors)); - - Tensor output_e = output_tensors[0]; - Tensor output_f = output_tensors[1]; - Tensor output_v = output_tensors[2]; - - auto oe = output_e.flat (); - auto of = output_f.flat (); - auto ov = output_v.flat (); - - dener = oe(0); - vector dforce (3 * nall); - dvirial.resize (9); - for (unsigned ii = 0; ii < nall * 3; ++ii){ - dforce[ii] = of(ii); - } - for (unsigned ii = 0; ii < 9; ++ii){ - dvirial[ii] = ov(ii); + dvirial[0] += 1.0 * oav(9*ii+0); + dvirial[1] += 1.0 * oav(9*ii+1); + dvirial[2] += 1.0 * oav(9*ii+2); + dvirial[3] += 1.0 * oav(9*ii+3); + dvirial[4] += 1.0 * oav(9*ii+4); + dvirial[5] += 1.0 * oav(9*ii+5); + dvirial[6] += 1.0 * oav(9*ii+6); + dvirial[7] += 1.0 * oav(9*ii+7); + dvirial[8] += 1.0 * oav(9*ii+8); } dforce_ = dforce; nnpmap.backward (dforce_.begin(), dforce.begin(), 3); -#endif } static void run_model (ENERGYTYPE & dener, @@ -155,7 +118,6 @@ static void run_model (ENERGYTYPE & dener, fill(datom_virial_.begin(), datom_virial_.end(), 0.0); return; } -#ifdef USE_CUDA_TOOLKIT std::vector output_tensors; checkStatus (session->Run(input_tensors, @@ -204,50 +166,6 @@ static void run_model (ENERGYTYPE & dener, nnpmap.backward (dforce_.begin(), dforce.begin(), 3); nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1); nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9); -#else - std::vector output_tensors; - - checkStatus (session->Run(input_tensors, - {"o_energy", "o_force", "o_virial", "o_atom_energy", "o_atom_virial"}, - {}, - &output_tensors)); - - Tensor output_e = output_tensors[0]; - Tensor output_f = output_tensors[1]; - Tensor output_v = output_tensors[2]; - Tensor output_ae = output_tensors[3]; - Tensor output_av = output_tensors[4]; - - auto oe = output_e.flat (); - auto of = output_f.flat (); - auto ov = output_v.flat (); - auto oae = output_ae.flat (); - auto oav = output_av.flat (); - - dener = oe(0); - vector dforce (3 * nall); - vector datom_energy (nall, 0); - vector datom_virial (9 * nall); - dvirial.resize (9); - for (int ii = 0; ii < nall * 3; ++ii) { - dforce[ii] = of(ii); - } - for (int ii = 0; ii < nloc; ++ii) { - datom_energy[ii] = oae(ii); - } - for (int ii = 0; ii < nall * 9; ++ii) { - datom_virial[ii] = oav(ii); - } - for (int ii = 0; ii < 9; ++ii) { - dvirial[ii] = ov(ii); - } - dforce_ = dforce; - datom_energy_ = datom_energy; - datom_virial_ = datom_virial; - nnpmap.backward (dforce_.begin(), dforce.begin(), 3); - nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1); - nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9); -#endif } @@ -266,50 +184,8 @@ NNPInter (const string & model, const int & gpu_rank) init(model, gpu_rank); } -NNPInter::~NNPInter() { - #ifdef USE_CUDA_TOOLKIT - if (init_nbor) { - cudaErrcheck(cudaFree(ilist)); - cudaErrcheck(cudaFree(jrange)); - cudaErrcheck(cudaFree(jlist)); - } - #endif -} - -#ifdef USE_CUDA_TOOLKIT -void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) { - if (!init_nbor) { - cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); - cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); - cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - ilist_size = nlist.ilist.size(); - jrange_size = nlist.jrange.size(); - jlist_size = nlist.jlist.size(); - init_nbor = true; - } - if (ilist_size < nlist.ilist.size()) { - cudaErrcheck(cudaFree(ilist)); - cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); - ilist_size = nlist.ilist.size(); - } - if (jrange_size < nlist.jrange.size()) { - cudaErrcheck(cudaFree(jrange)); - cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); - jrange_size = nlist.jrange.size(); - } - if (jlist_size < nlist.jlist.size()) { - cudaErrcheck(cudaFree(jlist)); - cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - jlist_size = nlist.jlist.size(); - } - - cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); - cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); - cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); -} -#endif // USE_CUDA_TOOLKIT +NNPInter::~NNPInter() {} -#ifdef USE_CUDA_TOOLKIT void NNPInter:: init (const string & model, const int & gpu_rank) @@ -318,21 +194,21 @@ init (const string & model, const int & gpu_rank) SessionOptions options; options.config.set_inter_op_parallelism_threads(num_inter_nthreads); options.config.set_intra_op_parallelism_threads(num_intra_nthreads); - options.config.set_allow_soft_placement(true); - options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); - options.config.mutable_gpu_options()->set_allow_growth(true); checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def)); int gpu_num = -1; - cudaGetDeviceCount(&gpu_num); - // std::cout << "current number of devices: " << gpu_num << std::endl; - // set device to GPU only when at least GPU is found + #if GOOGLE_CUDA + cudaGetDeviceCount(&gpu_num); // check current device environment if (gpu_num > 0) { + options.config.set_allow_soft_placement(true); + options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); + options.config.mutable_gpu_options()->set_allow_growth(true); + cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num)); std::string str = "/gpu:"; str += std::to_string(gpu_rank % gpu_num); graph::SetDefaultDevice(str, &graph_def); - // std::cout << "current device rank: " << str << std::endl; } + #endif // GOOGLE_CUDA checkStatus (NewSession(options, &session)); checkStatus (session->Create(graph_def)); rcut = get_scalar("descrpt_attr/rcut"); @@ -340,8 +216,6 @@ init (const string & model, const int & gpu_rank) ntypes = get_scalar("descrpt_attr/ntypes"); dfparam = get_scalar("fitting_attr/dfparam"); daparam = get_scalar("fitting_attr/daparam"); - // assert(rcut == get_rcut()); - // assert(ntypes == get_ntypes()); if (dfparam < 0) dfparam = 0; if (daparam < 0) daparam = 0; inited = true; @@ -350,38 +224,6 @@ init (const string & model, const int & gpu_rank) ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; } -#else -void -NNPInter:: -init (const string & model, const int & gpu_rank) -{ - assert (!inited); - SessionOptions options; - options.config.set_inter_op_parallelism_threads(num_inter_nthreads); - options.config.set_intra_op_parallelism_threads(num_intra_nthreads); - checkStatus (NewSession(options, &session)); - checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def)); - checkStatus (session->Create(graph_def)); - rcut = get_scalar("descrpt_attr/rcut"); - cell_size = rcut; - ntypes = get_scalar("descrpt_attr/ntypes"); - dfparam = get_scalar("fitting_attr/dfparam"); - daparam = get_scalar("fitting_attr/daparam"); - // assert(rcut == get_rcut()); - // assert(ntypes == get_ntypes()); - if (dfparam < 0) dfparam = 0; - if (daparam < 0) daparam = 0; - // rcut = get_rcut(); - // cell_size = rcut; - // ntypes = get_ntypes(); - // dfparam = get_dfparam(); - inited = true; - - init_nbor = false; - ilist = NULL; jrange = NULL; jlist = NULL; - ilist_size = 0; jrange_size = 0; jlist_size = 0; -} -#endif void NNPInter:: @@ -554,17 +396,10 @@ compute_inner (ENERGYTYPE & dener, nnpmap = NNPAtomMap (datype_.begin(), datype_.begin() + nloc); assert (nloc == nnpmap.get_type().size()); - shuffle_nlist (nlist, nnpmap); - #ifdef USE_CUDA_TOOLKIT - update_nbor(nlist, nloc); - #endif + shuffle_nlist (nlist, nnpmap); } - #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); - #else - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - #endif + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago); assert (nloc == ret); run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost); } @@ -622,16 +457,9 @@ compute (ENERGYTYPE & dener, // InternalNeighborList nlist; convert_nlist_lmp_internal (nlist, lmp_list); shuffle_nlist (nlist, nnpmap); - #ifdef USE_CUDA_TOOLKIT - update_nbor(nlist, nloc); - #endif } - #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); - #else - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - #endif + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago); assert (nloc == ret); run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost); } @@ -663,17 +491,8 @@ NNPInterModelDevi (const vector & models, const int & gpu_rank) init(models, gpu_rank); } -NNPInterModelDevi::~NNPInterModelDevi() { -#ifdef USE_CUDA_TOOLKIT - if (init_nbor) { - cudaErrcheck(cudaFree(ilist)); - cudaErrcheck(cudaFree(jrange)); - cudaErrcheck(cudaFree(jlist)); - } -#endif -} +NNPInterModelDevi::~NNPInterModelDevi() {} -#ifdef USE_CUDA_TOOLKIT void NNPInterModelDevi:: init (const vector & models, const int & gpu_rank) @@ -682,26 +501,32 @@ init (const vector & models, const int & gpu_rank) numb_models = models.size(); sessions.resize(numb_models); graph_defs.resize(numb_models); + + int gpu_num = -1; + #if GOOGLE_CUDA + cudaGetDeviceCount(&gpu_num); + #endif // GOOGLE_CUDA + SessionOptions options; options.config.set_inter_op_parallelism_threads(num_inter_nthreads); options.config.set_intra_op_parallelism_threads(num_intra_nthreads); - options.config.set_allow_soft_placement(true); - options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); - options.config.mutable_gpu_options()->set_allow_growth(true); - for (unsigned ii = 0; ii < numb_models; ++ii){ checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii])); } - int gpu_num = -1; - cudaGetDeviceCount(&gpu_num); - // std::cout << "current number of devices: " << gpu_num << std::endl; - for (unsigned ii = 0; ii < numb_models; ++ii){ - // set device to GPU only when at least GPU is found + #if GOOGLE_CUDA + if (gpu_num > 0) { + options.config.set_allow_soft_placement(true); + options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); + options.config.mutable_gpu_options()->set_allow_growth(true); + cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num)); + } + #endif // GOOGLE_CUDA + + for (unsigned ii = 0; ii < numb_models; ++ii) { if (gpu_num > 0) { std::string str = "/gpu:"; str += std::to_string(gpu_rank % gpu_num); graph::SetDefaultDevice(str, &graph_defs[ii]); - // std::cout << "current device rank: " << str << std::endl; } checkStatus (NewSession(options, &(sessions[ii]))); checkStatus (sessions[ii]->Create(graph_defs[ii])); @@ -722,40 +547,6 @@ init (const vector & models, const int & gpu_rank) ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; } -#else -void -NNPInterModelDevi:: -init (const vector & models, const int & gpu_rank) -{ - assert (!inited); - numb_models = models.size(); - sessions.resize(numb_models); - graph_defs.resize(numb_models); - SessionOptions options; - options.config.set_inter_op_parallelism_threads(num_inter_nthreads); - options.config.set_intra_op_parallelism_threads(num_intra_nthreads); - for (unsigned ii = 0; ii < numb_models; ++ii){ - checkStatus (NewSession(options, &(sessions[ii]))); - checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii])); - checkStatus (sessions[ii]->Create(graph_defs[ii])); - } - rcut = get_scalar("descrpt_attr/rcut"); - cell_size = rcut; - ntypes = get_scalar("descrpt_attr/ntypes"); - dfparam = get_scalar("fitting_attr/dfparam"); - daparam = get_scalar("fitting_attr/daparam"); - if (dfparam < 0) dfparam = 0; - if (daparam < 0) daparam = 0; - // rcut = get_rcut(); - // cell_size = rcut; - // ntypes = get_ntypes(); - inited = true; - - init_nbor = false; - ilist = NULL; jrange = NULL; jlist = NULL; - ilist_size = 0; jrange_size = 0; jlist_size = 0; -} -#endif template VT @@ -821,42 +612,6 @@ cum_sum (const std::vector > n_sel) } } -#ifdef USE_CUDA_TOOLKIT -void -NNPInterModelDevi:: -update_nbor(const InternalNeighborList & nlist, const int nloc) -{ - if (!init_nbor) { - cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); - cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); - cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - ilist_size = nlist.ilist.size(); - jrange_size = nlist.jrange.size(); - jlist_size = nlist.jlist.size(); - init_nbor = true; - } - if (ilist_size < nlist.ilist.size()) { - cudaErrcheck(cudaFree(ilist)); - cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); - ilist_size = nlist.ilist.size(); - } - if (jrange_size < nlist.jrange.size()) { - cudaErrcheck(cudaFree(jrange)); - cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); - jrange_size = nlist.jrange.size(); - } - if (jlist_size < nlist.jlist.size()) { - cudaErrcheck(cudaFree(jlist)); - cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - jlist_size = nlist.jlist.size(); - } - - cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); - cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); - cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); -} -#endif //USE_CUDA_TOOLKIT - void NNPInterModelDevi:: validate_fparam_aparam(const int & nloc, @@ -946,16 +701,8 @@ compute (vector & all_energy, // InternalNeighborList nlist; convert_nlist_lmp_internal (nlist, lmp_list); shuffle_nlist (nlist, nnpmap); - #ifdef USE_CUDA_TOOLKIT - update_nbor(nlist, nloc); - #endif - } - #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); - #else - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - #endif + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago); all_energy.resize (numb_models); all_force.resize (numb_models); @@ -996,16 +743,8 @@ compute (vector & all_energy, // InternalNeighborList nlist; convert_nlist_lmp_internal (nlist, lmp_list); shuffle_nlist (nlist, nnpmap); - #ifdef USE_CUDA_TOOLKIT - update_nbor(nlist, nloc); - #endif - } - #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); - #else - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - #endif + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago); all_energy.resize (numb_models); all_force .resize (numb_models); diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc index dd4c3f672a..d990195e6c 100644 --- a/source/lib/src/common.cc +++ b/source/lib/src/common.cc @@ -340,6 +340,136 @@ session_input_tensors (std::vector> & input_tensors, return nloc; } +int +session_input_tensors (std::vector> & input_tensors, + const vector & dcoord_, + const int & ntypes, + const vector & datype_, + const vector & dbox, + InternalNeighborList & dlist, + const vector & fparam_, + const vector & aparam_, + const NNPAtomMap& nnpmap, + const int nghost, + const int ago, + const string scope) +{ + assert (dbox.size() == 9); + + int nframes = 1; + int nall = dcoord_.size() / 3; + int nloc = nall - nghost; + assert (nall == datype_.size()); + + vector datype = nnpmap.get_type(); + vector type_count (ntypes, 0); + for (unsigned ii = 0; ii < datype.size(); ++ii){ + type_count[datype[ii]] ++; + } + datype.insert (datype.end(), datype_.begin() + nloc, datype_.end()); + + TensorShape coord_shape ; + coord_shape.AddDim (nframes); + coord_shape.AddDim (nall * 3); + TensorShape type_shape ; + type_shape.AddDim (nframes); + type_shape.AddDim (nall); + TensorShape box_shape ; + box_shape.AddDim (nframes); + box_shape.AddDim (9); + TensorShape mesh_shape ; + mesh_shape.AddDim (16); + TensorShape natoms_shape ; + natoms_shape.AddDim (2 + ntypes); + TensorShape fparam_shape ; + fparam_shape.AddDim (nframes); + fparam_shape.AddDim (fparam_.size()); + TensorShape aparam_shape ; + aparam_shape.AddDim (nframes); + aparam_shape.AddDim (aparam_.size()); + +#ifdef HIGH_PREC + Tensor coord_tensor (DT_DOUBLE, coord_shape); + Tensor box_tensor (DT_DOUBLE, box_shape); + Tensor fparam_tensor (DT_DOUBLE, fparam_shape); + Tensor aparam_tensor (DT_DOUBLE, aparam_shape); +#else + Tensor coord_tensor (DT_FLOAT, coord_shape); + Tensor box_tensor (DT_FLOAT, box_shape); + Tensor fparam_tensor (DT_FLOAT, fparam_shape); + Tensor aparam_tensor (DT_FLOAT, aparam_shape); +#endif + Tensor type_tensor (DT_INT32, type_shape); + Tensor mesh_tensor (DT_INT32, mesh_shape); + Tensor natoms_tensor (DT_INT32, natoms_shape); + + auto coord = coord_tensor.matrix (); + auto type = type_tensor.matrix (); + auto box = box_tensor.matrix (); + auto mesh = mesh_tensor.flat (); + auto natoms = natoms_tensor.flat (); + auto fparam = fparam_tensor.matrix (); + auto aparam = aparam_tensor.matrix (); + + vector dcoord (dcoord_); + nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3); + + for (int ii = 0; ii < nframes; ++ii){ + for (int jj = 0; jj < nall * 3; ++jj){ + coord(ii, jj) = dcoord[jj]; + } + for (int jj = 0; jj < 9; ++jj){ + box(ii, jj) = dbox[jj]; + } + for (int jj = 0; jj < nall; ++jj){ + type(ii, jj) = datype[jj]; + } + for (int jj = 0; jj < fparam_.size(); ++jj){ + fparam(ii, jj) = fparam_[jj]; + } + for (int jj = 0; jj < aparam_.size(); ++jj){ + aparam(ii, jj) = aparam_[jj]; + } + } + + for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0; + + const int stride = sizeof(int *) / sizeof(int); + assert (stride * sizeof(int) == sizeof(int *)); + assert (stride <= 4); + mesh (0) = ago; + mesh (1) = dlist.ilist.size(); + mesh (2) = dlist.jrange.size(); + mesh (3) = dlist.jlist.size(); + dlist.make_ptrs(); + memcpy (&mesh(4), &(dlist.pilist), sizeof(int *)); + memcpy (&mesh(8), &(dlist.pjrange), sizeof(int *)); + memcpy (&mesh(12), &(dlist.pjlist), sizeof(int *)); + + natoms (0) = nloc; + natoms (1) = nall; + for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii]; + + string prefix = ""; + if (scope != ""){ + prefix = scope + "/"; + } + input_tensors = { + {prefix+"t_coord", coord_tensor}, + {prefix+"t_type", type_tensor}, + {prefix+"t_box", box_tensor}, + {prefix+"t_mesh", mesh_tensor}, + {prefix+"t_natoms",natoms_tensor}, + }; + if (fparam_.size() > 0) { + input_tensors.push_back({prefix+"t_fparam", fparam_tensor}); + } + if (aparam_.size() > 0) { + input_tensors.push_back({prefix+"t_aparam", aparam_tensor}); + } + return nloc; +} + int session_input_tensors (std::vector> & input_tensors, const vector & dcoord_, diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index 993a1b6fd4..c4d97cd85a 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -4,8 +4,7 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} ) file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc) -file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc) -file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_gpu.cc) +file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc) file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) @@ -27,7 +26,7 @@ if (BUILD_PY_IF) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) set(CMAKE_INSTALL_RPATH DESTINATION ${DP_PIP_INSTALL_PATH} ${DP_SETUP_INSTALL_PATH} ${CMAKE_BINARY_DIR}/op/cuda) if (USE_CUDA_TOOLKIT) - add_library(op_abi SHARED ${OP_PY_CUDA_SRC} ${OP_LIB}) + add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) add_library(op_grads SHARED ${OP_GRADS_SRC}) add_subdirectory(cuda) find_package(CUDA REQUIRED) diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu index 1636df8ff5..5965254111 100644 --- a/source/op/cuda/descrpt_se_a.cu +++ b/source/op/cuda/descrpt_se_a.cu @@ -1,42 +1,7 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#define EIGEN_USE_GPU -#include -#include -#include #include #include #include -#include - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -typedef double compute_t; - -typedef unsigned long long int_64; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" template < typename Key, @@ -72,24 +37,25 @@ __global__ void BlockSortKernel( cub::StoreDirectStriped(threadIdx.x, d_out + block_offset, items); } -template -__device__ inline T dev_dot(T * arr1, T * arr2) { +template +__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) { return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; } -__device__ inline void spline5_switch(compute_t & vv, - compute_t & dd, - compute_t & xx, - const compute_t & rmin, - const compute_t & rmax) +template +__device__ inline void spline5_switch(FPTYPE & vv, + FPTYPE & dd, + FPTYPE & xx, + const float & rmin, + const float & rmax) { if (xx < rmin) { dd = 0; vv = 1; } else if (xx < rmax) { - compute_t uu = (xx - rmin) / (rmax - rmin) ; - compute_t du = 1. / (rmax - rmin) ; + FPTYPE uu = (xx - rmin) / (rmax - rmin) ; + FPTYPE du = 1. / (rmax - rmin) ; vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; } @@ -110,7 +76,8 @@ __global__ void get_i_idx_se_a(const int nloc, i_idx[ilist[idy]] = idy; } -__global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord, +template +__global__ void format_nlist_fill_a_se_a(const FPTYPE * coord, const int * type, const int * jrange, const int * jlist, @@ -120,8 +87,8 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord, const int MAGIC_NUMBER) { // <<>> - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]]; if (idy >= nsize) { @@ -133,12 +100,12 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord, int_64 * key_in = key + idx * MAGIC_NUMBER; - compute_t diff[3]; + FPTYPE diff[3]; const int & j_idx = nei_idx[idy]; for (int dd = 0; dd < 3; dd++) { diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; } - compute_t rr = sqrt(dev_dot(diff, diff)); + FPTYPE rr = sqrt(dev_dot(diff, diff)); if (rr <= rcut) { key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx; } @@ -180,33 +147,34 @@ __global__ void format_nlist_fill_b_se_a(int * nlist, } //it's ok! -__global__ void compute_descriptor_se_a (VALUETYPE* descript, +template +__global__ void compute_descriptor_se_a (FPTYPE* descript, const int ndescrpt, - VALUETYPE* descript_deriv, + FPTYPE* descript_deriv, const int descript_deriv_size, - VALUETYPE* rij, + FPTYPE* rij, const int rij_size, const int* type, - const VALUETYPE* avg, - const VALUETYPE* std, + const FPTYPE* avg, + const FPTYPE* std, int* nlist, const int nlist_size, - const VALUETYPE* coord, - const VALUETYPE rmin, - const VALUETYPE rmax, + const FPTYPE* coord, + const float rmin, + const float rmax, const int sec_a_size) { // <<>> - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; const int idx_deriv = idy * 4 * 3; // 4 components time 3 directions const int idx_value = idy * 4; // 4 components if (idy >= sec_a_size) {return;} // else {return;} - VALUETYPE * row_descript = descript + idx * ndescrpt; - VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; - VALUETYPE * row_rij = rij + idx * rij_size; + FPTYPE * row_descript = descript + idx * ndescrpt; + FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; + FPTYPE * row_rij = rij + idx * rij_size; int * row_nlist = nlist + idx * nlist_size; if (row_nlist[idy] >= 0) { @@ -214,14 +182,14 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, for (int kk = 0; kk < 3; kk++) { row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; } - const compute_t * rr = &row_rij[idy * 3 + 0]; - compute_t nr2 = dev_dot(rr, rr); - compute_t inr = 1./sqrt(nr2); - compute_t nr = nr2 * inr; - compute_t inr2 = inr * inr; - compute_t inr4 = inr2 * inr2; - compute_t inr3 = inr4 * nr; - compute_t sw, dsw; + const FPTYPE * rr = &row_rij[idy * 3 + 0]; + FPTYPE nr2 = dev_dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); row_descript[idx_value + 0] = (1./nr) ;//* sw; row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw; @@ -260,8 +228,9 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, } } +template void format_nbor_list_256 ( - const VALUETYPE* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -274,9 +243,10 @@ void format_nbor_list_256 ( const int LEN = 256; const int MAGIC_NUMBER = 256; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -292,8 +262,9 @@ void format_nbor_list_256 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } +template void format_nbor_list_512 ( - const VALUETYPE* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -306,9 +277,10 @@ void format_nbor_list_512 ( const int LEN = 256; const int MAGIC_NUMBER = 512; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -324,8 +296,9 @@ void format_nbor_list_512 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } +template void format_nbor_list_1024 ( - const VALUETYPE* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -338,9 +311,10 @@ void format_nbor_list_1024 ( const int LEN = 256; const int MAGIC_NUMBER = 1024; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -356,8 +330,9 @@ void format_nbor_list_1024 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } +template void format_nbor_list_2048 ( - const VALUETYPE* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -370,9 +345,10 @@ void format_nbor_list_2048 ( const int LEN = 256; const int MAGIC_NUMBER = 2048; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -388,8 +364,9 @@ void format_nbor_list_2048 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } +template void format_nbor_list_4096 ( - const VALUETYPE* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -402,9 +379,10 @@ void format_nbor_list_4096 ( const int LEN = 256; const int MAGIC_NUMBER = 4096; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -420,29 +398,8 @@ void format_nbor_list_4096 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } -void DescrptSeALauncher(const VALUETYPE* coord, - const int* type, - const int* ilist, - const int* jrange, - const int* jlist, - int* array_int, - unsigned long long* array_longlong, - const VALUETYPE* avg, - const VALUETYPE* std, - VALUETYPE* descript, - VALUETYPE* descript_deriv, - VALUETYPE* rij, - int* nlist, - const int& nloc, - const int& nnei, - const float& rcut_r, - const float& rcut_r_smth, - const int& ndescrpt, - const std::vector& sec_a, - const bool& fill_nei_a, - const int MAGIC_NUMBER -) -{ +template +void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) { const int LEN = 256; int nblock = (nloc + LEN -1) / LEN; int * sec_a_dev = array_int; @@ -454,8 +411,8 @@ void DescrptSeALauncher(const VALUETYPE* coord, res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); - res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res); - res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); + res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res); + res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); if (fill_nei_a) { // ~~~ @@ -534,8 +491,9 @@ void DescrptSeALauncher(const VALUETYPE* coord, } const int nblock_ = (sec_a.back() + LEN -1) / LEN; - dim3 block_grid(nblock_, nloc); - compute_descriptor_se_a<<>> ( + dim3 block_grid(nloc, nblock_); + dim3 thread_grid(1, LEN); + compute_descriptor_se_a<<>> ( descript, ndescrpt, descript_deriv, @@ -552,4 +510,7 @@ void DescrptSeALauncher(const VALUETYPE* coord, rcut_r, sec_a.back() ); -} \ No newline at end of file +} + +template struct DescrptSeAGPUExecuteFunctor; +template struct DescrptSeAGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu index fa9678be34..a65ba5887a 100644 --- a/source/op/cuda/descrpt_se_r.cu +++ b/source/op/cuda/descrpt_se_r.cu @@ -1,45 +1,7 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#define EIGEN_USE_GPU -#include -#include -#include #include #include #include -#include -#include - -#define MAGIC_NUMBER 256 - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -typedef double compute_t; - -typedef unsigned long long int_64; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" template < typename Key, @@ -48,7 +10,7 @@ template < __launch_bounds__ (BLOCK_THREADS) __global__ void BlockSortKernel( Key * d_in, - Key * d_out) // Tile of output + Key * d_out) // Tile of output { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) @@ -75,24 +37,25 @@ __global__ void BlockSortKernel( cub::StoreDirectStriped(threadIdx.x, d_out + block_offset, items); } -template -__device__ inline T dev_dot(T * arr1, T * arr2) { +template +__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) { return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; } -__device__ inline void spline5_switch(compute_t & vv, - compute_t & dd, - compute_t & xx, - const compute_t & rmin, - const compute_t & rmax) +template +__device__ inline void spline5_switch(FPTYPE & vv, + FPTYPE & dd, + FPTYPE & xx, + const float & rmin, + const float & rmax) { if (xx < rmin) { dd = 0; vv = 1; } else if (xx < rmax) { - compute_t uu = (xx - rmin) / (rmax - rmin) ; - compute_t du = 1. / (rmax - rmin) ; + FPTYPE uu = (xx - rmin) / (rmax - rmin) ; + FPTYPE du = 1. / (rmax - rmin) ; vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; } @@ -104,7 +67,7 @@ __device__ inline void spline5_switch(compute_t & vv, __global__ void get_i_idx_se_r(const int nloc, const int * ilist, - int * i_idx) + int * i_idx) { const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; if(idy >= nloc) { @@ -113,17 +76,19 @@ __global__ void get_i_idx_se_r(const int nloc, i_idx[ilist[idy]] = idy; } -__global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord, +template +__global__ void format_nlist_fill_a_se_r(const FPTYPE * coord, const int * type, const int * jrange, const int * jlist, const float rcut, int_64 * key, - int * i_idx) + int * i_idx, + const int MAGIC_NUMBER) { // <<>> const unsigned int idx = blockIdx.x; - const unsigned int idy = threadIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]]; if (idy >= nsize) { @@ -135,12 +100,12 @@ __global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord, int_64 * key_in = key + idx * MAGIC_NUMBER; - compute_t diff[3]; + FPTYPE diff[3]; const int & j_idx = nei_idx[idy]; for (int dd = 0; dd < 3; dd++) { diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; } - compute_t rr = sqrt(dev_dot(diff, diff)); + FPTYPE rr = sqrt(dev_dot(diff, diff)); if (rr <= rcut) { key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx; } @@ -153,9 +118,10 @@ __global__ void format_nlist_fill_b_se_r(int * nlist, const int * jrange, const int * jlist, int_64 * key, - const int * sec, + const int * sec_a, const int sec_a_size, - int * nei_iter_dev) + int * nei_iter_dev, + const int MAGIC_NUMBER) { const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; @@ -169,137 +135,329 @@ __global__ void format_nlist_fill_b_se_r(int * nlist, int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER; for (int ii = 0; ii < sec_a_size; ii++) { - nei_iter[ii] = sec[ii]; + nei_iter[ii] = sec_a[ii]; } for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) { const int & nei_type = key_out[kk] / 1E15; - if (nei_iter[nei_type] < sec[nei_type + 1]) { + if (nei_iter[nei_type] < sec_a[nei_type + 1]) { row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000; } } } //it's ok! -__global__ void compute_descriptor_se_r (VALUETYPE* descript, +template +__global__ void compute_descriptor_se_r (FPTYPE* descript, const int ndescrpt, - VALUETYPE* descript_deriv, + FPTYPE* descript_deriv, const int descript_deriv_size, - VALUETYPE* rij, + FPTYPE* rij, const int rij_size, const int* type, - const VALUETYPE* avg, - const VALUETYPE* std, + const FPTYPE* avg, + const FPTYPE* std, int* nlist, const int nlist_size, - const VALUETYPE* coord, - const VALUETYPE rmin, - const VALUETYPE rmax, - compute_t* sel_diff_dev, - const int sec_size) + const FPTYPE* coord, + const float rmin, + const float rmax, + const int sec_a_size) { - // <<>> - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; - const int idx_deriv = idy * 3; // 1 components time 3 directions - const int idx_value = idy; // 1 components - if (idy >= sec_size) {return;} - + // <<>> + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + const int idx_deriv = idy * 3; // 4 components time 3 directions + const int idx_value = idy; // 4 components + if (idy >= sec_a_size) {return;} + // else {return;} - VALUETYPE * row_descript = descript + idx * ndescrpt; - VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; - VALUETYPE * row_rij = rij + idx * rij_size; - compute_t * sel_diff = sel_diff_dev + idx * nlist_size * 3; + FPTYPE * row_descript = descript + idx * ndescrpt; + FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; + FPTYPE * row_rij = rij + idx * rij_size; int * row_nlist = nlist + idx * nlist_size; - + if (row_nlist[idy] >= 0) { const int & j_idx = row_nlist[idy]; for (int kk = 0; kk < 3; kk++) { - sel_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; - row_rij[idy * 3 + kk] = sel_diff[idy * 3 + kk]; + row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; } - const compute_t * rr = &sel_diff[idy * 3 + 0]; - compute_t nr2 = dev_dot(rr, rr); - compute_t inr = 1./sqrt(nr2); - compute_t nr = nr2 * inr; - compute_t inr2 = inr * inr; - compute_t inr4 = inr2 * inr2; - compute_t inr3 = inr4 * nr; - compute_t sw, dsw; + const FPTYPE * rr = &row_rij[idy * 3 + 0]; + FPTYPE nr2 = dev_dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); row_descript[idx_value + 0] = (1./nr) ;//* sw; row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; - // 1 value components + // 4 value components row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0]; } for (int ii = 0; ii < 1; ii++) { row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii]; } + // idy nloc, idx ndescrpt * 3 + // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3]; for (int ii = 0; ii < 3; ii++) { row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3]; } } -void DescrptSeRLauncher(const VALUETYPE* coord, - const int* type, - const int* ilist, - const int* jrange, - const int* jlist, - int* array_int, - unsigned long long* array_longlong, - compute_t* array_double, - const VALUETYPE* avg, - const VALUETYPE* std, - VALUETYPE* descript, - VALUETYPE* descript_deriv, - VALUETYPE* rij, - int* nlist, - const int& ntypes, - const int& nloc, - const int& nall, - const int& nnei, - const float& rcut, - const float& rcut_smth, - const int& ndescrpt, - const std::vector& sec, - const bool& fill_nei_a -) +template +void format_nbor_list_256 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 256; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void format_nbor_list_512 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 512; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void format_nbor_list_1024 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) { + const int LEN = 256; + const int MAGIC_NUMBER = 1024; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void format_nbor_list_2048 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 2048; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void format_nbor_list_4096 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 4096; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 16; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void DescrptSeRGPUExecuteFunctor::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) { const int LEN = 256; int nblock = (nloc + LEN -1) / LEN; - int * sec_dev = array_int; - int * nei_iter = array_int + sec.size(); // = new int[sec_a_size]; - int * i_idx = array_int + sec.size() + nloc * sec.size(); + int * sec_a_dev = array_int; + int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size]; + int * i_idx = array_int + sec_a.size() + nloc * sec_a.size(); int_64 * key = array_longlong; - compute_t * sel_diff = array_double; // = new VALUETYPE *[nlist_size]; nnei cudaError_t res = cudaSuccess; - res = cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); + res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); - res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res); - res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); + res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res); + res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); if (fill_nei_a) { + // ~~~ // cudaProfilerStart(); get_i_idx_se_r<<>> (nloc, ilist, i_idx); - format_nlist_fill_a_se_r<<>> ( - coord, - type, - jrange, - jlist, - rcut, - key, - i_idx - ); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = 64; - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + if (nnei <= 256) { + format_nbor_list_256 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 512) { + format_nbor_list_512 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 1024) { + format_nbor_list_1024 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 2048) { + format_nbor_list_2048 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 4096) { + format_nbor_list_4096 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } + format_nlist_fill_b_se_r<<>> ( nlist, nnei, @@ -307,14 +465,17 @@ void DescrptSeRLauncher(const VALUETYPE* coord, jrange, jlist, key, - sec_dev, - sec.size(), - nei_iter + sec_a_dev, + sec_a.size(), + nei_iter, + MAGIC_NUMBER ); } - const int nblock_ = (sec.back() + LEN -1) / LEN; - dim3 block_grid(nblock_, nloc); - compute_descriptor_se_r<<>> ( + + const int nblock_ = (sec_a.back() + LEN -1) / LEN; + dim3 block_grid(nloc, nblock_); + dim3 thread_grid(1, LEN); + compute_descriptor_se_r<<>> ( descript, ndescrpt, descript_deriv, @@ -327,9 +488,11 @@ void DescrptSeRLauncher(const VALUETYPE* coord, nlist, nnei, coord, - rcut_smth, - rcut, - sel_diff, - sec.back() + rcut_r_smth, + rcut_r, + sec_a.back() ); -} \ No newline at end of file +} + +template struct DescrptSeRGPUExecuteFunctor; +template struct DescrptSeRGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu index 99b7b1aed4..6329c8f085 100644 --- a/source/op/cuda/gelu.cu +++ b/source/op/cuda/gelu.cu @@ -1,39 +1,35 @@ -#include -#include +#include "DeviceFunctor.h" -#define SQRT_2_PI 0.7978845608028654 - -template -__global__ void gelu(const T * in, T * out, int const size) { +template +__global__ void gelu(const FPTYPE * in, FPTYPE * out, int const size) { int const idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) {return;} out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]))); } -template -__global__ void gelu_grad(const T * dy, const T * in, T * out, int const size) { +template +__global__ void gelu_grad(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { int const idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) {return;} // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]))); - T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); out[idx] = dy[idx] * (0.5 * SQRT_2_PI * in[idx] * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1) + 0.5 * var1 + 0.5); } -template -__global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * out, int const size) { +template +__global__ void gelu_grad_grad(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { int const idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) {return;} // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]))); - T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); - T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); + FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1); out[idx] = dy[idx] * dy_[idx] * (0.134145 * SQRT_2_PI * in[idx] * in[idx] * (1 - var1 * var1) - SQRT_2_PI * in[idx] * var2 * (0.134145 * in[idx] * in[idx] + 1) * var1 + var2); } - void GeluLauncher(const float * in, float * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; @@ -75,3 +71,34 @@ void GeluGradGradLauncher(const double * dy, const double * dy_, const double * gelu_grad_grad<<>>(dy, dy_, in, out, size); } + +template +void GeluGPUExecuteFunctor::operator()(const FPTYPE * in, FPTYPE * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu<<>>(in, out, size); +} + +template +void GeluGradGPUExecuteFunctor::operator()(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu_grad<<>>(dy, in, out, size); +} + +template +void GeluGradGradGPUExecuteFunctor::operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu_grad_grad<<>>(dy, dy_, in, out, size); +} + +template struct GeluGPUExecuteFunctor; +template struct GeluGPUExecuteFunctor; +template struct GeluGradGPUExecuteFunctor; +template struct GeluGradGPUExecuteFunctor; +template struct GeluGradGradGPUExecuteFunctor; +template struct GeluGradGradGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu index 080ff8ef75..1667c15f90 100644 --- a/source/op/cuda/prod_force_se_a.cu +++ b/source/op/cuda/prod_force_se_a.cu @@ -1,22 +1,4 @@ -#include -#include -#include - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { @@ -32,25 +14,25 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -__global__ void deriv_wrt_center_atom_se_a(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +__global__ void deriv_wrt_center_atom_se_a(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int ndescrpt) { - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + const unsigned int idz = threadIdx.x; - if (idy >= ndescrpt) { - return; - } + if (idy >= ndescrpt) {return;} atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); } -__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +__global__ void deriv_wrt_neighbors_se_a(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nnei, @@ -75,23 +57,24 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force, atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]); } -void ProdForceSeALauncher(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +void ProdForceSeAGPUExecuteFunctor::operator()(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, - const int ndescrpt, const int nnei, + const int ndescrpt, const int n_a_sel, const int n_a_shift) { // std::cout << "I'm here!" << std::endl; - cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3)); + cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3)); const int LEN1 = 256; const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; - dim3 grid(nblock1, nloc); - dim3 thread(LEN1, 3); + dim3 grid(nloc, nblock1); + dim3 thread(3, LEN1); deriv_wrt_center_atom_se_a<<>>(force, net_deriv, in_deriv, ndescrpt); const int LEN = 64; @@ -99,4 +82,7 @@ void ProdForceSeALauncher(VALUETYPE * force, dim3 block_grid(nblock, nnei); dim3 thread_grid(LEN, 3, 4); deriv_wrt_neighbors_se_a<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift); -} \ No newline at end of file +} + +template struct ProdForceSeAGPUExecuteFunctor; +template struct ProdForceSeAGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/prod_force_se_r.cu b/source/op/cuda/prod_force_se_r.cu index 765842d9c3..5a4b582dd0 100644 --- a/source/op/cuda/prod_force_se_r.cu +++ b/source/op/cuda/prod_force_se_r.cu @@ -1,21 +1,4 @@ -#include -#include - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { @@ -31,31 +14,29 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -__global__ void deriv_wrt_center_atom_se_r(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +__global__ void deriv_wrt_center_atom_se_r(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int ndescrpt) { - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; - - if (idy >= ndescrpt) { - return; - } + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + const unsigned int idz = threadIdx.x; + if (idy >= ndescrpt) {return;} + atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); } -__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +__global__ void deriv_wrt_neighbors_se_r(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift) + const int ndescrpt) { // idy -> nnei const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -73,27 +54,30 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force, atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); } -void ProdForceSeRLauncher(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +void ProdForceSeRGPUExecuteFunctor::operator()(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, - const int ndescrpt, const int nnei, - const int n_a_sel, - const int n_a_shift) -{ - cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3)); + const int ndescrpt) +{ + // std::cout << "I'm here!" << std::endl; + cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3)); const int LEN1 = 256; const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; - dim3 grid(nblock1, nloc); - dim3 thread(LEN1, 3); + dim3 grid(nloc, nblock1); + dim3 thread(3, LEN1); deriv_wrt_center_atom_se_r<<>>(force, net_deriv, in_deriv, ndescrpt); const int LEN = 64; int nblock = (nloc + LEN -1) / LEN; dim3 block_grid(nblock, nnei); dim3 thread_grid(LEN, 3); - deriv_wrt_neighbors_se_r<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift); -} \ No newline at end of file + deriv_wrt_neighbors_se_r<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt); +} + +template struct ProdForceSeRGPUExecuteFunctor; +template struct ProdForceSeRGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu index 241e2b7e06..e084720c6d 100644 --- a/source/op/cuda/prod_virial_se_a.cu +++ b/source/op/cuda/prod_virial_se_a.cu @@ -1,21 +1,4 @@ -#include -#include - -#define MUL 512 - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { - if (code != cudaSuccess) { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { @@ -31,11 +14,12 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, +template +__global__ void deriv_wrt_neighbors_se_a(FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, const int * nlist, const int nloc, const int nnei, @@ -64,21 +48,22 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial, atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]); } -void ProdVirialSeALauncher(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, +template +void ProdVirialSeAGPUExecuteFunctor::operator()(FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, - const int n_a_shift) + const int n_a_shift) { - cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9)); - cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall)); + cudaErrcheck(cudaMemset(virial, 0.0, sizeof(FPTYPE) * 9)); + cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall)); const int LEN = 16; int nblock = (nloc + LEN -1) / LEN; @@ -99,3 +84,6 @@ void ProdVirialSeALauncher(VALUETYPE * virial, n_a_shift ); } + +template struct ProdVirialSeAGPUExecuteFunctor; +template struct ProdVirialSeAGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/prod_virial_se_r.cu b/source/op/cuda/prod_virial_se_r.cu index a2c02007fc..9b8f43543f 100644 --- a/source/op/cuda/prod_virial_se_r.cu +++ b/source/op/cuda/prod_virial_se_r.cu @@ -1,24 +1,5 @@ -#include -#include -#include +#include "DeviceFunctor.h" -#define MUL 512 - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { - if (code != cudaSuccess) { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -// currently, double precision atomicAdd only support arch number larger than 6.0 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; @@ -33,17 +14,16 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, +template +__global__ void deriv_wrt_neighbors_se_r(FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, const int * nlist, const int nloc, const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift) + const int ndescrpt) { // idx -> nloc // idy -> nnei @@ -61,26 +41,26 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial, if (j_idx < 0) { return; } + // atomicAdd(virial + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]); atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]); } -void ProdVirialSeRLauncher(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, +template +void ProdVirialSeRGPUExecuteFunctor::operator()(FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift) + const int ndescrpt) { - cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9)); - cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall)); + cudaErrcheck(cudaMemset(virial, 0.0, sizeof(FPTYPE) * 9)); + cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall)); - const int LEN = 16; + const int LEN = 64; int nblock = (nloc + LEN -1) / LEN; dim3 block_grid(nblock, nnei); dim3 thread_grid(LEN, 9); @@ -94,8 +74,9 @@ void ProdVirialSeRLauncher(VALUETYPE * virial, nlist, nloc, nnei, - ndescrpt, - n_a_sel, - n_a_shift + ndescrpt ); -} \ No newline at end of file +} + +template struct ProdVirialSeRGPUExecuteFunctor; +template struct ProdVirialSeRGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc index 75c7640b2b..147ca687cf 100644 --- a/source/op/descrpt.cc +++ b/source/op/descrpt.cc @@ -12,54 +12,30 @@ typedef double compute_t; using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE ; -#else -typedef float VALUETYPE ; -#endif +using CPUDevice = Eigen::ThreadPoolDevice; -#ifdef HIGH_PREC REGISTER_OP("Descrpt") -.Input("coord: double") +.Attr("T: {float, double}") +.Input("coord: T") .Input("type: int32") .Input("natoms: int32") -.Input("box: double") +.Input("box: T") .Input("mesh: int32") -.Input("davg: double") -.Input("dstd: double") +.Input("davg: T") +.Input("dstd: T") .Attr("rcut_a: float") .Attr("rcut_r: float") .Attr("sel_a: list(int)") .Attr("sel_r: list(int)") .Attr("axis_rule: list(int)") -.Output("descrpt: double") -.Output("descrpt_deriv: double") -.Output("rij: double") +.Output("descrpt: T") +.Output("descrpt_deriv: T") +.Output("rij: T") .Output("nlist: int32") .Output("axis: int32") -.Output("rot_mat: double"); -#else -REGISTER_OP("Descrpt") -.Input("coord: float") -.Input("type: int32") -.Input("natoms: int32") -.Input("box: float") -.Input("mesh: int32") -.Input("davg: float") -.Input("dstd: float") -.Attr("rcut_a: float") -.Attr("rcut_r: float") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Attr("axis_rule: list(int)") -.Output("descrpt: float") -.Output("descrpt_deriv: float") -.Output("rij: float") -.Output("nlist: int32") -.Output("axis: int32") -.Output("rot_mat: float"); -#endif +.Output("rot_mat: T"); +template class DescrptOp : public OpKernel { public: explicit DescrptOp(OpKernelConstruction* context) : OpKernel(context) { @@ -182,18 +158,18 @@ class DescrptOp : public OpKernel { Tensor* rot_mat_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(5, rot_mat_shape, &rot_mat_tensor)); - auto coord = coord_tensor .matrix(); + auto coord = coord_tensor .matrix(); auto type = type_tensor .matrix(); - auto box = box_tensor .matrix(); + auto box = box_tensor .matrix(); auto mesh = mesh_tensor .flat(); - auto avg = avg_tensor .matrix(); - auto std = std_tensor .matrix(); - auto descrpt = descrpt_tensor ->matrix(); - auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); - auto rij = rij_tensor ->matrix(); + auto avg = avg_tensor .matrix(); + auto std = std_tensor .matrix(); + auto descrpt = descrpt_tensor ->matrix(); + auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); + auto rij = rij_tensor ->matrix(); auto nlist = nlist_tensor ->matrix(); auto axis = axis_tensor ->matrix(); - auto rot_mat = rot_mat_tensor ->matrix(); + auto rot_mat = rot_mat_tensor ->matrix(); // // check the types // int max_type_v = 0; @@ -624,5 +600,10 @@ class DescrptOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("Descrpt").Device(DEVICE_CPU), DescrptOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("Descrpt").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/descrpt_se_a.cc b/source/op/descrpt_se_a.cc index b970f6dbf0..88f866944d 100644 --- a/source/op/descrpt_se_a.cc +++ b/source/op/descrpt_se_a.cc @@ -12,50 +12,29 @@ typedef double compute_t; using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE ; -#else -typedef float VALUETYPE ; -#endif +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; -#ifdef HIGH_PREC REGISTER_OP("DescrptSeA") -.Input("coord: double") -.Input("type: int32") -.Input("natoms: int32") -.Input("box: double") -.Input("mesh: int32") -.Input("davg: double") -.Input("dstd: double") -.Attr("rcut_a: float") -.Attr("rcut_r: float") -.Attr("rcut_r_smth: float") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Output("descrpt: double") -.Output("descrpt_deriv: double") -.Output("rij: double") -.Output("nlist: int32"); -#else -REGISTER_OP("DescrptSeA") -.Input("coord: float") -.Input("type: int32") -.Input("natoms: int32") -.Input("box: float") -.Input("mesh: int32") -.Input("davg: float") -.Input("dstd: float") -.Attr("rcut_a: float") -.Attr("rcut_r: float") -.Attr("rcut_r_smth: float") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Output("descrpt: float") -.Output("descrpt_deriv: float") -.Output("rij: float") -.Output("nlist: int32"); -#endif + .Attr("T: {float, double}") + .Input("coord: T") //atomic coordinates + .Input("type: int32") //atomic type + .Input("natoms: int32") //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers + .Input("box : T") + .Input("mesh : int32") + .Input("davg: T") //average value of data + .Input("dstd: T") //standard deviation + .Attr("rcut_a: float") //no use + .Attr("rcut_r: float") + .Attr("rcut_r_smth: float") + .Attr("sel_a: list(int)") + .Attr("sel_r: list(int)") //all zero + .Output("descrpt: T") + .Output("descrpt_deriv: T") + .Output("rij: T") + .Output("nlist: int32"); +template class DescrptSeAOp : public OpKernel { public: explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -180,15 +159,15 @@ class DescrptSeAOp : public OpKernel { nlist_shape, &nlist_tensor)); - auto coord = coord_tensor .matrix(); + auto coord = coord_tensor .matrix(); auto type = type_tensor .matrix(); - auto box = box_tensor .matrix(); + auto box = box_tensor .matrix(); auto mesh = mesh_tensor .flat(); - auto avg = avg_tensor .matrix(); - auto std = std_tensor .matrix(); - auto descrpt = descrpt_tensor ->matrix(); - auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); - auto rij = rij_tensor ->matrix(); + auto avg = avg_tensor .matrix(); + auto std = std_tensor .matrix(); + auto descrpt = descrpt_tensor ->matrix(); + auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); + auto rij = rij_tensor ->matrix(); auto nlist = nlist_tensor ->matrix(); // // check the types @@ -369,5 +348,10 @@ class DescrptSeAOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_CPU), DescrptSeAOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptSeAOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc deleted file mode 100644 index fd5ae632cb..0000000000 --- a/source/op/descrpt_se_a_gpu.cc +++ /dev/null @@ -1,282 +0,0 @@ -#include -#include -#include -#include -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" - -using namespace tensorflow; // NOLINT(build/namespaces) - -#ifdef HIGH_PREC - typedef double VALUETYPE ; -#else - typedef float VALUETYPE ; -#endif - -typedef double compute_t; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -using GPUDevice = Eigen::GpuDevice; - -#ifdef HIGH_PREC -REGISTER_OP("DescrptSeA") - .Input("coord: double") //atomic coordinates - .Input("type: int32") //atomic type - .Input("natoms: int32") //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers - .Input("box : double") - .Input("mesh : int32") - .Input("davg: double") //average value of data - .Input("dstd: double") //standard deviation - .Attr("rcut_a: float") //no use - .Attr("rcut_r: float") - .Attr("rcut_r_smth: float") - .Attr("sel_a: list(int)") - .Attr("sel_r: list(int)") //all zero - .Output("descrpt: double") - .Output("descrpt_deriv: double") - .Output("rij: double") - .Output("nlist: int32"); - // only sel_a and rcut_r uesd. -#else -REGISTER_OP("DescrptSeA") - .Input("coord: float") - .Input("type: int32") - .Input("natoms: int32") - .Input("box : float") - .Input("mesh : int32") - .Input("davg: float") - .Input("dstd: float") - .Attr("rcut_a: float") - .Attr("rcut_r: float") - .Attr("rcut_r_smth: float") - .Attr("sel_a: list(int)") - .Attr("sel_r: list(int)") - .Output("descrpt: float") - .Output("descrpt_deriv: float") - .Output("rij: float") - .Output("nlist: int32"); -#endif - -int get_magic_number(int const nnei) { - if (nnei <= 256) { - return 256; - } - else if (nnei <= 512) { - return 512; - } - else if (nnei <= 1024) { - return 1024; - } - else if (nnei <= 2048) { - return 2048; - } - else if (nnei <= 4096) { - return 4096; - } -} - -void DescrptSeALauncher(const VALUETYPE* coord, - const int* type, - const int* ilist, - const int* jrange, - const int* jlist, - int* array_int, - unsigned long long* array_longlong, - const VALUETYPE* avg, - const VALUETYPE* std, - VALUETYPE* descript, - VALUETYPE* descript_deriv, - VALUETYPE* rij, - int* nlist, - const int& nloc, - const int& nnei, - const float& rcut_r, - const float& rcut_r_smth, - const int& ndescrpt, - const std::vector& sec_a, - const bool& fill_nei_a, - const int MAGIC_NUMBER -); - -class DescrptSeAOp : public OpKernel { -public: - explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) { - float nloc_f, nall_f; - OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a)); - OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r)); - OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth)); - OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a)); - OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r)); - // OP_REQUIRES_OK(context, context->GetAttr("nloc", &nloc_f)); - // OP_REQUIRES_OK(context, context->GetAttr("nall", &nall_f)); - cum_sum (sec_a, sel_a); - cum_sum (sec_r, sel_r); - ndescrpt_a = sec_a.back() * 4; - ndescrpt_r = sec_r.back() * 1; - ndescrpt = ndescrpt_a + ndescrpt_r; - nnei_a = sec_a.back(); - nnei_r = sec_r.back(); - nnei = nnei_a + nnei_r; - fill_nei_a = (rcut_a < 0); - magic_number = get_magic_number(nnei); - } - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - int context_input_index = 0; - const Tensor& coord_tensor = context->input(context_input_index++); - const Tensor& type_tensor = context->input(context_input_index++); - const Tensor& natoms_tensor = context->input(context_input_index++); - const Tensor& box_tensor = context->input(context_input_index++); - const Tensor& mesh_tensor = context->input(context_input_index++); - const Tensor& avg_tensor = context->input(context_input_index++); - const Tensor& std_tensor = context->input(context_input_index++); - // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3] - OP_REQUIRES (context, (coord_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of coord should be 2")); - OP_REQUIRES (context, (type_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of type should be 2")); - OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); - OP_REQUIRES (context, (box_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of box should be 2")); - OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of mesh should be 1")); - OP_REQUIRES (context, (avg_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of avg should be 2")); - OP_REQUIRES (context, (std_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of std should be 2")); - OP_REQUIRES (context, (fill_nei_a), errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0")); - OP_REQUIRES (context, (sec_r.back() == 0), errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero.")); - - OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); - int nloc = natoms[0]; - int nall = natoms[1]; - int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something. - int nsamples = coord_tensor.shape().dim_size(0); - // - //// check the sizes - OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of avg should be ntype")); - OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of std should be ntype")); - - OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); - OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); - OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of box should be 9")); - OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of avg should be ndescrpt")); - OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of std should be ndescrpt")); - - OP_REQUIRES (context, (ntypes == int(sel_a.size())), errors::InvalidArgument ("number of types should match the length of sel array")); - OP_REQUIRES (context, (ntypes == int(sel_r.size())), errors::InvalidArgument ("number of types should match the length of sel array")); - OP_REQUIRES (context, (nnei <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit.")); - - // Create output tensors - TensorShape descrpt_shape ; - descrpt_shape.AddDim (nsamples); - descrpt_shape.AddDim (nloc * ndescrpt); - TensorShape descrpt_deriv_shape ; - descrpt_deriv_shape.AddDim (nsamples); - descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3); - TensorShape rij_shape ; - rij_shape.AddDim (nsamples); - rij_shape.AddDim (nloc * nnei * 3); - TensorShape nlist_shape ; - nlist_shape.AddDim (nsamples); - nlist_shape.AddDim (nloc * nnei); - - int context_output_index = 0; - Tensor* descrpt_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - descrpt_shape, - &descrpt_tensor)); - Tensor* descrpt_deriv_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - descrpt_deriv_shape, - &descrpt_deriv_tensor)); - Tensor* rij_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - rij_shape, - &rij_tensor)); - Tensor* nlist_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - nlist_shape, - &nlist_tensor)); - - // allocate temp memory, temp memory must not be used after this operation! - Tensor int_temp; - TensorShape int_shape; - int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc); - OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); - - Tensor uint64_temp; - TensorShape uint64_shape; - uint64_shape.AddDim(nloc * magic_number * 2); - OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); - - int * ilist = NULL, * jrange = NULL, * jlist = NULL; - int * array_int = int_temp.flat().data(); - unsigned long long * array_longlong = uint64_temp.flat().data(); - cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - - // Launch computation - for (int II = 0; II < nsamples; II++) { - DescrptSeALauncher(coord_tensor.matrix().data() + II * (nall * 3), // related to the kk argument - type_tensor.matrix().data() + II * nall, // also related to the kk argument - ilist, - jrange, - jlist, - array_int, - array_longlong, - avg_tensor.matrix().data(), - std_tensor.matrix().data(), - descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), - descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), - rij_tensor->matrix().data() + II * (nloc * nnei * 3), - nlist_tensor->matrix().data() + II * (nloc * nnei), - nloc, - nnei, - rcut_r, - rcut_r_smth, - ndescrpt, - sec_a, - fill_nei_a, - magic_number - ); - } - // std::cout << "done" << std::endl; - delete[] natoms; - } - -///////////////////////////////////////////////////////////////////////////////////////////// -private: - float rcut_a; - float rcut_r; - float rcut_r_smth; - std::vector sel_r; - std::vector sel_a; - std::vector sec_a; - std::vector sec_r; - int ndescrpt, ndescrpt_a, ndescrpt_r; - int nnei, nnei_a, nnei_r, nloc, nall, magic_number; - bool fill_nei_a; - - //private func - void cum_sum (std::vector & sec, const std::vector & n_sel) const { - sec.resize (n_sel.size() + 1); - sec[0] = 0; - for (int ii = 1; ii < sec.size(); ++ii) { - sec[ii] = sec[ii-1] + n_sel[ii-1]; - } - } -}; - -REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_GPU), DescrptSeAOp); \ No newline at end of file diff --git a/source/op/descrpt_se_a_multi_device.cc b/source/op/descrpt_se_a_multi_device.cc new file mode 100644 index 0000000000..ae5e623171 --- /dev/null +++ b/source/op/descrpt_se_a_multi_device.cc @@ -0,0 +1,307 @@ +#include "common.h" +#include "CustomeOperation.h" + +REGISTER_OP("DescrptSeA") + .Attr("T: {float, double}") + .Input("coord: T") //atomic coordinates + .Input("type: int32") //atomic type + .Input("natoms: int32") //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers + .Input("box : T") + .Input("mesh : int32") + .Input("davg: T") //average value of data + .Input("dstd: T") //standard deviation + .Attr("rcut_a: float") //no use + .Attr("rcut_r: float") + .Attr("rcut_r_smth: float") + .Attr("sel_a: list(int)") + .Attr("sel_r: list(int)") //all zero + .Output("descrpt: T") + .Output("descrpt_deriv: T") + .Output("rij: T") + .Output("nlist: int32"); + // only sel_a and rcut_r uesd. + +struct DeviceFunctor { + void operator()(const CPUDevice& d, std::string& device) { + device = "CPU"; + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, std::string& device) { + device = "GPU"; + } + #endif // GOOGLE_CUDA +}; + +template +struct DescrptSeAFunctor { + void operator()(const CPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + } + + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + } + #endif // GOOGLE_CUDA +}; + +template +class DescrptSeAOp : public OpKernel { +public: + explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) { + float nloc_f, nall_f; + OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a)); + OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r)); + OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth)); + OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a)); + OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r)); + // OP_REQUIRES_OK(context, context->GetAttr("nloc", &nloc_f)); + // OP_REQUIRES_OK(context, context->GetAttr("nall", &nall_f)); + cum_sum (sec_a, sel_a); + cum_sum (sec_r, sel_r); + ndescrpt_a = sec_a.back() * 4; + ndescrpt_r = sec_r.back() * 1; + ndescrpt = ndescrpt_a + ndescrpt_r; + nnei_a = sec_a.back(); + nnei_r = sec_r.back(); + nnei = nnei_a + nnei_r; + fill_nei_a = (rcut_a < 0); + magic_number = get_magic_number(nnei); + } + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& coord_tensor = context->input(context_input_index++); + const Tensor& type_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + const Tensor& box_tensor = context->input(context_input_index++); + const Tensor& mesh_tensor = context->input(context_input_index++); + const Tensor& avg_tensor = context->input(context_input_index++); + const Tensor& std_tensor = context->input(context_input_index++); + // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3] + OP_REQUIRES (context, (coord_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of coord should be 2")); + OP_REQUIRES (context, (type_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of type should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + OP_REQUIRES (context, (box_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of box should be 2")); + OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of mesh should be 1")); + OP_REQUIRES (context, (avg_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of avg should be 2")); + OP_REQUIRES (context, (std_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of std should be 2")); + OP_REQUIRES (context, (fill_nei_a), errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0")); + OP_REQUIRES (context, (sec_r.back() == 0), errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero.")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + + DeviceFunctor() ( + context->eigen_device(), + device + ); + + const int * natoms = natoms_tensor.flat().data(); + int nloc = natoms[0]; + int nall = natoms[1]; + int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something. + int nsamples = coord_tensor.shape().dim_size(0); + // + //// check the sizes + OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of avg should be ntype")); + OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of std should be ntype")); + + OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of box should be 9")); + OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of avg should be ndescrpt")); + OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of std should be ndescrpt")); + + OP_REQUIRES (context, (ntypes == int(sel_a.size())), errors::InvalidArgument ("number of types should match the length of sel array")); + OP_REQUIRES (context, (ntypes == int(sel_r.size())), errors::InvalidArgument ("number of types should match the length of sel array")); + OP_REQUIRES (context, (nnei <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit.")); + + // Create output tensors + TensorShape descrpt_shape ; + descrpt_shape.AddDim (nsamples); + descrpt_shape.AddDim (nloc * ndescrpt); + TensorShape descrpt_deriv_shape ; + descrpt_deriv_shape.AddDim (nsamples); + descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3); + TensorShape rij_shape ; + rij_shape.AddDim (nsamples); + rij_shape.AddDim (nloc * nnei * 3); + TensorShape nlist_shape ; + nlist_shape.AddDim (nsamples); + nlist_shape.AddDim (nloc * nnei); + + int context_output_index = 0; + Tensor* descrpt_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_shape, + &descrpt_tensor)); + Tensor* descrpt_deriv_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_deriv_shape, + &descrpt_deriv_tensor)); + Tensor* rij_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + rij_shape, + &rij_tensor)); + Tensor* nlist_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + nlist_shape, + &nlist_tensor)); + + if(device == "GPU") { + // allocate temp memory, temp memory must not be used after this operation! + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(nloc * magic_number * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); + + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + + nbor_update(mesh_tensor.flat().data(), static_cast(mesh_tensor.NumElements())); + } + else if (device == "CPU") { + memcpy (&ilist, 4 + mesh_tensor.flat().data(), sizeof(int *)); + memcpy (&jrange, 8 + mesh_tensor.flat().data(), sizeof(int *)); + memcpy (&jlist, 12 + mesh_tensor.flat().data(), sizeof(int *)); + } + + DescrptSeAFunctor()( + context->eigen_device(), // define actually graph execution device + coord_tensor.matrix().data(), // related to the kk argument + type_tensor.matrix().data(), // also related to the kk argument + mesh_tensor.flat().data(), + ilist, + jrange, + jlist, + array_int, + array_longlong, + avg_tensor.matrix().data(), + std_tensor.matrix().data(), + descrpt_tensor->matrix().data(), + descrpt_deriv_tensor->matrix().data(), + rij_tensor->matrix().data(), + nlist_tensor->matrix().data(), + nloc, + nall, + nnei, + ntypes, + ndescrpt, + rcut_r, + rcut_r_smth, + sec_a, + fill_nei_a, + magic_number + ); + } + +///////////////////////////////////////////////////////////////////////////////////////////// +private: + float rcut_a; + float rcut_r; + float rcut_r_smth; + std::vector sel_r; + std::vector sel_a; + std::vector sec_a; + std::vector sec_r; + int ndescrpt, ndescrpt_a, ndescrpt_r; + int nnei, nnei_a, nnei_r, nloc, nall, magic_number; + bool fill_nei_a; + + //private func + void cum_sum (std::vector & sec, const std::vector & n_sel) const { + sec.resize (n_sel.size() + 1); + sec[0] = 0; + for (int ii = 1; ii < sec.size(); ++ii) { + sec[ii] = sec[ii-1] + n_sel[ii-1]; + } + } + + std::string device; + int *array_int; + unsigned long long*array_longlong; + int * ilist = NULL, * jrange = NULL, * jlist = NULL; + int ilist_size = 0, jrange_size = 0, jlist_size = 0; + bool init = false; + + void nbor_update(const int * mesh, const int size) { + int *mesh_host = new int[size], *ilist_host = NULL, *jrange_host = NULL, *jlist_host = NULL; + cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost)); + memcpy (&ilist_host, 4 + mesh_host, sizeof(int *)); + memcpy (&jrange_host, 8 + mesh_host, sizeof(int *)); + memcpy (&jlist_host, 12 + mesh_host, sizeof(int *)); + int const ago = mesh_host[0]; + if (!init) { + ilist_size = (int)(mesh_host[1] * 1.2); + jrange_size = (int)(mesh_host[2] * 1.2); + jlist_size = (int)(mesh_host[3] * 1.2); + cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size)); + cudaErrcheck(cudaMalloc((void **)&jrange, sizeof(int) * jrange_size)); + cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size)); + init = true; + } + if (ago == 0) { + if (ilist_size < mesh_host[1]) { + ilist_size = (int)(mesh_host[1] * 1.2); + cudaErrcheck(cudaFree(ilist)); + cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size)); + } + if (jrange_size < mesh_host[2]) { + jrange_size = (int)(mesh_host[2] * 1.2); + cudaErrcheck(cudaFree(jrange)); + cudaErrcheck(cudaMalloc((void **)&jrange,sizeof(int) * jrange_size)); + } + if (jlist_size < mesh_host[3]) { + jlist_size = (int)(mesh_host[3] * 1.2); + cudaErrcheck(cudaFree(jlist)); + cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size)); + } + cudaErrcheck(cudaMemcpy(ilist, ilist_host, sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jlist, jlist_host, sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice)); + } + delete [] mesh_host; + } + + int get_magic_number(int const nnei) { + if (nnei <= 256) { + return 256; + } + else if (nnei <= 512) { + return 512; + } + else if (nnei <= 1024) { + return 1024; + } + else if (nnei <= 2048) { + return 2048; + } + else if (nnei <= 4096) { + return 4096; + } + } +}; + +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptSeAOp); +REGISTER_CPU(float); +REGISTER_CPU(double); +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + DescrptSeAOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/descrpt_se_r.cc b/source/op/descrpt_se_r.cc index 6798df503c..a4bfe341ac 100644 --- a/source/op/descrpt_se_r.cc +++ b/source/op/descrpt_se_r.cc @@ -12,45 +12,26 @@ typedef double compute_t; using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE ; -#else -typedef float VALUETYPE ; -#endif +using CPUDevice = Eigen::ThreadPoolDevice; REGISTER_OP("DescrptSeR") -#ifdef HIGH_PREC -.Input("coord: double") +.Attr("T: {float, double}") +.Input("coord: T") .Input("type: int32") .Input("natoms: int32") -.Input("box: double") +.Input("box: T") .Input("mesh: int32") -.Input("davg: double") -.Input("dstd: double") +.Input("davg: T") +.Input("dstd: T") .Attr("rcut: float") .Attr("rcut_smth: float") .Attr("sel: list(int)") -.Output("descrpt: double") -.Output("descrpt_deriv: double") -.Output("rij: double") +.Output("descrpt: T") +.Output("descrpt_deriv: T") +.Output("rij: T") .Output("nlist: int32"); -#else -.Input("coord: float") -.Input("type: int32") -.Input("natoms: int32") -.Input("box: float") -.Input("mesh: int32") -.Input("davg: float") -.Input("dstd: float") -.Attr("rcut: float") -.Attr("rcut_smth: float") -.Attr("sel: list(int)") -.Output("descrpt: float") -.Output("descrpt_deriv: float") -.Output("rij: float") -.Output("nlist: int32"); -#endif +template class DescrptSeROp : public OpKernel { public: explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -169,15 +150,15 @@ class DescrptSeROp : public OpKernel { nlist_shape, &nlist_tensor)); - auto coord = coord_tensor .matrix(); + auto coord = coord_tensor .matrix(); auto type = type_tensor .matrix(); - auto box = box_tensor .matrix(); + auto box = box_tensor .matrix(); auto mesh = mesh_tensor .flat(); - auto avg = avg_tensor .matrix(); - auto std = std_tensor .matrix(); - auto descrpt = descrpt_tensor ->matrix(); - auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); - auto rij = rij_tensor ->matrix(); + auto avg = avg_tensor .matrix(); + auto std = std_tensor .matrix(); + auto descrpt = descrpt_tensor ->matrix(); + auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); + auto rij = rij_tensor ->matrix(); auto nlist = nlist_tensor ->matrix(); OP_REQUIRES (context, (ntypes == int(sel.size())), errors::InvalidArgument ("number of types should match the length of sel array")); @@ -351,5 +332,10 @@ class DescrptSeROp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_CPU), DescrptSeROp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptSeROp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/descrpt_se_r_gpu.cc b/source/op/descrpt_se_r_gpu.cc deleted file mode 100644 index 65e2682ef0..0000000000 --- a/source/op/descrpt_se_r_gpu.cc +++ /dev/null @@ -1,247 +0,0 @@ -#include -#include -#include -#include -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" - -using namespace tensorflow; // NOLINT(build/namespaces) -#define MAGIC_NUMBER 256 - -#ifdef HIGH_PREC - typedef double VALUETYPE ; -#else - typedef float VALUETYPE ; -#endif - -typedef double compute_t; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -// sec_a kao,sec_r, - -using GPUDevice = Eigen::GpuDevice; - -#ifdef HIGH_PREC -REGISTER_OP("DescrptSeR") - .Input("coord: double") - .Input("type: int32") - .Input("natoms: int32") - .Input("box: double") - .Input("mesh: int32") - .Input("davg: double") - .Input("dstd: double") - .Attr("rcut: float") - .Attr("rcut_smth: float") - .Attr("sel: list(int)") - .Output("descrpt: double") - .Output("descrpt_deriv: double") - .Output("rij: double") - .Output("nlist: int32"); -#else -REGISTER_OP("DescrptSeR") - .Input("coord: float") - .Input("type: int32") - .Input("natoms: int32") - .Input("box: float") - .Input("mesh: int32") - .Input("davg: float") - .Input("dstd: float") - .Attr("rcut: float") - .Attr("rcut_smth: float") - .Attr("sel: list(int)") - .Output("descrpt: float") - .Output("descrpt_deriv: float") - .Output("rij: float") - .Output("nlist: int32"); -#endif - -void DescrptSeRLauncher(const VALUETYPE* coord, - const int* type, - const int* ilist, - const int* jrange, - const int* jlist, - int* array_int, - unsigned long long* array_longlong, - compute_t* array_double, - const VALUETYPE* avg, - const VALUETYPE* std, - VALUETYPE* descript, - VALUETYPE* descript_deriv, - VALUETYPE* rij, - int* nlist, - const int& ntypes, - const int& nloc, - const int& nall, - const int& nnei, - const float& rcut, - const float& rcut_smth, - const int& ndescrpt, - const std::vector& sec, - const bool& fill_nei_a -); - -class DescrptSeROp : public OpKernel { -public: - explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) { - float nloc_f, nall_f; - OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut)); - OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth)); - OP_REQUIRES_OK(context, context->GetAttr("sel", &sel)); - cum_sum (sec, sel); - sel_null.resize(3, 0); - cum_sum (sec_null, sel_null); - ndescrpt = sec.back() * 1; - nnei = sec.back(); - fill_nei_a = true; - // count_nei_idx_overflow = 0; - // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl; - } - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - int context_input_index = 0; - const Tensor& coord_tensor = context->input(context_input_index++); - const Tensor& type_tensor = context->input(context_input_index++); - const Tensor& natoms_tensor = context->input(context_input_index++); - const Tensor& box_tensor = context->input(context_input_index++); - const Tensor& mesh_tensor = context->input(context_input_index++); - const Tensor& avg_tensor = context->input(context_input_index++); - const Tensor& std_tensor = context->input(context_input_index++); - // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3] - OP_REQUIRES (context, (coord_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of coord should be 2")); - OP_REQUIRES (context, (type_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of type should be 2")); - OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); - OP_REQUIRES (context, (box_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of box should be 2")); - OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of mesh should be 1")); - OP_REQUIRES (context, (avg_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of avg should be 2")); - OP_REQUIRES (context, (std_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of std should be 2")); - OP_REQUIRES (context, (fill_nei_a), errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0")); - - OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); - int nloc = natoms[0]; - int nall = natoms[1]; - int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something. - int nsamples = coord_tensor.shape().dim_size(0); - // - //// check the sizes - OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - OP_REQUIRES (context, (ntypes == int(sel.size())), errors::InvalidArgument ("number of types should match the length of sel array")); - OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of avg should be ntype")); - OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of std should be ntype")); - - OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); - OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); - OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of box should be 9")); - OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of avg should be ndescrpt")); - OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of std should be ndescrpt")); - - // Create output tensors - TensorShape descrpt_shape ; - descrpt_shape.AddDim (nsamples); - descrpt_shape.AddDim (nloc * ndescrpt); - TensorShape descrpt_deriv_shape ; - descrpt_deriv_shape.AddDim (nsamples); - descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3); - TensorShape rij_shape ; - rij_shape.AddDim (nsamples); - rij_shape.AddDim (nloc * nnei * 3); - TensorShape nlist_shape ; - nlist_shape.AddDim (nsamples); - nlist_shape.AddDim (nloc * nnei); - - int context_output_index = 0; - Tensor* descrpt_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - descrpt_shape, - &descrpt_tensor)); - Tensor* descrpt_deriv_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - descrpt_deriv_shape, - &descrpt_deriv_tensor)); - Tensor* rij_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - rij_shape, - &rij_tensor)); - Tensor* nlist_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - nlist_shape, - &nlist_tensor)); - - int * ilist = NULL, *jrange = NULL, *jlist = NULL; - int *array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL; - cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost)); - - // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice)); - // Launch computation - for (int II = 0; II < nsamples; II++) { - DescrptSeRLauncher( coord_tensor.matrix().data() + II * (nall * 3), - type_tensor.matrix().data() + II * nall, - ilist, - jrange, - jlist, - array_int, - array_longlong, - array_double, - avg_tensor.matrix().data(), - std_tensor.matrix().data(), - descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), - descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), - rij_tensor->matrix().data() + II * (nloc * nnei * 3), - nlist_tensor->matrix().data() + II * (nloc * nnei), - ntypes, - nloc, - nall, - nnei, - rcut, - rcut_smth, - ndescrpt, - sec, - fill_nei_a - ); - } - // std::cout << "done" << std::endl; - delete[] natoms; - } - -///////////////////////////////////////////////////////////////////////////////////////////// - -private: - float rcut; - float rcut_smth; - std::vector sel; - std::vector sel_null; - std::vector sec; - std::vector sec_null; - int nnei, ndescrpt, nloc, nall; - bool fill_nei_a; - - //private func - void cum_sum (std::vector & sec, const std::vector & n_sel) const { - sec.resize (n_sel.size() + 1); - sec[0] = 0; - for (int ii = 1; ii < sec.size(); ++ii) { - sec[ii] = sec[ii-1] + n_sel[ii-1]; - } - } -}; - -REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_GPU), DescrptSeROp); \ No newline at end of file diff --git a/source/op/descrpt_se_r_multi_device.cc b/source/op/descrpt_se_r_multi_device.cc new file mode 100644 index 0000000000..c5eaff616c --- /dev/null +++ b/source/op/descrpt_se_r_multi_device.cc @@ -0,0 +1,297 @@ +#include "common.h" +#include "CustomeOperation.h" + +REGISTER_OP("DescrptSeR") + .Attr("T: {float, double}") + .Input("coord: T") + .Input("type: int32") + .Input("natoms: int32") + .Input("box: T") + .Input("mesh: int32") + .Input("davg: T") + .Input("dstd: T") + .Attr("rcut: float") + .Attr("rcut_smth: float") + .Attr("sel: list(int)") + .Output("descrpt: T") + .Output("descrpt_deriv: T") + .Output("rij: T") + .Output("nlist: int32"); + +struct DeviceFunctor { + void operator()(const CPUDevice& d, std::string& device) { + device = "CPU"; + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, std::string& device) { + device = "GPU"; + } + #endif // GOOGLE_CUDA +}; + +template +struct DescrptSeRFunctor { + void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeRCPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + } + + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeRGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + } + #endif // GOOGLE_CUDA +}; + +template +class DescrptSeROp : public OpKernel { +public: + explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut)); + OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth)); + OP_REQUIRES_OK(context, context->GetAttr("sel", &sel)); + cum_sum (sec, sel); + sel_null.resize(3, 0); + cum_sum (sec_null, sel_null); + ndescrpt = sec.back() * 1; + nnei = sec.back(); + fill_nei_a = true; + magic_number = get_magic_number(nnei); + // count_nei_idx_overflow = 0; + // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl; + } + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& coord_tensor = context->input(context_input_index++); + const Tensor& type_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + const Tensor& box_tensor = context->input(context_input_index++); + const Tensor& mesh_tensor = context->input(context_input_index++); + const Tensor& avg_tensor = context->input(context_input_index++); + const Tensor& std_tensor = context->input(context_input_index++); + + // set size of the sample + OP_REQUIRES (context, (coord_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of coord should be 2")); + OP_REQUIRES (context, (type_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of type should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + OP_REQUIRES (context, (box_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of box should be 2")); + OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of mesh should be 1")); + OP_REQUIRES (context, (avg_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of avg should be 2")); + OP_REQUIRES (context, (std_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of std should be 2")); + OP_REQUIRES (context, (fill_nei_a), errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + + DeviceFunctor() ( + context->eigen_device(), + device + ); + + const int * natoms = natoms_tensor.flat().data(); + int nloc = natoms[0]; + int nall = natoms[1]; + int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something. + int nsamples = coord_tensor.shape().dim_size(0); + // + //// check the sizes + // check the sizes + OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of avg should be ntype")); + OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of std should be ntype")); + + OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of box should be 9")); + OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of avg should be ndescrpt")); + OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of std should be ndescrpt")); + + OP_REQUIRES (context, (nnei <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit.")); + + // Create an output tensor + TensorShape descrpt_shape ; + descrpt_shape.AddDim (nsamples); + descrpt_shape.AddDim (nloc * ndescrpt); + TensorShape descrpt_deriv_shape ; + descrpt_deriv_shape.AddDim (nsamples); + descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3); + TensorShape rij_shape ; + rij_shape.AddDim (nsamples); + rij_shape.AddDim (nloc * nnei * 3); + TensorShape nlist_shape ; + nlist_shape.AddDim (nsamples); + nlist_shape.AddDim (nloc * nnei); + + int context_output_index = 0; + Tensor* descrpt_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_shape, + &descrpt_tensor)); + Tensor* descrpt_deriv_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_deriv_shape, + &descrpt_deriv_tensor)); + Tensor* rij_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + rij_shape, + &rij_tensor)); + Tensor* nlist_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + nlist_shape, + &nlist_tensor)); + + if(device == "GPU") { + // allocate temp memory, temp memory must not be used after this operation! + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec.size() + nloc * sec.size() + nloc); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(nloc * magic_number * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); + + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + + nbor_update(mesh_tensor.flat().data(), static_cast(mesh_tensor.NumElements())); + } + else if (device == "CPU") { + memcpy (&ilist, 4 + mesh_tensor.flat().data(), sizeof(int *)); + memcpy (&jrange, 8 + mesh_tensor.flat().data(), sizeof(int *)); + memcpy (&jlist, 12 + mesh_tensor.flat().data(), sizeof(int *)); + } + + DescrptSeRFunctor()( + context->eigen_device(), // define actually graph execution device + coord_tensor.matrix().data(), // related to the kk argument + type_tensor.matrix().data(), // also related to the kk argument + mesh_tensor.flat().data(), + ilist, + jrange, + jlist, + array_int, + array_longlong, + avg_tensor.matrix().data(), + std_tensor.matrix().data(), + descrpt_tensor->matrix().data(), + descrpt_deriv_tensor->matrix().data(), + rij_tensor->matrix().data(), + nlist_tensor->matrix().data(), + nloc, + nall, + nnei, + ntypes, + ndescrpt, + rcut, + rcut_smth, + sec, + fill_nei_a, + magic_number + ); + } + +///////////////////////////////////////////////////////////////////////////////////////////// + +private: + float rcut; + float rcut_smth; + std::vector sel; + std::vector sel_null; + std::vector sec; + std::vector sec_null; + int nnei, ndescrpt, nloc, nall; + bool fill_nei_a; + + //private func + void cum_sum (std::vector & sec, const std::vector & n_sel) const { + sec.resize (n_sel.size() + 1); + sec[0] = 0; + for (int ii = 1; ii < sec.size(); ++ii) { + sec[ii] = sec[ii-1] + n_sel[ii-1]; + } + } + + int magic_number; + std::string device; + int *array_int; + unsigned long long*array_longlong; + int * ilist = NULL, * jrange = NULL, * jlist = NULL; + int ilist_size = 0, jrange_size = 0, jlist_size = 0; + bool init = false; + + void nbor_update(const int * mesh, const int size) { + int *mesh_host = new int[size], *ilist_host = NULL, *jrange_host = NULL, *jlist_host = NULL; + cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost)); + memcpy (&ilist_host, 4 + mesh_host, sizeof(int *)); + memcpy (&jrange_host, 8 + mesh_host, sizeof(int *)); + memcpy (&jlist_host, 12 + mesh_host, sizeof(int *)); + int const ago = mesh_host[0]; + if (!init) { + ilist_size = (int)(mesh_host[1] * 1.2); + jrange_size = (int)(mesh_host[2] * 1.2); + jlist_size = (int)(mesh_host[3] * 1.2); + cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size)); + cudaErrcheck(cudaMalloc((void **)&jrange, sizeof(int) * jrange_size)); + cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size)); + init = true; + } + if (ago == 0) { + if (ilist_size < mesh_host[1]) { + ilist_size = (int)(mesh_host[1] * 1.2); + cudaErrcheck(cudaFree(ilist)); + cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size)); + } + if (jrange_size < mesh_host[2]) { + jrange_size = (int)(mesh_host[2] * 1.2); + cudaErrcheck(cudaFree(jrange)); + cudaErrcheck(cudaMalloc((void **)&jrange,sizeof(int) * jrange_size)); + } + if (jlist_size < mesh_host[3]) { + jlist_size = (int)(mesh_host[3] * 1.2); + cudaErrcheck(cudaFree(jlist)); + cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size)); + } + cudaErrcheck(cudaMemcpy(ilist, ilist_host, sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jlist, jlist_host, sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice)); + } + delete [] mesh_host; + } + + int get_magic_number(int const nnei) { + if (nnei <= 256) { + return 256; + } + else if (nnei <= 512) { + return 512; + } + else if (nnei <= 1024) { + return 1024; + } + else if (nnei <= 2048) { + return 2048; + } + else if (nnei <= 4096) { + return 4096; + } + } +}; + +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptSeROp); +REGISTER_CPU(float); +REGISTER_CPU(double); +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + DescrptSeROp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc index 29daaa53c8..22c61b7429 100644 --- a/source/op/ewald_recp.cc +++ b/source/op/ewald_recp.cc @@ -10,36 +10,21 @@ typedef double boxtensor_t ; using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE ; -#else -typedef float VALUETYPE ; -#endif +using CPUDevice = Eigen::ThreadPoolDevice; -#ifdef HIGH_PREC REGISTER_OP("EwaldRecp") -.Input("coord: double") -.Input("charge: double") +.Attr("T: {float, double}") +.Input("coord: T") +.Input("charge: T") .Input("natoms: int32") -.Input("box: double") +.Input("box: T") .Attr("ewald_beta: float") .Attr("ewald_h: float") -.Output("energy: double") -.Output("force: double") -.Output("virial: double"); -#else -REGISTER_OP("EwaldRecp") -.Input("coord: float") -.Input("charge: float") -.Input("natoms: int32") -.Input("box: float") -.Attr("ewald_beta: float") -.Attr("ewald_h: float") -.Output("energy: float") -.Output("force: float") -.Output("virial: float"); -#endif +.Output("energy: T") +.Output("force: T") +.Output("virial: T"); +template class EwaldRecpOp : public OpKernel { public: explicit EwaldRecpOp(OpKernelConstruction* context) : OpKernel(context) { @@ -90,12 +75,12 @@ class EwaldRecpOp : public OpKernel { Tensor* virial_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(cc++, virial_shape, &virial_tensor)); - auto coord = coord_tensor .flat(); - auto charge = charge_tensor .flat(); - auto box = box_tensor .flat(); - auto energy = energy_tensor ->flat(); - auto force = force_tensor ->matrix(); - auto virial = virial_tensor ->matrix(); + auto coord = coord_tensor .flat(); + auto charge = charge_tensor .flat(); + auto box = box_tensor .flat(); + auto energy = energy_tensor ->flat(); + auto force = force_tensor ->matrix(); + auto virial = virial_tensor ->matrix(); for (int kk = 0; kk < nsamples; ++kk){ int box_iter = kk * 9; @@ -122,19 +107,19 @@ class EwaldRecpOp : public OpKernel { else if (inter[dd] >= 1) inter[dd] -= 1.; } } - vector d_coord3 (nloc*3); + vector d_coord3 (nloc*3); for (int ii = 0; ii < nloc * 3; ++ii) { d_coord3[ii] = d_coord3_[ii]; } // set charge - vector d_charge (nloc); + vector d_charge (nloc); for (int ii = 0; ii < nloc; ++ii) d_charge[ii] = charge(charge_iter + ii); // prepare outputs vectors - VALUETYPE d_ener; - vector d_force(nloc*3); - vector d_virial(9); + FPTYPE d_ener; + vector d_force(nloc*3); + vector d_virial(9); // compute EwaldReciprocal(d_ener, d_force, d_virial, d_coord3, d_charge, region, ep); @@ -150,8 +135,12 @@ class EwaldRecpOp : public OpKernel { } } private: - EwaldParameters ep; + EwaldParameters ep; }; -REGISTER_KERNEL_BUILDER(Name("EwaldRecp").Device(DEVICE_CPU), EwaldRecpOp); - +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("EwaldRecp").Device(DEVICE_CPU).TypeConstraint("T"), \ + EwaldRecpOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/gelu.cc b/source/op/gelu.cc index 26c53c8511..7012438db9 100644 --- a/source/op/gelu.cc +++ b/source/op/gelu.cc @@ -2,11 +2,11 @@ #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/shape_inference.h" -#define SQRT_2_PI 0.7978845608028654 using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; +#define SQRT_2_PI 0.7978845608028654 REGISTER_OP("Gelu") .Attr("T: {float, double}") @@ -26,43 +26,68 @@ REGISTER_OP("GeluGradGrad") .Input("x: T") .Output("output: T"); -template +#if GOOGLE_CUDA +// maybe instead use cudnn activation forward +void GeluLauncher(const float * in, float * out, int const size); +void GeluLauncher(const double * in, double * out, int const size); +void GeluGradLauncher(const float * dy, const float * in, float * out, int const size); +void GeluGradLauncher(const double * dy, const double * in, double * out, int const size); +void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size); +void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size); +#endif // GOOGLE_CUDa + +template struct GeluFunctor { - void operator()(const Device& d, const T * in, T * out, int const size) { + void operator()(const CPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) { #pragma omp parallel for for (int ii = 0; ii < size; ii++) { out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] * in[ii]))); } } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) { + GeluLauncher(in, out, size); + } + #endif }; -template +template struct GeluGradFunctor { - void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) { + void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { #pragma omp parallel for for (int ii = 0; ii < size; ii++) { - T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5); } } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradLauncher(dy, in, out, size); + } + #endif }; -template +template struct GeluGradGradFunctor { - void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { #pragma omp parallel for for (int ii = 0; ii < size; ii++) { - T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); - T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2); } } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradGradLauncher(dy, dy_, in, out, size); + } + #endif }; // OpKernel definition. -// template parameter is the datatype of the tensors. -template +// template parameter is the datatype of the tensors. +template class GeluOp : public OpKernel { public : explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -70,26 +95,25 @@ class GeluOp : public OpKernel { void Compute(OpKernelContext* context) override { // Grab the input tensor const Tensor& x = context->input(0); - Tensor * output = NULL; int context_output_index = 0; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, x.shape(), &output)); - GeluFunctor()( + GeluFunctor()( context->eigen_device(), - x.flat().data(), - output->flat().data(), + x.flat().data(), + output->flat().data(), static_cast(output->NumElements()) ); - // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; // OpKernel definition. -// template parameter is the datatype of the tensors. -template +// template parameter is the datatype of the tensors. +template class GeluGradOp : public OpKernel { public : explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -105,20 +129,20 @@ class GeluGradOp : public OpKernel { x.shape(), &output)); - GeluGradFunctor()( + GeluGradFunctor()( context->eigen_device(), - dy.flat().data(), - x.flat().data(), - output->flat().data(), + dy.flat().data(), + x.flat().data(), + output->flat().data(), static_cast(output->NumElements()) ); - // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; // OpKernel definition. -// template parameter is the datatype of the tensors. -template +// template parameter is the datatype of the tensors. +template class GeluGradGradOp : public OpKernel { public : explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -135,12 +159,12 @@ class GeluGradGradOp : public OpKernel { x.shape(), &output)); - GeluGradGradFunctor()( + GeluGradGradFunctor()( context->eigen_device(), - dy.flat().data(), - dy_.flat().data(), - x.flat().data(), - output->flat().data(), + dy.flat().data(), + dy_.flat().data(), + x.flat().data(), + output->flat().data(), static_cast(output->NumElements()) ); // GeluGradGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); @@ -148,17 +172,35 @@ class GeluGradGradOp : public OpKernel { }; #define REGISTER_CPU(T) \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("Gelu").Device(DEVICE_CPU).TypeConstraint("T"), \ - GeluOp); \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ - GeluGradOp); \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ - GeluGradGradOp); - REGISTER_CPU(float); - REGISTER_CPU(double); \ No newline at end of file +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); + +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradGradOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc deleted file mode 100644 index 34d4183f98..0000000000 --- a/source/op/gelu_gpu.cc +++ /dev/null @@ -1,159 +0,0 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/shape_inference.h" - -using namespace tensorflow; -using CPUDevice = Eigen::ThreadPoolDevice; -using GPUDevice = Eigen::GpuDevice; - -REGISTER_OP("Gelu") - .Attr("T: {float, double}") - .Input("x: T") - .Output("output: T"); - -REGISTER_OP("GeluGrad") - .Attr("T: {float, double}") - .Input("dy: T") - .Input("x: T") - .Output("output: T"); - -REGISTER_OP("GeluGradGrad") - .Attr("T: {float, double}") - .Input("dy: T") - .Input("dy_: T") - .Input("x: T") - .Output("output: T"); - -// maybe instead use cudnn activation forward -void GeluLauncher(const float * in, float * out, int const size); -void GeluLauncher(const double * in, double * out, int const size); - -void GeluGradLauncher(const float * dy, const float * in, float * out, int const size); -void GeluGradLauncher(const double * dy, const double * in, double * out, int const size); - -void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size); -void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size); - -template -struct GeluFunctor { - void operator()(const Device& d, const T * in, T * out, int const size) { - GeluLauncher(in, out, size); - } -}; - -template -struct GeluGradFunctor { - void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) { - GeluGradLauncher(dy, in, out, size); - } -}; - -template -struct GeluGradGradFunctor { - void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { - GeluGradGradLauncher(dy, dy_, in, out, size); - } -}; - -// OpKernel definition. -// template parameter is the datatype of the tensors. -template -class GeluOp : public OpKernel { - public : - explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& x = context->input(0); - Tensor * output = NULL; - int context_output_index = 0; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - x.shape(), - &output)); - - GeluFunctor()( - context->eigen_device(), - x.flat().data(), - output->flat().data(), - static_cast(output->NumElements()) - ); - // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); - } -}; - -// OpKernel definition. -// template parameter is the datatype of the tensors. -template -class GeluGradOp : public OpKernel { - public : - explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& dy = context->input(0); - const Tensor& x = context->input(1); - - Tensor * output = NULL; - int context_output_index = 0; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - x.shape(), - &output)); - - GeluGradFunctor()( - context->eigen_device(), - dy.flat().data(), - x.flat().data(), - output->flat().data(), - static_cast(output->NumElements()) - ); - // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); - } -}; - -// OpKernel definition. -// template parameter is the datatype of the tensors. -template -class GeluGradGradOp : public OpKernel { - public : - explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& dy = context->input(0); - const Tensor& dy_ = context->input(1); - const Tensor& x = context->input(2); - - Tensor * output = NULL; - int context_output_index = 0; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - x.shape(), - &output)); - - GeluGradGradFunctor()( - context->eigen_device(), - dy.flat().data(), - dy_.flat().data(), - x.flat().data(), - output->flat().data(), - static_cast(output->NumElements()) - ); - // GeluGradGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); - } -}; - -#define REGISTER_GPU(T) \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ - GeluOp); \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ - GeluGradOp); \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ - GeluGradGradOp); - REGISTER_GPU(float); - REGISTER_GPU(double); diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc new file mode 100644 index 0000000000..f84c9c0f9f --- /dev/null +++ b/source/op/gelu_multi_device.cc @@ -0,0 +1,167 @@ +#include "common.h" +#include "CustomeOperation.h" + +REGISTER_OP("Gelu") + .Attr("T: {float, double}") + .Input("x: T") + .Output("output: T"); + +REGISTER_OP("GeluGrad") + .Attr("T: {float, double}") + .Input("dy: T") + .Input("x: T") + .Output("output: T"); + +REGISTER_OP("GeluGradGrad") + .Attr("T: {float, double}") + .Input("dy: T") + .Input("dy_: T") + .Input("x: T") + .Output("output: T"); + +template +struct GeluFunctor { + void operator()(const CPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) { + GeluCPULauncher(in, out, size); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGPULauncher(in, out, size); + } + #endif +}; + +template +struct GeluGradFunctor { + void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradCPULauncher(dy, in, out, size); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradGPULauncher(dy, in, out, size); + } + #endif +}; + +template +struct GeluGradGradFunctor { + void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradGradCPULauncher(dy, dy_, in, out, size); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradGradGPULauncher(dy, dy_, in, out, size); + } + #endif +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluOp : public OpKernel { + public : + explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& x = context->input(0); + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluFunctor()( + context->eigen_device(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluGradOp : public OpKernel { + public : + explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& dy = context->input(0); + const Tensor& x = context->input(1); + + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluGradFunctor()( + context->eigen_device(), + dy.flat().data(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluGradGradOp : public OpKernel { + public : + explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& dy = context->input(0); + const Tensor& dy_ = context->input(1); + const Tensor& x = context->input(2); + + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluGradGradFunctor()( + context->eigen_device(), + dy.flat().data(), + dy_.flat().data(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + } +}; + +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); + +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradGradOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_force.cc b/source/op/prod_force.cc index e41b908d41..7ba99b6e81 100644 --- a/source/op/prod_force.cc +++ b/source/op/prod_force.cc @@ -6,36 +6,23 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("ProdForce") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("nlist: int32") -.Input("axis: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("force: double"); -#else REGISTER_OP("ProdForce") -.Input("net_deriv: float") -.Input("in_deriv: float") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("axis: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("force: float"); -#endif +.Output("force: T"); + using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class ProdForceOp : public OpKernel { public: explicit ProdForceOp(OpKernelConstruction* context) : OpKernel(context) { @@ -86,11 +73,11 @@ class ProdForceOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); auto axis = axis_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); // loop over samples #pragma omp parallel for @@ -176,7 +163,11 @@ class ProdForceOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdForce").Device(DEVICE_CPU), ProdForceOp); - +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForce").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_force_grad.cc b/source/op/prod_force_grad.cc index 14e884c5c9..2c8e62550c 100644 --- a/source/op/prod_force_grad.cc +++ b/source/op/prod_force_grad.cc @@ -6,36 +6,21 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("ProdForceGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("nlist: int32") -.Input("axis: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: double"); -#else REGISTER_OP("ProdForceGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("axis: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdForceGradOp : public OpKernel { public: @@ -97,12 +82,12 @@ class ProdForceGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); auto nlist = nlist_tensor .flat(); auto axis = axis_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -190,4 +175,9 @@ class ProdForceGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceGrad").Device(DEVICE_CPU), ProdForceGradOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_force_se_a.cc b/source/op/prod_force_se_a.cc index af0e712492..1b5053377c 100644 --- a/source/op/prod_force_se_a.cc +++ b/source/op/prod_force_se_a.cc @@ -6,34 +6,23 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("ProdForceSeA") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("force: double"); -#else REGISTER_OP("ProdForceSeA") -.Input("net_deriv: float") -.Input("in_deriv: float") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("force: float"); -#endif +.Output("force: T"); + using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +template class ProdForceSeAOp : public OpKernel { public: explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -83,10 +72,10 @@ class ProdForceSeAOp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -155,7 +144,11 @@ class ProdForceSeAOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_CPU), ProdForceSeAOp); - - +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeAOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_force_se_a_grad.cc b/source/op/prod_force_se_a_grad.cc index eda965974a..884e46f9a8 100644 --- a/source/op/prod_force_se_a_grad.cc +++ b/source/op/prod_force_se_a_grad.cc @@ -6,34 +6,20 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC REGISTER_OP("ProdForceSeAGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: double"); -#else -REGISTER_OP("ProdForceSeAGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdForceSeAGradOp : public OpKernel { public: @@ -91,11 +77,11 @@ class ProdForceSeAGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -158,4 +144,9 @@ class ProdForceSeAGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeAGrad").Device(DEVICE_CPU), ProdForceSeAGradOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeAGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeAGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_force_se_a_gpu.cc b/source/op/prod_force_se_a_multi_device.cc similarity index 58% rename from source/op/prod_force_se_a_gpu.cc rename to source/op/prod_force_se_a_multi_device.cc index 2d159a8505..87a3ae3ecc 100644 --- a/source/op/prod_force_se_a_gpu.cc +++ b/source/op/prod_force_se_a_multi_device.cc @@ -1,58 +1,29 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include -#include +#include "common.h" +#include "CustomeOperation.h" -using namespace tensorflow; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("ProdForceSeA") - .Input("net_deriv: double") - .Input("in_deriv: double") - .Input("nlist: int32") - .Input("natoms: int32") - .Attr("n_a_sel: int") - .Attr("n_r_sel: int") - .Output("force: double"); -#else REGISTER_OP("ProdForceSeA") - .Input("net_deriv: float") - .Input("in_deriv: float") + .Attr("T: {float, double}") + .Input("net_deriv: T") + .Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") - .Output("force: float"); -#endif + .Output("force: T"); -void ProdForceSeALauncher(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const int * nlist, - const int nloc, - const int nall, - const int ndescrpt, - const int nnei, - const int n_a_sel, - const int n_a_shift); +template +struct ProdForceSeAFunctor { + void operator()(const CPUDevice& d, FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdForceSeACPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdForceSeAGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); + } + #endif // GOOGLE_CUDA +}; +template class ProdForceSeAOp : public OpKernel { public: explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -76,8 +47,7 @@ class ProdForceSeAOp : public OpKernel { OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + const int * natoms = natoms_tensor.flat().data(); int nloc = natoms[0]; int nall = natoms[1]; int nframes = net_deriv_tensor.shape().dim_size(0); @@ -102,10 +72,10 @@ class ProdForceSeAOp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -117,23 +87,37 @@ class ProdForceSeAOp : public OpKernel { assert (nloc * nnei == nlist_tensor.shape().dim_size(1)); assert (nnei * 4 == ndescrpt); - for (int II = 0; II < nframes; II++) { - ProdForceSeALauncher(force_tensor->flat().data() + II * (nall * 3), - net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), - in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), - nlist_tensor.flat().data() + II * (nloc * nnei), - nloc, - nall, - ndescrpt, - nnei, - n_a_sel, - n_a_shift - ); - } - delete[] natoms; + ProdForceSeAFunctor()( + context->eigen_device(), + force_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + nlist_tensor.flat().data(), + nloc, + nall, + nnei, + ndescrpt, + n_a_sel, + n_a_shift + ); } private: int n_r_sel, n_a_sel, n_a_shift; }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_GPU), ProdForceSeAOp); \ No newline at end of file +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeAOp); +REGISTER_CPU(float); +REGISTER_CPU(double); +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + ProdForceSeAOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_force_se_r.cc b/source/op/prod_force_se_r.cc index b4933c5b4a..4347442167 100644 --- a/source/op/prod_force_se_r.cc +++ b/source/op/prod_force_se_r.cc @@ -6,29 +6,19 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdForceSeR") -#ifdef HIGH_PREC -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("nlist: int32") -.Input("natoms: int32") -.Output("force: double"); -#else -.Input("net_deriv: float") -.Input("in_deriv: float") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") -.Output("force: float"); -#endif +.Output("force: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class ProdForceSeROp : public OpKernel { public: explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -73,10 +63,10 @@ class ProdForceSeROp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -126,7 +116,12 @@ class ProdForceSeROp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_CPU), ProdForceSeROp); - +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeROp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_force_se_r_gpu.cc b/source/op/prod_force_se_r_gpu.cc deleted file mode 100644 index 3dc0bc7853..0000000000 --- a/source/op/prod_force_se_r_gpu.cc +++ /dev/null @@ -1,133 +0,0 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include -#include - -using namespace tensorflow; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("ProdForceSeR") - .Input("net_deriv: double") - .Input("in_deriv: double") - .Input("nlist: int32") - .Input("natoms: int32") - .Output("force: double"); -#else -REGISTER_OP("ProdForceSeR") - .Input("net_deriv: float") - .Input("in_deriv: float") - .Input("nlist: int32") - .Input("natoms: int32") - .Output("force: float"); -#endif - -void ProdForceSeRLauncher(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const int * nlist, - const int nloc, - const int nall, - const int ndescrpt, - const int nnei, - const int n_a_sel, - const int n_a_shift); - -class ProdForceSeROp : public OpKernel { -public: - explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) { - // std::cout << "I'm in prod_force_se_r_gpu.cc" << std::endl; - } - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - int context_input_index = 0; - const Tensor& net_deriv_tensor = context->input(context_input_index++); - const Tensor& in_deriv_tensor = context->input(context_input_index++); - const Tensor& nlist_tensor = context->input(context_input_index++); - const Tensor& natoms_tensor = context->input(context_input_index++); - - // set size of the sample - OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of net deriv should be 2")); - OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input deriv should be 2")); - OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of nlist should be 2")); - OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); - - OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); - int nloc = natoms[0]; - int nall = natoms[1]; - int nframes = net_deriv_tensor.shape().dim_size(0); - int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc; - int nnei = nlist_tensor.shape().dim_size(1) / nloc; - - // check the sizes - OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - - OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); - // OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel), errors::InvalidArgument ("number of neighbors should match")); - // OP_REQUIRES (context, (0 == n_r_sel), errors::InvalidArgument ("Rotational free only support all-angular information")); - - // Create an output tensor - TensorShape force_shape; - force_shape.AddDim (nframes); - force_shape.AddDim (3 * nall); - Tensor* force_tensor = NULL; - int context_output_index = 0; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - force_shape, &force_tensor)); - - // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); - - assert (nframes == force_shape.dim_size(0)); - assert (nframes == net_deriv_tensor.shape().dim_size(0)); - assert (nframes == in_deriv_tensor.shape().dim_size(0)); - assert (nframes == nlist_tensor.shape().dim_size(0)); - assert (nall * 3 == force_shape.dim_size(1)); - assert (nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1)); - assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)); - assert (nloc * nnei == nlist_tensor.shape().dim_size(1)); - assert (nnei * 4 == ndescrpt); - - for (int II = 0; II < nframes; II++) { - ProdForceSeRLauncher(force_tensor->flat().data() + II * (nall * 3), - net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), - in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), - nlist_tensor.flat().data() + II * (nloc * nnei), - nloc, - nall, - ndescrpt, - nnei, - n_a_sel, - n_a_shift - ); - } - delete[] natoms; - } -private: - int n_r_sel, n_a_sel, n_a_shift; -}; - -REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_GPU), ProdForceSeROp); \ No newline at end of file diff --git a/source/op/prod_force_se_r_grad.cc b/source/op/prod_force_se_r_grad.cc index 3866ef9b86..dfe9a5ff98 100644 --- a/source/op/prod_force_se_r_grad.cc +++ b/source/op/prod_force_se_r_grad.cc @@ -6,30 +6,18 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC REGISTER_OP("ProdForceSeRGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") -.Output("grad_net: double"); -#else -REGISTER_OP("ProdForceSeRGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdForceSeRGradOp : public OpKernel { public: @@ -83,11 +71,11 @@ class ProdForceSeRGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -131,4 +119,9 @@ class ProdForceSeRGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeRGrad").Device(DEVICE_CPU), ProdForceSeRGradOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeRGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeRGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_force_se_r_multi_device.cc b/source/op/prod_force_se_r_multi_device.cc new file mode 100644 index 0000000000..0a97d17742 --- /dev/null +++ b/source/op/prod_force_se_r_multi_device.cc @@ -0,0 +1,111 @@ +#include "common.h" +#include "CustomeOperation.h" + +REGISTER_OP("ProdForceSeR") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("nlist: int32") +.Input("natoms: int32") +.Output("force: T"); + +template +struct ProdForceSeRFunctor { + void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdForceSeRCPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdForceSeRGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt); + } + #endif // GOOGLE_CUDA +}; + +template +class ProdForceSeROp : public OpKernel { +public: + explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& net_deriv_tensor = context->input(context_input_index++); + const Tensor& in_deriv_tensor = context->input(context_input_index++); + const Tensor& nlist_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + + // set size of the sample + OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of net deriv should be 2")); + OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input deriv should be 2")); + OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of nlist should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + const int * natoms = natoms_tensor.flat().data(); + int nloc = natoms[0]; + int nall = natoms[1]; + int nframes = net_deriv_tensor.shape().dim_size(0); + int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc; + int nnei = nlist_tensor.shape().dim_size(1) / nloc; + + // check the sizes + OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + + OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); + + // Create an output tensor + TensorShape force_shape ; + force_shape.AddDim (nframes); + force_shape.AddDim (3 * nall); + Tensor* force_tensor = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + force_shape, &force_tensor)); + + // flat the tensors + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto nlist = nlist_tensor.flat(); + auto force = force_tensor->flat(); + + assert (nframes == force_shape.dim_size(0)); + assert (nframes == net_deriv_tensor.shape().dim_size(0)); + assert (nframes == in_deriv_tensor.shape().dim_size(0)); + assert (nframes == nlist_tensor.shape().dim_size(0)); + assert (nall * 3 == force_shape.dim_size(1)); + assert (nloc * ndescrpt == net_deriv_tensor.shape().dim_size(1)); + assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)); + assert (nloc * nnei == nlist_tensor.shape().dim_size(1)); + assert (nnei * 4 == ndescrpt); + + ProdForceSeRFunctor()( + context->eigen_device(), + force_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + nlist_tensor.flat().data(), + nloc, + nall, + nnei, + ndescrpt + ); + } +}; + +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeROp); +REGISTER_CPU(float); +REGISTER_CPU(double); +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeR").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + ProdForceSeROp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_virial.cc b/source/op/prod_virial.cc index 55b0b4b60d..f42a5055c2 100644 --- a/source/op/prod_virial.cc +++ b/source/op/prod_virial.cc @@ -6,42 +6,25 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("ProdVirial") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") -.Input("nlist: int32") -.Input("axis: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("virial: double") -.Output("atom_virial: double") -; -#else REGISTER_OP("ProdVirial") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("axis: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("virial: float") -.Output("atom_virial: float") -; -#endif +.Output("virial: T") +.Output("atom_virial: T"); + using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class ProdVirialOp : public OpKernel { public: explicit ProdVirialOp(OpKernelConstruction* context) : OpKernel(context) { @@ -100,13 +83,13 @@ class ProdVirialOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); auto axis = axis_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); // loop over samples #pragma omp parallel for @@ -144,10 +127,10 @@ class ProdVirialOp : public OpKernel { if (j_idx < 0) continue; if (jj == axis_0) { for (int aa = 0; aa < ndescrpt; ++aa){ - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0); virial (virial_iter + dd0 * 3 + dd1) += tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v; } @@ -156,10 +139,10 @@ class ProdVirialOp : public OpKernel { } else if (jj == axis_1) { for (int aa = 0; aa < ndescrpt; ++aa){ - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0); virial (virial_iter + dd0 * 3 + dd1) += tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v; } @@ -170,10 +153,10 @@ class ProdVirialOp : public OpKernel { int aa_start, aa_end; make_descript_range (aa_start, aa_end, jj); for (int aa = aa_start; aa < aa_end; ++aa) { - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0); virial (virial_iter + dd0 * 3 + dd1) += tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v; } @@ -201,7 +184,11 @@ class ProdVirialOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirial").Device(DEVICE_CPU), ProdVirialOp); - +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirial").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_virial_grad.cc b/source/op/prod_virial_grad.cc index 5257467029..5d75f5f649 100644 --- a/source/op/prod_virial_grad.cc +++ b/source/op/prod_virial_grad.cc @@ -6,38 +6,22 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("ProdVirialGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") -.Input("nlist: int32") -.Input("axis: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: double"); -#else REGISTER_OP("ProdVirialGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("axis: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdVirialGradOp : public OpKernel { public: @@ -104,13 +88,13 @@ class ProdVirialGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); - auto rij = rij_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); + auto rij = rij_tensor .flat(); auto nlist = nlist_tensor .flat(); auto axis = axis_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -200,4 +184,9 @@ class ProdVirialGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialGrad").Device(DEVICE_CPU), ProdVirialGradOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_virial_se_a.cc b/source/op/prod_virial_se_a.cc index 2f71d37505..ba934fa54e 100644 --- a/source/op/prod_virial_se_a.cc +++ b/source/op/prod_virial_se_a.cc @@ -6,40 +6,25 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeA") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("virial: double") -.Output("atom_virial: double") -; -#else -REGISTER_OP("ProdVirialSeA") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("virial: float") -.Output("atom_virial: float") -; -#endif +.Output("virial: T") +.Output("atom_virial: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +template class ProdVirialSeAOp : public OpKernel { public: explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -95,12 +80,12 @@ class ProdVirialSeAOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); // loop over samples #pragma omp parallel for @@ -131,10 +116,10 @@ class ProdVirialSeAOp : public OpKernel { int aa_start, aa_end; make_descript_range (aa_start, aa_end, jj); for (int aa = aa_start; aa < aa_end; ++aa) { - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0); virial (virial_iter + dd0 * 3 + dd1) -= tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v; } @@ -161,7 +146,13 @@ class ProdVirialSeAOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_CPU), ProdVirialSeAOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeAOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc index 660f652566..ebbd857bf0 100644 --- a/source/op/prod_virial_se_a_grad.cc +++ b/source/op/prod_virial_se_a_grad.cc @@ -6,36 +6,21 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeAGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: double"); -#else -REGISTER_OP("ProdVirialSeAGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdVirialSeAGradOp : public OpKernel { public: @@ -98,12 +83,12 @@ class ProdVirialSeAGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); - auto rij = rij_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); + auto rij = rij_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -162,4 +147,10 @@ class ProdVirialSeAGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeAGrad").Device(DEVICE_CPU), ProdVirialSeAGradOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeAGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeAGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_virial_se_a_gpu.cc b/source/op/prod_virial_se_a_multi_device.cc similarity index 53% rename from source/op/prod_virial_se_a_gpu.cc rename to source/op/prod_virial_se_a_multi_device.cc index 42f70d06d2..7929fbd588 100644 --- a/source/op/prod_virial_se_a_gpu.cc +++ b/source/op/prod_virial_se_a_multi_device.cc @@ -1,64 +1,31 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include -#include +#include "common.h" +#include "CustomeOperation.h" -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("ProdVirialSeA") - .Input("net_deriv: double") - .Input("in_deriv: double") - .Input("rij: double") - .Input("nlist: int32") - .Input("natoms: int32") - .Attr("n_a_sel: int") - .Attr("n_r_sel: int") - .Output("virial: double") - .Output("atom_virial: double"); -#else REGISTER_OP("ProdVirialSeA") - .Input("net_deriv: float") - .Input("in_deriv: float") - .Input("rij: float") + .Attr("T: {float, double}") + .Input("net_deriv: T") + .Input("in_deriv: T") + .Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") - .Output("virial: float") - .Output("atom_virial: float"); -#endif - -using namespace tensorflow; + .Output("virial: T") + .Output("atom_virial: T"); -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); +template +struct ProdVirialSeAFunctor { + void operator()(const CPUDevice& d, FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdVirialSeACPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } -} - -void ProdVirialSeALauncher(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, - const int * nlist, - const int nloc, - const int nall, - const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift); + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdVirialSeAGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); + } + #endif // GOOGLE_CUDA +}; +template class ProdVirialSeAOp : public OpKernel { public: explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -84,8 +51,7 @@ class ProdVirialSeAOp : public OpKernel { OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + const int * natoms = natoms_tensor.flat().data(); int nloc = natoms[0]; int nall = natoms[1]; int nnei = nlist_tensor.shape().dim_size(1) / nloc; @@ -114,32 +80,46 @@ class ProdVirialSeAOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); - for (int II = 0; II < nframes; II++) { - ProdVirialSeALauncher(virial_tensor->flat().data() + II * 9, - atom_virial_tensor->flat().data() + II * (nall * 9), - net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), - in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), - rij_tensor.flat().data() + II * (nloc * nnei * 3), - nlist_tensor.flat().data() + II * (nloc * nnei), - nloc, - nall, - nnei, - ndescrpt, - n_a_sel, - n_a_shift - ); - } - delete[] natoms; + ProdVirialSeAFunctor()( + context->eigen_device(), + virial_tensor->flat().data(), + atom_virial_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + rij_tensor.flat().data(), + nlist_tensor.flat().data(), + nloc, + nall, + nnei, + ndescrpt, + n_a_sel, + n_a_shift + ); } private: int n_r_sel, n_a_sel, n_a_shift; }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_GPU), ProdVirialSeAOp); \ No newline at end of file +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeAOp); +REGISTER_CPU(float); +REGISTER_CPU(double); +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + ProdVirialSeAOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_virial_se_r.cc b/source/op/prod_virial_se_r.cc index 1d21234421..5337246c21 100644 --- a/source/op/prod_virial_se_r.cc +++ b/source/op/prod_virial_se_r.cc @@ -6,36 +6,22 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeR") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") -.Output("virial: double") -.Output("atom_virial: double") -; -#else -REGISTER_OP("ProdVirialSeR") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Output("virial: float") -.Output("atom_virial: float") -; -#endif +.Output("virial: T") +.Output("atom_virial: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class ProdVirialSeROp : public OpKernel { public: explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -87,12 +73,12 @@ class ProdVirialSeROp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); // loop over samples #pragma omp parallel for @@ -119,10 +105,10 @@ class ProdVirialSeROp : public OpKernel { for (int jj = 0; jj < nnei; ++jj){ int j_idx = nlist (nlist_iter + i_idx * nnei + jj); if (j_idx < 0) continue; - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0); virial (virial_iter + dd0 * 3 + dd1) -= tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v; } @@ -133,7 +119,12 @@ class ProdVirialSeROp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_CPU), ProdVirialSeROp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeROp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_virial_se_r_gpu.cc b/source/op/prod_virial_se_r_gpu.cc deleted file mode 100644 index 91f965b72c..0000000000 --- a/source/op/prod_virial_se_r_gpu.cc +++ /dev/null @@ -1,137 +0,0 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include -#include - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("ProdVirialSeR") - .Input("net_deriv: double") - .Input("in_deriv: double") - .Input("rij: double") - .Input("nlist: int32") - .Input("natoms: int32") - .Output("virial: double") - .Output("atom_virial: double"); -#else -REGISTER_OP("ProdVirialSeR") - .Input("net_deriv: float") - .Input("in_deriv: float") - .Input("rij: float") - .Input("nlist: int32") - .Input("natoms: int32") - .Output("virial: float") - .Output("atom_virial: float"); -#endif - -using namespace tensorflow; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -void ProdVirialSeRLauncher(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, - const int * nlist, - const int nloc, - const int nall, - const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift); - -class ProdVirialSeROp : public OpKernel { - public: - explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) { - // std::cout << "I'm in prod_virial_se_r_gpu.cc" << std::endl; - } - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - int context_input_index = 0; - const Tensor& net_deriv_tensor = context->input(context_input_index++); - const Tensor& in_deriv_tensor = context->input(context_input_index++); - const Tensor& rij_tensor = context->input(context_input_index++); - const Tensor& nlist_tensor = context->input(context_input_index++); - const Tensor& natoms_tensor = context->input(context_input_index++); - // set size of the sample - OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of net deriv should be 2")); - OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input deriv should be 2")); - OP_REQUIRES (context, (rij_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of rij should be 2")); - OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of nlist should be 2")); - OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); - cudaErrcheck(cudaDeviceSynchronize()); - OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); - int nloc = natoms[0]; - int nall = natoms[1]; - int nnei = nlist_tensor.shape().dim_size(1) / nloc; - int nframes = net_deriv_tensor.shape().dim_size(0); - int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc; - - // check the sizes - OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - - OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); - OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)), errors::InvalidArgument ("dim of rij should be nnei * 3")); - - // Create an output tensor - TensorShape virial_shape; - virial_shape.AddDim (nframes); - virial_shape.AddDim (9); - Tensor* virial_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor)); - TensorShape atom_virial_shape ; - atom_virial_shape.AddDim (nframes); - atom_virial_shape.AddDim (9 * nall); - Tensor* atom_virial_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); - - // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); - auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); - - for (int II = 0; II < nframes; II++) { - ProdVirialSeRLauncher(virial_tensor->flat().data() + II * 9, - atom_virial_tensor->flat().data() + II * (nall * 9), - net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), - in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), - rij_tensor.flat().data() + II * (nloc * nnei * 3), - nlist_tensor.flat().data() + II * (nloc * nnei), - nloc, - nall, - nnei, - ndescrpt, - n_a_sel, - n_a_shift - ); - } - delete[] natoms; - } -private: - int n_r_sel, n_a_sel, n_a_shift; -}; - -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_GPU), ProdVirialSeROp); \ No newline at end of file diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc index 20b53cf3c9..61b8970e71 100644 --- a/source/op/prod_virial_se_r_grad.cc +++ b/source/op/prod_virial_se_r_grad.cc @@ -6,32 +6,19 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeRGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") -.Output("grad_net: double"); -#else -REGISTER_OP("ProdVirialSeRGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdVirialSeRGradOp : public OpKernel { public: @@ -90,12 +77,12 @@ class ProdVirialSeRGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); - auto rij = rij_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); + auto rij = rij_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -135,4 +122,10 @@ class ProdVirialSeRGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeRGrad").Device(DEVICE_CPU), ProdVirialSeRGradOp); +// Register the GPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeRGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeRGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_virial_se_r_multi_device.cc b/source/op/prod_virial_se_r_multi_device.cc new file mode 100644 index 0000000000..61e37eb215 --- /dev/null +++ b/source/op/prod_virial_se_r_multi_device.cc @@ -0,0 +1,114 @@ +#include "common.h" +#include "CustomeOperation.h" + +REGISTER_OP("ProdVirialSeR") + .Attr("T: {float, double}") + .Input("net_deriv: T") + .Input("in_deriv: T") + .Input("rij: T") + .Input("nlist: int32") + .Input("natoms: int32") + .Output("virial: T") + .Output("atom_virial: T"); + +template +struct ProdVirialSeRFunctor { + void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdVirialSeRCPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdVirialSeRGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt); + } + #endif // GOOGLE_CUDA +}; + +template +class ProdVirialSeROp : public OpKernel { + public: + explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& net_deriv_tensor = context->input(context_input_index++); + const Tensor& in_deriv_tensor = context->input(context_input_index++); + const Tensor& rij_tensor = context->input(context_input_index++); + const Tensor& nlist_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + + // set size of the sample + OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of net deriv should be 2")); + OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input deriv should be 2")); + OP_REQUIRES (context, (rij_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of rij should be 2")); + OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of nlist should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + const int * natoms = natoms_tensor.flat().data(); + int nloc = natoms[0]; + int nall = natoms[1]; + int nnei = nlist_tensor.shape().dim_size(1) / nloc; + int nframes = net_deriv_tensor.shape().dim_size(0); + int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc; + + // check the sizes + OP_REQUIRES (context, (nframes == in_deriv_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nframes == rij_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + + OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); + OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)), errors::InvalidArgument ("dim of rij should be nnei * 3")); + + // Create an output tensor + TensorShape virial_shape ; + virial_shape.AddDim (nframes); + virial_shape.AddDim (9); + Tensor* virial_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor)); + TensorShape atom_virial_shape; + atom_virial_shape.AddDim (nframes); + atom_virial_shape.AddDim (9 * nall); + Tensor* atom_virial_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); + + // flat the tensors + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); + auto nlist = nlist_tensor.flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); + + ProdVirialSeRFunctor()( + context->eigen_device(), + virial_tensor->flat().data(), + atom_virial_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + rij_tensor.flat().data(), + nlist_tensor.flat().data(), + nloc, + nall, + nnei, + ndescrpt + ); + } +}; + +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeROp); +REGISTER_CPU(float); +REGISTER_CPU(double); +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeR").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + ProdVirialSeROp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/soft_min.cc b/source/op/soft_min.cc index 6b23305d3e..6f3bc58932 100644 --- a/source/op/soft_min.cc +++ b/source/op/soft_min.cc @@ -8,16 +8,11 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif -#ifdef HIGH_PREC REGISTER_OP("SoftMinSwitch") +.Attr("T: {float, double}") .Input("type: int32") -.Input("rij: double") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("sel_a: list(int)") @@ -25,25 +20,14 @@ REGISTER_OP("SoftMinSwitch") .Attr("alpha: float") .Attr("rmin: float") .Attr("rmax: float") -.Output("sw_value: double") -.Output("sw_deriv: double"); -#else -REGISTER_OP("SoftMinSwitch") -.Input("type: int32") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Attr("alpha: float") -.Attr("rmin: float") -.Attr("rmax: float") -.Output("sw_value: float") -.Output("sw_deriv: float"); -#endif +.Output("sw_value: T") +.Output("sw_deriv: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class SoftMinSwitchOp : public OpKernel { public: explicit SoftMinSwitchOp(OpKernelConstruction* context) : OpKernel(context) { @@ -106,10 +90,10 @@ class SoftMinSwitchOp : public OpKernel { // flat the tensors auto type = type_tensor .matrix(); - auto rij = rij_tensor .matrix(); + auto rij = rij_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto sw_value = sw_value_tensor ->matrix(); - auto sw_deriv = sw_deriv_tensor ->matrix(); + auto sw_value = sw_value_tensor ->matrix(); + auto sw_deriv = sw_deriv_tensor ->matrix(); // loop over samples #pragma omp parallel for @@ -126,26 +110,26 @@ class SoftMinSwitchOp : public OpKernel { // compute force of a frame for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; - VALUETYPE aa = 0; - VALUETYPE bb = 0; + FPTYPE aa = 0; + FPTYPE bb = 0; for (int jj = 0; jj < nnei; ++jj){ int j_idx = nlist (kk, i_idx * nnei + jj); if (j_idx < 0) continue; int rij_idx_shift = (i_idx * nnei + jj) * 3; - VALUETYPE dr[3] = { + FPTYPE dr[3] = { rij(kk, rij_idx_shift + 0), rij(kk, rij_idx_shift + 1), rij(kk, rij_idx_shift + 2) }; - VALUETYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; - VALUETYPE rr = sqrt(rr2); - VALUETYPE ee = exp(-rr / alpha); + FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; + FPTYPE rr = sqrt(rr2); + FPTYPE ee = exp(-rr / alpha); aa += ee; bb += rr * ee; } - VALUETYPE smin = bb / aa; - VALUETYPE vv, dd; - spline5_switch(vv, dd, smin, static_cast(rmin), static_cast(rmax)); + FPTYPE smin = bb / aa; + FPTYPE vv, dd; + spline5_switch(vv, dd, smin, static_cast(rmin), static_cast(rmax)); // value of switch sw_value(kk, i_idx) = vv; // deriv of switch distributed as force @@ -153,17 +137,17 @@ class SoftMinSwitchOp : public OpKernel { int j_idx = nlist (kk, i_idx * nnei + jj); if (j_idx < 0) continue; int rij_idx_shift = (ii * nnei + jj) * 3; - VALUETYPE dr[3] = { + FPTYPE dr[3] = { rij(kk, rij_idx_shift + 0), rij(kk, rij_idx_shift + 1), rij(kk, rij_idx_shift + 2) }; - VALUETYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; - VALUETYPE rr = sqrt(rr2); - VALUETYPE ee = exp(-rr / alpha); - VALUETYPE pref_c = (1./rr - 1./alpha) * ee ; - VALUETYPE pref_d = 1./(rr * alpha) * ee; - VALUETYPE ts; + FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; + FPTYPE rr = sqrt(rr2); + FPTYPE ee = exp(-rr / alpha); + FPTYPE pref_c = (1./rr - 1./alpha) * ee ; + FPTYPE pref_d = 1./(rr * alpha) * ee; + FPTYPE ts; ts = dd / (aa * aa) * (aa * pref_c + bb * pref_d); sw_deriv(kk, rij_idx_shift + 0) += ts * dr[0]; sw_deriv(kk, rij_idx_shift + 1) += ts * dr[1]; @@ -196,7 +180,12 @@ class SoftMinSwitchOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("SoftMinSwitch").Device(DEVICE_CPU), SoftMinSwitchOp); - +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinSwitch").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinSwitchOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/soft_min_force.cc b/source/op/soft_min_force.cc index e51aadbc79..ffd6442a1a 100644 --- a/source/op/soft_min_force.cc +++ b/source/op/soft_min_force.cc @@ -6,34 +6,21 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC REGISTER_OP("SoftMinForce") -.Input("du: double") -.Input("sw_deriv: double") +.Attr("T: {float, double}") +.Input("du: T") +.Input("sw_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("force: double"); -#else -REGISTER_OP("SoftMinForce") -.Input("du: float") -.Input("sw_deriv: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("force: float"); -#endif +.Output("force: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class SoftMinForceOp : public OpKernel { public: explicit SoftMinForceOp(OpKernelConstruction* context) : OpKernel(context) { @@ -78,10 +65,10 @@ class SoftMinForceOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor)); // flat the tensors - auto du = du_tensor.matrix(); - auto sw_deriv = sw_deriv_tensor.matrix(); + auto du = du_tensor.matrix(); + auto sw_deriv = sw_deriv_tensor.matrix(); auto nlist = nlist_tensor.matrix(); - auto force = force_tensor->matrix(); + auto force = force_tensor->matrix(); // loop over samples #pragma omp parallel for @@ -118,4 +105,10 @@ class SoftMinForceOp : public OpKernel { int n_r_sel, n_a_sel; }; -REGISTER_KERNEL_BUILDER(Name("SoftMinForce").Device(DEVICE_CPU), SoftMinForceOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinForce").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinForceOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/soft_min_force_grad.cc b/source/op/soft_min_force_grad.cc index 4c8a4b21ff..2461828853 100644 --- a/source/op/soft_min_force_grad.cc +++ b/source/op/soft_min_force_grad.cc @@ -6,34 +6,20 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC REGISTER_OP("SoftMinForceGrad") -.Input("grad: double") -.Input("du: double") -.Input("sw_deriv: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("du: T") +.Input("sw_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: double"); -#else -REGISTER_OP("SoftMinForceGrad") -.Input("grad: float") -.Input("du: float") -.Input("sw_deriv: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class SoftMinForceGradOp : public OpKernel { public: @@ -89,11 +75,11 @@ class SoftMinForceGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .matrix(); - auto du = du_tensor .matrix(); - auto sw_deriv = sw_deriv_tensor .matrix(); + auto grad = grad_tensor .matrix(); + auto du = du_tensor .matrix(); + auto sw_deriv = sw_deriv_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto grad_net = grad_net_tensor ->matrix(); + auto grad_net = grad_net_tensor ->matrix(); // loop over frames #pragma omp parallel for @@ -125,4 +111,10 @@ class SoftMinForceGradOp : public OpKernel int n_r_sel, n_a_sel; }; -REGISTER_KERNEL_BUILDER(Name("SoftMinForceGrad").Device(DEVICE_CPU), SoftMinForceGradOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinForceGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinForceGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/soft_min_virial.cc b/source/op/soft_min_virial.cc index 193c34f981..f369808685 100644 --- a/source/op/soft_min_virial.cc +++ b/source/op/soft_min_virial.cc @@ -6,40 +6,23 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - -#ifdef HIGH_PREC -REGISTER_OP("SoftMinVirial") -.Input("du: double") -.Input("sw_deriv: double") -.Input("rij: double") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("virial: double") -.Output("atom_virial: double") -; -#else REGISTER_OP("SoftMinVirial") -.Input("du: float") -.Input("sw_deriv: float") -.Input("rij: float") +.Attr("T: {float, double}") +.Input("du: T") +.Input("sw_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("virial: float") -.Output("atom_virial: float") -; -#endif +.Output("virial: T") +.Output("atom_virial: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class SoftMinVirialOp : public OpKernel { public: explicit SoftMinVirialOp(OpKernelConstruction* context) : OpKernel(context) { @@ -94,12 +77,12 @@ class SoftMinVirialOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto du = du_tensor.matrix(); - auto sw_deriv = sw_deriv_tensor.matrix(); - auto rij = rij_tensor.matrix(); + auto du = du_tensor.matrix(); + auto sw_deriv = sw_deriv_tensor.matrix(); + auto rij = rij_tensor.matrix(); auto nlist = nlist_tensor.matrix(); - auto virial = virial_tensor->matrix(); - auto atom_virial = atom_virial_tensor->matrix(); + auto virial = virial_tensor->matrix(); + auto atom_virial = atom_virial_tensor->matrix(); // loop over samples #pragma omp parallel for @@ -122,7 +105,7 @@ class SoftMinVirialOp : public OpKernel { int rij_idx_shift = (ii * nnei + jj) * 3; for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = du(kk, i_idx) * sw_deriv(kk, rij_idx_shift + dd0) * rij(kk, rij_idx_shift + dd1); + FPTYPE tmp_v = du(kk, i_idx) * sw_deriv(kk, rij_idx_shift + dd0) * rij(kk, rij_idx_shift + dd1); virial(kk, dd0 * 3 + dd1) -= tmp_v; atom_virial(kk, j_idx * 9 + dd0 * 3 + dd1) -= tmp_v; } @@ -135,7 +118,12 @@ class SoftMinVirialOp : public OpKernel { int n_r_sel, n_a_sel; }; -REGISTER_KERNEL_BUILDER(Name("SoftMinVirial").Device(DEVICE_CPU), SoftMinVirialOp); - +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinVirial").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinVirialOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/soft_min_virial_grad.cc b/source/op/soft_min_virial_grad.cc index 6f8703bdee..0bafaa5bfa 100644 --- a/source/op/soft_min_virial_grad.cc +++ b/source/op/soft_min_virial_grad.cc @@ -6,36 +6,22 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif -#ifdef HIGH_PREC REGISTER_OP("SoftMinVirialGrad") -.Input("grad: double") -.Input("du: double") -.Input("sw_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("du: T") +.Input("sw_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: double"); -#else -REGISTER_OP("SoftMinVirialGrad") -.Input("grad: float") -.Input("du: float") -.Input("sw_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class SoftMinVirialGradOp : public OpKernel { public: @@ -97,12 +83,12 @@ class SoftMinVirialGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .matrix(); - auto du = du_tensor .matrix(); - auto sw_deriv = sw_deriv_tensor .matrix(); - auto rij = rij_tensor .matrix(); + auto grad = grad_tensor .matrix(); + auto du = du_tensor .matrix(); + auto sw_deriv = sw_deriv_tensor .matrix(); + auto rij = rij_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto grad_net = grad_net_tensor ->matrix(); + auto grad_net = grad_net_tensor ->matrix(); // loop over frames #pragma omp parallel for @@ -148,4 +134,10 @@ class SoftMinVirialGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("SoftMinVirialGrad").Device(DEVICE_CPU), SoftMinVirialGradOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinVirialGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinVirialGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/tab_inter.cc b/source/op/tab_inter.cc index 242e52a6e7..63c7af210c 100644 --- a/source/op/tab_inter.cc +++ b/source/op/tab_inter.cc @@ -6,44 +6,26 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif -#ifdef HIGH_PREC REGISTER_OP("TabInter") +.Attr("T: {float, double}") .Input("table_info: double") .Input("table_data: double") .Input("type: int32") -.Input("rij: double") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") -.Input("scale: double") +.Input("scale: T") .Attr("sel_a: list(int)") .Attr("sel_r: list(int)") -.Output("atom_energy: double") -.Output("force: double") -.Output("atom_virial: double"); -#else -REGISTER_OP("TabInter") -.Input("table_info: double") -.Input("table_data: double") -.Input("type: int32") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Input("scale: float") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Output("atom_energy: float") -.Output("force: float") -.Output("atom_virial: float"); -#endif +.Output("atom_energy: T") +.Output("force: T") +.Output("atom_virial: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + inline void tabulated_inter (double & ener, double & fscale, @@ -86,6 +68,7 @@ void tabulated_inter (double & ener, fscale *= -hi; } +template class TabInterOp : public OpKernel { public: explicit TabInterOp(OpKernelConstruction* context) : OpKernel(context) { @@ -156,15 +139,15 @@ class TabInterOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(tmp_idx++, virial_shape, &virial_tensor)); // flat the tensors - auto table_info = table_info_tensor.flat(); - auto table_data = table_data_tensor.flat(); + auto table_info = table_info_tensor.flat(); + auto table_data = table_data_tensor.flat(); auto type = type_tensor .matrix(); - auto rij = rij_tensor .matrix(); + auto rij = rij_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto scale = scale_tensor .matrix(); - auto energy = energy_tensor ->matrix(); - auto force = force_tensor ->matrix(); - auto virial = virial_tensor ->matrix(); + auto scale = scale_tensor .matrix(); + auto energy = energy_tensor ->matrix(); + auto force = force_tensor ->matrix(); + auto virial = virial_tensor ->matrix(); OP_REQUIRES (context, (ntypes == int(table_info(3)+0.1)), errors::InvalidArgument ("ntypes provided in table does not match deeppot")); int nspline = table_info(2)+0.1; @@ -203,7 +186,7 @@ class TabInterOp : public OpKernel { for (int tt = 0; tt < ntypes; ++tt) { for (int ii = 0; ii < natoms(2+tt); ++ii){ int i_type = type(kk, i_idx); - VALUETYPE i_scale = scale(kk, i_idx); + FPTYPE i_scale = scale(kk, i_idx); assert(i_type == tt) ; int jiter = 0; // a neighbor @@ -318,7 +301,13 @@ class TabInterOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("TabInter").Device(DEVICE_CPU), TabInterOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("TabInter").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabInterOp); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py index e46265231f..1c93e92f02 100644 --- a/source/train/DescrptSeA.py +++ b/source/train/DescrptSeA.py @@ -391,14 +391,18 @@ def _filter(self, xyz_scatter = tf.matmul(xyz_scatter, w) # natom x nei_type_i x out_size xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1])) - xyz_scatter_total.append(xyz_scatter) + # xyz_scatter_total.append(xyz_scatter) + if type_i == 0 : + xyz_scatter_1 = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True) + else : + xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True) # natom x nei x outputs_size - xyz_scatter = tf.concat(xyz_scatter_total, axis=1) + # xyz_scatter = tf.concat(xyz_scatter_total, axis=1) # natom x nei x 4 - inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4]) + # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4]) # natom x 4 x outputs_size - xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True) + # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True) xyz_scatter_1 = xyz_scatter_1 * (4.0 / shape[1]) # natom x 4 x outputs_size_2 xyz_scatter_2 = tf.slice(xyz_scatter_1, [0,0,0],[-1,-1,outputs_size_2])