From d8b392163ee7f0230d60a963c817efe0787b5194 Mon Sep 17 00:00:00 2001 From: Lu Date: Tue, 8 Sep 2020 01:12:14 +0800 Subject: [PATCH 1/3] device independence for deepmd-kit 1. device independence for deepmd-kit 2. fix some potential bugs with cuda code 3. replace concat with gemm to calculate the descriptor after the embedding net --- source/CMakeLists.txt | 2 +- source/lib/include/CustomeOperation.h | 369 ++++++++++++++++++++++++++ source/lib/include/NNPInter.h | 6 - source/lib/include/common.h | 20 ++ source/lib/src/NNPInter.cc | 341 +++--------------------- source/lib/src/common.cc | 130 +++++++++ source/op/cuda/descrpt_se_a.cu | 315 +++++++++++++++------- source/op/cuda/gelu.cu | 12 +- source/op/cuda/prod_force_se_a.cu | 70 +++-- source/op/cuda/prod_virial_se_a.cu | 69 +++-- source/op/descrpt_se_a_gpu.cc | 236 +++++++++------- source/op/gelu_gpu.cc | 90 +++++-- source/op/prod_force_se_a_gpu.cc | 93 +++---- source/op/prod_virial_se_a_gpu.cc | 97 +++---- source/train/DescrptSeA.py | 12 +- 15 files changed, 1192 insertions(+), 670 deletions(-) create mode 100644 source/lib/include/CustomeOperation.h diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 84c3d326da..dc35ee5dc0 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -125,7 +125,7 @@ else() endif() endif() if (USE_CUDA_TOOLKIT) - add_definitions("-DUSE_CUDA_TOOLKIT") + add_definitions("-D GOOGLE_CUDA") endif() # define USE_TTM diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h new file mode 100644 index 0000000000..ef8b450db4 --- /dev/null +++ b/source/lib/include/CustomeOperation.h @@ -0,0 +1,369 @@ +#pragma once +#include +#include +#include +#include +#include +#include "MathUtilities.h" + +#if GOOGLE_CUDA +#include +#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);} +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} +#endif // GOOGLE_CUDA + +struct NeighborInfo { + int type; + double dist; + int index; + NeighborInfo () : type (0), dist(0), index(0) {} + NeighborInfo (int tt, double dd, int ii) : type (tt), dist(dd), index(ii) {} + + bool operator < (const NeighborInfo & b) const { + return (type < b.type || (type == b.type && (dist < b.dist || (dist == b.dist && index < b.index)))); + } +}; + +template +inline void spline5_switch ( + T & vv, + T & dd, + const T & xx, + const float & rmin, + const float & rmax) +{ + if (xx < rmin) { + dd = 0; + vv = 1; + } + else if (xx < rmax) { + T uu = (xx - rmin) / (rmax - rmin) ; + T du = 1. / (rmax - rmin) ; + vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; + dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; + } + else { + dd = 0; + vv = 0; + } +} + +template +int format_nlist_fill_se_a_cpu ( + vector & fmt_nei_idx_a, + const vector & posi, + const int & ntypes, + const vector & type, + const int & i_idx, + const vector & nei_idx_a, + const float & rcut, + const vector & sec_a) +{ + fmt_nei_idx_a.resize (sec_a.back()); + fill (fmt_nei_idx_a.begin(), fmt_nei_idx_a.end(), -1); + + // gether all neighbors + std::vector nei_idx (nei_idx_a); + // allocate the information for all neighbors + vector sel_nei; + sel_nei.reserve (nei_idx_a.size()); + for (unsigned kk = 0; kk < nei_idx.size(); ++kk) { + T diff[3]; + const int & j_idx = nei_idx[kk]; + for (int dd = 0; dd < 3; ++dd) { + diff[dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd]; + } + T rr = sqrt(MathUtilities::dot (diff, diff)); + if (rr <= rcut) { + sel_nei.push_back(NeighborInfo(type[j_idx], rr, j_idx)); + } + } + sort(sel_nei.begin(), sel_nei.end()); + + std::vector nei_iter = sec_a; + int overflowed = -1; + for (unsigned kk = 0; kk < sel_nei.size(); ++kk) { + const int & nei_type = sel_nei[kk].type; + if (nei_iter[nei_type] < sec_a[nei_type+1]) { + fmt_nei_idx_a[nei_iter[nei_type] ++] = sel_nei[kk].index; + } + } + return overflowed; +} + +template +void compute_descriptor_se_a_cpu ( + vector & descrpt_a, + vector & descrpt_a_deriv, + vector & rij_a, + const vector & posi, + const int & ntypes, + const vector & type, + const int & i_idx, + const vector & fmt_nlist_a, + const vector & sec_a, + const float & rmin, + const float & rmax) +{ + // compute the diff of the neighbors + rij_a.resize (sec_a.back() * 3); + fill (rij_a.begin(), rij_a.end(), 0.0); + for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) { + for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) { + if (fmt_nlist_a[jj] < 0) break; + const int & j_idx = fmt_nlist_a[jj]; + + for (int dd = 0; dd < 3; ++dd) { + rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd]; + } + } + } + // 1./rr, cos(theta), cos(phi), sin(phi) + descrpt_a.resize (sec_a.back() * 4); + fill (descrpt_a.begin(), descrpt_a.end(), 0.0); + // deriv wrt center: 3 + descrpt_a_deriv.resize (sec_a.back() * 4 * 3); + fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0); + + for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) { + for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) { + if (fmt_nlist_a[nei_iter] < 0) break; + const T * rr = &rij_a[nei_iter * 3]; + T nr2 = MathUtilities::dot(rr, rr); + T inr = 1./sqrt(nr2); + T nr = nr2 * inr; + T inr2 = inr * inr; + T inr4 = inr2 * inr2; + T inr3 = inr4 * nr; + T sw, dsw; + spline5_switch(sw, dsw, nr, rmin, rmax); + int idx_deriv = nei_iter * 4 * 3; // 4 components time 3 directions + int idx_value = nei_iter * 4; // 4 components + // 4 value components + descrpt_a[idx_value + 0] = 1./nr; + descrpt_a[idx_value + 1] = rr[0] / nr2; + descrpt_a[idx_value + 2] = rr[1] / nr2; + descrpt_a[idx_value + 3] = rr[2] / nr2; + // deriv of component 1/r + descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr; + // deriv of component x/r2 + descrpt_a_deriv[idx_deriv + 3] = (2. * rr[0] * rr[0] * inr4 - inr2) * sw - descrpt_a[idx_value + 1] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 4] = (2. * rr[0] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 5] = (2. * rr[0] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[2] * inr; + // deriv of component y/r2 + descrpt_a_deriv[idx_deriv + 6] = (2. * rr[1] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 7] = (2. * rr[1] * rr[1] * inr4 - inr2) * sw - descrpt_a[idx_value + 2] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 8] = (2. * rr[1] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[2] * inr; + // deriv of component z/r2 + descrpt_a_deriv[idx_deriv + 9] = (2. * rr[2] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv +10] = (2. * rr[2] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv +11] = (2. * rr[2] * rr[2] * inr4 - inr2) * sw - descrpt_a[idx_value + 3] * dsw * rr[2] * inr; + // 4 value components + descrpt_a[idx_value + 0] *= sw; + descrpt_a[idx_value + 1] *= sw; + descrpt_a[idx_value + 2] *= sw; + descrpt_a[idx_value + 3] *= sw; + } + } +} + +template +void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + // set & normalize coord + std::vector d_coord3(nall * 3); + for (int ii = 0; ii < nall; ++ii) { + for (int dd = 0; dd < 3; ++dd) { + d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; + } + } + + // set type + std::vector d_type (nall); + for (int ii = 0; ii < nall; ++ii) { + d_type[ii] = type[ii]; + } + + // build nlist + std::vector > d_nlist_a(nloc); + + for (unsigned ii = 0; ii < nloc; ++ii) { + d_nlist_a.reserve (jrange[nloc] / nloc + 10); + } + for (unsigned ii = 0; ii < nloc; ++ii) { + int i_idx = ilist[ii]; + for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) { + int j_idx = jlist[jj]; + d_nlist_a[i_idx].push_back (j_idx); + } + } + + #pragma omp parallel for + for (int ii = 0; ii < nloc; ++ii) { + vector fmt_nlist_a; + int ret = -1; + if (fill_nei_a) { + format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a); + } + std::vector d_descrpt_a; + std::vector d_descrpt_a_deriv; + std::vector d_descrpt_r; + std::vector d_descrpt_r_deriv; + std::vector d_rij_a; + compute_descriptor_se_a_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r); + + // check sizes + assert (d_descrpt_a.size() == ndescrpt); + assert (d_descrpt_a_deriv.size() == ndescrpt * 3); + assert (d_rij_a.size() == nnei * 3); + assert (fmt_nlist_a.size() == nnei); + // record outputs + for (int jj = 0; jj < ndescrpt; ++jj) { + descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj]; + } + for (int jj = 0; jj < ndescrpt * 3; ++jj) { + descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3]; + } + for (int jj = 0; jj < nnei * 3; ++jj) { + rij[ii * nnei * 3 + jj] = d_rij_a[jj]; + } + for (int jj = 0; jj < nnei; ++jj) { + nlist[ii * nnei + jj] = fmt_nlist_a[jj]; + } + } + + #if DEBUG + std::fstream fout1("nlist.txt", std::ios::out); + fout1 << "tensor nlist, length:\t" << nloc << ",\twidth:\t" << nnei << std::endl; + for (int ii = 0; ii < nloc; ii++) { + for (int jj = 0; jj < nnei; jj++) { + fout1 << "nlist[" << ii << "][" << jj << "]:\t" << nlist[ii * nnei + jj] << std::endl; + } + } + fout1.close(); + + std::fstream fout2("rij.txt", std::ios::out); + fout2 << "tensor rij, length:\t" << nloc << ",\twidth:\t" << nnei * 3 << std::endl; + for (int ii = 0; ii < nloc; ii++) { + for (int jj = 0; jj < nnei * 3; jj++) { + fout2 << "rij[" << ii << "][" << jj << "]:\t" << rij[ii * nnei * 3 + jj] << std::endl; + } + } + fout2.close(); + + std::fstream fout3("descrpt.txt", std::ios::out); + fout3 << "tensor descrpt, length:\t" << nloc << ",\twidth:\t" << ndescrpt << std::endl; + for (int ii = 0; ii < nloc; ii++) { + for (int jj = 0; jj < ndescrpt; jj++) { + fout3 << "descrpt[" << ii << "][" << jj << "]:\t" << descrpt[ii * ndescrpt + jj] << std::endl; + } + } + fout3.close(); + #endif // DEBUG +} + +extern void DescrptSeAGPUExecuteLauncher(const float * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const float * avg, const float * std, float * descrpt, float * descrpt_deriv, float * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number); +extern void DescrptSeAGPUExecuteLauncher(const double * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const double * avg, const double * std, double * descrpt, double * descrpt_deriv, double * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number); + +template +void DescrptSeAGPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeAGPUExecuteLauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); +} + +// ****************************************************************************** +// end of custome op DescrptSeA +// ****************************************************************************** + +inline void make_descript_range (int & idx_start, int & idx_end, const int & nei_idx, const int& n_a_sel, const int n_a_shift) { + if (nei_idx < n_a_sel) { + idx_start = nei_idx * 4; + idx_end = nei_idx * 4 + 4; + } + else { + idx_start = n_a_shift + (nei_idx - n_a_sel); + idx_end = n_a_shift + (nei_idx - n_a_sel) + 1; + } +} + +template +void ProdForceSeACPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + memset(force, 0.0, sizeof(T) * nall * 3); + // compute force of a frame + for (int i_idx = 0; i_idx < nloc; ++i_idx) { + // deriv wrt center atom + for (int aa = 0; aa < ndescrpt; ++aa) { + force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; + force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; + force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; + } + // deriv wrt neighbors + for (int jj = 0; jj < nnei; ++jj) { + int j_idx = nlist[i_idx * nnei + jj]; + if (j_idx < 0) continue; + int aa_start, aa_end; + make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift); + for (int aa = aa_start; aa < aa_end; ++aa) { + force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; + force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; + force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; + } + } + } +} + +extern void ProdForceSeAGPUExecuteLauncher(float * force, const float * net_derive, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); +extern void ProdForceSeAGPUExecuteLauncher(double * force, const double * net_derive, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); + + +template +void ProdForceSeAGPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdForceSeAGPUExecuteLauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); +} + +// ****************************************************************************** +// end of custome op ProdForceSeA +// ****************************************************************************** + +template +void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + memset(virial, 0.0, sizeof(T) * 9); + memset(atom_virial, 0.0, sizeof(T) * nall * 9); + + // compute virial of a frame + for (int i_idx = 0; i_idx < nloc; ++i_idx) { + // deriv wrt neighbors + for (int jj = 0; jj < nnei; ++jj) { + int j_idx = nlist[i_idx * nnei + jj]; + if (j_idx < 0) continue; + int aa_start, aa_end; + make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift); + for (int aa = aa_start; aa < aa_end; ++aa) { + T pref = -1.0 * net_deriv[i_idx * ndescrpt + aa]; + for (int dd0 = 0; dd0 < 3; ++dd0) + for (int dd1 = 0; dd1 < 3; ++dd1) { + T tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0]; + virial[dd0 * 3 + dd1] -= tmp_v; + atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v; + } + } + } + } +} + +extern void ProdVirialSeAGPUExecuteLauncher(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); +extern void ProdVirialSeAGPUExecuteLauncher(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); + +template +void ProdVirialSeAGPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdVirialSeAGPUExecuteLauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); +} + +// ****************************************************************************** +// end of custome op ProdVirialSeA +// ****************************************************************************** diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h index a32940a738..6c37770758 100644 --- a/source/lib/include/NNPInter.h +++ b/source/lib/include/NNPInter.h @@ -98,9 +98,6 @@ class NNPInter // function used for neighbor list copy vector get_sel_a() const; -#ifdef USE_CUDA_TOOLKIT - void update_nbor(const InternalNeighborList & nlist, const int nloc); -#endif }; class NNPInterModelDevi @@ -195,9 +192,6 @@ class NNPInterModelDevi // function used for nborlist copy vector > get_sel() const; void cum_sum(const std::vector > n_sel); -#ifdef USE_CUDA_TOOLKIT - void update_nbor(const InternalNeighborList & nlist, const int nloc); -#endif }; diff --git a/source/lib/include/common.h b/source/lib/include/common.h index 3912f21f7f..4874274305 100644 --- a/source/lib/include/common.h +++ b/source/lib/include/common.h @@ -8,11 +8,17 @@ using namespace tensorflow; using namespace std; +#include +#include #include "NNPAtomMap.h" #include +#include +#include #include "version.h" +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; #ifdef HIGH_PREC typedef double VALUETYPE; typedef double ENERGYTYPE; @@ -122,6 +128,20 @@ session_input_tensors (std::vector> & input_tensors, const int nghost = 0, const string scope = ""); +int +session_input_tensors (std::vector> & input_tensors, + const vector & dcoord_, + const int & ntypes, + const vector & datype_, + const vector & dbox, + InternalNeighborList & dlist, + const vector & fparam_, + const vector & aparam_, + const NNPAtomMap& nnpmap, + const int nghost, + const int ago, + const string scope = ""); + int session_input_tensors (std::vector> & input_tensors, const vector & dcoord_, diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc index aea9d48c9b..b262851450 100644 --- a/source/lib/src/NNPInter.cc +++ b/source/lib/src/NNPInter.cc @@ -4,11 +4,8 @@ #include -#ifdef USE_CUDA_TOOLKIT +#if GOOGLE_CUDA #include "cuda_runtime.h" -#include -#include -#include #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) @@ -57,7 +54,6 @@ run_model (ENERGYTYPE & dener, return; } -#ifdef USE_CUDA_TOOLKIT std::vector output_tensors; checkStatus (session->Run(input_tensors, {"o_energy", "o_force", "o_atom_virial"}, @@ -74,56 +70,23 @@ run_model (ENERGYTYPE & dener, dener = oe(0); vector dforce (3 * nall); - vector datom_virial (9 * nall); dvirial.resize (9); for (unsigned ii = 0; ii < nall * 3; ++ii){ dforce[ii] = of(ii); } - for (int ii = 0; ii < nall * 9; ++ii) { - datom_virial[ii] = oav(ii); - } for (int ii = 0; ii < nall; ++ii) { - dvirial[0] += 1.0 * datom_virial[9*ii+0]; - dvirial[1] += 1.0 * datom_virial[9*ii+1]; - dvirial[2] += 1.0 * datom_virial[9*ii+2]; - dvirial[3] += 1.0 * datom_virial[9*ii+3]; - dvirial[4] += 1.0 * datom_virial[9*ii+4]; - dvirial[5] += 1.0 * datom_virial[9*ii+5]; - dvirial[6] += 1.0 * datom_virial[9*ii+6]; - dvirial[7] += 1.0 * datom_virial[9*ii+7]; - dvirial[8] += 1.0 * datom_virial[9*ii+8]; - } - - dforce_ = dforce; - nnpmap.backward (dforce_.begin(), dforce.begin(), 3); -#else - std::vector output_tensors; - - checkStatus (session->Run(input_tensors, - {"o_energy", "o_force", "o_virial"}, - {}, - &output_tensors)); - - Tensor output_e = output_tensors[0]; - Tensor output_f = output_tensors[1]; - Tensor output_v = output_tensors[2]; - - auto oe = output_e.flat (); - auto of = output_f.flat (); - auto ov = output_v.flat (); - - dener = oe(0); - vector dforce (3 * nall); - dvirial.resize (9); - for (unsigned ii = 0; ii < nall * 3; ++ii){ - dforce[ii] = of(ii); - } - for (unsigned ii = 0; ii < 9; ++ii){ - dvirial[ii] = ov(ii); + dvirial[0] += 1.0 * oav(9*ii+0); + dvirial[1] += 1.0 * oav(9*ii+1); + dvirial[2] += 1.0 * oav(9*ii+2); + dvirial[3] += 1.0 * oav(9*ii+3); + dvirial[4] += 1.0 * oav(9*ii+4); + dvirial[5] += 1.0 * oav(9*ii+5); + dvirial[6] += 1.0 * oav(9*ii+6); + dvirial[7] += 1.0 * oav(9*ii+7); + dvirial[8] += 1.0 * oav(9*ii+8); } dforce_ = dforce; nnpmap.backward (dforce_.begin(), dforce.begin(), 3); -#endif } static void run_model (ENERGYTYPE & dener, @@ -155,7 +118,6 @@ static void run_model (ENERGYTYPE & dener, fill(datom_virial_.begin(), datom_virial_.end(), 0.0); return; } -#ifdef USE_CUDA_TOOLKIT std::vector output_tensors; checkStatus (session->Run(input_tensors, @@ -204,50 +166,6 @@ static void run_model (ENERGYTYPE & dener, nnpmap.backward (dforce_.begin(), dforce.begin(), 3); nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1); nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9); -#else - std::vector output_tensors; - - checkStatus (session->Run(input_tensors, - {"o_energy", "o_force", "o_virial", "o_atom_energy", "o_atom_virial"}, - {}, - &output_tensors)); - - Tensor output_e = output_tensors[0]; - Tensor output_f = output_tensors[1]; - Tensor output_v = output_tensors[2]; - Tensor output_ae = output_tensors[3]; - Tensor output_av = output_tensors[4]; - - auto oe = output_e.flat (); - auto of = output_f.flat (); - auto ov = output_v.flat (); - auto oae = output_ae.flat (); - auto oav = output_av.flat (); - - dener = oe(0); - vector dforce (3 * nall); - vector datom_energy (nall, 0); - vector datom_virial (9 * nall); - dvirial.resize (9); - for (int ii = 0; ii < nall * 3; ++ii) { - dforce[ii] = of(ii); - } - for (int ii = 0; ii < nloc; ++ii) { - datom_energy[ii] = oae(ii); - } - for (int ii = 0; ii < nall * 9; ++ii) { - datom_virial[ii] = oav(ii); - } - for (int ii = 0; ii < 9; ++ii) { - dvirial[ii] = ov(ii); - } - dforce_ = dforce; - datom_energy_ = datom_energy; - datom_virial_ = datom_virial; - nnpmap.backward (dforce_.begin(), dforce.begin(), 3); - nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1); - nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9); -#endif } @@ -266,50 +184,8 @@ NNPInter (const string & model, const int & gpu_rank) init(model, gpu_rank); } -NNPInter::~NNPInter() { - #ifdef USE_CUDA_TOOLKIT - if (init_nbor) { - cudaErrcheck(cudaFree(ilist)); - cudaErrcheck(cudaFree(jrange)); - cudaErrcheck(cudaFree(jlist)); - } - #endif -} - -#ifdef USE_CUDA_TOOLKIT -void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) { - if (!init_nbor) { - cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); - cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); - cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - ilist_size = nlist.ilist.size(); - jrange_size = nlist.jrange.size(); - jlist_size = nlist.jlist.size(); - init_nbor = true; - } - if (ilist_size < nlist.ilist.size()) { - cudaErrcheck(cudaFree(ilist)); - cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); - ilist_size = nlist.ilist.size(); - } - if (jrange_size < nlist.jrange.size()) { - cudaErrcheck(cudaFree(jrange)); - cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); - jrange_size = nlist.jrange.size(); - } - if (jlist_size < nlist.jlist.size()) { - cudaErrcheck(cudaFree(jlist)); - cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - jlist_size = nlist.jlist.size(); - } - - cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); - cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); - cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); -} -#endif // USE_CUDA_TOOLKIT +NNPInter::~NNPInter() {} -#ifdef USE_CUDA_TOOLKIT void NNPInter:: init (const string & model, const int & gpu_rank) @@ -318,21 +194,21 @@ init (const string & model, const int & gpu_rank) SessionOptions options; options.config.set_inter_op_parallelism_threads(num_inter_nthreads); options.config.set_intra_op_parallelism_threads(num_intra_nthreads); - options.config.set_allow_soft_placement(true); - options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); - options.config.mutable_gpu_options()->set_allow_growth(true); checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def)); int gpu_num = -1; - cudaGetDeviceCount(&gpu_num); - // std::cout << "current number of devices: " << gpu_num << std::endl; - // set device to GPU only when at least GPU is found + #if GOOGLE_CUDA + cudaGetDeviceCount(&gpu_num); // check current device environment if (gpu_num > 0) { + options.config.set_allow_soft_placement(true); + options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); + options.config.mutable_gpu_options()->set_allow_growth(true); + cudaErrcheck(cudaSetDevice(gpu_rank)); std::string str = "/gpu:"; str += std::to_string(gpu_rank % gpu_num); graph::SetDefaultDevice(str, &graph_def); - // std::cout << "current device rank: " << str << std::endl; } + #endif // GOOGLE_CUDA checkStatus (NewSession(options, &session)); checkStatus (session->Create(graph_def)); rcut = get_scalar("descrpt_attr/rcut"); @@ -340,8 +216,6 @@ init (const string & model, const int & gpu_rank) ntypes = get_scalar("descrpt_attr/ntypes"); dfparam = get_scalar("fitting_attr/dfparam"); daparam = get_scalar("fitting_attr/daparam"); - // assert(rcut == get_rcut()); - // assert(ntypes == get_ntypes()); if (dfparam < 0) dfparam = 0; if (daparam < 0) daparam = 0; inited = true; @@ -350,38 +224,6 @@ init (const string & model, const int & gpu_rank) ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; } -#else -void -NNPInter:: -init (const string & model, const int & gpu_rank) -{ - assert (!inited); - SessionOptions options; - options.config.set_inter_op_parallelism_threads(num_inter_nthreads); - options.config.set_intra_op_parallelism_threads(num_intra_nthreads); - checkStatus (NewSession(options, &session)); - checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def)); - checkStatus (session->Create(graph_def)); - rcut = get_scalar("descrpt_attr/rcut"); - cell_size = rcut; - ntypes = get_scalar("descrpt_attr/ntypes"); - dfparam = get_scalar("fitting_attr/dfparam"); - daparam = get_scalar("fitting_attr/daparam"); - // assert(rcut == get_rcut()); - // assert(ntypes == get_ntypes()); - if (dfparam < 0) dfparam = 0; - if (daparam < 0) daparam = 0; - // rcut = get_rcut(); - // cell_size = rcut; - // ntypes = get_ntypes(); - // dfparam = get_dfparam(); - inited = true; - - init_nbor = false; - ilist = NULL; jrange = NULL; jlist = NULL; - ilist_size = 0; jrange_size = 0; jlist_size = 0; -} -#endif void NNPInter:: @@ -554,17 +396,10 @@ compute_inner (ENERGYTYPE & dener, nnpmap = NNPAtomMap (datype_.begin(), datype_.begin() + nloc); assert (nloc == nnpmap.get_type().size()); - shuffle_nlist (nlist, nnpmap); - #ifdef USE_CUDA_TOOLKIT - update_nbor(nlist, nloc); - #endif + shuffle_nlist (nlist, nnpmap); } - #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); - #else - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - #endif + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago); assert (nloc == ret); run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost); } @@ -622,16 +457,9 @@ compute (ENERGYTYPE & dener, // InternalNeighborList nlist; convert_nlist_lmp_internal (nlist, lmp_list); shuffle_nlist (nlist, nnpmap); - #ifdef USE_CUDA_TOOLKIT - update_nbor(nlist, nloc); - #endif } - #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); - #else - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - #endif + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago); assert (nloc == ret); run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost); } @@ -663,17 +491,8 @@ NNPInterModelDevi (const vector & models, const int & gpu_rank) init(models, gpu_rank); } -NNPInterModelDevi::~NNPInterModelDevi() { -#ifdef USE_CUDA_TOOLKIT - if (init_nbor) { - cudaErrcheck(cudaFree(ilist)); - cudaErrcheck(cudaFree(jrange)); - cudaErrcheck(cudaFree(jlist)); - } -#endif -} +NNPInterModelDevi::~NNPInterModelDevi() {} -#ifdef USE_CUDA_TOOLKIT void NNPInterModelDevi:: init (const vector & models, const int & gpu_rank) @@ -682,26 +501,32 @@ init (const vector & models, const int & gpu_rank) numb_models = models.size(); sessions.resize(numb_models); graph_defs.resize(numb_models); + + int gpu_num = -1; + #if GOOGLE_CUDA + cudaGetDeviceCount(&gpu_num); + #endif // GOOGLE_CUDA + SessionOptions options; options.config.set_inter_op_parallelism_threads(num_inter_nthreads); options.config.set_intra_op_parallelism_threads(num_intra_nthreads); - options.config.set_allow_soft_placement(true); - options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); - options.config.mutable_gpu_options()->set_allow_growth(true); - for (unsigned ii = 0; ii < numb_models; ++ii){ checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii])); } - int gpu_num = -1; - cudaGetDeviceCount(&gpu_num); - // std::cout << "current number of devices: " << gpu_num << std::endl; - for (unsigned ii = 0; ii < numb_models; ++ii){ - // set device to GPU only when at least GPU is found + #if GOOGLE_CUDA + if (gpu_num > 0) { + options.config.set_allow_soft_placement(true); + options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); + options.config.mutable_gpu_options()->set_allow_growth(true); + cudaErrcheck(cudaSetDevice(gpu_rank)); + } + #endif // GOOGLE_CUDA + + for (unsigned ii = 0; ii < numb_models; ++ii) { if (gpu_num > 0) { std::string str = "/gpu:"; str += std::to_string(gpu_rank % gpu_num); graph::SetDefaultDevice(str, &graph_defs[ii]); - // std::cout << "current device rank: " << str << std::endl; } checkStatus (NewSession(options, &(sessions[ii]))); checkStatus (sessions[ii]->Create(graph_defs[ii])); @@ -722,40 +547,6 @@ init (const vector & models, const int & gpu_rank) ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; } -#else -void -NNPInterModelDevi:: -init (const vector & models, const int & gpu_rank) -{ - assert (!inited); - numb_models = models.size(); - sessions.resize(numb_models); - graph_defs.resize(numb_models); - SessionOptions options; - options.config.set_inter_op_parallelism_threads(num_inter_nthreads); - options.config.set_intra_op_parallelism_threads(num_intra_nthreads); - for (unsigned ii = 0; ii < numb_models; ++ii){ - checkStatus (NewSession(options, &(sessions[ii]))); - checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii])); - checkStatus (sessions[ii]->Create(graph_defs[ii])); - } - rcut = get_scalar("descrpt_attr/rcut"); - cell_size = rcut; - ntypes = get_scalar("descrpt_attr/ntypes"); - dfparam = get_scalar("fitting_attr/dfparam"); - daparam = get_scalar("fitting_attr/daparam"); - if (dfparam < 0) dfparam = 0; - if (daparam < 0) daparam = 0; - // rcut = get_rcut(); - // cell_size = rcut; - // ntypes = get_ntypes(); - inited = true; - - init_nbor = false; - ilist = NULL; jrange = NULL; jlist = NULL; - ilist_size = 0; jrange_size = 0; jlist_size = 0; -} -#endif template VT @@ -821,42 +612,6 @@ cum_sum (const std::vector > n_sel) } } -#ifdef USE_CUDA_TOOLKIT -void -NNPInterModelDevi:: -update_nbor(const InternalNeighborList & nlist, const int nloc) -{ - if (!init_nbor) { - cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); - cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); - cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - ilist_size = nlist.ilist.size(); - jrange_size = nlist.jrange.size(); - jlist_size = nlist.jlist.size(); - init_nbor = true; - } - if (ilist_size < nlist.ilist.size()) { - cudaErrcheck(cudaFree(ilist)); - cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); - ilist_size = nlist.ilist.size(); - } - if (jrange_size < nlist.jrange.size()) { - cudaErrcheck(cudaFree(jrange)); - cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); - jrange_size = nlist.jrange.size(); - } - if (jlist_size < nlist.jlist.size()) { - cudaErrcheck(cudaFree(jlist)); - cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - jlist_size = nlist.jlist.size(); - } - - cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); - cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); - cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); -} -#endif //USE_CUDA_TOOLKIT - void NNPInterModelDevi:: validate_fparam_aparam(const int & nloc, @@ -946,16 +701,8 @@ compute (vector & all_energy, // InternalNeighborList nlist; convert_nlist_lmp_internal (nlist, lmp_list); shuffle_nlist (nlist, nnpmap); - #ifdef USE_CUDA_TOOLKIT - update_nbor(nlist, nloc); - #endif - } - #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); - #else - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - #endif + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago); all_energy.resize (numb_models); all_force.resize (numb_models); @@ -996,16 +743,8 @@ compute (vector & all_energy, // InternalNeighborList nlist; convert_nlist_lmp_internal (nlist, lmp_list); shuffle_nlist (nlist, nnpmap); - #ifdef USE_CUDA_TOOLKIT - update_nbor(nlist, nloc); - #endif - } - #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); - #else - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); - #endif + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago); all_energy.resize (numb_models); all_force .resize (numb_models); diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc index dd4c3f672a..d990195e6c 100644 --- a/source/lib/src/common.cc +++ b/source/lib/src/common.cc @@ -340,6 +340,136 @@ session_input_tensors (std::vector> & input_tensors, return nloc; } +int +session_input_tensors (std::vector> & input_tensors, + const vector & dcoord_, + const int & ntypes, + const vector & datype_, + const vector & dbox, + InternalNeighborList & dlist, + const vector & fparam_, + const vector & aparam_, + const NNPAtomMap& nnpmap, + const int nghost, + const int ago, + const string scope) +{ + assert (dbox.size() == 9); + + int nframes = 1; + int nall = dcoord_.size() / 3; + int nloc = nall - nghost; + assert (nall == datype_.size()); + + vector datype = nnpmap.get_type(); + vector type_count (ntypes, 0); + for (unsigned ii = 0; ii < datype.size(); ++ii){ + type_count[datype[ii]] ++; + } + datype.insert (datype.end(), datype_.begin() + nloc, datype_.end()); + + TensorShape coord_shape ; + coord_shape.AddDim (nframes); + coord_shape.AddDim (nall * 3); + TensorShape type_shape ; + type_shape.AddDim (nframes); + type_shape.AddDim (nall); + TensorShape box_shape ; + box_shape.AddDim (nframes); + box_shape.AddDim (9); + TensorShape mesh_shape ; + mesh_shape.AddDim (16); + TensorShape natoms_shape ; + natoms_shape.AddDim (2 + ntypes); + TensorShape fparam_shape ; + fparam_shape.AddDim (nframes); + fparam_shape.AddDim (fparam_.size()); + TensorShape aparam_shape ; + aparam_shape.AddDim (nframes); + aparam_shape.AddDim (aparam_.size()); + +#ifdef HIGH_PREC + Tensor coord_tensor (DT_DOUBLE, coord_shape); + Tensor box_tensor (DT_DOUBLE, box_shape); + Tensor fparam_tensor (DT_DOUBLE, fparam_shape); + Tensor aparam_tensor (DT_DOUBLE, aparam_shape); +#else + Tensor coord_tensor (DT_FLOAT, coord_shape); + Tensor box_tensor (DT_FLOAT, box_shape); + Tensor fparam_tensor (DT_FLOAT, fparam_shape); + Tensor aparam_tensor (DT_FLOAT, aparam_shape); +#endif + Tensor type_tensor (DT_INT32, type_shape); + Tensor mesh_tensor (DT_INT32, mesh_shape); + Tensor natoms_tensor (DT_INT32, natoms_shape); + + auto coord = coord_tensor.matrix (); + auto type = type_tensor.matrix (); + auto box = box_tensor.matrix (); + auto mesh = mesh_tensor.flat (); + auto natoms = natoms_tensor.flat (); + auto fparam = fparam_tensor.matrix (); + auto aparam = aparam_tensor.matrix (); + + vector dcoord (dcoord_); + nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3); + + for (int ii = 0; ii < nframes; ++ii){ + for (int jj = 0; jj < nall * 3; ++jj){ + coord(ii, jj) = dcoord[jj]; + } + for (int jj = 0; jj < 9; ++jj){ + box(ii, jj) = dbox[jj]; + } + for (int jj = 0; jj < nall; ++jj){ + type(ii, jj) = datype[jj]; + } + for (int jj = 0; jj < fparam_.size(); ++jj){ + fparam(ii, jj) = fparam_[jj]; + } + for (int jj = 0; jj < aparam_.size(); ++jj){ + aparam(ii, jj) = aparam_[jj]; + } + } + + for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0; + + const int stride = sizeof(int *) / sizeof(int); + assert (stride * sizeof(int) == sizeof(int *)); + assert (stride <= 4); + mesh (0) = ago; + mesh (1) = dlist.ilist.size(); + mesh (2) = dlist.jrange.size(); + mesh (3) = dlist.jlist.size(); + dlist.make_ptrs(); + memcpy (&mesh(4), &(dlist.pilist), sizeof(int *)); + memcpy (&mesh(8), &(dlist.pjrange), sizeof(int *)); + memcpy (&mesh(12), &(dlist.pjlist), sizeof(int *)); + + natoms (0) = nloc; + natoms (1) = nall; + for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii]; + + string prefix = ""; + if (scope != ""){ + prefix = scope + "/"; + } + input_tensors = { + {prefix+"t_coord", coord_tensor}, + {prefix+"t_type", type_tensor}, + {prefix+"t_box", box_tensor}, + {prefix+"t_mesh", mesh_tensor}, + {prefix+"t_natoms",natoms_tensor}, + }; + if (fparam_.size() > 0) { + input_tensors.push_back({prefix+"t_fparam", fparam_tensor}); + } + if (aparam_.size() > 0) { + input_tensors.push_back({prefix+"t_aparam", aparam_tensor}); + } + return nloc; +} + int session_input_tensors (std::vector> & input_tensors, const vector & dcoord_, diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu index 1636df8ff5..7fa84335e5 100644 --- a/source/op/cuda/descrpt_se_a.cu +++ b/source/op/cuda/descrpt_se_a.cu @@ -1,15 +1,3 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#define EIGEN_USE_GPU #include #include #include @@ -18,14 +6,6 @@ limitations under the License. #include #include -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -typedef double compute_t; - typedef unsigned long long int_64; #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } @@ -77,19 +57,20 @@ __device__ inline T dev_dot(T * arr1, T * arr2) { return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; } -__device__ inline void spline5_switch(compute_t & vv, - compute_t & dd, - compute_t & xx, - const compute_t & rmin, - const compute_t & rmax) +template +__device__ inline void spline5_switch(T & vv, + T & dd, + T & xx, + const float & rmin, + const float & rmax) { if (xx < rmin) { dd = 0; vv = 1; } else if (xx < rmax) { - compute_t uu = (xx - rmin) / (rmax - rmin) ; - compute_t du = 1. / (rmax - rmin) ; + T uu = (xx - rmin) / (rmax - rmin) ; + T du = 1. / (rmax - rmin) ; vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; } @@ -110,7 +91,8 @@ __global__ void get_i_idx_se_a(const int nloc, i_idx[ilist[idy]] = idy; } -__global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord, +template +__global__ void format_nlist_fill_a_se_a(const T * coord, const int * type, const int * jrange, const int * jlist, @@ -120,8 +102,8 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord, const int MAGIC_NUMBER) { // <<>> - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]]; if (idy >= nsize) { @@ -133,12 +115,12 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord, int_64 * key_in = key + idx * MAGIC_NUMBER; - compute_t diff[3]; + T diff[3]; const int & j_idx = nei_idx[idy]; for (int dd = 0; dd < 3; dd++) { diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; } - compute_t rr = sqrt(dev_dot(diff, diff)); + T rr = sqrt(dev_dot(diff, diff)); if (rr <= rcut) { key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx; } @@ -180,33 +162,34 @@ __global__ void format_nlist_fill_b_se_a(int * nlist, } //it's ok! -__global__ void compute_descriptor_se_a (VALUETYPE* descript, +template +__global__ void compute_descriptor_se_a (T* descript, const int ndescrpt, - VALUETYPE* descript_deriv, + T* descript_deriv, const int descript_deriv_size, - VALUETYPE* rij, + T* rij, const int rij_size, const int* type, - const VALUETYPE* avg, - const VALUETYPE* std, + const T* avg, + const T* std, int* nlist, const int nlist_size, - const VALUETYPE* coord, - const VALUETYPE rmin, - const VALUETYPE rmax, + const T* coord, + const float rmin, + const float rmax, const int sec_a_size) { // <<>> - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; const int idx_deriv = idy * 4 * 3; // 4 components time 3 directions const int idx_value = idy * 4; // 4 components if (idy >= sec_a_size) {return;} // else {return;} - VALUETYPE * row_descript = descript + idx * ndescrpt; - VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; - VALUETYPE * row_rij = rij + idx * rij_size; + T * row_descript = descript + idx * ndescrpt; + T * row_descript_deriv = descript_deriv + idx * descript_deriv_size; + T * row_rij = rij + idx * rij_size; int * row_nlist = nlist + idx * nlist_size; if (row_nlist[idy] >= 0) { @@ -214,14 +197,14 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, for (int kk = 0; kk < 3; kk++) { row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; } - const compute_t * rr = &row_rij[idy * 3 + 0]; - compute_t nr2 = dev_dot(rr, rr); - compute_t inr = 1./sqrt(nr2); - compute_t nr = nr2 * inr; - compute_t inr2 = inr * inr; - compute_t inr4 = inr2 * inr2; - compute_t inr3 = inr4 * nr; - compute_t sw, dsw; + const T * rr = &row_rij[idy * 3 + 0]; + T nr2 = dev_dot(rr, rr); + T inr = 1./sqrt(nr2); + T nr = nr2 * inr; + T inr2 = inr * inr; + T inr4 = inr2 * inr2; + T inr3 = inr4 * nr; + T sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); row_descript[idx_value + 0] = (1./nr) ;//* sw; row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw; @@ -260,8 +243,9 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, } } +template void format_nbor_list_256 ( - const VALUETYPE* coord, + const T* coord, const int* type, const int* jrange, const int* jlist, @@ -274,9 +258,10 @@ void format_nbor_list_256 ( const int LEN = 256; const int MAGIC_NUMBER = 256; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -292,8 +277,9 @@ void format_nbor_list_256 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } +template void format_nbor_list_512 ( - const VALUETYPE* coord, + const T* coord, const int* type, const int* jrange, const int* jlist, @@ -306,9 +292,10 @@ void format_nbor_list_512 ( const int LEN = 256; const int MAGIC_NUMBER = 512; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -324,8 +311,9 @@ void format_nbor_list_512 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } +template void format_nbor_list_1024 ( - const VALUETYPE* coord, + const T* coord, const int* type, const int* jrange, const int* jlist, @@ -338,9 +326,10 @@ void format_nbor_list_1024 ( const int LEN = 256; const int MAGIC_NUMBER = 1024; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -356,8 +345,9 @@ void format_nbor_list_1024 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } +template void format_nbor_list_2048 ( - const VALUETYPE* coord, + const T* coord, const int* type, const int* jrange, const int* jlist, @@ -370,9 +360,10 @@ void format_nbor_list_2048 ( const int LEN = 256; const int MAGIC_NUMBER = 2048; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -388,8 +379,9 @@ void format_nbor_list_2048 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } +template void format_nbor_list_4096 ( - const VALUETYPE* coord, + const T* coord, const int* type, const int* jrange, const int* jlist, @@ -402,9 +394,10 @@ void format_nbor_list_4096 ( const int LEN = 256; const int MAGIC_NUMBER = 4096; const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; - dim3 block_grid(nblock, nloc); + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); format_nlist_fill_a_se_a - <<>> ( + <<>> ( coord, type, jrange, @@ -420,26 +413,27 @@ void format_nbor_list_4096 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } -void DescrptSeALauncher(const VALUETYPE* coord, - const int* type, - const int* ilist, - const int* jrange, - const int* jlist, - int* array_int, - unsigned long long* array_longlong, - const VALUETYPE* avg, - const VALUETYPE* std, - VALUETYPE* descript, - VALUETYPE* descript_deriv, - VALUETYPE* rij, - int* nlist, - const int& nloc, - const int& nnei, - const float& rcut_r, - const float& rcut_r_smth, - const int& ndescrpt, - const std::vector& sec_a, - const bool& fill_nei_a, +void DescrptSeAGPUExecuteLauncher(const float * coord, + const int * type, + const int * ilist, + const int * jrange, + const int * jlist, + int * array_int, + unsigned long long * array_longlong, + const float * avg, + const float * std, + float * descript, + float * descript_deriv, + float * rij, + int * nlist, + const int nloc, + const int nall, + const int nnei, + const int ndescrpt, + const float rcut_r, + const float rcut_r_smth, + const std::vector sec_a, + const bool fill_nei_a, const int MAGIC_NUMBER ) { @@ -454,8 +448,144 @@ void DescrptSeALauncher(const VALUETYPE* coord, res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); - res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res); - res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); + res = cudaMemset(descript, 0.0, sizeof(float) * nloc * ndescrpt); cudaErrcheck(res); + res = cudaMemset(descript_deriv, 0.0, sizeof(float) * nloc * ndescrpt * 3); cudaErrcheck(res); + + if (fill_nei_a) { + // ~~~ + // cudaProfilerStart(); + get_i_idx_se_a<<>> (nloc, ilist, i_idx); + + if (nnei <= 256) { + format_nbor_list_256 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 512) { + format_nbor_list_512 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 1024) { + format_nbor_list_1024 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 2048) { + format_nbor_list_2048 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 4096) { + format_nbor_list_4096 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } + + format_nlist_fill_b_se_a<<>> ( + nlist, + nnei, + nloc, + jrange, + jlist, + key, + sec_a_dev, + sec_a.size(), + nei_iter, + MAGIC_NUMBER + ); + } + + const int nblock_ = (sec_a.back() + LEN -1) / LEN; + dim3 block_grid(nloc, nblock_); + dim3 thread_grid(1, LEN); + compute_descriptor_se_a<<>> ( + descript, + ndescrpt, + descript_deriv, + ndescrpt * 3, + rij, + nnei * 3, + type, + avg, + std, + nlist, + nnei, + coord, + rcut_r_smth, + rcut_r, + sec_a.back() + ); +} + +void DescrptSeAGPUExecuteLauncher(const double * coord, + const int * type, + const int * ilist, + const int * jrange, + const int * jlist, + int * array_int, + unsigned long long * array_longlong, + const double * avg, + const double * std, + double * descript, + double * descript_deriv, + double * rij, + int * nlist, + const int nloc, + const int nall, + const int nnei, + const int ndescrpt, + const float rcut_r, + const float rcut_r_smth, + const std::vector sec_a, + const bool fill_nei_a, + const int MAGIC_NUMBER +) +{ + const int LEN = 256; + int nblock = (nloc + LEN -1) / LEN; + int * sec_a_dev = array_int; + int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size]; + int * i_idx = array_int + sec_a.size() + nloc * sec_a.size(); + int_64 * key = array_longlong; + + cudaError_t res = cudaSuccess; + res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); + res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); + res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); + res = cudaMemset(descript, 0.0, sizeof(double) * nloc * ndescrpt); cudaErrcheck(res); + res = cudaMemset(descript_deriv, 0.0, sizeof(double) * nloc * ndescrpt * 3); cudaErrcheck(res); if (fill_nei_a) { // ~~~ @@ -534,8 +664,9 @@ void DescrptSeALauncher(const VALUETYPE* coord, } const int nblock_ = (sec_a.back() + LEN -1) / LEN; - dim3 block_grid(nblock_, nloc); - compute_descriptor_se_a<<>> ( + dim3 block_grid(nloc, nblock_); + dim3 thread_grid(1, LEN); + compute_descriptor_se_a<<>> ( descript, ndescrpt, descript_deriv, diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu index 99b7b1aed4..e16a285034 100644 --- a/source/op/cuda/gelu.cu +++ b/source/op/cuda/gelu.cu @@ -34,42 +34,42 @@ __global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * ou } -void GeluLauncher(const float * in, float * out, int const size) { +void GeluGPULauncher(const float * in, float * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu<<>>(in, out, size); } -void GeluLauncher(const double * in, double * out, int const size) { +void GeluGPULauncher(const double * in, double * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu<<>>(in, out, size); } -void GeluGradLauncher(const float * dy, const float * in, float * out, int const size) { +void GeluGradGPULauncher(const float * dy, const float * in, float * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad<<>>(dy, in, out, size); } -void GeluGradLauncher(const double * dy, const double * in, double * out, int const size) { +void GeluGradGPULauncher(const double * dy, const double * in, double * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad<<>>(dy, in, out, size); } -void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) { +void GeluGradGradGPULauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad_grad<<>>(dy, dy_, in, out, size); } -void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) { +void GeluGradGradGPULauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu index 080ff8ef75..926f30604b 100644 --- a/source/op/cuda/prod_force_se_a.cu +++ b/source/op/cuda/prod_force_se_a.cu @@ -2,14 +2,8 @@ #include #include -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -32,25 +26,25 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -__global__ void deriv_wrt_center_atom_se_a(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +__global__ void deriv_wrt_center_atom_se_a(T * force, + const T * net_deriv, + const T * in_deriv, const int ndescrpt) { - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + const unsigned int idz = threadIdx.x; - if (idy >= ndescrpt) { - return; - } + if (idy >= ndescrpt) {return;} atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); } -__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +__global__ void deriv_wrt_neighbors_se_a(T * force, + const T * net_deriv, + const T * in_deriv, const int * nlist, const int nloc, const int nnei, @@ -75,23 +69,49 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force, atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]); } -void ProdForceSeALauncher(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +void ProdForceSeAGPUExecuteLauncher(float * force, + const float * net_deriv, + const float * in_deriv, const int * nlist, const int nloc, const int nall, + const int nnei, const int ndescrpt, + const int n_a_sel, + const int n_a_shift) +{ + // std::cout << "I'm here!" << std::endl; + cudaErrcheck(cudaMemset(force, 0.0, sizeof(float) * nall * 3)); + const int LEN1 = 256; + const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; + dim3 grid(nloc, nblock1); + dim3 thread(3, LEN1); + deriv_wrt_center_atom_se_a<<>>(force, net_deriv, in_deriv, ndescrpt); + + const int LEN = 64; + int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 3, 4); + deriv_wrt_neighbors_se_a<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift); +} + +void ProdForceSeAGPUExecuteLauncher(double * force, + const double * net_deriv, + const double * in_deriv, + const int * nlist, + const int nloc, + const int nall, const int nnei, + const int ndescrpt, const int n_a_sel, const int n_a_shift) { // std::cout << "I'm here!" << std::endl; - cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3)); + cudaErrcheck(cudaMemset(force, 0.0, sizeof(double) * nall * 3)); const int LEN1 = 256; const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; - dim3 grid(nblock1, nloc); - dim3 thread(LEN1, 3); + dim3 grid(nloc, nblock1); + dim3 thread(3, LEN1); deriv_wrt_center_atom_se_a<<>>(force, net_deriv, in_deriv, ndescrpt); const int LEN = 64; diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu index 241e2b7e06..d2524a3c81 100644 --- a/source/op/cuda/prod_virial_se_a.cu +++ b/source/op/cuda/prod_virial_se_a.cu @@ -1,14 +1,6 @@ #include #include -#define MUL 512 - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -31,11 +23,12 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, +template +__global__ void deriv_wrt_neighbors_se_a(T * virial, + T * atom_virial, + const T * net_deriv, + const T * in_deriv, + const T * rij, const int * nlist, const int nloc, const int nnei, @@ -64,11 +57,47 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial, atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]); } -void ProdVirialSeALauncher(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, +void ProdVirialSeAGPUExecuteLauncher(float * virial, + float * atom_virial, + const float * net_deriv, + const float * in_deriv, + const float * rij, + const int * nlist, + const int nloc, + const int nall, + const int nnei, + const int ndescrpt, + const int n_a_sel, + const int n_a_shift) +{ + cudaErrcheck(cudaMemset(virial, 0.0, sizeof(float) * 9)); + cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(float) * 9 * nall)); + + const int LEN = 16; + int nblock = (nloc + LEN -1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 9, 4); + // compute virial of a frame + deriv_wrt_neighbors_se_a<<>>( + virial, + atom_virial, + net_deriv, + in_deriv, + rij, + nlist, + nloc, + nnei, + ndescrpt, + n_a_sel, + n_a_shift + ); +} + +void ProdVirialSeAGPUExecuteLauncher(double * virial, + double * atom_virial, + const double * net_deriv, + const double * in_deriv, + const double * rij, const int * nlist, const int nloc, const int nall, @@ -77,8 +106,8 @@ void ProdVirialSeALauncher(VALUETYPE * virial, const int n_a_sel, const int n_a_shift) { - cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9)); - cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall)); + cudaErrcheck(cudaMemset(virial, 0.0, sizeof(double) * 9)); + cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(double) * 9 * nall)); const int LEN = 16; int nblock = (nloc + LEN -1) / LEN; diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc index fd5ae632cb..9c52a62474 100644 --- a/source/op/descrpt_se_a_gpu.cc +++ b/source/op/descrpt_se_a_gpu.cc @@ -1,32 +1,5 @@ -#include -#include -#include -#include -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" - -using namespace tensorflow; // NOLINT(build/namespaces) - -#ifdef HIGH_PREC - typedef double VALUETYPE ; -#else - typedef float VALUETYPE ; -#endif - -typedef double compute_t; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -using GPUDevice = Eigen::GpuDevice; +#include "common.h" +#include "CustomeOperation.h" #ifdef HIGH_PREC REGISTER_OP("DescrptSeA") @@ -85,29 +58,32 @@ int get_magic_number(int const nnei) { } } -void DescrptSeALauncher(const VALUETYPE* coord, - const int* type, - const int* ilist, - const int* jrange, - const int* jlist, - int* array_int, - unsigned long long* array_longlong, - const VALUETYPE* avg, - const VALUETYPE* std, - VALUETYPE* descript, - VALUETYPE* descript_deriv, - VALUETYPE* rij, - int* nlist, - const int& nloc, - const int& nnei, - const float& rcut_r, - const float& rcut_r_smth, - const int& ndescrpt, - const std::vector& sec_a, - const bool& fill_nei_a, - const int MAGIC_NUMBER -); +template +struct DeviceFunctor { + void operator()(const CPUDevice& d, std::string& device) { + device = "CPU"; + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, std::string& device) { + device = "GPU"; + } + #endif // GOOGLE_CUDA +}; + +template +struct DescrptSeAFunctor { + void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + } + + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + } + #endif // GOOGLE_CUDA +}; +template class DescrptSeAOp : public OpKernel { public: explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -154,8 +130,12 @@ class DescrptSeAOp : public OpKernel { OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + DeviceFunctor() ( + context->eigen_device(), + device + ); + + const int * natoms = natoms_tensor.flat().data(); int nloc = natoms[0]; int nall = natoms[1]; int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something. @@ -209,51 +189,55 @@ class DescrptSeAOp : public OpKernel { nlist_shape, &nlist_tensor)); - // allocate temp memory, temp memory must not be used after this operation! - Tensor int_temp; - TensorShape int_shape; - int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc); - OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); - - Tensor uint64_temp; - TensorShape uint64_shape; - uint64_shape.AddDim(nloc * magic_number * 2); - OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); + if(device == "GPU") { + // allocate temp memory, temp memory must not be used after this operation! + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(nloc * magic_number * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); - int * ilist = NULL, * jrange = NULL, * jlist = NULL; - int * array_int = int_temp.flat().data(); - unsigned long long * array_longlong = uint64_temp.flat().data(); - cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); - // Launch computation - for (int II = 0; II < nsamples; II++) { - DescrptSeALauncher(coord_tensor.matrix().data() + II * (nall * 3), // related to the kk argument - type_tensor.matrix().data() + II * nall, // also related to the kk argument - ilist, - jrange, - jlist, - array_int, - array_longlong, - avg_tensor.matrix().data(), - std_tensor.matrix().data(), - descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), - descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), - rij_tensor->matrix().data() + II * (nloc * nnei * 3), - nlist_tensor->matrix().data() + II * (nloc * nnei), - nloc, - nnei, - rcut_r, - rcut_r_smth, - ndescrpt, - sec_a, - fill_nei_a, - magic_number - ); + nbor_update(mesh_tensor.flat().data(), static_cast(mesh_tensor.NumElements())); + } + else if (device == "CPU") { + memcpy (&ilist, 4 + mesh_tensor.flat().data(), sizeof(int *)); + memcpy (&jrange, 8 + mesh_tensor.flat().data(), sizeof(int *)); + memcpy (&jlist, 12 + mesh_tensor.flat().data(), sizeof(int *)); } - // std::cout << "done" << std::endl; - delete[] natoms; + + DescrptSeAFunctor()( + context->eigen_device(), // define actually graph execution device + coord_tensor.matrix().data(), // related to the kk argument + type_tensor.matrix().data(), // also related to the kk argument + mesh_tensor.flat().data(), + ilist, + jrange, + jlist, + array_int, + array_longlong, + avg_tensor.matrix().data(), + std_tensor.matrix().data(), + descrpt_tensor->matrix().data(), + descrpt_deriv_tensor->matrix().data(), + rij_tensor->matrix().data(), + nlist_tensor->matrix().data(), + nloc, + nall, + nnei, + ntypes, + ndescrpt, + rcut_r, + rcut_r_smth, + sec_a, + fill_nei_a, + magic_number + ); } ///////////////////////////////////////////////////////////////////////////////////////////// @@ -277,6 +261,66 @@ class DescrptSeAOp : public OpKernel { sec[ii] = sec[ii-1] + n_sel[ii-1]; } } + + std::string device; + int *array_int; + unsigned long long*array_longlong; + int * ilist = NULL, * jrange = NULL, * jlist = NULL; + int ilist_size = 0, jrange_size = 0, jlist_size = 0; + bool init = false; + + void nbor_update(const int * mesh, const int size) { + int *mesh_host = new int[size], *ilist_host = NULL, *jrange_host = NULL, *jlist_host = NULL; + cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost)); + memcpy (&ilist_host, 4 + mesh_host, sizeof(int *)); + memcpy (&jrange_host, 8 + mesh_host, sizeof(int *)); + memcpy (&jlist_host, 12 + mesh_host, sizeof(int *)); + int const ago = mesh_host[0]; + if (!init) { + ilist_size = (int)(mesh_host[1] * 1.2); + jrange_size = (int)(mesh_host[2] * 1.2); + jlist_size = (int)(mesh_host[3] * 1.2); + cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size)); + cudaErrcheck(cudaMalloc((void **)&jrange, sizeof(int) * jrange_size)); + cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size)); + init = true; + } + if (ago == 0) { + if (ilist_size < mesh_host[1]) { + ilist_size = (int)(mesh_host[1] * 1.2); + cudaErrcheck(cudaFree(ilist)); + cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size)); + } + if (jrange_size < mesh_host[2]) { + jrange_size = (int)(mesh_host[2] * 1.2); + cudaErrcheck(cudaFree(jrange)); + cudaErrcheck(cudaMalloc((void **)&jrange,sizeof(int) * jrange_size)); + } + if (jlist_size < mesh_host[3]) { + jlist_size = (int)(mesh_host[3] * 1.2); + cudaErrcheck(cudaFree(jlist)); + cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size)); + } + cudaErrcheck(cudaMemcpy(ilist, ilist_host, sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jlist, jlist_host, sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice)); + } + delete [] mesh_host; + } }; -REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_GPU), DescrptSeAOp); \ No newline at end of file +// Register the CPU kernels. +#define REGISTER_CPU() \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_CPU), \ + DescrptSeAOp); +REGISTER_CPU(); + +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU() \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_GPU).HostMemory("natoms"), \ + DescrptSeAOp); +REGISTER_GPU(); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc index 34d4183f98..8aece7e384 100644 --- a/source/op/gelu_gpu.cc +++ b/source/op/gelu_gpu.cc @@ -1,3 +1,4 @@ +#include #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -7,6 +8,7 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; +#define SQRT_2_PI 0.7978845608028654 REGISTER_OP("Gelu") .Attr("T: {float, double}") .Input("x: T") @@ -25,35 +27,76 @@ REGISTER_OP("GeluGradGrad") .Input("x: T") .Output("output: T"); +#if GOOGLE_CUDA // maybe instead use cudnn activation forward -void GeluLauncher(const float * in, float * out, int const size); -void GeluLauncher(const double * in, double * out, int const size); +void GeluGPULauncher(const float * in, float * out, int const size); +void GeluGPULauncher(const double * in, double * out, int const size); -void GeluGradLauncher(const float * dy, const float * in, float * out, int const size); -void GeluGradLauncher(const double * dy, const double * in, double * out, int const size); +void GeluGradGPULauncher(const float * dy, const float * in, float * out, int const size); +void GeluGradGPULauncher(const double * dy, const double * in, double * out, int const size); -void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size); -void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size); +void GeluGradGradGPULauncher(const float * dy, const float * dy_, const float * in, float * out, int const size); +void GeluGradGradGPULauncher(const double * dy, const double * dy_, const double * in, double * out, int const size); +#endif // GOOGLE_CUDA + +template +void GeluCPULauncher(const T * in, T * out, int const size) { + for (int ii = 0; ii < size; ii++) { + out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]))); + } +} + +template +void GeluGradCPULauncher(const T * dy, const T * in, T * out, int const size) { + for (int ii = 0; ii < size; ii++) { + T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5); + } +} + +template +void GeluGradGradCPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) { + for (int ii = 0; ii < size; ii++) { + T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); + out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2); + } +} template struct GeluFunctor { - void operator()(const Device& d, const T * in, T * out, int const size) { - GeluLauncher(in, out, size); + void operator()(const CPUDevice& d, const T * in, T * out, int const size) { + GeluCPULauncher(in, out, size); } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const T * in, T * out, int const size) { + GeluGPULauncher(in, out, size); + } + #endif }; template struct GeluGradFunctor { - void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) { - GeluGradLauncher(dy, in, out, size); + void operator()(const CPUDevice& d, const T * dy, const T * in, T * out, int const size) { + GeluGradCPULauncher(dy, in, out, size); } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const T * dy, const T * in, T * out, int const size) { + GeluGradGPULauncher(dy, in, out, size); + } + #endif }; template struct GeluGradGradFunctor { - void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { - GeluGradGradLauncher(dy, dy_, in, out, size); + void operator()(const CPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + GeluGradGradCPULauncher(dy, dy_, in, out, size); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + GeluGradGradGPULauncher(dy, dy_, in, out, size); } + #endif }; // OpKernel definition. @@ -78,7 +121,6 @@ class GeluOp : public OpKernel { output->flat().data(), static_cast(output->NumElements()) ); - // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; @@ -107,7 +149,6 @@ class GeluGradOp : public OpKernel { output->flat().data(), static_cast(output->NumElements()) ); - // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; @@ -138,11 +179,27 @@ class GeluGradGradOp : public OpKernel { output->flat().data(), static_cast(output->NumElements()) ); - // GeluGradGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; -#define REGISTER_GPU(T) \ +#define REGISTER_CPU(T) \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); + +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ /* Declare explicit instantiations in kernel_example.cu.cc. */ \ REGISTER_KERNEL_BUILDER( \ Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ @@ -157,3 +214,4 @@ class GeluGradGradOp : public OpKernel { GeluGradGradOp); REGISTER_GPU(float); REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_force_se_a_gpu.cc b/source/op/prod_force_se_a_gpu.cc index 2d159a8505..c5c6c5c6b7 100644 --- a/source/op/prod_force_se_a_gpu.cc +++ b/source/op/prod_force_se_a_gpu.cc @@ -1,26 +1,5 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include -#include - -using namespace tensorflow; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif +#include "common.h" +#include "CustomeOperation.h" #ifdef HIGH_PREC REGISTER_OP("ProdForceSeA") @@ -42,17 +21,19 @@ REGISTER_OP("ProdForceSeA") .Output("force: float"); #endif -void ProdForceSeALauncher(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const int * nlist, - const int nloc, - const int nall, - const int ndescrpt, - const int nnei, - const int n_a_sel, - const int n_a_shift); +template +struct ProdForceSeAFunctor { + void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdForceSeACPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdForceSeAGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); + } + #endif // GOOGLE_CUDA +}; +template class ProdForceSeAOp : public OpKernel { public: explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -76,8 +57,7 @@ class ProdForceSeAOp : public OpKernel { OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + const int * natoms = natoms_tensor.flat().data(); int nloc = natoms[0]; int nall = natoms[1]; int nframes = net_deriv_tensor.shape().dim_size(0); @@ -117,23 +97,36 @@ class ProdForceSeAOp : public OpKernel { assert (nloc * nnei == nlist_tensor.shape().dim_size(1)); assert (nnei * 4 == ndescrpt); - for (int II = 0; II < nframes; II++) { - ProdForceSeALauncher(force_tensor->flat().data() + II * (nall * 3), - net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), - in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), - nlist_tensor.flat().data() + II * (nloc * nnei), - nloc, - nall, - ndescrpt, - nnei, - n_a_sel, - n_a_shift - ); - } - delete[] natoms; + ProdForceSeAFunctor()( + context->eigen_device(), + force_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + nlist_tensor.flat().data(), + nloc, + nall, + nnei, + ndescrpt, + n_a_sel, + n_a_shift + ); } private: int n_r_sel, n_a_sel, n_a_shift; }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_GPU), ProdForceSeAOp); \ No newline at end of file +// Register the CPU kernels. +#define REGISTER_CPU() \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeA").Device(DEVICE_CPU), \ + ProdForceSeAOp); +REGISTER_CPU(); + +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU() \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeA").Device(DEVICE_GPU).HostMemory("natoms"), \ + ProdForceSeAOp); +REGISTER_GPU(); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_virial_se_a_gpu.cc b/source/op/prod_virial_se_a_gpu.cc index 42f70d06d2..34a7a03574 100644 --- a/source/op/prod_virial_se_a_gpu.cc +++ b/source/op/prod_virial_se_a_gpu.cc @@ -1,14 +1,5 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include -#include - -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif +#include "common.h" +#include "CustomeOperation.h" #ifdef HIGH_PREC REGISTER_OP("ProdVirialSeA") @@ -34,31 +25,19 @@ REGISTER_OP("ProdVirialSeA") .Output("atom_virial: float"); #endif -using namespace tensorflow; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); +template +struct ProdVirialSeAFunctor { + void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdVirialSeACPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } -} - -void ProdVirialSeALauncher(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, - const int * nlist, - const int nloc, - const int nall, - const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift); + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdVirialSeAGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); + } + #endif // GOOGLE_CUDA +}; +template class ProdVirialSeAOp : public OpKernel { public: explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -84,8 +63,7 @@ class ProdVirialSeAOp : public OpKernel { OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + const int * natoms = natoms_tensor.flat().data(); int nloc = natoms[0]; int nall = natoms[1]; int nnei = nlist_tensor.shape().dim_size(1) / nloc; @@ -121,25 +99,38 @@ class ProdVirialSeAOp : public OpKernel { auto virial = virial_tensor->flat(); auto atom_virial = atom_virial_tensor->flat(); - for (int II = 0; II < nframes; II++) { - ProdVirialSeALauncher(virial_tensor->flat().data() + II * 9, - atom_virial_tensor->flat().data() + II * (nall * 9), - net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), - in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), - rij_tensor.flat().data() + II * (nloc * nnei * 3), - nlist_tensor.flat().data() + II * (nloc * nnei), - nloc, - nall, - nnei, - ndescrpt, - n_a_sel, - n_a_shift - ); - } - delete[] natoms; + ProdVirialSeAFunctor()( + context->eigen_device(), + virial_tensor->flat().data(), + atom_virial_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + rij_tensor.flat().data(), + nlist_tensor.flat().data(), + nloc, + nall, + nnei, + ndescrpt, + n_a_sel, + n_a_shift + ); } private: int n_r_sel, n_a_sel, n_a_shift; }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_GPU), ProdVirialSeAOp); \ No newline at end of file +// Register the CPU kernels. +#define REGISTER_CPU() \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeA").Device(DEVICE_CPU), \ + ProdVirialSeAOp); +REGISTER_CPU(); + +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU() \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeA").Device(DEVICE_GPU).HostMemory("natoms"), \ + ProdVirialSeAOp); +REGISTER_GPU(); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py index e46265231f..1c93e92f02 100644 --- a/source/train/DescrptSeA.py +++ b/source/train/DescrptSeA.py @@ -391,14 +391,18 @@ def _filter(self, xyz_scatter = tf.matmul(xyz_scatter, w) # natom x nei_type_i x out_size xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1])) - xyz_scatter_total.append(xyz_scatter) + # xyz_scatter_total.append(xyz_scatter) + if type_i == 0 : + xyz_scatter_1 = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True) + else : + xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True) # natom x nei x outputs_size - xyz_scatter = tf.concat(xyz_scatter_total, axis=1) + # xyz_scatter = tf.concat(xyz_scatter_total, axis=1) # natom x nei x 4 - inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4]) + # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4]) # natom x 4 x outputs_size - xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True) + # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True) xyz_scatter_1 = xyz_scatter_1 * (4.0 / shape[1]) # natom x 4 x outputs_size_2 xyz_scatter_2 = tf.slice(xyz_scatter_1, [0,0,0],[-1,-1,outputs_size_2]) From b55309f10434566b14e0068255b6982da4e50094 Mon Sep 17 00:00:00 2001 From: Lu Date: Fri, 11 Sep 2020 04:16:31 +0800 Subject: [PATCH 2/3] use standard operator style --- source/lib/include/CustomeOperation.h | 111 ++++++----- source/lib/include/DeviceFunctor.h | 47 +++++ source/op/CMakeLists.txt | 2 +- source/op/cuda/descrpt_se_a.cu | 186 +----------------- source/op/cuda/gelu.cu | 49 +++-- source/op/cuda/prod_force_se_a.cu | 50 +---- source/op/cuda/prod_virial_se_a.cu | 63 ++---- source/op/descrpt.cc | 64 +++--- source/op/descrpt_se_a.cc | 77 ++++---- ..._a_gpu.cc => descrpt_se_a_multi_device.cc} | 81 +++----- source/op/descrpt_se_r.cc | 55 +++--- source/op/descrpt_se_r_gpu.cc | 39 ++-- source/op/ewald_recp.cc | 60 +++--- source/op/gelu_gpu.cc | 90 ++------- source/op/gelu_multi_device.cc | 167 ++++++++++++++++ source/op/prod_force.cc | 36 ++-- source/op/prod_force_grad.cc | 39 ++-- source/op/prod_force_se_a.cc | 38 ++-- source/op/prod_force_se_a_grad.cc | 30 ++- ...gpu.cc => prod_force_se_a_multi_device.cc} | 59 +++--- source/op/prod_force_se_r.cc | 32 +-- source/op/prod_force_se_r_gpu.cc | 31 +-- source/op/prod_force_se_r_grad.cc | 36 ++-- source/op/prod_virial.cc | 60 +++--- source/op/prod_virial_grad.cc | 34 ++-- source/op/prod_virial_se_a.cc | 51 +++-- source/op/prod_virial_se_a_grad.cc | 44 ++--- ...pu.cc => prod_virial_se_a_multi_device.cc} | 73 +++---- source/op/prod_virial_se_r.cc | 47 ++--- source/op/prod_virial_se_r_gpu.cc | 33 ++-- source/op/prod_virial_se_r_grad.cc | 32 ++- source/op/soft_min.cc | 71 +++---- source/op/soft_min_force.cc | 35 ++-- source/op/soft_min_force_grad.cc | 39 ++-- source/op/soft_min_virial.cc | 37 ++-- source/op/soft_min_virial_grad.cc | 44 ++--- source/op/tab_inter.cc | 53 +++-- 37 files changed, 935 insertions(+), 1160 deletions(-) create mode 100644 source/lib/include/DeviceFunctor.h rename source/op/{descrpt_se_a_gpu.cc => descrpt_se_a_multi_device.cc} (89%) create mode 100644 source/op/gelu_multi_device.cc rename source/op/{prod_force_se_a_gpu.cc => prod_force_se_a_multi_device.cc} (80%) rename source/op/{prod_virial_se_a_gpu.cc => prod_virial_se_a_multi_device.cc} (77%) diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h index ef8b450db4..c0bd8fdbd5 100644 --- a/source/lib/include/CustomeOperation.h +++ b/source/lib/include/CustomeOperation.h @@ -1,22 +1,16 @@ #pragma once #include -#include #include #include #include #include "MathUtilities.h" - #if GOOGLE_CUDA -#include -#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);} -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { - if (code != cudaSuccess) { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" #endif // GOOGLE_CUDA +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + struct NeighborInfo { int type; double dist; @@ -237,45 +231,14 @@ void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist, nlist[ii * nnei + jj] = fmt_nlist_a[jj]; } } - - #if DEBUG - std::fstream fout1("nlist.txt", std::ios::out); - fout1 << "tensor nlist, length:\t" << nloc << ",\twidth:\t" << nnei << std::endl; - for (int ii = 0; ii < nloc; ii++) { - for (int jj = 0; jj < nnei; jj++) { - fout1 << "nlist[" << ii << "][" << jj << "]:\t" << nlist[ii * nnei + jj] << std::endl; - } - } - fout1.close(); - - std::fstream fout2("rij.txt", std::ios::out); - fout2 << "tensor rij, length:\t" << nloc << ",\twidth:\t" << nnei * 3 << std::endl; - for (int ii = 0; ii < nloc; ii++) { - for (int jj = 0; jj < nnei * 3; jj++) { - fout2 << "rij[" << ii << "][" << jj << "]:\t" << rij[ii * nnei * 3 + jj] << std::endl; - } - } - fout2.close(); - - std::fstream fout3("descrpt.txt", std::ios::out); - fout3 << "tensor descrpt, length:\t" << nloc << ",\twidth:\t" << ndescrpt << std::endl; - for (int ii = 0; ii < nloc; ii++) { - for (int jj = 0; jj < ndescrpt; jj++) { - fout3 << "descrpt[" << ii << "][" << jj << "]:\t" << descrpt[ii * ndescrpt + jj] << std::endl; - } - } - fout3.close(); - #endif // DEBUG } -extern void DescrptSeAGPUExecuteLauncher(const float * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const float * avg, const float * std, float * descrpt, float * descrpt_deriv, float * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number); -extern void DescrptSeAGPUExecuteLauncher(const double * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const double * avg, const double * std, double * descrpt, double * descrpt_deriv, double * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number); - +#if GOOGLE_CUDA template void DescrptSeAGPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { - DescrptSeAGPUExecuteLauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + DescrptSeAGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); } - +#endif // GOOGLE_CUDA // ****************************************************************************** // end of custome op DescrptSeA // ****************************************************************************** @@ -317,14 +280,12 @@ void ProdForceSeACPULauncher(T * force, const T * net_deriv, const T * in_deriv, } } -extern void ProdForceSeAGPUExecuteLauncher(float * force, const float * net_derive, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); -extern void ProdForceSeAGPUExecuteLauncher(double * force, const double * net_derive, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); - - +#if GOOGLE_CUDA template void ProdForceSeAGPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { - ProdForceSeAGPUExecuteLauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); + ProdForceSeAGPUExecuteFunctor()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } +#endif // GOOGLE_CUDA // ****************************************************************************** // end of custome op ProdForceSeA @@ -356,14 +317,56 @@ void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv, } } -extern void ProdVirialSeAGPUExecuteLauncher(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); -extern void ProdVirialSeAGPUExecuteLauncher(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); - +#if GOOGLE_CUDA template void ProdVirialSeAGPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { - ProdVirialSeAGPUExecuteLauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); + ProdVirialSeAGPUExecuteFunctor()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } - +#endif // GOOGLE_CUDA // ****************************************************************************** // end of custome op ProdVirialSeA // ****************************************************************************** + +template +void GeluCPULauncher(const T * in, T * out, int const size) { + for (int ii = 0; ii < size; ii++) { + out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]))); + } +} + +template +void GeluGradCPULauncher(const T * dy, const T * in, T * out, int const size) { + for (int ii = 0; ii < size; ii++) { + T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5); + } +} + +template +void GeluGradGradCPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) { + for (int ii = 0; ii < size; ii++) { + T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); + out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2); + } +} + +#if GOOGLE_CUDA +template +void GeluGPULauncher(const T * in, T * out, int const size) { + GeluGPUExecuteFunctor()(in, out, size); +} + +template +void GeluGradGPULauncher(const T * dy, const T * in, T * out, int const size) { + GeluGradGPUExecuteFunctor()(dy, in, out, size); +} + +template +void GeluGradGradGPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) { + GeluGradGradGPUExecuteFunctor()(dy, dy_, in, out, size); +} +#endif // GOOGLE_CUDA +// ****************************************************************************** +// end of custome op Gelu +// ****************************************************************************** diff --git a/source/lib/include/DeviceFunctor.h b/source/lib/include/DeviceFunctor.h new file mode 100644 index 0000000000..f02ef55651 --- /dev/null +++ b/source/lib/include/DeviceFunctor.h @@ -0,0 +1,47 @@ +#pragma once +#include +#include +#include +#include +#include + +typedef unsigned long long int_64; +#define SQRT_2_PI 0.7978845608028654 + +#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);} +inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) { + fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +template +struct DescrptSeAGPUExecuteFunctor { + void operator()(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descript, T * descript_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER); +}; + +template +struct ProdForceSeAGPUExecuteFunctor { + void operator()(T * force, const T * net_derive, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); +}; + +template +struct ProdVirialSeAGPUExecuteFunctor { + void operator()(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); +}; + +template +struct GeluGPUExecuteFunctor { + void operator()(const T * in, T * out, const int size); +}; + +template +struct GeluGradGPUExecuteFunctor { + void operator()(const T * dy, const T * in, T * out, const int size); +}; + +template +struct GeluGradGradGPUExecuteFunctor { + void operator()(const T * dy, const T * dy_, const T * in, T * out, const int size); +}; \ No newline at end of file diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index 993a1b6fd4..21f675c845 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -5,7 +5,7 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} ) file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc) file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc) -file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_gpu.cc) +file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc) file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu index 7fa84335e5..afed0dbe83 100644 --- a/source/op/cuda/descrpt_se_a.cu +++ b/source/op/cuda/descrpt_se_a.cu @@ -1,22 +1,7 @@ -#include -#include -#include #include #include #include -#include - -typedef unsigned long long int_64; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" template < typename Key, @@ -413,30 +398,8 @@ void format_nbor_list_4096 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } -void DescrptSeAGPUExecuteLauncher(const float * coord, - const int * type, - const int * ilist, - const int * jrange, - const int * jlist, - int * array_int, - unsigned long long * array_longlong, - const float * avg, - const float * std, - float * descript, - float * descript_deriv, - float * rij, - int * nlist, - const int nloc, - const int nall, - const int nnei, - const int ndescrpt, - const float rcut_r, - const float rcut_r_smth, - const std::vector sec_a, - const bool fill_nei_a, - const int MAGIC_NUMBER -) -{ +template +void DescrptSeAGPUExecuteFunctor::operator()(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descript, T * descript_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) { const int LEN = 256; int nblock = (nloc + LEN -1) / LEN; int * sec_a_dev = array_int; @@ -448,8 +411,8 @@ void DescrptSeAGPUExecuteLauncher(const float * coord, res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); - res = cudaMemset(descript, 0.0, sizeof(float) * nloc * ndescrpt); cudaErrcheck(res); - res = cudaMemset(descript_deriv, 0.0, sizeof(float) * nloc * ndescrpt * 3); cudaErrcheck(res); + res = cudaMemset(descript, 0.0, sizeof(T) * nloc * ndescrpt); cudaErrcheck(res); + res = cudaMemset(descript_deriv, 0.0, sizeof(T) * nloc * ndescrpt * 3); cudaErrcheck(res); if (fill_nei_a) { // ~~~ @@ -547,140 +510,7 @@ void DescrptSeAGPUExecuteLauncher(const float * coord, rcut_r, sec_a.back() ); -} - -void DescrptSeAGPUExecuteLauncher(const double * coord, - const int * type, - const int * ilist, - const int * jrange, - const int * jlist, - int * array_int, - unsigned long long * array_longlong, - const double * avg, - const double * std, - double * descript, - double * descript_deriv, - double * rij, - int * nlist, - const int nloc, - const int nall, - const int nnei, - const int ndescrpt, - const float rcut_r, - const float rcut_r_smth, - const std::vector sec_a, - const bool fill_nei_a, - const int MAGIC_NUMBER -) -{ - const int LEN = 256; - int nblock = (nloc + LEN -1) / LEN; - int * sec_a_dev = array_int; - int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size]; - int * i_idx = array_int + sec_a.size() + nloc * sec_a.size(); - int_64 * key = array_longlong; - - cudaError_t res = cudaSuccess; - res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); - res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); - res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); - res = cudaMemset(descript, 0.0, sizeof(double) * nloc * ndescrpt); cudaErrcheck(res); - res = cudaMemset(descript_deriv, 0.0, sizeof(double) * nloc * ndescrpt * 3); cudaErrcheck(res); - - if (fill_nei_a) { - // ~~~ - // cudaProfilerStart(); - get_i_idx_se_a<<>> (nloc, ilist, i_idx); - - if (nnei <= 256) { - format_nbor_list_256 ( - coord, - type, - jrange, - jlist, - nloc, - rcut_r, - i_idx, - key - ); - } else if (nnei <= 512) { - format_nbor_list_512 ( - coord, - type, - jrange, - jlist, - nloc, - rcut_r, - i_idx, - key - ); - } else if (nnei <= 1024) { - format_nbor_list_1024 ( - coord, - type, - jrange, - jlist, - nloc, - rcut_r, - i_idx, - key - ); - } else if (nnei <= 2048) { - format_nbor_list_2048 ( - coord, - type, - jrange, - jlist, - nloc, - rcut_r, - i_idx, - key - ); - } else if (nnei <= 4096) { - format_nbor_list_4096 ( - coord, - type, - jrange, - jlist, - nloc, - rcut_r, - i_idx, - key - ); - } - - format_nlist_fill_b_se_a<<>> ( - nlist, - nnei, - nloc, - jrange, - jlist, - key, - sec_a_dev, - sec_a.size(), - nei_iter, - MAGIC_NUMBER - ); - } +} - const int nblock_ = (sec_a.back() + LEN -1) / LEN; - dim3 block_grid(nloc, nblock_); - dim3 thread_grid(1, LEN); - compute_descriptor_se_a<<>> ( - descript, - ndescrpt, - descript_deriv, - ndescrpt * 3, - rij, - nnei * 3, - type, - avg, - std, - nlist, - nnei, - coord, - rcut_r_smth, - rcut_r, - sec_a.back() - ); -} \ No newline at end of file +template struct DescrptSeAGPUExecuteFunctor; +template struct DescrptSeAGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu index e16a285034..078a31da33 100644 --- a/source/op/cuda/gelu.cu +++ b/source/op/cuda/gelu.cu @@ -1,7 +1,4 @@ -#include -#include - -#define SQRT_2_PI 0.7978845608028654 +#include "DeviceFunctor.h" template __global__ void gelu(const T * in, T * out, int const size) { @@ -33,45 +30,75 @@ __global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * ou out[idx] = dy[idx] * dy_[idx] * (0.134145 * SQRT_2_PI * in[idx] * in[idx] * (1 - var1 * var1) - SQRT_2_PI * in[idx] * var2 * (0.134145 * in[idx] * in[idx] + 1) * var1 + var2); } - -void GeluGPULauncher(const float * in, float * out, int const size) { +void GeluLauncher(const float * in, float * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu<<>>(in, out, size); } -void GeluGPULauncher(const double * in, double * out, int const size) { +void GeluLauncher(const double * in, double * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu<<>>(in, out, size); } -void GeluGradGPULauncher(const float * dy, const float * in, float * out, int const size) { +void GeluGradLauncher(const float * dy, const float * in, float * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad<<>>(dy, in, out, size); } -void GeluGradGPULauncher(const double * dy, const double * in, double * out, int const size) { +void GeluGradLauncher(const double * dy, const double * in, double * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad<<>>(dy, in, out, size); } -void GeluGradGradGPULauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) { +void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad_grad<<>>(dy, dy_, in, out, size); } -void GeluGradGradGPULauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) { +void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad_grad<<>>(dy, dy_, in, out, size); } + +template +void GeluGPUExecuteFunctor::operator()(const T * in, T * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu<<>>(in, out, size); +} + +template +void GeluGradGPUExecuteFunctor::operator()(const T * dy, const T * in, T * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu_grad<<>>(dy, in, out, size); +} + +template +void GeluGradGradGPUExecuteFunctor::operator()(const T * dy, const T * dy_, const T * in, T * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu_grad_grad<<>>(dy, dy_, in, out, size); +} + +template struct GeluGPUExecuteFunctor; +template struct GeluGPUExecuteFunctor; +template struct GeluGradGPUExecuteFunctor; +template struct GeluGradGPUExecuteFunctor; +template struct GeluGradGradGPUExecuteFunctor; +template struct GeluGradGradGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu index 926f30604b..ee826449af 100644 --- a/source/op/cuda/prod_force_se_a.cu +++ b/source/op/cuda/prod_force_se_a.cu @@ -1,16 +1,4 @@ -#include -#include -#include - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { @@ -69,9 +57,10 @@ __global__ void deriv_wrt_neighbors_se_a(T * force, atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]); } -void ProdForceSeAGPUExecuteLauncher(float * force, - const float * net_deriv, - const float * in_deriv, +template +void ProdForceSeAGPUExecuteFunctor::operator()(T * force, + const T * net_deriv, + const T * in_deriv, const int * nlist, const int nloc, const int nall, @@ -81,7 +70,7 @@ void ProdForceSeAGPUExecuteLauncher(float * force, const int n_a_shift) { // std::cout << "I'm here!" << std::endl; - cudaErrcheck(cudaMemset(force, 0.0, sizeof(float) * nall * 3)); + cudaErrcheck(cudaMemset(force, 0.0, sizeof(T) * nall * 3)); const int LEN1 = 256; const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; dim3 grid(nloc, nblock1); @@ -95,28 +84,5 @@ void ProdForceSeAGPUExecuteLauncher(float * force, deriv_wrt_neighbors_se_a<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift); } -void ProdForceSeAGPUExecuteLauncher(double * force, - const double * net_deriv, - const double * in_deriv, - const int * nlist, - const int nloc, - const int nall, - const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift) -{ - // std::cout << "I'm here!" << std::endl; - cudaErrcheck(cudaMemset(force, 0.0, sizeof(double) * nall * 3)); - const int LEN1 = 256; - const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; - dim3 grid(nloc, nblock1); - dim3 thread(3, LEN1); - deriv_wrt_center_atom_se_a<<>>(force, net_deriv, in_deriv, ndescrpt); - - const int LEN = 64; - int nblock = (nloc + LEN -1) / LEN; - dim3 block_grid(nblock, nnei); - dim3 thread_grid(LEN, 3, 4); - deriv_wrt_neighbors_se_a<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift); -} \ No newline at end of file +template struct ProdForceSeAGPUExecuteFunctor; +template struct ProdForceSeAGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu index d2524a3c81..5825120970 100644 --- a/source/op/cuda/prod_virial_se_a.cu +++ b/source/op/cuda/prod_virial_se_a.cu @@ -1,13 +1,4 @@ -#include -#include - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { - if (code != cudaSuccess) { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { @@ -57,11 +48,12 @@ __global__ void deriv_wrt_neighbors_se_a(T * virial, atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]); } -void ProdVirialSeAGPUExecuteLauncher(float * virial, - float * atom_virial, - const float * net_deriv, - const float * in_deriv, - const float * rij, +template +void ProdVirialSeAGPUExecuteFunctor::operator()(T * virial, + T * atom_virial, + const T * net_deriv, + const T * in_deriv, + const T * rij, const int * nlist, const int nloc, const int nall, @@ -70,8 +62,8 @@ void ProdVirialSeAGPUExecuteLauncher(float * virial, const int n_a_sel, const int n_a_shift) { - cudaErrcheck(cudaMemset(virial, 0.0, sizeof(float) * 9)); - cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(float) * 9 * nall)); + cudaErrcheck(cudaMemset(virial, 0.0, sizeof(T) * 9)); + cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(T) * 9 * nall)); const int LEN = 16; int nblock = (nloc + LEN -1) / LEN; @@ -93,38 +85,5 @@ void ProdVirialSeAGPUExecuteLauncher(float * virial, ); } -void ProdVirialSeAGPUExecuteLauncher(double * virial, - double * atom_virial, - const double * net_deriv, - const double * in_deriv, - const double * rij, - const int * nlist, - const int nloc, - const int nall, - const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift) -{ - cudaErrcheck(cudaMemset(virial, 0.0, sizeof(double) * 9)); - cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(double) * 9 * nall)); - - const int LEN = 16; - int nblock = (nloc + LEN -1) / LEN; - dim3 block_grid(nblock, nnei); - dim3 thread_grid(LEN, 9, 4); - // compute virial of a frame - deriv_wrt_neighbors_se_a<<>>( - virial, - atom_virial, - net_deriv, - in_deriv, - rij, - nlist, - nloc, - nnei, - ndescrpt, - n_a_sel, - n_a_shift - ); -} +template struct ProdVirialSeAGPUExecuteFunctor; +template struct ProdVirialSeAGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc index 75c7640b2b..c59ba817b1 100644 --- a/source/op/descrpt.cc +++ b/source/op/descrpt.cc @@ -18,48 +18,30 @@ typedef double VALUETYPE ; typedef float VALUETYPE ; #endif -#ifdef HIGH_PREC -REGISTER_OP("Descrpt") -.Input("coord: double") -.Input("type: int32") -.Input("natoms: int32") -.Input("box: double") -.Input("mesh: int32") -.Input("davg: double") -.Input("dstd: double") -.Attr("rcut_a: float") -.Attr("rcut_r: float") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Attr("axis_rule: list(int)") -.Output("descrpt: double") -.Output("descrpt_deriv: double") -.Output("rij: double") -.Output("nlist: int32") -.Output("axis: int32") -.Output("rot_mat: double"); -#else +using CPUDevice = Eigen::ThreadPoolDevice; + REGISTER_OP("Descrpt") -.Input("coord: float") +.Attr("T: {float, double}") +.Input("coord: T") .Input("type: int32") .Input("natoms: int32") -.Input("box: float") +.Input("box: T") .Input("mesh: int32") -.Input("davg: float") -.Input("dstd: float") +.Input("davg: T") +.Input("dstd: T") .Attr("rcut_a: float") .Attr("rcut_r: float") .Attr("sel_a: list(int)") .Attr("sel_r: list(int)") .Attr("axis_rule: list(int)") -.Output("descrpt: float") -.Output("descrpt_deriv: float") -.Output("rij: float") +.Output("descrpt: T") +.Output("descrpt_deriv: T") +.Output("rij: T") .Output("nlist: int32") .Output("axis: int32") -.Output("rot_mat: float"); -#endif +.Output("rot_mat: T"); +template class DescrptOp : public OpKernel { public: explicit DescrptOp(OpKernelConstruction* context) : OpKernel(context) { @@ -182,18 +164,18 @@ class DescrptOp : public OpKernel { Tensor* rot_mat_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(5, rot_mat_shape, &rot_mat_tensor)); - auto coord = coord_tensor .matrix(); + auto coord = coord_tensor .matrix(); auto type = type_tensor .matrix(); - auto box = box_tensor .matrix(); + auto box = box_tensor .matrix(); auto mesh = mesh_tensor .flat(); - auto avg = avg_tensor .matrix(); - auto std = std_tensor .matrix(); - auto descrpt = descrpt_tensor ->matrix(); - auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); - auto rij = rij_tensor ->matrix(); + auto avg = avg_tensor .matrix(); + auto std = std_tensor .matrix(); + auto descrpt = descrpt_tensor ->matrix(); + auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); + auto rij = rij_tensor ->matrix(); auto nlist = nlist_tensor ->matrix(); auto axis = axis_tensor ->matrix(); - auto rot_mat = rot_mat_tensor ->matrix(); + auto rot_mat = rot_mat_tensor ->matrix(); // // check the types // int max_type_v = 0; @@ -624,5 +606,9 @@ class DescrptOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("Descrpt").Device(DEVICE_CPU), DescrptOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("Descrpt").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/descrpt_se_a.cc b/source/op/descrpt_se_a.cc index b970f6dbf0..3dca8040aa 100644 --- a/source/op/descrpt_se_a.cc +++ b/source/op/descrpt_se_a.cc @@ -18,44 +18,29 @@ typedef double VALUETYPE ; typedef float VALUETYPE ; #endif -#ifdef HIGH_PREC -REGISTER_OP("DescrptSeA") -.Input("coord: double") -.Input("type: int32") -.Input("natoms: int32") -.Input("box: double") -.Input("mesh: int32") -.Input("davg: double") -.Input("dstd: double") -.Attr("rcut_a: float") -.Attr("rcut_r: float") -.Attr("rcut_r_smth: float") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Output("descrpt: double") -.Output("descrpt_deriv: double") -.Output("rij: double") -.Output("nlist: int32"); -#else +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + REGISTER_OP("DescrptSeA") -.Input("coord: float") -.Input("type: int32") -.Input("natoms: int32") -.Input("box: float") -.Input("mesh: int32") -.Input("davg: float") -.Input("dstd: float") -.Attr("rcut_a: float") -.Attr("rcut_r: float") -.Attr("rcut_r_smth: float") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Output("descrpt: float") -.Output("descrpt_deriv: float") -.Output("rij: float") -.Output("nlist: int32"); -#endif + .Attr("T: {float, double}") + .Input("coord: T") //atomic coordinates + .Input("type: int32") //atomic type + .Input("natoms: int32") //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers + .Input("box : T") + .Input("mesh : int32") + .Input("davg: T") //average value of data + .Input("dstd: T") //standard deviation + .Attr("rcut_a: float") //no use + .Attr("rcut_r: float") + .Attr("rcut_r_smth: float") + .Attr("sel_a: list(int)") + .Attr("sel_r: list(int)") //all zero + .Output("descrpt: T") + .Output("descrpt_deriv: T") + .Output("rij: T") + .Output("nlist: int32"); +template class DescrptSeAOp : public OpKernel { public: explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -180,15 +165,15 @@ class DescrptSeAOp : public OpKernel { nlist_shape, &nlist_tensor)); - auto coord = coord_tensor .matrix(); + auto coord = coord_tensor .matrix(); auto type = type_tensor .matrix(); - auto box = box_tensor .matrix(); + auto box = box_tensor .matrix(); auto mesh = mesh_tensor .flat(); - auto avg = avg_tensor .matrix(); - auto std = std_tensor .matrix(); - auto descrpt = descrpt_tensor ->matrix(); - auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); - auto rij = rij_tensor ->matrix(); + auto avg = avg_tensor .matrix(); + auto std = std_tensor .matrix(); + auto descrpt = descrpt_tensor ->matrix(); + auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); + auto rij = rij_tensor ->matrix(); auto nlist = nlist_tensor ->matrix(); // // check the types @@ -369,5 +354,9 @@ class DescrptSeAOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_CPU), DescrptSeAOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptSeAOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_multi_device.cc similarity index 89% rename from source/op/descrpt_se_a_gpu.cc rename to source/op/descrpt_se_a_multi_device.cc index 9c52a62474..0069bb5375 100644 --- a/source/op/descrpt_se_a_gpu.cc +++ b/source/op/descrpt_se_a_multi_device.cc @@ -1,44 +1,25 @@ #include "common.h" #include "CustomeOperation.h" -#ifdef HIGH_PREC REGISTER_OP("DescrptSeA") - .Input("coord: double") //atomic coordinates + .Attr("T: {float, double}") + .Input("coord: T") //atomic coordinates .Input("type: int32") //atomic type .Input("natoms: int32") //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers - .Input("box : double") + .Input("box : T") .Input("mesh : int32") - .Input("davg: double") //average value of data - .Input("dstd: double") //standard deviation + .Input("davg: T") //average value of data + .Input("dstd: T") //standard deviation .Attr("rcut_a: float") //no use .Attr("rcut_r: float") .Attr("rcut_r_smth: float") .Attr("sel_a: list(int)") .Attr("sel_r: list(int)") //all zero - .Output("descrpt: double") - .Output("descrpt_deriv: double") - .Output("rij: double") + .Output("descrpt: T") + .Output("descrpt_deriv: T") + .Output("rij: T") .Output("nlist: int32"); // only sel_a and rcut_r uesd. -#else -REGISTER_OP("DescrptSeA") - .Input("coord: float") - .Input("type: int32") - .Input("natoms: int32") - .Input("box : float") - .Input("mesh : int32") - .Input("davg: float") - .Input("dstd: float") - .Attr("rcut_a: float") - .Attr("rcut_r: float") - .Attr("rcut_r_smth: float") - .Attr("sel_a: list(int)") - .Attr("sel_r: list(int)") - .Output("descrpt: float") - .Output("descrpt_deriv: float") - .Output("rij: float") - .Output("nlist: int32"); -#endif int get_magic_number(int const nnei) { if (nnei <= 256) { @@ -58,7 +39,6 @@ int get_magic_number(int const nnei) { } } -template struct DeviceFunctor { void operator()(const CPUDevice& d, std::string& device) { device = "CPU"; @@ -70,7 +50,7 @@ struct DeviceFunctor { #endif // GOOGLE_CUDA }; -template +template struct DescrptSeAFunctor { void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); @@ -80,10 +60,10 @@ struct DescrptSeAFunctor { void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); } - #endif // GOOGLE_CUDA + #endif // GOOGLE_CUDA }; -template +template class DescrptSeAOp : public OpKernel { public: explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -130,7 +110,7 @@ class DescrptSeAOp : public OpKernel { OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - DeviceFunctor() ( + DeviceFunctor() ( context->eigen_device(), device ); @@ -211,9 +191,9 @@ class DescrptSeAOp : public OpKernel { memcpy (&jlist, 12 + mesh_tensor.flat().data(), sizeof(int *)); } - DescrptSeAFunctor()( + DescrptSeAFunctor()( context->eigen_device(), // define actually graph execution device - coord_tensor.matrix().data(), // related to the kk argument + coord_tensor.matrix().data(), // related to the kk argument type_tensor.matrix().data(), // also related to the kk argument mesh_tensor.flat().data(), ilist, @@ -221,11 +201,11 @@ class DescrptSeAOp : public OpKernel { jlist, array_int, array_longlong, - avg_tensor.matrix().data(), - std_tensor.matrix().data(), - descrpt_tensor->matrix().data(), - descrpt_deriv_tensor->matrix().data(), - rij_tensor->matrix().data(), + avg_tensor.matrix().data(), + std_tensor.matrix().data(), + descrpt_tensor->matrix().data(), + descrpt_deriv_tensor->matrix().data(), + rij_tensor->matrix().data(), nlist_tensor->matrix().data(), nloc, nall, @@ -310,17 +290,18 @@ class DescrptSeAOp : public OpKernel { }; // Register the CPU kernels. -#define REGISTER_CPU() \ -REGISTER_KERNEL_BUILDER( \ - Name("DescrptSeA").Device(DEVICE_CPU), \ - DescrptSeAOp); -REGISTER_CPU(); - +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptSeAOp); +REGISTER_CPU(float); +REGISTER_CPU(double); // Register the GPU kernels. #if GOOGLE_CUDA -#define REGISTER_GPU() \ -REGISTER_KERNEL_BUILDER( \ - Name("DescrptSeA").Device(DEVICE_GPU).HostMemory("natoms"), \ - DescrptSeAOp); -REGISTER_GPU(); +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + DescrptSeAOp); +REGISTER_GPU(float); +REGISTER_GPU(double); #endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/descrpt_se_r.cc b/source/op/descrpt_se_r.cc index 6798df503c..7a8dbb1541 100644 --- a/source/op/descrpt_se_r.cc +++ b/source/op/descrpt_se_r.cc @@ -18,39 +18,26 @@ typedef double VALUETYPE ; typedef float VALUETYPE ; #endif +using CPUDevice = Eigen::ThreadPoolDevice; + REGISTER_OP("DescrptSeR") -#ifdef HIGH_PREC -.Input("coord: double") -.Input("type: int32") -.Input("natoms: int32") -.Input("box: double") -.Input("mesh: int32") -.Input("davg: double") -.Input("dstd: double") -.Attr("rcut: float") -.Attr("rcut_smth: float") -.Attr("sel: list(int)") -.Output("descrpt: double") -.Output("descrpt_deriv: double") -.Output("rij: double") -.Output("nlist: int32"); -#else -.Input("coord: float") +.Attr("T: {float, double}") +.Input("coord: T") .Input("type: int32") .Input("natoms: int32") -.Input("box: float") +.Input("box: T") .Input("mesh: int32") -.Input("davg: float") -.Input("dstd: float") +.Input("davg: T") +.Input("dstd: T") .Attr("rcut: float") .Attr("rcut_smth: float") .Attr("sel: list(int)") -.Output("descrpt: float") -.Output("descrpt_deriv: float") -.Output("rij: float") +.Output("descrpt: T") +.Output("descrpt_deriv: T") +.Output("rij: T") .Output("nlist: int32"); -#endif +template class DescrptSeROp : public OpKernel { public: explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -169,15 +156,15 @@ class DescrptSeROp : public OpKernel { nlist_shape, &nlist_tensor)); - auto coord = coord_tensor .matrix(); + auto coord = coord_tensor .matrix(); auto type = type_tensor .matrix(); - auto box = box_tensor .matrix(); + auto box = box_tensor .matrix(); auto mesh = mesh_tensor .flat(); - auto avg = avg_tensor .matrix(); - auto std = std_tensor .matrix(); - auto descrpt = descrpt_tensor ->matrix(); - auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); - auto rij = rij_tensor ->matrix(); + auto avg = avg_tensor .matrix(); + auto std = std_tensor .matrix(); + auto descrpt = descrpt_tensor ->matrix(); + auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); + auto rij = rij_tensor ->matrix(); auto nlist = nlist_tensor ->matrix(); OP_REQUIRES (context, (ntypes == int(sel.size())), errors::InvalidArgument ("number of types should match the length of sel array")); @@ -351,5 +338,9 @@ class DescrptSeROp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_CPU), DescrptSeROp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptSeROp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/descrpt_se_r_gpu.cc b/source/op/descrpt_se_r_gpu.cc index 65e2682ef0..bf2f5b0b12 100644 --- a/source/op/descrpt_se_r_gpu.cc +++ b/source/op/descrpt_se_r_gpu.cc @@ -31,8 +31,8 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort= using GPUDevice = Eigen::GpuDevice; -#ifdef HIGH_PREC REGISTER_OP("DescrptSeR") + .Attr("T: {float, double}") .Input("coord: double") .Input("type: int32") .Input("natoms: int32") @@ -47,23 +47,7 @@ REGISTER_OP("DescrptSeR") .Output("descrpt_deriv: double") .Output("rij: double") .Output("nlist: int32"); -#else -REGISTER_OP("DescrptSeR") - .Input("coord: float") - .Input("type: int32") - .Input("natoms: int32") - .Input("box: float") - .Input("mesh: int32") - .Input("davg: float") - .Input("dstd: float") - .Attr("rcut: float") - .Attr("rcut_smth: float") - .Attr("sel: list(int)") - .Output("descrpt: float") - .Output("descrpt_deriv: float") - .Output("rij: float") - .Output("nlist: int32"); -#endif + void DescrptSeRLauncher(const VALUETYPE* coord, const int* type, @@ -90,6 +74,7 @@ void DescrptSeRLauncher(const VALUETYPE* coord, const bool& fill_nei_a ); +template class DescrptSeROp : public OpKernel { public: explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -193,7 +178,7 @@ class DescrptSeROp : public OpKernel { // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice)); // Launch computation for (int II = 0; II < nsamples; II++) { - DescrptSeRLauncher( coord_tensor.matrix().data() + II * (nall * 3), + DescrptSeRLauncher( coord_tensor.matrix().data() + II * (nall * 3), type_tensor.matrix().data() + II * nall, ilist, jrange, @@ -201,11 +186,11 @@ class DescrptSeROp : public OpKernel { array_int, array_longlong, array_double, - avg_tensor.matrix().data(), - std_tensor.matrix().data(), - descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), - descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), - rij_tensor->matrix().data() + II * (nloc * nnei * 3), + avg_tensor.matrix().data(), + std_tensor.matrix().data(), + descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), + descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), + rij_tensor->matrix().data() + II * (nloc * nnei * 3), nlist_tensor->matrix().data() + II * (nloc * nnei), ntypes, nloc, @@ -244,4 +229,8 @@ class DescrptSeROp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_GPU), DescrptSeROp); \ No newline at end of file +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint("T"), \ + DescrptSeROp); +REGISTER_GPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc index 29daaa53c8..3833de1f7b 100644 --- a/source/op/ewald_recp.cc +++ b/source/op/ewald_recp.cc @@ -16,30 +16,21 @@ typedef double VALUETYPE ; typedef float VALUETYPE ; #endif -#ifdef HIGH_PREC -REGISTER_OP("EwaldRecp") -.Input("coord: double") -.Input("charge: double") -.Input("natoms: int32") -.Input("box: double") -.Attr("ewald_beta: float") -.Attr("ewald_h: float") -.Output("energy: double") -.Output("force: double") -.Output("virial: double"); -#else +using CPUDevice = Eigen::ThreadPoolDevice; + REGISTER_OP("EwaldRecp") -.Input("coord: float") -.Input("charge: float") +.Attr("T: {float, double}") +.Input("coord: T") +.Input("charge: T") .Input("natoms: int32") -.Input("box: float") +.Input("box: T") .Attr("ewald_beta: float") .Attr("ewald_h: float") -.Output("energy: float") -.Output("force: float") -.Output("virial: float"); -#endif +.Output("energy: T") +.Output("force: T") +.Output("virial: T"); +template class EwaldRecpOp : public OpKernel { public: explicit EwaldRecpOp(OpKernelConstruction* context) : OpKernel(context) { @@ -90,12 +81,12 @@ class EwaldRecpOp : public OpKernel { Tensor* virial_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(cc++, virial_shape, &virial_tensor)); - auto coord = coord_tensor .flat(); - auto charge = charge_tensor .flat(); - auto box = box_tensor .flat(); - auto energy = energy_tensor ->flat(); - auto force = force_tensor ->matrix(); - auto virial = virial_tensor ->matrix(); + auto coord = coord_tensor .flat(); + auto charge = charge_tensor .flat(); + auto box = box_tensor .flat(); + auto energy = energy_tensor ->flat(); + auto force = force_tensor ->matrix(); + auto virial = virial_tensor ->matrix(); for (int kk = 0; kk < nsamples; ++kk){ int box_iter = kk * 9; @@ -122,19 +113,19 @@ class EwaldRecpOp : public OpKernel { else if (inter[dd] >= 1) inter[dd] -= 1.; } } - vector d_coord3 (nloc*3); + vector d_coord3 (nloc*3); for (int ii = 0; ii < nloc * 3; ++ii) { d_coord3[ii] = d_coord3_[ii]; } // set charge - vector d_charge (nloc); + vector d_charge (nloc); for (int ii = 0; ii < nloc; ++ii) d_charge[ii] = charge(charge_iter + ii); // prepare outputs vectors - VALUETYPE d_ener; - vector d_force(nloc*3); - vector d_virial(9); + T d_ener; + vector d_force(nloc*3); + vector d_virial(9); // compute EwaldReciprocal(d_ener, d_force, d_virial, d_coord3, d_charge, region, ep); @@ -150,8 +141,11 @@ class EwaldRecpOp : public OpKernel { } } private: - EwaldParameters ep; + EwaldParameters ep; }; -REGISTER_KERNEL_BUILDER(Name("EwaldRecp").Device(DEVICE_CPU), EwaldRecpOp); - +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("EwaldRecp").Device(DEVICE_CPU).TypeConstraint("T"), \ + EwaldRecpOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc index 8aece7e384..34d4183f98 100644 --- a/source/op/gelu_gpu.cc +++ b/source/op/gelu_gpu.cc @@ -1,4 +1,3 @@ -#include #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -8,7 +7,6 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; -#define SQRT_2_PI 0.7978845608028654 REGISTER_OP("Gelu") .Attr("T: {float, double}") .Input("x: T") @@ -27,76 +25,35 @@ REGISTER_OP("GeluGradGrad") .Input("x: T") .Output("output: T"); -#if GOOGLE_CUDA // maybe instead use cudnn activation forward -void GeluGPULauncher(const float * in, float * out, int const size); -void GeluGPULauncher(const double * in, double * out, int const size); +void GeluLauncher(const float * in, float * out, int const size); +void GeluLauncher(const double * in, double * out, int const size); -void GeluGradGPULauncher(const float * dy, const float * in, float * out, int const size); -void GeluGradGPULauncher(const double * dy, const double * in, double * out, int const size); +void GeluGradLauncher(const float * dy, const float * in, float * out, int const size); +void GeluGradLauncher(const double * dy, const double * in, double * out, int const size); -void GeluGradGradGPULauncher(const float * dy, const float * dy_, const float * in, float * out, int const size); -void GeluGradGradGPULauncher(const double * dy, const double * dy_, const double * in, double * out, int const size); -#endif // GOOGLE_CUDA - -template -void GeluCPULauncher(const T * in, T * out, int const size) { - for (int ii = 0; ii < size; ii++) { - out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]))); - } -} - -template -void GeluGradCPULauncher(const T * dy, const T * in, T * out, int const size) { - for (int ii = 0; ii < size; ii++) { - T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); - out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5); - } -} - -template -void GeluGradGradCPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) { - for (int ii = 0; ii < size; ii++) { - T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); - T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); - out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2); - } -} +void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size); +void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size); template struct GeluFunctor { - void operator()(const CPUDevice& d, const T * in, T * out, int const size) { - GeluCPULauncher(in, out, size); + void operator()(const Device& d, const T * in, T * out, int const size) { + GeluLauncher(in, out, size); } - #if GOOGLE_CUDA - void operator()(const GPUDevice& d, const T * in, T * out, int const size) { - GeluGPULauncher(in, out, size); - } - #endif }; template struct GeluGradFunctor { - void operator()(const CPUDevice& d, const T * dy, const T * in, T * out, int const size) { - GeluGradCPULauncher(dy, in, out, size); + void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) { + GeluGradLauncher(dy, in, out, size); } - #if GOOGLE_CUDA - void operator()(const GPUDevice& d, const T * dy, const T * in, T * out, int const size) { - GeluGradGPULauncher(dy, in, out, size); - } - #endif }; template struct GeluGradGradFunctor { - void operator()(const CPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { - GeluGradGradCPULauncher(dy, dy_, in, out, size); - } - #if GOOGLE_CUDA - void operator()(const GPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { - GeluGradGradGPULauncher(dy, dy_, in, out, size); + void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + GeluGradGradLauncher(dy, dy_, in, out, size); } - #endif }; // OpKernel definition. @@ -121,6 +78,7 @@ class GeluOp : public OpKernel { output->flat().data(), static_cast(output->NumElements()) ); + // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; @@ -149,6 +107,7 @@ class GeluGradOp : public OpKernel { output->flat().data(), static_cast(output->NumElements()) ); + // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; @@ -179,27 +138,11 @@ class GeluGradGradOp : public OpKernel { output->flat().data(), static_cast(output->NumElements()) ); + // GeluGradGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; -#define REGISTER_CPU(T) \ -/* Declare explicit instantiations in kernel_example.cu.cc. */ \ -REGISTER_KERNEL_BUILDER( \ - Name("Gelu").Device(DEVICE_CPU).TypeConstraint("T"), \ - GeluOp); \ -/* Declare explicit instantiations in kernel_example.cu.cc. */ \ -REGISTER_KERNEL_BUILDER( \ - Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ - GeluGradOp); \ -/* Declare explicit instantiations in kernel_example.cu.cc. */ \ -REGISTER_KERNEL_BUILDER( \ - Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ - GeluGradGradOp); -REGISTER_CPU(float); -REGISTER_CPU(double); - -#if GOOGLE_CUDA -#define REGISTER_GPU(T) \ +#define REGISTER_GPU(T) \ /* Declare explicit instantiations in kernel_example.cu.cc. */ \ REGISTER_KERNEL_BUILDER( \ Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ @@ -214,4 +157,3 @@ REGISTER_CPU(double); GeluGradGradOp); REGISTER_GPU(float); REGISTER_GPU(double); -#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc new file mode 100644 index 0000000000..2a5656e6d7 --- /dev/null +++ b/source/op/gelu_multi_device.cc @@ -0,0 +1,167 @@ +#include "common.h" +#include "CustomeOperation.h" + +REGISTER_OP("Gelu") + .Attr("T: {float, double}") + .Input("x: T") + .Output("output: T"); + +REGISTER_OP("GeluGrad") + .Attr("T: {float, double}") + .Input("dy: T") + .Input("x: T") + .Output("output: T"); + +REGISTER_OP("GeluGradGrad") + .Attr("T: {float, double}") + .Input("dy: T") + .Input("dy_: T") + .Input("x: T") + .Output("output: T"); + +template +struct GeluFunctor { + void operator()(const CPUDevice& d, const T * in, T * out, int const size) { + GeluCPULauncher(in, out, size); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const T * in, T * out, int const size) { + GeluGPULauncher(in, out, size); + } + #endif +}; + +template +struct GeluGradFunctor { + void operator()(const CPUDevice& d, const T * dy, const T * in, T * out, int const size) { + GeluGradCPULauncher(dy, in, out, size); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const T * dy, const T * in, T * out, int const size) { + GeluGradGPULauncher(dy, in, out, size); + } + #endif +}; + +template +struct GeluGradGradFunctor { + void operator()(const CPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + GeluGradGradCPULauncher(dy, dy_, in, out, size); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + GeluGradGradGPULauncher(dy, dy_, in, out, size); + } + #endif +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluOp : public OpKernel { + public : + explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& x = context->input(0); + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluFunctor()( + context->eigen_device(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluGradOp : public OpKernel { + public : + explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& dy = context->input(0); + const Tensor& x = context->input(1); + + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluGradFunctor()( + context->eigen_device(), + dy.flat().data(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluGradGradOp : public OpKernel { + public : + explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& dy = context->input(0); + const Tensor& dy_ = context->input(1); + const Tensor& x = context->input(2); + + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluGradGradFunctor()( + context->eigen_device(), + dy.flat().data(), + dy_.flat().data(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + } +}; + +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); + +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradGradOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_force.cc b/source/op/prod_force.cc index e41b908d41..e1c6c362c8 100644 --- a/source/op/prod_force.cc +++ b/source/op/prod_force.cc @@ -12,30 +12,23 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("ProdForce") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("nlist: int32") -.Input("axis: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("force: double"); -#else REGISTER_OP("ProdForce") -.Input("net_deriv: float") -.Input("in_deriv: float") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("axis: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("force: float"); -#endif +.Output("force: T"); + using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class ProdForceOp : public OpKernel { public: explicit ProdForceOp(OpKernelConstruction* context) : OpKernel(context) { @@ -86,11 +79,11 @@ class ProdForceOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); auto axis = axis_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); // loop over samples #pragma omp parallel for @@ -176,7 +169,10 @@ class ProdForceOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdForce").Device(DEVICE_CPU), ProdForceOp); - +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForce").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/prod_force_grad.cc b/source/op/prod_force_grad.cc index 14e884c5c9..48f55a977b 100644 --- a/source/op/prod_force_grad.cc +++ b/source/op/prod_force_grad.cc @@ -12,30 +12,21 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("ProdForceGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("nlist: int32") -.Input("axis: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: double"); -#else REGISTER_OP("ProdForceGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("axis: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdForceGradOp : public OpKernel { public: @@ -97,12 +88,12 @@ class ProdForceGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); auto nlist = nlist_tensor .flat(); auto axis = axis_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -190,4 +181,8 @@ class ProdForceGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceGrad").Device(DEVICE_CPU), ProdForceGradOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceGradOp); +REGISTER_CPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/prod_force_se_a.cc b/source/op/prod_force_se_a.cc index af0e712492..c762bf71fd 100644 --- a/source/op/prod_force_se_a.cc +++ b/source/op/prod_force_se_a.cc @@ -12,28 +12,23 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("ProdForceSeA") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("force: double"); -#else REGISTER_OP("ProdForceSeA") -.Input("net_deriv: float") -.Input("in_deriv: float") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("force: float"); -#endif +.Output("force: T"); + using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +template class ProdForceSeAOp : public OpKernel { public: explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -83,10 +78,10 @@ class ProdForceSeAOp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -155,7 +150,10 @@ class ProdForceSeAOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_CPU), ProdForceSeAOp); - - +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeAOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/prod_force_se_a_grad.cc b/source/op/prod_force_se_a_grad.cc index eda965974a..d884782f17 100644 --- a/source/op/prod_force_se_a_grad.cc +++ b/source/op/prod_force_se_a_grad.cc @@ -12,28 +12,20 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC REGISTER_OP("ProdForceSeAGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: double"); -#else -REGISTER_OP("ProdForceSeAGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdForceSeAGradOp : public OpKernel { public: @@ -158,4 +150,8 @@ class ProdForceSeAGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeAGrad").Device(DEVICE_CPU), ProdForceSeAGradOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeAGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeAGradOp); +REGISTER_CPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/prod_force_se_a_gpu.cc b/source/op/prod_force_se_a_multi_device.cc similarity index 80% rename from source/op/prod_force_se_a_gpu.cc rename to source/op/prod_force_se_a_multi_device.cc index c5c6c5c6b7..a864617208 100644 --- a/source/op/prod_force_se_a_gpu.cc +++ b/source/op/prod_force_se_a_multi_device.cc @@ -1,27 +1,17 @@ #include "common.h" #include "CustomeOperation.h" -#ifdef HIGH_PREC REGISTER_OP("ProdForceSeA") - .Input("net_deriv: double") - .Input("in_deriv: double") + .Attr("T: {float, double}") + .Input("net_deriv: T") + .Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") - .Output("force: double"); -#else -REGISTER_OP("ProdForceSeA") - .Input("net_deriv: float") - .Input("in_deriv: float") - .Input("nlist: int32") - .Input("natoms: int32") - .Attr("n_a_sel: int") - .Attr("n_r_sel: int") - .Output("force: float"); -#endif + .Output("force: T"); -template +template struct ProdForceSeAFunctor { void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { ProdForceSeACPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); @@ -33,7 +23,7 @@ struct ProdForceSeAFunctor { #endif // GOOGLE_CUDA }; -template +template class ProdForceSeAOp : public OpKernel { public: explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -82,10 +72,10 @@ class ProdForceSeAOp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -97,11 +87,11 @@ class ProdForceSeAOp : public OpKernel { assert (nloc * nnei == nlist_tensor.shape().dim_size(1)); assert (nnei * 4 == ndescrpt); - ProdForceSeAFunctor()( + ProdForceSeAFunctor()( context->eigen_device(), - force_tensor->flat().data(), - net_deriv_tensor.flat().data(), - in_deriv_tensor.flat().data(), + force_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), nlist_tensor.flat().data(), nloc, nall, @@ -116,17 +106,18 @@ class ProdForceSeAOp : public OpKernel { }; // Register the CPU kernels. -#define REGISTER_CPU() \ -REGISTER_KERNEL_BUILDER( \ - Name("ProdForceSeA").Device(DEVICE_CPU), \ - ProdForceSeAOp); -REGISTER_CPU(); - +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeAOp); +REGISTER_CPU(float); +REGISTER_CPU(double); // Register the GPU kernels. #if GOOGLE_CUDA -#define REGISTER_GPU() \ -REGISTER_KERNEL_BUILDER( \ - Name("ProdForceSeA").Device(DEVICE_GPU).HostMemory("natoms"), \ - ProdForceSeAOp); -REGISTER_GPU(); +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + ProdForceSeAOp); +REGISTER_GPU(float); +REGISTER_GPU(double); #endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_force_se_r.cc b/source/op/prod_force_se_r.cc index b4933c5b4a..a8e5b69162 100644 --- a/source/op/prod_force_se_r.cc +++ b/source/op/prod_force_se_r.cc @@ -13,22 +13,18 @@ typedef float VALUETYPE; #endif REGISTER_OP("ProdForceSeR") -#ifdef HIGH_PREC -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("nlist: int32") -.Input("natoms: int32") -.Output("force: double"); -#else -.Input("net_deriv: float") -.Input("in_deriv: float") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") -.Output("force: float"); -#endif +.Output("force: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class ProdForceSeROp : public OpKernel { public: explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -73,10 +69,10 @@ class ProdForceSeROp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -126,7 +122,11 @@ class ProdForceSeROp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_CPU), ProdForceSeROp); - +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeROp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/prod_force_se_r_gpu.cc b/source/op/prod_force_se_r_gpu.cc index 3dc0bc7853..8f6ee24910 100644 --- a/source/op/prod_force_se_r_gpu.cc +++ b/source/op/prod_force_se_r_gpu.cc @@ -22,21 +22,16 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort= typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("ProdForceSeR") - .Input("net_deriv: double") - .Input("in_deriv: double") - .Input("nlist: int32") - .Input("natoms: int32") - .Output("force: double"); -#else REGISTER_OP("ProdForceSeR") - .Input("net_deriv: float") - .Input("in_deriv: float") - .Input("nlist: int32") - .Input("natoms: int32") - .Output("force: float"); -#endif +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("nlist: int32") +.Input("natoms: int32") +.Output("force: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; void ProdForceSeRLauncher(VALUETYPE * force, const VALUETYPE * net_deriv, @@ -49,6 +44,7 @@ void ProdForceSeRLauncher(VALUETYPE * force, const int n_a_sel, const int n_a_shift); +template class ProdForceSeROp : public OpKernel { public: explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -130,4 +126,9 @@ class ProdForceSeROp : public OpKernel { int n_r_sel, n_a_sel, n_a_shift; }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_GPU), ProdForceSeROp); \ No newline at end of file +// Register the CPU kernels. +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeR").Device(DEVICE_GPU).TypeConstraint("T"), \ + ProdForceSeROp); +REGISTER_GPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/prod_force_se_r_grad.cc b/source/op/prod_force_se_r_grad.cc index 3866ef9b86..488492f699 100644 --- a/source/op/prod_force_se_r_grad.cc +++ b/source/op/prod_force_se_r_grad.cc @@ -12,24 +12,18 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC REGISTER_OP("ProdForceSeRGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") .Input("nlist: int32") .Input("natoms: int32") -.Output("grad_net: double"); -#else -REGISTER_OP("ProdForceSeRGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdForceSeRGradOp : public OpKernel { public: @@ -83,11 +77,11 @@ class ProdForceSeRGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -131,4 +125,8 @@ class ProdForceSeRGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdForceSeRGrad").Device(DEVICE_CPU), ProdForceSeRGradOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeRGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeRGradOp); +REGISTER_CPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/prod_virial.cc b/source/op/prod_virial.cc index 55b0b4b60d..8f607e0cf0 100644 --- a/source/op/prod_virial.cc +++ b/source/op/prod_virial.cc @@ -12,36 +12,25 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("ProdVirial") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") -.Input("nlist: int32") -.Input("axis: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("virial: double") -.Output("atom_virial: double") -; -#else REGISTER_OP("ProdVirial") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("axis: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("virial: float") -.Output("atom_virial: float") -; -#endif +.Output("virial: T") +.Output("atom_virial: T"); + using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class ProdVirialOp : public OpKernel { public: explicit ProdVirialOp(OpKernelConstruction* context) : OpKernel(context) { @@ -100,13 +89,13 @@ class ProdVirialOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); auto axis = axis_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); // loop over samples #pragma omp parallel for @@ -144,10 +133,10 @@ class ProdVirialOp : public OpKernel { if (j_idx < 0) continue; if (jj == axis_0) { for (int aa = 0; aa < ndescrpt; ++aa){ - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0); + T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0); virial (virial_iter + dd0 * 3 + dd1) += tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v; } @@ -156,10 +145,10 @@ class ProdVirialOp : public OpKernel { } else if (jj == axis_1) { for (int aa = 0; aa < ndescrpt; ++aa){ - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0); + T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0); virial (virial_iter + dd0 * 3 + dd1) += tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v; } @@ -170,10 +159,10 @@ class ProdVirialOp : public OpKernel { int aa_start, aa_end; make_descript_range (aa_start, aa_end, jj); for (int aa = aa_start; aa < aa_end; ++aa) { - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0); + T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0); virial (virial_iter + dd0 * 3 + dd1) += tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v; } @@ -201,7 +190,10 @@ class ProdVirialOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirial").Device(DEVICE_CPU), ProdVirialOp); - +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirial").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/prod_virial_grad.cc b/source/op/prod_virial_grad.cc index 5257467029..3d8c7e4639 100644 --- a/source/op/prod_virial_grad.cc +++ b/source/op/prod_virial_grad.cc @@ -12,32 +12,22 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("ProdVirialGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") -.Input("nlist: int32") -.Input("axis: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: double"); -#else REGISTER_OP("ProdVirialGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("axis: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdVirialGradOp : public OpKernel { public: @@ -200,4 +190,8 @@ class ProdVirialGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialGrad").Device(DEVICE_CPU), ProdVirialGradOp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialGradOp); +REGISTER_CPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/prod_virial_se_a.cc b/source/op/prod_virial_se_a.cc index 2f71d37505..d975913d88 100644 --- a/source/op/prod_virial_se_a.cc +++ b/source/op/prod_virial_se_a.cc @@ -12,34 +12,24 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeA") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("virial: double") -.Output("atom_virial: double") -; -#else -REGISTER_OP("ProdVirialSeA") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("virial: float") -.Output("atom_virial: float") -; -#endif +.Output("virial: T") +.Output("atom_virial: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +template class ProdVirialSeAOp : public OpKernel { public: explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -95,12 +85,12 @@ class ProdVirialSeAOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); // loop over samples #pragma omp parallel for @@ -131,10 +121,10 @@ class ProdVirialSeAOp : public OpKernel { int aa_start, aa_end; make_descript_range (aa_start, aa_end, jj); for (int aa = aa_start; aa < aa_end; ++aa) { - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0); + T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0); virial (virial_iter + dd0 * 3 + dd1) -= tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v; } @@ -161,7 +151,12 @@ class ProdVirialSeAOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_CPU), ProdVirialSeAOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeAOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc index 660f652566..0f506fc51a 100644 --- a/source/op/prod_virial_se_a_grad.cc +++ b/source/op/prod_virial_se_a_grad.cc @@ -12,30 +12,21 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeAGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: double"); -#else -REGISTER_OP("ProdVirialSeAGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdVirialSeAGradOp : public OpKernel { public: @@ -98,12 +89,12 @@ class ProdVirialSeAGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); - auto rij = rij_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); + auto rij = rij_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -162,4 +153,9 @@ class ProdVirialSeAGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeAGrad").Device(DEVICE_CPU), ProdVirialSeAGradOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeAGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeAGradOp); +REGISTER_CPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/prod_virial_se_a_gpu.cc b/source/op/prod_virial_se_a_multi_device.cc similarity index 77% rename from source/op/prod_virial_se_a_gpu.cc rename to source/op/prod_virial_se_a_multi_device.cc index 34a7a03574..21cd78c83c 100644 --- a/source/op/prod_virial_se_a_gpu.cc +++ b/source/op/prod_virial_se_a_multi_device.cc @@ -1,31 +1,19 @@ #include "common.h" #include "CustomeOperation.h" -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeA") - .Input("net_deriv: double") - .Input("in_deriv: double") - .Input("rij: double") + .Attr("T: {float, double}") + .Input("net_deriv: T") + .Input("in_deriv: T") + .Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") - .Output("virial: double") - .Output("atom_virial: double"); -#else -REGISTER_OP("ProdVirialSeA") - .Input("net_deriv: float") - .Input("in_deriv: float") - .Input("rij: float") - .Input("nlist: int32") - .Input("natoms: int32") - .Attr("n_a_sel: int") - .Attr("n_r_sel: int") - .Output("virial: float") - .Output("atom_virial: float"); -#endif + .Output("virial: T") + .Output("atom_virial: T"); -template +template struct ProdVirialSeAFunctor { void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { ProdVirialSeACPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); @@ -37,7 +25,7 @@ struct ProdVirialSeAFunctor { #endif // GOOGLE_CUDA }; -template +template class ProdVirialSeAOp : public OpKernel { public: explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -92,20 +80,20 @@ class ProdVirialSeAOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); - ProdVirialSeAFunctor()( + ProdVirialSeAFunctor()( context->eigen_device(), - virial_tensor->flat().data(), - atom_virial_tensor->flat().data(), - net_deriv_tensor.flat().data(), - in_deriv_tensor.flat().data(), - rij_tensor.flat().data(), + virial_tensor->flat().data(), + atom_virial_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + rij_tensor.flat().data(), nlist_tensor.flat().data(), nloc, nall, @@ -120,17 +108,18 @@ class ProdVirialSeAOp : public OpKernel { }; // Register the CPU kernels. -#define REGISTER_CPU() \ -REGISTER_KERNEL_BUILDER( \ - Name("ProdVirialSeA").Device(DEVICE_CPU), \ - ProdVirialSeAOp); -REGISTER_CPU(); - +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeAOp); +REGISTER_CPU(float); +REGISTER_CPU(double); // Register the GPU kernels. #if GOOGLE_CUDA -#define REGISTER_GPU() \ -REGISTER_KERNEL_BUILDER( \ - Name("ProdVirialSeA").Device(DEVICE_GPU).HostMemory("natoms"), \ - ProdVirialSeAOp); -REGISTER_GPU(); +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + ProdVirialSeAOp); +REGISTER_GPU(float); +REGISTER_GPU(double); #endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_virial_se_r.cc b/source/op/prod_virial_se_r.cc index 1d21234421..63b74398ea 100644 --- a/source/op/prod_virial_se_r.cc +++ b/source/op/prod_virial_se_r.cc @@ -12,30 +12,21 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeR") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") -.Output("virial: double") -.Output("atom_virial: double") -; -#else -REGISTER_OP("ProdVirialSeR") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Output("virial: float") -.Output("atom_virial: float") -; -#endif +.Output("virial: T") +.Output("atom_virial: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class ProdVirialSeROp : public OpKernel { public: explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -87,12 +78,12 @@ class ProdVirialSeROp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); // loop over samples #pragma omp parallel for @@ -119,10 +110,10 @@ class ProdVirialSeROp : public OpKernel { for (int jj = 0; jj < nnei; ++jj){ int j_idx = nlist (nlist_iter + i_idx * nnei + jj); if (j_idx < 0) continue; - VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj); + T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0); + T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0); virial (virial_iter + dd0 * 3 + dd1) -= tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v; } @@ -133,7 +124,11 @@ class ProdVirialSeROp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_CPU), ProdVirialSeROp); +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeROp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/prod_virial_se_r_gpu.cc b/source/op/prod_virial_se_r_gpu.cc index 91f965b72c..6324bcf88d 100644 --- a/source/op/prod_virial_se_r_gpu.cc +++ b/source/op/prod_virial_se_r_gpu.cc @@ -10,28 +10,21 @@ typedef float VALUETYPE; #endif -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeR") - .Input("net_deriv: double") - .Input("in_deriv: double") - .Input("rij: double") + .Attr("T: {float, double}") + .Input("net_deriv: T") + .Input("in_deriv: T") + .Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") - .Output("virial: double") - .Output("atom_virial: double"); -#else -REGISTER_OP("ProdVirialSeR") - .Input("net_deriv: float") - .Input("in_deriv: float") - .Input("rij: float") - .Input("nlist: int32") - .Input("natoms: int32") - .Output("virial: float") - .Output("atom_virial: float"); -#endif + .Output("virial: T") + .Output("atom_virial: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { @@ -55,6 +48,7 @@ void ProdVirialSeRLauncher(VALUETYPE * virial, const int n_a_sel, const int n_a_shift); +template class ProdVirialSeROp : public OpKernel { public: explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -134,4 +128,9 @@ class ProdVirialSeROp : public OpKernel { int n_r_sel, n_a_sel, n_a_shift; }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_GPU), ProdVirialSeROp); \ No newline at end of file +// Register the GPU kernels. +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeR").Device(DEVICE_GPU).TypeConstraint("T"), \ + ProdVirialSeROp); +REGISTER_GPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc index 20b53cf3c9..b125c9d783 100644 --- a/source/op/prod_virial_se_r_grad.cc +++ b/source/op/prod_virial_se_r_grad.cc @@ -12,26 +12,19 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC REGISTER_OP("ProdVirialSeRGrad") -.Input("grad: double") -.Input("net_deriv: double") -.Input("in_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("net_deriv: T") +.Input("in_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") -.Output("grad_net: double"); -#else -REGISTER_OP("ProdVirialSeRGrad") -.Input("grad: float") -.Input("net_deriv: float") -.Input("in_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class ProdVirialSeRGradOp : public OpKernel { public: @@ -135,4 +128,9 @@ class ProdVirialSeRGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("ProdVirialSeRGrad").Device(DEVICE_CPU), ProdVirialSeRGradOp); +// Register the GPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeRGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeRGradOp); +REGISTER_CPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/soft_min.cc b/source/op/soft_min.cc index 6b23305d3e..17ab6bc1f5 100644 --- a/source/op/soft_min.cc +++ b/source/op/soft_min.cc @@ -14,23 +14,10 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("SoftMinSwitch") -.Input("type: int32") -.Input("rij: double") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Attr("alpha: float") -.Attr("rmin: float") -.Attr("rmax: float") -.Output("sw_value: double") -.Output("sw_deriv: double"); -#else REGISTER_OP("SoftMinSwitch") +.Attr("T: {float, double}") .Input("type: int32") -.Input("rij: float") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("sel_a: list(int)") @@ -38,12 +25,14 @@ REGISTER_OP("SoftMinSwitch") .Attr("alpha: float") .Attr("rmin: float") .Attr("rmax: float") -.Output("sw_value: float") -.Output("sw_deriv: float"); -#endif +.Output("sw_value: T") +.Output("sw_deriv: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class SoftMinSwitchOp : public OpKernel { public: explicit SoftMinSwitchOp(OpKernelConstruction* context) : OpKernel(context) { @@ -106,10 +95,10 @@ class SoftMinSwitchOp : public OpKernel { // flat the tensors auto type = type_tensor .matrix(); - auto rij = rij_tensor .matrix(); + auto rij = rij_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto sw_value = sw_value_tensor ->matrix(); - auto sw_deriv = sw_deriv_tensor ->matrix(); + auto sw_value = sw_value_tensor ->matrix(); + auto sw_deriv = sw_deriv_tensor ->matrix(); // loop over samples #pragma omp parallel for @@ -126,26 +115,26 @@ class SoftMinSwitchOp : public OpKernel { // compute force of a frame for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; - VALUETYPE aa = 0; - VALUETYPE bb = 0; + T aa = 0; + T bb = 0; for (int jj = 0; jj < nnei; ++jj){ int j_idx = nlist (kk, i_idx * nnei + jj); if (j_idx < 0) continue; int rij_idx_shift = (i_idx * nnei + jj) * 3; - VALUETYPE dr[3] = { + T dr[3] = { rij(kk, rij_idx_shift + 0), rij(kk, rij_idx_shift + 1), rij(kk, rij_idx_shift + 2) }; - VALUETYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; - VALUETYPE rr = sqrt(rr2); - VALUETYPE ee = exp(-rr / alpha); + T rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; + T rr = sqrt(rr2); + T ee = exp(-rr / alpha); aa += ee; bb += rr * ee; } - VALUETYPE smin = bb / aa; - VALUETYPE vv, dd; - spline5_switch(vv, dd, smin, static_cast(rmin), static_cast(rmax)); + T smin = bb / aa; + T vv, dd; + spline5_switch(vv, dd, smin, static_cast(rmin), static_cast(rmax)); // value of switch sw_value(kk, i_idx) = vv; // deriv of switch distributed as force @@ -153,17 +142,17 @@ class SoftMinSwitchOp : public OpKernel { int j_idx = nlist (kk, i_idx * nnei + jj); if (j_idx < 0) continue; int rij_idx_shift = (ii * nnei + jj) * 3; - VALUETYPE dr[3] = { + T dr[3] = { rij(kk, rij_idx_shift + 0), rij(kk, rij_idx_shift + 1), rij(kk, rij_idx_shift + 2) }; - VALUETYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; - VALUETYPE rr = sqrt(rr2); - VALUETYPE ee = exp(-rr / alpha); - VALUETYPE pref_c = (1./rr - 1./alpha) * ee ; - VALUETYPE pref_d = 1./(rr * alpha) * ee; - VALUETYPE ts; + T rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; + T rr = sqrt(rr2); + T ee = exp(-rr / alpha); + T pref_c = (1./rr - 1./alpha) * ee ; + T pref_d = 1./(rr * alpha) * ee; + T ts; ts = dd / (aa * aa) * (aa * pref_c + bb * pref_d); sw_deriv(kk, rij_idx_shift + 0) += ts * dr[0]; sw_deriv(kk, rij_idx_shift + 1) += ts * dr[1]; @@ -196,7 +185,11 @@ class SoftMinSwitchOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("SoftMinSwitch").Device(DEVICE_CPU), SoftMinSwitchOp); - +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinSwitch").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinSwitchOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/soft_min_force.cc b/source/op/soft_min_force.cc index e51aadbc79..43344e7352 100644 --- a/source/op/soft_min_force.cc +++ b/source/op/soft_min_force.cc @@ -12,28 +12,22 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("SoftMinForce") -.Input("du: double") -.Input("sw_deriv: double") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("force: double"); -#else REGISTER_OP("SoftMinForce") -.Input("du: float") -.Input("sw_deriv: float") +.Attr("T: {float, double}") +.Input("du: T") +.Input("sw_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("force: float"); -#endif +.Output("force: T"); + using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class SoftMinForceOp : public OpKernel { public: explicit SoftMinForceOp(OpKernelConstruction* context) : OpKernel(context) { @@ -78,10 +72,10 @@ class SoftMinForceOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor)); // flat the tensors - auto du = du_tensor.matrix(); - auto sw_deriv = sw_deriv_tensor.matrix(); + auto du = du_tensor.matrix(); + auto sw_deriv = sw_deriv_tensor.matrix(); auto nlist = nlist_tensor.matrix(); - auto force = force_tensor->matrix(); + auto force = force_tensor->matrix(); // loop over samples #pragma omp parallel for @@ -118,4 +112,9 @@ class SoftMinForceOp : public OpKernel { int n_r_sel, n_a_sel; }; -REGISTER_KERNEL_BUILDER(Name("SoftMinForce").Device(DEVICE_CPU), SoftMinForceOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinForce").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinForceOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/soft_min_force_grad.cc b/source/op/soft_min_force_grad.cc index 4c8a4b21ff..bfaa8aca76 100644 --- a/source/op/soft_min_force_grad.cc +++ b/source/op/soft_min_force_grad.cc @@ -12,28 +12,20 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC REGISTER_OP("SoftMinForceGrad") -.Input("grad: double") -.Input("du: double") -.Input("sw_deriv: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("du: T") +.Input("sw_deriv: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: double"); -#else -REGISTER_OP("SoftMinForceGrad") -.Input("grad: float") -.Input("du: float") -.Input("sw_deriv: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class SoftMinForceGradOp : public OpKernel { public: @@ -89,11 +81,11 @@ class SoftMinForceGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .matrix(); - auto du = du_tensor .matrix(); - auto sw_deriv = sw_deriv_tensor .matrix(); + auto grad = grad_tensor .matrix(); + auto du = du_tensor .matrix(); + auto sw_deriv = sw_deriv_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto grad_net = grad_net_tensor ->matrix(); + auto grad_net = grad_net_tensor ->matrix(); // loop over frames #pragma omp parallel for @@ -125,4 +117,9 @@ class SoftMinForceGradOp : public OpKernel int n_r_sel, n_a_sel; }; -REGISTER_KERNEL_BUILDER(Name("SoftMinForceGrad").Device(DEVICE_CPU), SoftMinForceGradOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinForceGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinForceGradOp); +REGISTER_CPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/soft_min_virial.cc b/source/op/soft_min_virial.cc index 193c34f981..7829c4b7ed 100644 --- a/source/op/soft_min_virial.cc +++ b/source/op/soft_min_virial.cc @@ -12,34 +12,23 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("SoftMinVirial") -.Input("du: double") -.Input("sw_deriv: double") -.Input("rij: double") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("virial: double") -.Output("atom_virial: double") -; -#else REGISTER_OP("SoftMinVirial") -.Input("du: float") -.Input("sw_deriv: float") -.Input("rij: float") +.Attr("T: {float, double}") +.Input("du: T") +.Input("sw_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("virial: float") -.Output("atom_virial: float") -; -#endif +.Output("virial: T") +.Output("atom_virial: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + +template class SoftMinVirialOp : public OpKernel { public: explicit SoftMinVirialOp(OpKernelConstruction* context) : OpKernel(context) { @@ -135,7 +124,11 @@ class SoftMinVirialOp : public OpKernel { int n_r_sel, n_a_sel; }; -REGISTER_KERNEL_BUILDER(Name("SoftMinVirial").Device(DEVICE_CPU), SoftMinVirialOp); - +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinVirial").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinVirialOp); +REGISTER_CPU(VALUETYPE); diff --git a/source/op/soft_min_virial_grad.cc b/source/op/soft_min_virial_grad.cc index 6f8703bdee..b454612895 100644 --- a/source/op/soft_min_virial_grad.cc +++ b/source/op/soft_min_virial_grad.cc @@ -12,30 +12,21 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC REGISTER_OP("SoftMinVirialGrad") -.Input("grad: double") -.Input("du: double") -.Input("sw_deriv: double") -.Input("rij: double") +.Attr("T: {float, double}") +.Input("grad: T") +.Input("du: T") +.Input("sw_deriv: T") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") .Attr("n_a_sel: int") .Attr("n_r_sel: int") -.Output("grad_net: double"); -#else -REGISTER_OP("SoftMinVirialGrad") -.Input("grad: float") -.Input("du: float") -.Input("sw_deriv: float") -.Input("rij: float") -.Input("nlist: int32") -.Input("natoms: int32") -.Attr("n_a_sel: int") -.Attr("n_r_sel: int") -.Output("grad_net: float"); -#endif +.Output("grad_net: T"); + +using CPUDevice = Eigen::ThreadPoolDevice; +template class SoftMinVirialGradOp : public OpKernel { public: @@ -97,12 +88,12 @@ class SoftMinVirialGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .matrix(); - auto du = du_tensor .matrix(); - auto sw_deriv = sw_deriv_tensor .matrix(); - auto rij = rij_tensor .matrix(); + auto grad = grad_tensor .matrix(); + auto du = du_tensor .matrix(); + auto sw_deriv = sw_deriv_tensor .matrix(); + auto rij = rij_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto grad_net = grad_net_tensor ->matrix(); + auto grad_net = grad_net_tensor ->matrix(); // loop over frames #pragma omp parallel for @@ -148,4 +139,9 @@ class SoftMinVirialGradOp : public OpKernel } }; -REGISTER_KERNEL_BUILDER(Name("SoftMinVirialGrad").Device(DEVICE_CPU), SoftMinVirialGradOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("SoftMinVirialGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + SoftMinVirialGradOp); +REGISTER_CPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/tab_inter.cc b/source/op/tab_inter.cc index 242e52a6e7..5e86608e9a 100644 --- a/source/op/tab_inter.cc +++ b/source/op/tab_inter.cc @@ -12,38 +12,25 @@ typedef double VALUETYPE; typedef float VALUETYPE; #endif -#ifdef HIGH_PREC -REGISTER_OP("TabInter") -.Input("table_info: double") -.Input("table_data: double") -.Input("type: int32") -.Input("rij: double") -.Input("nlist: int32") -.Input("natoms: int32") -.Input("scale: double") -.Attr("sel_a: list(int)") -.Attr("sel_r: list(int)") -.Output("atom_energy: double") -.Output("force: double") -.Output("atom_virial: double"); -#else REGISTER_OP("TabInter") +.Attr("T: {float, double}") .Input("table_info: double") .Input("table_data: double") .Input("type: int32") -.Input("rij: float") +.Input("rij: T") .Input("nlist: int32") .Input("natoms: int32") -.Input("scale: float") +.Input("scale: T") .Attr("sel_a: list(int)") .Attr("sel_r: list(int)") -.Output("atom_energy: float") -.Output("force: float") -.Output("atom_virial: float"); -#endif +.Output("atom_energy: T") +.Output("force: T") +.Output("atom_virial: T"); using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; + inline void tabulated_inter (double & ener, double & fscale, @@ -86,6 +73,7 @@ void tabulated_inter (double & ener, fscale *= -hi; } +template class TabInterOp : public OpKernel { public: explicit TabInterOp(OpKernelConstruction* context) : OpKernel(context) { @@ -156,15 +144,15 @@ class TabInterOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(tmp_idx++, virial_shape, &virial_tensor)); // flat the tensors - auto table_info = table_info_tensor.flat(); - auto table_data = table_data_tensor.flat(); + auto table_info = table_info_tensor.flat(); + auto table_data = table_data_tensor.flat(); auto type = type_tensor .matrix(); - auto rij = rij_tensor .matrix(); + auto rij = rij_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto scale = scale_tensor .matrix(); - auto energy = energy_tensor ->matrix(); - auto force = force_tensor ->matrix(); - auto virial = virial_tensor ->matrix(); + auto scale = scale_tensor .matrix(); + auto energy = energy_tensor ->matrix(); + auto force = force_tensor ->matrix(); + auto virial = virial_tensor ->matrix(); OP_REQUIRES (context, (ntypes == int(table_info(3)+0.1)), errors::InvalidArgument ("ntypes provided in table does not match deeppot")); int nspline = table_info(2)+0.1; @@ -203,7 +191,7 @@ class TabInterOp : public OpKernel { for (int tt = 0; tt < ntypes; ++tt) { for (int ii = 0; ii < natoms(2+tt); ++ii){ int i_type = type(kk, i_idx); - VALUETYPE i_scale = scale(kk, i_idx); + T i_scale = scale(kk, i_idx); assert(i_type == tt) ; int jiter = 0; // a neighbor @@ -318,7 +306,12 @@ class TabInterOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("TabInter").Device(DEVICE_CPU), TabInterOp); +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("TabInter").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabInterOp); +REGISTER_CPU(VALUETYPE); From 4dd842c9e7b0cccf59221fd497da623cb18f5024 Mon Sep 17 00:00:00 2001 From: Lu Date: Sun, 13 Sep 2020 16:14:09 +0800 Subject: [PATCH 3/3] use standard styles for custome operators --- source/lib/include/CustomeOperation.h | 334 ++++++++++--- source/lib/include/DeviceFunctor.h | 39 +- source/lib/src/NNPInter.cc | 4 +- source/op/CMakeLists.txt | 5 +- source/op/cuda/descrpt_se_a.cu | 88 ++-- source/op/cuda/descrpt_se_r.cu | 455 ++++++++++++------ source/op/cuda/gelu.cu | 30 +- source/op/cuda/prod_force_se_a.cu | 26 +- source/op/cuda/prod_force_se_r.cu | 76 ++- source/op/cuda/prod_virial_se_a.cu | 28 +- source/op/cuda/prod_virial_se_r.cu | 67 +-- source/op/descrpt.cc | 27 +- source/op/descrpt_se_a.cc | 25 +- source/op/descrpt_se_a_multi_device.cc | 58 +-- source/op/descrpt_se_r.cc | 25 +- source/op/descrpt_se_r_gpu.cc | 236 --------- source/op/descrpt_se_r_multi_device.cc | 297 ++++++++++++ source/op/ewald_recp.cc | 35 +- source/op/gelu.cc | 132 +++-- source/op/gelu_gpu.cc | 159 ------ source/op/gelu_multi_device.cc | 54 +-- source/op/prod_force.cc | 17 +- source/op/prod_force_grad.cc | 19 +- source/op/prod_force_se_a.cc | 17 +- source/op/prod_force_se_a_grad.cc | 19 +- source/op/prod_force_se_a_multi_device.cc | 22 +- source/op/prod_force_se_r.cc | 17 +- source/op/prod_force_se_r_grad.cc | 19 +- ...gpu.cc => prod_force_se_r_multi_device.cc} | 109 ++--- source/op/prod_virial.cc | 33 +- source/op/prod_virial_grad.cc | 21 +- source/op/prod_virial_se_a.cc | 24 +- source/op/prod_virial_se_a_grad.cc | 21 +- source/op/prod_virial_se_a_multi_device.cc | 30 +- source/op/prod_virial_se_r.cc | 24 +- source/op/prod_virial_se_r_grad.cc | 21 +- ...pu.cc => prod_virial_se_r_multi_device.cc} | 124 ++--- source/op/soft_min.cc | 48 +- source/op/soft_min_force.cc | 18 +- source/op/soft_min_force_grad.cc | 19 +- source/op/soft_min_virial.cc | 23 +- source/op/soft_min_virial_grad.cc | 20 +- source/op/tab_inter.cc | 26 +- 43 files changed, 1513 insertions(+), 1378 deletions(-) delete mode 100644 source/op/descrpt_se_r_gpu.cc create mode 100644 source/op/descrpt_se_r_multi_device.cc delete mode 100644 source/op/gelu_gpu.cc rename source/op/{prod_force_se_r_gpu.cc => prod_force_se_r_multi_device.cc} (55%) rename source/op/{prod_virial_se_r_gpu.cc => prod_virial_se_r_multi_device.cc} (53%) diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h index c0bd8fdbd5..c446db8130 100644 --- a/source/lib/include/CustomeOperation.h +++ b/source/lib/include/CustomeOperation.h @@ -23,11 +23,11 @@ struct NeighborInfo { } }; -template +template inline void spline5_switch ( - T & vv, - T & dd, - const T & xx, + FPTYPE & vv, + FPTYPE & dd, + const FPTYPE & xx, const float & rmin, const float & rmax) { @@ -36,8 +36,8 @@ inline void spline5_switch ( vv = 1; } else if (xx < rmax) { - T uu = (xx - rmin) / (rmax - rmin) ; - T du = 1. / (rmax - rmin) ; + FPTYPE uu = (xx - rmin) / (rmax - rmin) ; + FPTYPE du = 1. / (rmax - rmin) ; vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; } @@ -47,10 +47,10 @@ inline void spline5_switch ( } } -template +template int format_nlist_fill_se_a_cpu ( vector & fmt_nei_idx_a, - const vector & posi, + const vector & posi, const int & ntypes, const vector & type, const int & i_idx, @@ -67,12 +67,12 @@ int format_nlist_fill_se_a_cpu ( vector sel_nei; sel_nei.reserve (nei_idx_a.size()); for (unsigned kk = 0; kk < nei_idx.size(); ++kk) { - T diff[3]; + FPTYPE diff[3]; const int & j_idx = nei_idx[kk]; for (int dd = 0; dd < 3; ++dd) { diff[dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd]; } - T rr = sqrt(MathUtilities::dot (diff, diff)); + FPTYPE rr = sqrt(MathUtilities::dot (diff, diff)); if (rr <= rcut) { sel_nei.push_back(NeighborInfo(type[j_idx], rr, j_idx)); } @@ -90,12 +90,12 @@ int format_nlist_fill_se_a_cpu ( return overflowed; } -template +template void compute_descriptor_se_a_cpu ( - vector & descrpt_a, - vector & descrpt_a_deriv, - vector & rij_a, - const vector & posi, + vector & descrpt_a, + vector & descrpt_a_deriv, + vector & rij_a, + const vector & posi, const int & ntypes, const vector & type, const int & i_idx, @@ -127,14 +127,14 @@ void compute_descriptor_se_a_cpu ( for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) { for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) { if (fmt_nlist_a[nei_iter] < 0) break; - const T * rr = &rij_a[nei_iter * 3]; - T nr2 = MathUtilities::dot(rr, rr); - T inr = 1./sqrt(nr2); - T nr = nr2 * inr; - T inr2 = inr * inr; - T inr4 = inr2 * inr2; - T inr3 = inr4 * nr; - T sw, dsw; + const FPTYPE * rr = &rij_a[nei_iter * 3]; + FPTYPE nr2 = MathUtilities::dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); int idx_deriv = nei_iter * 4 * 3; // 4 components time 3 directions int idx_value = nei_iter * 4; // 4 components @@ -168,10 +168,10 @@ void compute_descriptor_se_a_cpu ( } } -template -void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { +template +void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { // set & normalize coord - std::vector d_coord3(nall * 3); + std::vector d_coord3(nall * 3); for (int ii = 0; ii < nall; ++ii) { for (int dd = 0; dd < 3; ++dd) { d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; @@ -205,11 +205,11 @@ void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist, if (fill_nei_a) { format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a); } - std::vector d_descrpt_a; - std::vector d_descrpt_a_deriv; - std::vector d_descrpt_r; - std::vector d_descrpt_r_deriv; - std::vector d_rij_a; + std::vector d_descrpt_a; + std::vector d_descrpt_a_deriv; + std::vector d_descrpt_r; + std::vector d_descrpt_r_deriv; + std::vector d_rij_a; compute_descriptor_se_a_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r); // check sizes @@ -234,9 +234,9 @@ void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist, } #if GOOGLE_CUDA -template -void DescrptSeAGPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { - DescrptSeAGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); +template +void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeAGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); } #endif // GOOGLE_CUDA // ****************************************************************************** @@ -254,9 +254,9 @@ inline void make_descript_range (int & idx_start, int & idx_end, const int & nei } } -template -void ProdForceSeACPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { - memset(force, 0.0, sizeof(T) * nall * 3); +template +void ProdForceSeACPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + memset(force, 0.0, sizeof(FPTYPE) * nall * 3); // compute force of a frame for (int i_idx = 0; i_idx < nloc; ++i_idx) { // deriv wrt center atom @@ -281,9 +281,9 @@ void ProdForceSeACPULauncher(T * force, const T * net_deriv, const T * in_deriv, } #if GOOGLE_CUDA -template -void ProdForceSeAGPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { - ProdForceSeAGPUExecuteFunctor()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); +template +void ProdForceSeAGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdForceSeAGPUExecuteFunctor()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } #endif // GOOGLE_CUDA @@ -291,10 +291,10 @@ void ProdForceSeAGPULauncher(T * force, const T * net_deriv, const T * in_deriv, // end of custome op ProdForceSeA // ****************************************************************************** -template -void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { - memset(virial, 0.0, sizeof(T) * 9); - memset(atom_virial, 0.0, sizeof(T) * nall * 9); +template +void ProdVirialSeACPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + memset(virial, 0.0, sizeof(FPTYPE) * 9); + memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9); // compute virial of a frame for (int i_idx = 0; i_idx < nloc; ++i_idx) { @@ -305,10 +305,10 @@ void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv, int aa_start, aa_end; make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift); for (int aa = aa_start; aa < aa_end; ++aa) { - T pref = -1.0 * net_deriv[i_idx * ndescrpt + aa]; + FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + aa]; for (int dd0 = 0; dd0 < 3; ++dd0) for (int dd1 = 0; dd1 < 3; ++dd1) { - T tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0]; + FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0]; virial[dd0 * 3 + dd1] -= tmp_v; atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v; } @@ -318,55 +318,255 @@ void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv, } #if GOOGLE_CUDA -template -void ProdVirialSeAGPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { - ProdVirialSeAGPUExecuteFunctor()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); +template +void ProdVirialSeAGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + ProdVirialSeAGPUExecuteFunctor()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } #endif // GOOGLE_CUDA // ****************************************************************************** // end of custome op ProdVirialSeA // ****************************************************************************** -template -void GeluCPULauncher(const T * in, T * out, int const size) { +template +void GeluCPULauncher(const FPTYPE * in, FPTYPE * out, int const size) { for (int ii = 0; ii < size; ii++) { out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]))); } } -template -void GeluGradCPULauncher(const T * dy, const T * in, T * out, int const size) { +template +void GeluGradCPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { for (int ii = 0; ii < size; ii++) { - T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5); } } -template -void GeluGradGradCPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) { +template +void GeluGradGradCPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { for (int ii = 0; ii < size; ii++) { - T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); - T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2); } } #if GOOGLE_CUDA -template -void GeluGPULauncher(const T * in, T * out, int const size) { - GeluGPUExecuteFunctor()(in, out, size); +template +void GeluGPULauncher(const FPTYPE * in, FPTYPE * out, int const size) { + GeluGPUExecuteFunctor()(in, out, size); } -template -void GeluGradGPULauncher(const T * dy, const T * in, T * out, int const size) { - GeluGradGPUExecuteFunctor()(dy, in, out, size); +template +void GeluGradGPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradGPUExecuteFunctor()(dy, in, out, size); } -template -void GeluGradGradGPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) { - GeluGradGradGPUExecuteFunctor()(dy, dy_, in, out, size); +template +void GeluGradGradGPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradGradGPUExecuteFunctor()(dy, dy_, in, out, size); } #endif // GOOGLE_CUDA // ****************************************************************************** // end of custome op Gelu // ****************************************************************************** + +template +void compute_descriptor_se_r_cpu ( + vector & descrpt_a, + vector & descrpt_a_deriv, + vector & rij_a, + const vector & posi, + const int & ntypes, + const vector & type, + const int & i_idx, + const vector & fmt_nlist_a, + const vector & sec_a, + const float & rmin, + const float & rmax) +{ + // compute the diff of the neighbors + rij_a.resize (sec_a.back() * 3); + fill (rij_a.begin(), rij_a.end(), 0.0); + for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) { + for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) { + if (fmt_nlist_a[jj] < 0) break; + const int & j_idx = fmt_nlist_a[jj]; + + for (int dd = 0; dd < 3; ++dd) { + rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd]; + } + } + } + // 1./rr, cos(theta), cos(phi), sin(phi) + descrpt_a.resize (sec_a.back()); + fill (descrpt_a.begin(), descrpt_a.end(), 0.0); + // deriv wrt center: 3 + descrpt_a_deriv.resize (sec_a.back() * 3); + fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0); + + for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) { + for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) { + if (fmt_nlist_a[nei_iter] < 0) break; + const FPTYPE * rr = &rij_a[nei_iter * 3]; + FPTYPE nr2 = MathUtilities::dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; + spline5_switch(sw, dsw, nr, rmin, rmax); + int idx_deriv = nei_iter * 3; // 1 components time 3 directions + int idx_value = nei_iter; // 1 components + // 4 value components + descrpt_a[idx_value + 0] = 1./nr; + // deriv of component 1/r + descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr; + descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr; + descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr; + // 4 value components + descrpt_a[idx_value + 0] *= sw; + } + } +} + +template +void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + // set & normalize coord + std::vector d_coord3(nall * 3); + for (int ii = 0; ii < nall; ++ii) { + for (int dd = 0; dd < 3; ++dd) { + d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; + } + } + + // set type + std::vector d_type (nall); + for (int ii = 0; ii < nall; ++ii) { + d_type[ii] = type[ii]; + } + + // build nlist + std::vector > d_nlist_a(nloc); + + for (unsigned ii = 0; ii < nloc; ++ii) { + d_nlist_a.reserve (jrange[nloc] / nloc + 10); + } + for (unsigned ii = 0; ii < nloc; ++ii) { + int i_idx = ilist[ii]; + for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) { + int j_idx = jlist[jj]; + d_nlist_a[i_idx].push_back (j_idx); + } + } + + #pragma omp parallel for + for (int ii = 0; ii < nloc; ++ii) { + vector fmt_nlist_a; + int ret = -1; + if (fill_nei_a) { + format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a); + } + std::vector d_descrpt_a; + std::vector d_descrpt_a_deriv; + std::vector d_descrpt_r; + std::vector d_descrpt_r_deriv; + std::vector d_rij_a; + compute_descriptor_se_r_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r); + + // check sizes + assert (d_descrpt_a.size() == ndescrpt); + assert (d_descrpt_a_deriv.size() == ndescrpt * 3); + assert (d_rij_a.size() == nnei * 3); + assert (fmt_nlist_a.size() == nnei); + // record outputs + for (int jj = 0; jj < ndescrpt; ++jj) { + descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj]; + } + for (int jj = 0; jj < ndescrpt * 3; ++jj) { + descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3]; + } + for (int jj = 0; jj < nnei * 3; ++jj) { + rij[ii * nnei * 3 + jj] = d_rij_a[jj]; + } + for (int jj = 0; jj < nnei; ++jj) { + nlist[ii * nnei + jj] = fmt_nlist_a[jj]; + } + } +} + +#if GOOGLE_CUDA +template +void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeRGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); +} +#endif // GOOGLE_CUDA +// ****************************************************************************** +// end of custome op DescrptSeR +// ****************************************************************************** + +template +void ProdForceSeRCPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + memset(force, 0.0, sizeof(FPTYPE) * nall * 3); + // compute force of a frame + for (int i_idx = 0; i_idx < nloc; ++i_idx) { + // deriv wrt center atom + for (int aa = 0; aa < ndescrpt; ++aa) { + force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; + force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; + force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; + } + // deriv wrt neighbors + for (int jj = 0; jj < nnei; ++jj) { + int j_idx = nlist[i_idx * nnei + jj]; + if (j_idx < 0) continue; + force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 0]; + force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 1]; + force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 2]; + } + } +} + +#if GOOGLE_CUDA +template +void ProdForceSeRGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdForceSeRGPUExecuteFunctor()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt); +} +#endif // GOOGLE_CUDA + +// ****************************************************************************** +// end of custome op ProdForceSeR +// ****************************************************************************** + +template +void ProdVirialSeRCPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + memset(virial, 0.0, sizeof(FPTYPE) * 9); + memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9); + + // compute virial of a frame + for (int i_idx = 0; i_idx < nloc; ++i_idx) { + // deriv wrt neighbors + for (int jj = 0; jj < nnei; ++jj) { + int j_idx = nlist[i_idx * nnei + jj]; + if (j_idx < 0) continue; + FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + jj]; + for (int dd0 = 0; dd0 < 3; ++dd0) + for (int dd1 = 0; dd1 < 3; ++dd1) { + FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0]; + virial[dd0 * 3 + dd1] -= tmp_v; + atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v; + } + } + } +} + +#if GOOGLE_CUDA +template +void ProdVirialSeRGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdVirialSeRGPUExecuteFunctor()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt); +} +#endif // GOOGLE_CUDA +// ****************************************************************************** +// end of custome op ProdVirialSeR +// ****************************************************************************** diff --git a/source/lib/include/DeviceFunctor.h b/source/lib/include/DeviceFunctor.h index f02ef55651..d51d617f84 100644 --- a/source/lib/include/DeviceFunctor.h +++ b/source/lib/include/DeviceFunctor.h @@ -16,32 +16,47 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort= } } -template +template struct DescrptSeAGPUExecuteFunctor { - void operator()(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descript, T * descript_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER); + void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER); }; -template +template +struct DescrptSeRGPUExecuteFunctor { + void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER); +}; + +template struct ProdForceSeAGPUExecuteFunctor { - void operator()(T * force, const T * net_derive, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); + void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); +}; + +template +struct ProdForceSeRGPUExecuteFunctor { + void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt); }; -template +template struct ProdVirialSeAGPUExecuteFunctor { - void operator()(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); + void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift); +}; + +template +struct ProdVirialSeRGPUExecuteFunctor { + void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt); }; -template +template struct GeluGPUExecuteFunctor { - void operator()(const T * in, T * out, const int size); + void operator()(const FPTYPE * in, FPTYPE * out, const int size); }; -template +template struct GeluGradGPUExecuteFunctor { - void operator()(const T * dy, const T * in, T * out, const int size); + void operator()(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, const int size); }; -template +template struct GeluGradGradGPUExecuteFunctor { - void operator()(const T * dy, const T * dy_, const T * in, T * out, const int size); + void operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, const int size); }; \ No newline at end of file diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc index b262851450..879c36d5c0 100644 --- a/source/lib/src/NNPInter.cc +++ b/source/lib/src/NNPInter.cc @@ -203,7 +203,7 @@ init (const string & model, const int & gpu_rank) options.config.set_allow_soft_placement(true); options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); options.config.mutable_gpu_options()->set_allow_growth(true); - cudaErrcheck(cudaSetDevice(gpu_rank)); + cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num)); std::string str = "/gpu:"; str += std::to_string(gpu_rank % gpu_num); graph::SetDefaultDevice(str, &graph_def); @@ -518,7 +518,7 @@ init (const vector & models, const int & gpu_rank) options.config.set_allow_soft_placement(true); options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9); options.config.mutable_gpu_options()->set_allow_growth(true); - cudaErrcheck(cudaSetDevice(gpu_rank)); + cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num)); } #endif // GOOGLE_CUDA diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index 21f675c845..c4d97cd85a 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -4,8 +4,7 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} ) file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc) -file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc) -file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc) +file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc) file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) @@ -27,7 +26,7 @@ if (BUILD_PY_IF) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) set(CMAKE_INSTALL_RPATH DESTINATION ${DP_PIP_INSTALL_PATH} ${DP_SETUP_INSTALL_PATH} ${CMAKE_BINARY_DIR}/op/cuda) if (USE_CUDA_TOOLKIT) - add_library(op_abi SHARED ${OP_PY_CUDA_SRC} ${OP_LIB}) + add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) add_library(op_grads SHARED ${OP_GRADS_SRC}) add_subdirectory(cuda) find_package(CUDA REQUIRED) diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu index afed0dbe83..5965254111 100644 --- a/source/op/cuda/descrpt_se_a.cu +++ b/source/op/cuda/descrpt_se_a.cu @@ -37,15 +37,15 @@ __global__ void BlockSortKernel( cub::StoreDirectStriped(threadIdx.x, d_out + block_offset, items); } -template -__device__ inline T dev_dot(T * arr1, T * arr2) { +template +__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) { return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; } -template -__device__ inline void spline5_switch(T & vv, - T & dd, - T & xx, +template +__device__ inline void spline5_switch(FPTYPE & vv, + FPTYPE & dd, + FPTYPE & xx, const float & rmin, const float & rmax) { @@ -54,8 +54,8 @@ __device__ inline void spline5_switch(T & vv, vv = 1; } else if (xx < rmax) { - T uu = (xx - rmin) / (rmax - rmin) ; - T du = 1. / (rmax - rmin) ; + FPTYPE uu = (xx - rmin) / (rmax - rmin) ; + FPTYPE du = 1. / (rmax - rmin) ; vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; } @@ -76,8 +76,8 @@ __global__ void get_i_idx_se_a(const int nloc, i_idx[ilist[idy]] = idy; } -template -__global__ void format_nlist_fill_a_se_a(const T * coord, +template +__global__ void format_nlist_fill_a_se_a(const FPTYPE * coord, const int * type, const int * jrange, const int * jlist, @@ -100,12 +100,12 @@ __global__ void format_nlist_fill_a_se_a(const T * coord, int_64 * key_in = key + idx * MAGIC_NUMBER; - T diff[3]; + FPTYPE diff[3]; const int & j_idx = nei_idx[idy]; for (int dd = 0; dd < 3; dd++) { diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; } - T rr = sqrt(dev_dot(diff, diff)); + FPTYPE rr = sqrt(dev_dot(diff, diff)); if (rr <= rcut) { key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx; } @@ -147,19 +147,19 @@ __global__ void format_nlist_fill_b_se_a(int * nlist, } //it's ok! -template -__global__ void compute_descriptor_se_a (T* descript, +template +__global__ void compute_descriptor_se_a (FPTYPE* descript, const int ndescrpt, - T* descript_deriv, + FPTYPE* descript_deriv, const int descript_deriv_size, - T* rij, + FPTYPE* rij, const int rij_size, const int* type, - const T* avg, - const T* std, + const FPTYPE* avg, + const FPTYPE* std, int* nlist, const int nlist_size, - const T* coord, + const FPTYPE* coord, const float rmin, const float rmax, const int sec_a_size) @@ -172,9 +172,9 @@ __global__ void compute_descriptor_se_a (T* descript, if (idy >= sec_a_size) {return;} // else {return;} - T * row_descript = descript + idx * ndescrpt; - T * row_descript_deriv = descript_deriv + idx * descript_deriv_size; - T * row_rij = rij + idx * rij_size; + FPTYPE * row_descript = descript + idx * ndescrpt; + FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; + FPTYPE * row_rij = rij + idx * rij_size; int * row_nlist = nlist + idx * nlist_size; if (row_nlist[idy] >= 0) { @@ -182,14 +182,14 @@ __global__ void compute_descriptor_se_a (T* descript, for (int kk = 0; kk < 3; kk++) { row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; } - const T * rr = &row_rij[idy * 3 + 0]; - T nr2 = dev_dot(rr, rr); - T inr = 1./sqrt(nr2); - T nr = nr2 * inr; - T inr2 = inr * inr; - T inr4 = inr2 * inr2; - T inr3 = inr4 * nr; - T sw, dsw; + const FPTYPE * rr = &row_rij[idy * 3 + 0]; + FPTYPE nr2 = dev_dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); row_descript[idx_value + 0] = (1./nr) ;//* sw; row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw; @@ -228,9 +228,9 @@ __global__ void compute_descriptor_se_a (T* descript, } } -template +template void format_nbor_list_256 ( - const T* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -262,9 +262,9 @@ void format_nbor_list_256 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } -template +template void format_nbor_list_512 ( - const T* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -296,9 +296,9 @@ void format_nbor_list_512 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } -template +template void format_nbor_list_1024 ( - const T* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -330,9 +330,9 @@ void format_nbor_list_1024 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } -template +template void format_nbor_list_2048 ( - const T* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -364,9 +364,9 @@ void format_nbor_list_2048 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } -template +template void format_nbor_list_4096 ( - const T* coord, + const FPTYPE* coord, const int* type, const int* jrange, const int* jlist, @@ -398,8 +398,8 @@ void format_nbor_list_4096 ( BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); } -template -void DescrptSeAGPUExecuteFunctor::operator()(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descript, T * descript_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) { +template +void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) { const int LEN = 256; int nblock = (nloc + LEN -1) / LEN; int * sec_a_dev = array_int; @@ -411,8 +411,8 @@ void DescrptSeAGPUExecuteFunctor::operator()(const T * coord, const int * typ res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); - res = cudaMemset(descript, 0.0, sizeof(T) * nloc * ndescrpt); cudaErrcheck(res); - res = cudaMemset(descript_deriv, 0.0, sizeof(T) * nloc * ndescrpt * 3); cudaErrcheck(res); + res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res); + res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); if (fill_nei_a) { // ~~~ diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu index fa9678be34..a65ba5887a 100644 --- a/source/op/cuda/descrpt_se_r.cu +++ b/source/op/cuda/descrpt_se_r.cu @@ -1,45 +1,7 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#define EIGEN_USE_GPU -#include -#include -#include #include #include #include -#include -#include - -#define MAGIC_NUMBER 256 - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -typedef double compute_t; - -typedef unsigned long long int_64; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" template < typename Key, @@ -48,7 +10,7 @@ template < __launch_bounds__ (BLOCK_THREADS) __global__ void BlockSortKernel( Key * d_in, - Key * d_out) // Tile of output + Key * d_out) // Tile of output { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) @@ -75,24 +37,25 @@ __global__ void BlockSortKernel( cub::StoreDirectStriped(threadIdx.x, d_out + block_offset, items); } -template -__device__ inline T dev_dot(T * arr1, T * arr2) { +template +__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) { return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; } -__device__ inline void spline5_switch(compute_t & vv, - compute_t & dd, - compute_t & xx, - const compute_t & rmin, - const compute_t & rmax) +template +__device__ inline void spline5_switch(FPTYPE & vv, + FPTYPE & dd, + FPTYPE & xx, + const float & rmin, + const float & rmax) { if (xx < rmin) { dd = 0; vv = 1; } else if (xx < rmax) { - compute_t uu = (xx - rmin) / (rmax - rmin) ; - compute_t du = 1. / (rmax - rmin) ; + FPTYPE uu = (xx - rmin) / (rmax - rmin) ; + FPTYPE du = 1. / (rmax - rmin) ; vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1; dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du; } @@ -104,7 +67,7 @@ __device__ inline void spline5_switch(compute_t & vv, __global__ void get_i_idx_se_r(const int nloc, const int * ilist, - int * i_idx) + int * i_idx) { const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; if(idy >= nloc) { @@ -113,17 +76,19 @@ __global__ void get_i_idx_se_r(const int nloc, i_idx[ilist[idy]] = idy; } -__global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord, +template +__global__ void format_nlist_fill_a_se_r(const FPTYPE * coord, const int * type, const int * jrange, const int * jlist, const float rcut, int_64 * key, - int * i_idx) + int * i_idx, + const int MAGIC_NUMBER) { // <<>> const unsigned int idx = blockIdx.x; - const unsigned int idy = threadIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]]; if (idy >= nsize) { @@ -135,12 +100,12 @@ __global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord, int_64 * key_in = key + idx * MAGIC_NUMBER; - compute_t diff[3]; + FPTYPE diff[3]; const int & j_idx = nei_idx[idy]; for (int dd = 0; dd < 3; dd++) { diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd]; } - compute_t rr = sqrt(dev_dot(diff, diff)); + FPTYPE rr = sqrt(dev_dot(diff, diff)); if (rr <= rcut) { key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx; } @@ -153,9 +118,10 @@ __global__ void format_nlist_fill_b_se_r(int * nlist, const int * jrange, const int * jlist, int_64 * key, - const int * sec, + const int * sec_a, const int sec_a_size, - int * nei_iter_dev) + int * nei_iter_dev, + const int MAGIC_NUMBER) { const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; @@ -169,137 +135,329 @@ __global__ void format_nlist_fill_b_se_r(int * nlist, int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER; for (int ii = 0; ii < sec_a_size; ii++) { - nei_iter[ii] = sec[ii]; + nei_iter[ii] = sec_a[ii]; } for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) { const int & nei_type = key_out[kk] / 1E15; - if (nei_iter[nei_type] < sec[nei_type + 1]) { + if (nei_iter[nei_type] < sec_a[nei_type + 1]) { row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000; } } } //it's ok! -__global__ void compute_descriptor_se_r (VALUETYPE* descript, +template +__global__ void compute_descriptor_se_r (FPTYPE* descript, const int ndescrpt, - VALUETYPE* descript_deriv, + FPTYPE* descript_deriv, const int descript_deriv_size, - VALUETYPE* rij, + FPTYPE* rij, const int rij_size, const int* type, - const VALUETYPE* avg, - const VALUETYPE* std, + const FPTYPE* avg, + const FPTYPE* std, int* nlist, const int nlist_size, - const VALUETYPE* coord, - const VALUETYPE rmin, - const VALUETYPE rmax, - compute_t* sel_diff_dev, - const int sec_size) + const FPTYPE* coord, + const float rmin, + const float rmax, + const int sec_a_size) { - // <<>> - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; - const int idx_deriv = idy * 3; // 1 components time 3 directions - const int idx_value = idy; // 1 components - if (idy >= sec_size) {return;} - + // <<>> + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + const int idx_deriv = idy * 3; // 4 components time 3 directions + const int idx_value = idy; // 4 components + if (idy >= sec_a_size) {return;} + // else {return;} - VALUETYPE * row_descript = descript + idx * ndescrpt; - VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; - VALUETYPE * row_rij = rij + idx * rij_size; - compute_t * sel_diff = sel_diff_dev + idx * nlist_size * 3; + FPTYPE * row_descript = descript + idx * ndescrpt; + FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; + FPTYPE * row_rij = rij + idx * rij_size; int * row_nlist = nlist + idx * nlist_size; - + if (row_nlist[idy] >= 0) { const int & j_idx = row_nlist[idy]; for (int kk = 0; kk < 3; kk++) { - sel_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; - row_rij[idy * 3 + kk] = sel_diff[idy * 3 + kk]; + row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; } - const compute_t * rr = &sel_diff[idy * 3 + 0]; - compute_t nr2 = dev_dot(rr, rr); - compute_t inr = 1./sqrt(nr2); - compute_t nr = nr2 * inr; - compute_t inr2 = inr * inr; - compute_t inr4 = inr2 * inr2; - compute_t inr3 = inr4 * nr; - compute_t sw, dsw; + const FPTYPE * rr = &row_rij[idy * 3 + 0]; + FPTYPE nr2 = dev_dot(rr, rr); + FPTYPE inr = 1./sqrt(nr2); + FPTYPE nr = nr2 * inr; + FPTYPE inr2 = inr * inr; + FPTYPE inr4 = inr2 * inr2; + FPTYPE inr3 = inr4 * nr; + FPTYPE sw, dsw; spline5_switch(sw, dsw, nr, rmin, rmax); row_descript[idx_value + 0] = (1./nr) ;//* sw; row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3]; row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3]; row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3]; - // 1 value components + // 4 value components row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0]; } for (int ii = 0; ii < 1; ii++) { row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii]; } + // idy nloc, idx ndescrpt * 3 + // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3]; for (int ii = 0; ii < 3; ii++) { row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3]; } } -void DescrptSeRLauncher(const VALUETYPE* coord, - const int* type, - const int* ilist, - const int* jrange, - const int* jlist, - int* array_int, - unsigned long long* array_longlong, - compute_t* array_double, - const VALUETYPE* avg, - const VALUETYPE* std, - VALUETYPE* descript, - VALUETYPE* descript_deriv, - VALUETYPE* rij, - int* nlist, - const int& ntypes, - const int& nloc, - const int& nall, - const int& nnei, - const float& rcut, - const float& rcut_smth, - const int& ndescrpt, - const std::vector& sec, - const bool& fill_nei_a -) +template +void format_nbor_list_256 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 256; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void format_nbor_list_512 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 512; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void format_nbor_list_1024 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) { + const int LEN = 256; + const int MAGIC_NUMBER = 1024; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void format_nbor_list_2048 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 2048; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void format_nbor_list_4096 ( + const FPTYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 4096; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(1, LEN); + format_nlist_fill_a_se_r + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 16; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +template +void DescrptSeRGPUExecuteFunctor::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) { const int LEN = 256; int nblock = (nloc + LEN -1) / LEN; - int * sec_dev = array_int; - int * nei_iter = array_int + sec.size(); // = new int[sec_a_size]; - int * i_idx = array_int + sec.size() + nloc * sec.size(); + int * sec_a_dev = array_int; + int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size]; + int * i_idx = array_int + sec_a.size() + nloc * sec_a.size(); int_64 * key = array_longlong; - compute_t * sel_diff = array_double; // = new VALUETYPE *[nlist_size]; nnei cudaError_t res = cudaSuccess; - res = cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); + res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); - res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res); - res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); + res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res); + res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); if (fill_nei_a) { + // ~~~ // cudaProfilerStart(); get_i_idx_se_r<<>> (nloc, ilist, i_idx); - format_nlist_fill_a_se_r<<>> ( - coord, - type, - jrange, - jlist, - rcut, - key, - i_idx - ); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = 64; - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); + if (nnei <= 256) { + format_nbor_list_256 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 512) { + format_nbor_list_512 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 1024) { + format_nbor_list_1024 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 2048) { + format_nbor_list_2048 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 4096) { + format_nbor_list_4096 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } + format_nlist_fill_b_se_r<<>> ( nlist, nnei, @@ -307,14 +465,17 @@ void DescrptSeRLauncher(const VALUETYPE* coord, jrange, jlist, key, - sec_dev, - sec.size(), - nei_iter + sec_a_dev, + sec_a.size(), + nei_iter, + MAGIC_NUMBER ); } - const int nblock_ = (sec.back() + LEN -1) / LEN; - dim3 block_grid(nblock_, nloc); - compute_descriptor_se_r<<>> ( + + const int nblock_ = (sec_a.back() + LEN -1) / LEN; + dim3 block_grid(nloc, nblock_); + dim3 thread_grid(1, LEN); + compute_descriptor_se_r<<>> ( descript, ndescrpt, descript_deriv, @@ -327,9 +488,11 @@ void DescrptSeRLauncher(const VALUETYPE* coord, nlist, nnei, coord, - rcut_smth, - rcut, - sel_diff, - sec.back() + rcut_r_smth, + rcut_r, + sec_a.back() ); -} \ No newline at end of file +} + +template struct DescrptSeRGPUExecuteFunctor; +template struct DescrptSeRGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu index 078a31da33..6329c8f085 100644 --- a/source/op/cuda/gelu.cu +++ b/source/op/cuda/gelu.cu @@ -1,31 +1,31 @@ #include "DeviceFunctor.h" -template -__global__ void gelu(const T * in, T * out, int const size) { +template +__global__ void gelu(const FPTYPE * in, FPTYPE * out, int const size) { int const idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) {return;} out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]))); } -template -__global__ void gelu_grad(const T * dy, const T * in, T * out, int const size) { +template +__global__ void gelu_grad(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { int const idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) {return;} // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]))); - T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); out[idx] = dy[idx] * (0.5 * SQRT_2_PI * in[idx] * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1) + 0.5 * var1 + 0.5); } -template -__global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * out, int const size) { +template +__global__ void gelu_grad_grad(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { int const idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= size) {return;} // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]))); - T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); - T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); + FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1); out[idx] = dy[idx] * dy_[idx] * (0.134145 * SQRT_2_PI * in[idx] * in[idx] * (1 - var1 * var1) - SQRT_2_PI * in[idx] * var2 * (0.134145 * in[idx] * in[idx] + 1) * var1 + var2); } @@ -72,24 +72,24 @@ void GeluGradGradLauncher(const double * dy, const double * dy_, const double * gelu_grad_grad<<>>(dy, dy_, in, out, size); } -template -void GeluGPUExecuteFunctor::operator()(const T * in, T * out, int const size) { +template +void GeluGPUExecuteFunctor::operator()(const FPTYPE * in, FPTYPE * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu<<>>(in, out, size); } -template -void GeluGradGPUExecuteFunctor::operator()(const T * dy, const T * in, T * out, int const size) { +template +void GeluGradGPUExecuteFunctor::operator()(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; gelu_grad<<>>(dy, in, out, size); } -template -void GeluGradGradGPUExecuteFunctor::operator()(const T * dy, const T * dy_, const T * in, T * out, int const size) { +template +void GeluGradGradGPUExecuteFunctor::operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { int const THREAD_ITEMS = 1024; int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu index ee826449af..1667c15f90 100644 --- a/source/op/cuda/prod_force_se_a.cu +++ b/source/op/cuda/prod_force_se_a.cu @@ -14,10 +14,10 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -template -__global__ void deriv_wrt_center_atom_se_a(T * force, - const T * net_deriv, - const T * in_deriv, +template +__global__ void deriv_wrt_center_atom_se_a(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int ndescrpt) { const unsigned int idx = blockIdx.x; @@ -29,10 +29,10 @@ __global__ void deriv_wrt_center_atom_se_a(T * force, atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); } -template -__global__ void deriv_wrt_neighbors_se_a(T * force, - const T * net_deriv, - const T * in_deriv, +template +__global__ void deriv_wrt_neighbors_se_a(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nnei, @@ -57,10 +57,10 @@ __global__ void deriv_wrt_neighbors_se_a(T * force, atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]); } -template -void ProdForceSeAGPUExecuteFunctor::operator()(T * force, - const T * net_deriv, - const T * in_deriv, +template +void ProdForceSeAGPUExecuteFunctor::operator()(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, @@ -70,7 +70,7 @@ void ProdForceSeAGPUExecuteFunctor::operator()(T * force, const int n_a_shift) { // std::cout << "I'm here!" << std::endl; - cudaErrcheck(cudaMemset(force, 0.0, sizeof(T) * nall * 3)); + cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3)); const int LEN1 = 256; const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; dim3 grid(nloc, nblock1); diff --git a/source/op/cuda/prod_force_se_r.cu b/source/op/cuda/prod_force_se_r.cu index 765842d9c3..5a4b582dd0 100644 --- a/source/op/cuda/prod_force_se_r.cu +++ b/source/op/cuda/prod_force_se_r.cu @@ -1,21 +1,4 @@ -#include -#include - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "DeviceFunctor.h" #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { @@ -31,31 +14,29 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -__global__ void deriv_wrt_center_atom_se_r(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +__global__ void deriv_wrt_center_atom_se_r(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int ndescrpt) { - const unsigned int idx = blockIdx.y; - const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; - - if (idy >= ndescrpt) { - return; - } + const unsigned int idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + const unsigned int idz = threadIdx.x; + if (idy >= ndescrpt) {return;} + atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); } -__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +__global__ void deriv_wrt_neighbors_se_r(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift) + const int ndescrpt) { // idy -> nnei const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -73,27 +54,30 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force, atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); } -void ProdForceSeRLauncher(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, +template +void ProdForceSeRGPUExecuteFunctor::operator()(FPTYPE * force, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, - const int ndescrpt, const int nnei, - const int n_a_sel, - const int n_a_shift) -{ - cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3)); + const int ndescrpt) +{ + // std::cout << "I'm here!" << std::endl; + cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3)); const int LEN1 = 256; const int nblock1 = (ndescrpt + LEN1 -1) / LEN1; - dim3 grid(nblock1, nloc); - dim3 thread(LEN1, 3); + dim3 grid(nloc, nblock1); + dim3 thread(3, LEN1); deriv_wrt_center_atom_se_r<<>>(force, net_deriv, in_deriv, ndescrpt); const int LEN = 64; int nblock = (nloc + LEN -1) / LEN; dim3 block_grid(nblock, nnei); dim3 thread_grid(LEN, 3); - deriv_wrt_neighbors_se_r<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift); -} \ No newline at end of file + deriv_wrt_neighbors_se_r<<>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt); +} + +template struct ProdForceSeRGPUExecuteFunctor; +template struct ProdForceSeRGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu index 5825120970..e084720c6d 100644 --- a/source/op/cuda/prod_virial_se_a.cu +++ b/source/op/cuda/prod_virial_se_a.cu @@ -14,12 +14,12 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -template -__global__ void deriv_wrt_neighbors_se_a(T * virial, - T * atom_virial, - const T * net_deriv, - const T * in_deriv, - const T * rij, +template +__global__ void deriv_wrt_neighbors_se_a(FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, const int * nlist, const int nloc, const int nnei, @@ -48,12 +48,12 @@ __global__ void deriv_wrt_neighbors_se_a(T * virial, atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]); } -template -void ProdVirialSeAGPUExecuteFunctor::operator()(T * virial, - T * atom_virial, - const T * net_deriv, - const T * in_deriv, - const T * rij, +template +void ProdVirialSeAGPUExecuteFunctor::operator()(FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, const int * nlist, const int nloc, const int nall, @@ -62,8 +62,8 @@ void ProdVirialSeAGPUExecuteFunctor::operator()(T * virial, const int n_a_sel, const int n_a_shift) { - cudaErrcheck(cudaMemset(virial, 0.0, sizeof(T) * 9)); - cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(T) * 9 * nall)); + cudaErrcheck(cudaMemset(virial, 0.0, sizeof(FPTYPE) * 9)); + cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall)); const int LEN = 16; int nblock = (nloc + LEN -1) / LEN; diff --git a/source/op/cuda/prod_virial_se_r.cu b/source/op/cuda/prod_virial_se_r.cu index a2c02007fc..9b8f43543f 100644 --- a/source/op/cuda/prod_virial_se_r.cu +++ b/source/op/cuda/prod_virial_se_r.cu @@ -1,24 +1,5 @@ -#include -#include -#include +#include "DeviceFunctor.h" -#define MUL 512 - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { - if (code != cudaSuccess) { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -// currently, double precision atomicAdd only support arch number larger than 6.0 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; @@ -33,17 +14,16 @@ static __inline__ __device__ double atomicAdd(double* address, double val) { } #endif -__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, +template +__global__ void deriv_wrt_neighbors_se_r(FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, const int * nlist, const int nloc, const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift) + const int ndescrpt) { // idx -> nloc // idy -> nnei @@ -61,26 +41,26 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial, if (j_idx < 0) { return; } + // atomicAdd(virial + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]); atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]); } -void ProdVirialSeRLauncher(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, +template +void ProdVirialSeRGPUExecuteFunctor::operator()(FPTYPE * virial, + FPTYPE * atom_virial, + const FPTYPE * net_deriv, + const FPTYPE * in_deriv, + const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift) + const int ndescrpt) { - cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9)); - cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall)); + cudaErrcheck(cudaMemset(virial, 0.0, sizeof(FPTYPE) * 9)); + cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall)); - const int LEN = 16; + const int LEN = 64; int nblock = (nloc + LEN -1) / LEN; dim3 block_grid(nblock, nnei); dim3 thread_grid(LEN, 9); @@ -94,8 +74,9 @@ void ProdVirialSeRLauncher(VALUETYPE * virial, nlist, nloc, nnei, - ndescrpt, - n_a_sel, - n_a_shift + ndescrpt ); -} \ No newline at end of file +} + +template struct ProdVirialSeRGPUExecuteFunctor; +template struct ProdVirialSeRGPUExecuteFunctor; \ No newline at end of file diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc index c59ba817b1..147ca687cf 100644 --- a/source/op/descrpt.cc +++ b/source/op/descrpt.cc @@ -12,12 +12,6 @@ typedef double compute_t; using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE ; -#else -typedef float VALUETYPE ; -#endif - using CPUDevice = Eigen::ThreadPoolDevice; REGISTER_OP("Descrpt") @@ -41,7 +35,7 @@ REGISTER_OP("Descrpt") .Output("axis: int32") .Output("rot_mat: T"); -template +template class DescrptOp : public OpKernel { public: explicit DescrptOp(OpKernelConstruction* context) : OpKernel(context) { @@ -164,18 +158,18 @@ class DescrptOp : public OpKernel { Tensor* rot_mat_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(5, rot_mat_shape, &rot_mat_tensor)); - auto coord = coord_tensor .matrix(); + auto coord = coord_tensor .matrix(); auto type = type_tensor .matrix(); - auto box = box_tensor .matrix(); + auto box = box_tensor .matrix(); auto mesh = mesh_tensor .flat(); - auto avg = avg_tensor .matrix(); - auto std = std_tensor .matrix(); - auto descrpt = descrpt_tensor ->matrix(); - auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); - auto rij = rij_tensor ->matrix(); + auto avg = avg_tensor .matrix(); + auto std = std_tensor .matrix(); + auto descrpt = descrpt_tensor ->matrix(); + auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); + auto rij = rij_tensor ->matrix(); auto nlist = nlist_tensor ->matrix(); auto axis = axis_tensor ->matrix(); - auto rot_mat = rot_mat_tensor ->matrix(); + auto rot_mat = rot_mat_tensor ->matrix(); // // check the types // int max_type_v = 0; @@ -610,5 +604,6 @@ class DescrptOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("Descrpt").Device(DEVICE_CPU).TypeConstraint("T"), \ DescrptOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/descrpt_se_a.cc b/source/op/descrpt_se_a.cc index 3dca8040aa..88f866944d 100644 --- a/source/op/descrpt_se_a.cc +++ b/source/op/descrpt_se_a.cc @@ -12,12 +12,6 @@ typedef double compute_t; using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE ; -#else -typedef float VALUETYPE ; -#endif - using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; @@ -40,7 +34,7 @@ REGISTER_OP("DescrptSeA") .Output("rij: T") .Output("nlist: int32"); -template +template class DescrptSeAOp : public OpKernel { public: explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -165,15 +159,15 @@ class DescrptSeAOp : public OpKernel { nlist_shape, &nlist_tensor)); - auto coord = coord_tensor .matrix(); + auto coord = coord_tensor .matrix(); auto type = type_tensor .matrix(); - auto box = box_tensor .matrix(); + auto box = box_tensor .matrix(); auto mesh = mesh_tensor .flat(); - auto avg = avg_tensor .matrix(); - auto std = std_tensor .matrix(); - auto descrpt = descrpt_tensor ->matrix(); - auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); - auto rij = rij_tensor ->matrix(); + auto avg = avg_tensor .matrix(); + auto std = std_tensor .matrix(); + auto descrpt = descrpt_tensor ->matrix(); + auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); + auto rij = rij_tensor ->matrix(); auto nlist = nlist_tensor ->matrix(); // // check the types @@ -358,5 +352,6 @@ class DescrptSeAOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ DescrptSeAOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/descrpt_se_a_multi_device.cc b/source/op/descrpt_se_a_multi_device.cc index 0069bb5375..ae5e623171 100644 --- a/source/op/descrpt_se_a_multi_device.cc +++ b/source/op/descrpt_se_a_multi_device.cc @@ -21,24 +21,6 @@ REGISTER_OP("DescrptSeA") .Output("nlist: int32"); // only sel_a and rcut_r uesd. -int get_magic_number(int const nnei) { - if (nnei <= 256) { - return 256; - } - else if (nnei <= 512) { - return 512; - } - else if (nnei <= 1024) { - return 1024; - } - else if (nnei <= 2048) { - return 2048; - } - else if (nnei <= 4096) { - return 4096; - } -} - struct DeviceFunctor { void operator()(const CPUDevice& d, std::string& device) { device = "CPU"; @@ -50,20 +32,20 @@ struct DeviceFunctor { #endif // GOOGLE_CUDA }; -template +template struct DescrptSeAFunctor { - void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + void operator()(const CPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); } #if GOOGLE_CUDA - void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + void operator()(const GPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); } #endif // GOOGLE_CUDA }; -template +template class DescrptSeAOp : public OpKernel { public: explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -191,9 +173,9 @@ class DescrptSeAOp : public OpKernel { memcpy (&jlist, 12 + mesh_tensor.flat().data(), sizeof(int *)); } - DescrptSeAFunctor()( + DescrptSeAFunctor()( context->eigen_device(), // define actually graph execution device - coord_tensor.matrix().data(), // related to the kk argument + coord_tensor.matrix().data(), // related to the kk argument type_tensor.matrix().data(), // also related to the kk argument mesh_tensor.flat().data(), ilist, @@ -201,11 +183,11 @@ class DescrptSeAOp : public OpKernel { jlist, array_int, array_longlong, - avg_tensor.matrix().data(), - std_tensor.matrix().data(), - descrpt_tensor->matrix().data(), - descrpt_deriv_tensor->matrix().data(), - rij_tensor->matrix().data(), + avg_tensor.matrix().data(), + std_tensor.matrix().data(), + descrpt_tensor->matrix().data(), + descrpt_deriv_tensor->matrix().data(), + rij_tensor->matrix().data(), nlist_tensor->matrix().data(), nloc, nall, @@ -287,6 +269,24 @@ class DescrptSeAOp : public OpKernel { } delete [] mesh_host; } + + int get_magic_number(int const nnei) { + if (nnei <= 256) { + return 256; + } + else if (nnei <= 512) { + return 512; + } + else if (nnei <= 1024) { + return 1024; + } + else if (nnei <= 2048) { + return 2048; + } + else if (nnei <= 4096) { + return 4096; + } + } }; // Register the CPU kernels. diff --git a/source/op/descrpt_se_r.cc b/source/op/descrpt_se_r.cc index 7a8dbb1541..a4bfe341ac 100644 --- a/source/op/descrpt_se_r.cc +++ b/source/op/descrpt_se_r.cc @@ -12,12 +12,6 @@ typedef double compute_t; using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE ; -#else -typedef float VALUETYPE ; -#endif - using CPUDevice = Eigen::ThreadPoolDevice; REGISTER_OP("DescrptSeR") @@ -37,7 +31,7 @@ REGISTER_OP("DescrptSeR") .Output("rij: T") .Output("nlist: int32"); -template +template class DescrptSeROp : public OpKernel { public: explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -156,15 +150,15 @@ class DescrptSeROp : public OpKernel { nlist_shape, &nlist_tensor)); - auto coord = coord_tensor .matrix(); + auto coord = coord_tensor .matrix(); auto type = type_tensor .matrix(); - auto box = box_tensor .matrix(); + auto box = box_tensor .matrix(); auto mesh = mesh_tensor .flat(); - auto avg = avg_tensor .matrix(); - auto std = std_tensor .matrix(); - auto descrpt = descrpt_tensor ->matrix(); - auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); - auto rij = rij_tensor ->matrix(); + auto avg = avg_tensor .matrix(); + auto std = std_tensor .matrix(); + auto descrpt = descrpt_tensor ->matrix(); + auto descrpt_deriv = descrpt_deriv_tensor ->matrix(); + auto rij = rij_tensor ->matrix(); auto nlist = nlist_tensor ->matrix(); OP_REQUIRES (context, (ntypes == int(sel.size())), errors::InvalidArgument ("number of types should match the length of sel array")); @@ -342,5 +336,6 @@ class DescrptSeROp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ DescrptSeROp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/descrpt_se_r_gpu.cc b/source/op/descrpt_se_r_gpu.cc deleted file mode 100644 index bf2f5b0b12..0000000000 --- a/source/op/descrpt_se_r_gpu.cc +++ /dev/null @@ -1,236 +0,0 @@ -#include -#include -#include -#include -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" - -using namespace tensorflow; // NOLINT(build/namespaces) -#define MAGIC_NUMBER 256 - -#ifdef HIGH_PREC - typedef double VALUETYPE ; -#else - typedef float VALUETYPE ; -#endif - -typedef double compute_t; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -// sec_a kao,sec_r, - -using GPUDevice = Eigen::GpuDevice; - -REGISTER_OP("DescrptSeR") - .Attr("T: {float, double}") - .Input("coord: double") - .Input("type: int32") - .Input("natoms: int32") - .Input("box: double") - .Input("mesh: int32") - .Input("davg: double") - .Input("dstd: double") - .Attr("rcut: float") - .Attr("rcut_smth: float") - .Attr("sel: list(int)") - .Output("descrpt: double") - .Output("descrpt_deriv: double") - .Output("rij: double") - .Output("nlist: int32"); - - -void DescrptSeRLauncher(const VALUETYPE* coord, - const int* type, - const int* ilist, - const int* jrange, - const int* jlist, - int* array_int, - unsigned long long* array_longlong, - compute_t* array_double, - const VALUETYPE* avg, - const VALUETYPE* std, - VALUETYPE* descript, - VALUETYPE* descript_deriv, - VALUETYPE* rij, - int* nlist, - const int& ntypes, - const int& nloc, - const int& nall, - const int& nnei, - const float& rcut, - const float& rcut_smth, - const int& ndescrpt, - const std::vector& sec, - const bool& fill_nei_a -); - -template -class DescrptSeROp : public OpKernel { -public: - explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) { - float nloc_f, nall_f; - OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut)); - OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth)); - OP_REQUIRES_OK(context, context->GetAttr("sel", &sel)); - cum_sum (sec, sel); - sel_null.resize(3, 0); - cum_sum (sec_null, sel_null); - ndescrpt = sec.back() * 1; - nnei = sec.back(); - fill_nei_a = true; - // count_nei_idx_overflow = 0; - // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl; - } - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - int context_input_index = 0; - const Tensor& coord_tensor = context->input(context_input_index++); - const Tensor& type_tensor = context->input(context_input_index++); - const Tensor& natoms_tensor = context->input(context_input_index++); - const Tensor& box_tensor = context->input(context_input_index++); - const Tensor& mesh_tensor = context->input(context_input_index++); - const Tensor& avg_tensor = context->input(context_input_index++); - const Tensor& std_tensor = context->input(context_input_index++); - // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3] - OP_REQUIRES (context, (coord_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of coord should be 2")); - OP_REQUIRES (context, (type_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of type should be 2")); - OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); - OP_REQUIRES (context, (box_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of box should be 2")); - OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of mesh should be 1")); - OP_REQUIRES (context, (avg_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of avg should be 2")); - OP_REQUIRES (context, (std_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of std should be 2")); - OP_REQUIRES (context, (fill_nei_a), errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0")); - - OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); - int nloc = natoms[0]; - int nall = natoms[1]; - int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something. - int nsamples = coord_tensor.shape().dim_size(0); - // - //// check the sizes - OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); - OP_REQUIRES (context, (ntypes == int(sel.size())), errors::InvalidArgument ("number of types should match the length of sel array")); - OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of avg should be ntype")); - OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of std should be ntype")); - - OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); - OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); - OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of box should be 9")); - OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of avg should be ndescrpt")); - OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of std should be ndescrpt")); - - // Create output tensors - TensorShape descrpt_shape ; - descrpt_shape.AddDim (nsamples); - descrpt_shape.AddDim (nloc * ndescrpt); - TensorShape descrpt_deriv_shape ; - descrpt_deriv_shape.AddDim (nsamples); - descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3); - TensorShape rij_shape ; - rij_shape.AddDim (nsamples); - rij_shape.AddDim (nloc * nnei * 3); - TensorShape nlist_shape ; - nlist_shape.AddDim (nsamples); - nlist_shape.AddDim (nloc * nnei); - - int context_output_index = 0; - Tensor* descrpt_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - descrpt_shape, - &descrpt_tensor)); - Tensor* descrpt_deriv_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - descrpt_deriv_shape, - &descrpt_deriv_tensor)); - Tensor* rij_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - rij_shape, - &rij_tensor)); - Tensor* nlist_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - nlist_shape, - &nlist_tensor)); - - int * ilist = NULL, *jrange = NULL, *jlist = NULL; - int *array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL; - cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost)); - - // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice)); - // Launch computation - for (int II = 0; II < nsamples; II++) { - DescrptSeRLauncher( coord_tensor.matrix().data() + II * (nall * 3), - type_tensor.matrix().data() + II * nall, - ilist, - jrange, - jlist, - array_int, - array_longlong, - array_double, - avg_tensor.matrix().data(), - std_tensor.matrix().data(), - descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), - descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), - rij_tensor->matrix().data() + II * (nloc * nnei * 3), - nlist_tensor->matrix().data() + II * (nloc * nnei), - ntypes, - nloc, - nall, - nnei, - rcut, - rcut_smth, - ndescrpt, - sec, - fill_nei_a - ); - } - // std::cout << "done" << std::endl; - delete[] natoms; - } - -///////////////////////////////////////////////////////////////////////////////////////////// - -private: - float rcut; - float rcut_smth; - std::vector sel; - std::vector sel_null; - std::vector sec; - std::vector sec_null; - int nnei, ndescrpt, nloc, nall; - bool fill_nei_a; - - //private func - void cum_sum (std::vector & sec, const std::vector & n_sel) const { - sec.resize (n_sel.size() + 1); - sec[0] = 0; - for (int ii = 1; ii < sec.size(); ++ii) { - sec[ii] = sec[ii-1] + n_sel[ii-1]; - } - } -}; - -#define REGISTER_GPU(T) \ -REGISTER_KERNEL_BUILDER( \ - Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint("T"), \ - DescrptSeROp); -REGISTER_GPU(VALUETYPE); \ No newline at end of file diff --git a/source/op/descrpt_se_r_multi_device.cc b/source/op/descrpt_se_r_multi_device.cc new file mode 100644 index 0000000000..c5eaff616c --- /dev/null +++ b/source/op/descrpt_se_r_multi_device.cc @@ -0,0 +1,297 @@ +#include "common.h" +#include "CustomeOperation.h" + +REGISTER_OP("DescrptSeR") + .Attr("T: {float, double}") + .Input("coord: T") + .Input("type: int32") + .Input("natoms: int32") + .Input("box: T") + .Input("mesh: int32") + .Input("davg: T") + .Input("dstd: T") + .Attr("rcut: float") + .Attr("rcut_smth: float") + .Attr("sel: list(int)") + .Output("descrpt: T") + .Output("descrpt_deriv: T") + .Output("rij: T") + .Output("nlist: int32"); + +struct DeviceFunctor { + void operator()(const CPUDevice& d, std::string& device) { + device = "CPU"; + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, std::string& device) { + device = "GPU"; + } + #endif // GOOGLE_CUDA +}; + +template +struct DescrptSeRFunctor { + void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeRCPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + } + + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) { + DescrptSeRGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number); + } + #endif // GOOGLE_CUDA +}; + +template +class DescrptSeROp : public OpKernel { +public: + explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut)); + OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth)); + OP_REQUIRES_OK(context, context->GetAttr("sel", &sel)); + cum_sum (sec, sel); + sel_null.resize(3, 0); + cum_sum (sec_null, sel_null); + ndescrpt = sec.back() * 1; + nnei = sec.back(); + fill_nei_a = true; + magic_number = get_magic_number(nnei); + // count_nei_idx_overflow = 0; + // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl; + } + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& coord_tensor = context->input(context_input_index++); + const Tensor& type_tensor = context->input(context_input_index++); + const Tensor& natoms_tensor = context->input(context_input_index++); + const Tensor& box_tensor = context->input(context_input_index++); + const Tensor& mesh_tensor = context->input(context_input_index++); + const Tensor& avg_tensor = context->input(context_input_index++); + const Tensor& std_tensor = context->input(context_input_index++); + + // set size of the sample + OP_REQUIRES (context, (coord_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of coord should be 2")); + OP_REQUIRES (context, (type_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of type should be 2")); + OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); + OP_REQUIRES (context, (box_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of box should be 2")); + OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of mesh should be 1")); + OP_REQUIRES (context, (avg_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of avg should be 2")); + OP_REQUIRES (context, (std_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of std should be 2")); + OP_REQUIRES (context, (fill_nei_a), errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0")); + + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); + + DeviceFunctor() ( + context->eigen_device(), + device + ); + + const int * natoms = natoms_tensor.flat().data(); + int nloc = natoms[0]; + int nall = natoms[1]; + int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something. + int nsamples = coord_tensor.shape().dim_size(0); + // + //// check the sizes + // check the sizes + OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); + OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of avg should be ntype")); + OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of std should be ntype")); + + OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match")); + OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of box should be 9")); + OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of avg should be ndescrpt")); + OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of std should be ndescrpt")); + + OP_REQUIRES (context, (nnei <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit.")); + + // Create an output tensor + TensorShape descrpt_shape ; + descrpt_shape.AddDim (nsamples); + descrpt_shape.AddDim (nloc * ndescrpt); + TensorShape descrpt_deriv_shape ; + descrpt_deriv_shape.AddDim (nsamples); + descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3); + TensorShape rij_shape ; + rij_shape.AddDim (nsamples); + rij_shape.AddDim (nloc * nnei * 3); + TensorShape nlist_shape ; + nlist_shape.AddDim (nsamples); + nlist_shape.AddDim (nloc * nnei); + + int context_output_index = 0; + Tensor* descrpt_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_shape, + &descrpt_tensor)); + Tensor* descrpt_deriv_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + descrpt_deriv_shape, + &descrpt_deriv_tensor)); + Tensor* rij_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + rij_shape, + &rij_tensor)); + Tensor* nlist_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + nlist_shape, + &nlist_tensor)); + + if(device == "GPU") { + // allocate temp memory, temp memory must not be used after this operation! + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec.size() + nloc * sec.size() + nloc); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(nloc * magic_number * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); + + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + + nbor_update(mesh_tensor.flat().data(), static_cast(mesh_tensor.NumElements())); + } + else if (device == "CPU") { + memcpy (&ilist, 4 + mesh_tensor.flat().data(), sizeof(int *)); + memcpy (&jrange, 8 + mesh_tensor.flat().data(), sizeof(int *)); + memcpy (&jlist, 12 + mesh_tensor.flat().data(), sizeof(int *)); + } + + DescrptSeRFunctor()( + context->eigen_device(), // define actually graph execution device + coord_tensor.matrix().data(), // related to the kk argument + type_tensor.matrix().data(), // also related to the kk argument + mesh_tensor.flat().data(), + ilist, + jrange, + jlist, + array_int, + array_longlong, + avg_tensor.matrix().data(), + std_tensor.matrix().data(), + descrpt_tensor->matrix().data(), + descrpt_deriv_tensor->matrix().data(), + rij_tensor->matrix().data(), + nlist_tensor->matrix().data(), + nloc, + nall, + nnei, + ntypes, + ndescrpt, + rcut, + rcut_smth, + sec, + fill_nei_a, + magic_number + ); + } + +///////////////////////////////////////////////////////////////////////////////////////////// + +private: + float rcut; + float rcut_smth; + std::vector sel; + std::vector sel_null; + std::vector sec; + std::vector sec_null; + int nnei, ndescrpt, nloc, nall; + bool fill_nei_a; + + //private func + void cum_sum (std::vector & sec, const std::vector & n_sel) const { + sec.resize (n_sel.size() + 1); + sec[0] = 0; + for (int ii = 1; ii < sec.size(); ++ii) { + sec[ii] = sec[ii-1] + n_sel[ii-1]; + } + } + + int magic_number; + std::string device; + int *array_int; + unsigned long long*array_longlong; + int * ilist = NULL, * jrange = NULL, * jlist = NULL; + int ilist_size = 0, jrange_size = 0, jlist_size = 0; + bool init = false; + + void nbor_update(const int * mesh, const int size) { + int *mesh_host = new int[size], *ilist_host = NULL, *jrange_host = NULL, *jlist_host = NULL; + cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost)); + memcpy (&ilist_host, 4 + mesh_host, sizeof(int *)); + memcpy (&jrange_host, 8 + mesh_host, sizeof(int *)); + memcpy (&jlist_host, 12 + mesh_host, sizeof(int *)); + int const ago = mesh_host[0]; + if (!init) { + ilist_size = (int)(mesh_host[1] * 1.2); + jrange_size = (int)(mesh_host[2] * 1.2); + jlist_size = (int)(mesh_host[3] * 1.2); + cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size)); + cudaErrcheck(cudaMalloc((void **)&jrange, sizeof(int) * jrange_size)); + cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size)); + init = true; + } + if (ago == 0) { + if (ilist_size < mesh_host[1]) { + ilist_size = (int)(mesh_host[1] * 1.2); + cudaErrcheck(cudaFree(ilist)); + cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size)); + } + if (jrange_size < mesh_host[2]) { + jrange_size = (int)(mesh_host[2] * 1.2); + cudaErrcheck(cudaFree(jrange)); + cudaErrcheck(cudaMalloc((void **)&jrange,sizeof(int) * jrange_size)); + } + if (jlist_size < mesh_host[3]) { + jlist_size = (int)(mesh_host[3] * 1.2); + cudaErrcheck(cudaFree(jlist)); + cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size)); + } + cudaErrcheck(cudaMemcpy(ilist, ilist_host, sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice)); + cudaErrcheck(cudaMemcpy(jlist, jlist_host, sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice)); + } + delete [] mesh_host; + } + + int get_magic_number(int const nnei) { + if (nnei <= 256) { + return 256; + } + else if (nnei <= 512) { + return 512; + } + else if (nnei <= 1024) { + return 1024; + } + else if (nnei <= 2048) { + return 2048; + } + else if (nnei <= 4096) { + return 4096; + } + } +}; + +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + DescrptSeROp); +REGISTER_CPU(float); +REGISTER_CPU(double); +// Register the GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + DescrptSeROp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc index 3833de1f7b..22c61b7429 100644 --- a/source/op/ewald_recp.cc +++ b/source/op/ewald_recp.cc @@ -10,12 +10,6 @@ typedef double boxtensor_t ; using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE ; -#else -typedef float VALUETYPE ; -#endif - using CPUDevice = Eigen::ThreadPoolDevice; REGISTER_OP("EwaldRecp") @@ -30,7 +24,7 @@ REGISTER_OP("EwaldRecp") .Output("force: T") .Output("virial: T"); -template +template class EwaldRecpOp : public OpKernel { public: explicit EwaldRecpOp(OpKernelConstruction* context) : OpKernel(context) { @@ -81,12 +75,12 @@ class EwaldRecpOp : public OpKernel { Tensor* virial_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(cc++, virial_shape, &virial_tensor)); - auto coord = coord_tensor .flat(); - auto charge = charge_tensor .flat(); - auto box = box_tensor .flat(); - auto energy = energy_tensor ->flat(); - auto force = force_tensor ->matrix(); - auto virial = virial_tensor ->matrix(); + auto coord = coord_tensor .flat(); + auto charge = charge_tensor .flat(); + auto box = box_tensor .flat(); + auto energy = energy_tensor ->flat(); + auto force = force_tensor ->matrix(); + auto virial = virial_tensor ->matrix(); for (int kk = 0; kk < nsamples; ++kk){ int box_iter = kk * 9; @@ -113,19 +107,19 @@ class EwaldRecpOp : public OpKernel { else if (inter[dd] >= 1) inter[dd] -= 1.; } } - vector d_coord3 (nloc*3); + vector d_coord3 (nloc*3); for (int ii = 0; ii < nloc * 3; ++ii) { d_coord3[ii] = d_coord3_[ii]; } // set charge - vector d_charge (nloc); + vector d_charge (nloc); for (int ii = 0; ii < nloc; ++ii) d_charge[ii] = charge(charge_iter + ii); // prepare outputs vectors - T d_ener; - vector d_force(nloc*3); - vector d_virial(9); + FPTYPE d_ener; + vector d_force(nloc*3); + vector d_virial(9); // compute EwaldReciprocal(d_ener, d_force, d_virial, d_coord3, d_charge, region, ep); @@ -141,11 +135,12 @@ class EwaldRecpOp : public OpKernel { } } private: - EwaldParameters ep; + EwaldParameters ep; }; #define REGISTER_CPU(T) \ REGISTER_KERNEL_BUILDER( \ Name("EwaldRecp").Device(DEVICE_CPU).TypeConstraint("T"), \ EwaldRecpOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/gelu.cc b/source/op/gelu.cc index 26c53c8511..7012438db9 100644 --- a/source/op/gelu.cc +++ b/source/op/gelu.cc @@ -2,11 +2,11 @@ #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/shape_inference.h" -#define SQRT_2_PI 0.7978845608028654 using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; +#define SQRT_2_PI 0.7978845608028654 REGISTER_OP("Gelu") .Attr("T: {float, double}") @@ -26,43 +26,68 @@ REGISTER_OP("GeluGradGrad") .Input("x: T") .Output("output: T"); -template +#if GOOGLE_CUDA +// maybe instead use cudnn activation forward +void GeluLauncher(const float * in, float * out, int const size); +void GeluLauncher(const double * in, double * out, int const size); +void GeluGradLauncher(const float * dy, const float * in, float * out, int const size); +void GeluGradLauncher(const double * dy, const double * in, double * out, int const size); +void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size); +void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size); +#endif // GOOGLE_CUDa + +template struct GeluFunctor { - void operator()(const Device& d, const T * in, T * out, int const size) { + void operator()(const CPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) { #pragma omp parallel for for (int ii = 0; ii < size; ii++) { out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] * in[ii]))); } } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) { + GeluLauncher(in, out, size); + } + #endif }; -template +template struct GeluGradFunctor { - void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) { + void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { #pragma omp parallel for for (int ii = 0; ii < size; ii++) { - T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5); } } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradLauncher(dy, in, out, size); + } + #endif }; -template +template struct GeluGradGradFunctor { - void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { #pragma omp parallel for for (int ii = 0; ii < size; ii++) { - T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); - T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); + FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2); } } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { + GeluGradGradLauncher(dy, dy_, in, out, size); + } + #endif }; // OpKernel definition. -// template parameter is the datatype of the tensors. -template +// template parameter is the datatype of the tensors. +template class GeluOp : public OpKernel { public : explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -70,26 +95,25 @@ class GeluOp : public OpKernel { void Compute(OpKernelContext* context) override { // Grab the input tensor const Tensor& x = context->input(0); - Tensor * output = NULL; int context_output_index = 0; OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, x.shape(), &output)); - GeluFunctor()( + GeluFunctor()( context->eigen_device(), - x.flat().data(), - output->flat().data(), + x.flat().data(), + output->flat().data(), static_cast(output->NumElements()) ); - // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; // OpKernel definition. -// template parameter is the datatype of the tensors. -template +// template parameter is the datatype of the tensors. +template class GeluGradOp : public OpKernel { public : explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -105,20 +129,20 @@ class GeluGradOp : public OpKernel { x.shape(), &output)); - GeluGradFunctor()( + GeluGradFunctor()( context->eigen_device(), - dy.flat().data(), - x.flat().data(), - output->flat().data(), + dy.flat().data(), + x.flat().data(), + output->flat().data(), static_cast(output->NumElements()) ); - // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); } }; // OpKernel definition. -// template parameter is the datatype of the tensors. -template +// template parameter is the datatype of the tensors. +template class GeluGradGradOp : public OpKernel { public : explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -135,12 +159,12 @@ class GeluGradGradOp : public OpKernel { x.shape(), &output)); - GeluGradGradFunctor()( + GeluGradGradFunctor()( context->eigen_device(), - dy.flat().data(), - dy_.flat().data(), - x.flat().data(), - output->flat().data(), + dy.flat().data(), + dy_.flat().data(), + x.flat().data(), + output->flat().data(), static_cast(output->NumElements()) ); // GeluGradGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); @@ -148,17 +172,35 @@ class GeluGradGradOp : public OpKernel { }; #define REGISTER_CPU(T) \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("Gelu").Device(DEVICE_CPU).TypeConstraint("T"), \ - GeluOp); \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ - GeluGradOp); \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ - GeluGradGradOp); - REGISTER_CPU(float); - REGISTER_CPU(double); \ No newline at end of file +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradGradOp); +REGISTER_CPU(float); +REGISTER_CPU(double); + +#if GOOGLE_CUDA +#define REGISTER_GPU(T) \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradOp); \ +/* Declare explicit instantiations in kernel_example.cu.cc. */ \ +REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradGradOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc deleted file mode 100644 index 34d4183f98..0000000000 --- a/source/op/gelu_gpu.cc +++ /dev/null @@ -1,159 +0,0 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/shape_inference.h" - -using namespace tensorflow; -using CPUDevice = Eigen::ThreadPoolDevice; -using GPUDevice = Eigen::GpuDevice; - -REGISTER_OP("Gelu") - .Attr("T: {float, double}") - .Input("x: T") - .Output("output: T"); - -REGISTER_OP("GeluGrad") - .Attr("T: {float, double}") - .Input("dy: T") - .Input("x: T") - .Output("output: T"); - -REGISTER_OP("GeluGradGrad") - .Attr("T: {float, double}") - .Input("dy: T") - .Input("dy_: T") - .Input("x: T") - .Output("output: T"); - -// maybe instead use cudnn activation forward -void GeluLauncher(const float * in, float * out, int const size); -void GeluLauncher(const double * in, double * out, int const size); - -void GeluGradLauncher(const float * dy, const float * in, float * out, int const size); -void GeluGradLauncher(const double * dy, const double * in, double * out, int const size); - -void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size); -void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size); - -template -struct GeluFunctor { - void operator()(const Device& d, const T * in, T * out, int const size) { - GeluLauncher(in, out, size); - } -}; - -template -struct GeluGradFunctor { - void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) { - GeluGradLauncher(dy, in, out, size); - } -}; - -template -struct GeluGradGradFunctor { - void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { - GeluGradGradLauncher(dy, dy_, in, out, size); - } -}; - -// OpKernel definition. -// template parameter is the datatype of the tensors. -template -class GeluOp : public OpKernel { - public : - explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& x = context->input(0); - Tensor * output = NULL; - int context_output_index = 0; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - x.shape(), - &output)); - - GeluFunctor()( - context->eigen_device(), - x.flat().data(), - output->flat().data(), - static_cast(output->NumElements()) - ); - // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); - } -}; - -// OpKernel definition. -// template parameter is the datatype of the tensors. -template -class GeluGradOp : public OpKernel { - public : - explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& dy = context->input(0); - const Tensor& x = context->input(1); - - Tensor * output = NULL; - int context_output_index = 0; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - x.shape(), - &output)); - - GeluGradFunctor()( - context->eigen_device(), - dy.flat().data(), - x.flat().data(), - output->flat().data(), - static_cast(output->NumElements()) - ); - // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); - } -}; - -// OpKernel definition. -// template parameter is the datatype of the tensors. -template -class GeluGradGradOp : public OpKernel { - public : - explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - // Grab the input tensor - const Tensor& dy = context->input(0); - const Tensor& dy_ = context->input(1); - const Tensor& x = context->input(2); - - Tensor * output = NULL; - int context_output_index = 0; - OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, - x.shape(), - &output)); - - GeluGradGradFunctor()( - context->eigen_device(), - dy.flat().data(), - dy_.flat().data(), - x.flat().data(), - output->flat().data(), - static_cast(output->NumElements()) - ); - // GeluGradGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); - } -}; - -#define REGISTER_GPU(T) \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ - GeluOp); \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ - GeluGradOp); \ - /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - REGISTER_KERNEL_BUILDER( \ - Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ - GeluGradGradOp); - REGISTER_GPU(float); - REGISTER_GPU(double); diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc index 2a5656e6d7..f84c9c0f9f 100644 --- a/source/op/gelu_multi_device.cc +++ b/source/op/gelu_multi_device.cc @@ -19,45 +19,45 @@ REGISTER_OP("GeluGradGrad") .Input("x: T") .Output("output: T"); -template +template struct GeluFunctor { - void operator()(const CPUDevice& d, const T * in, T * out, int const size) { + void operator()(const CPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) { GeluCPULauncher(in, out, size); } #if GOOGLE_CUDA - void operator()(const GPUDevice& d, const T * in, T * out, int const size) { + void operator()(const GPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) { GeluGPULauncher(in, out, size); } #endif }; -template +template struct GeluGradFunctor { - void operator()(const CPUDevice& d, const T * dy, const T * in, T * out, int const size) { + void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { GeluGradCPULauncher(dy, in, out, size); } #if GOOGLE_CUDA - void operator()(const GPUDevice& d, const T * dy, const T * in, T * out, int const size) { + void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) { GeluGradGPULauncher(dy, in, out, size); } #endif }; -template +template struct GeluGradGradFunctor { - void operator()(const CPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { GeluGradGradCPULauncher(dy, dy_, in, out, size); } #if GOOGLE_CUDA - void operator()(const GPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) { GeluGradGradGPULauncher(dy, dy_, in, out, size); } #endif }; // OpKernel definition. -// template parameter is the datatype of the tensors. -template +// template parameter is the datatype of the tensors. +template class GeluOp : public OpKernel { public : explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -71,18 +71,18 @@ class GeluOp : public OpKernel { x.shape(), &output)); - GeluFunctor()( + GeluFunctor()( context->eigen_device(), - x.flat().data(), - output->flat().data(), + x.flat().data(), + output->flat().data(), static_cast(output->NumElements()) ); } }; // OpKernel definition. -// template parameter is the datatype of the tensors. -template +// template parameter is the datatype of the tensors. +template class GeluGradOp : public OpKernel { public : explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -98,19 +98,19 @@ class GeluGradOp : public OpKernel { x.shape(), &output)); - GeluGradFunctor()( + GeluGradFunctor()( context->eigen_device(), - dy.flat().data(), - x.flat().data(), - output->flat().data(), + dy.flat().data(), + x.flat().data(), + output->flat().data(), static_cast(output->NumElements()) ); } }; // OpKernel definition. -// template parameter is the datatype of the tensors. -template +// template parameter is the datatype of the tensors. +template class GeluGradGradOp : public OpKernel { public : explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -127,12 +127,12 @@ class GeluGradGradOp : public OpKernel { x.shape(), &output)); - GeluGradGradFunctor()( + GeluGradGradFunctor()( context->eigen_device(), - dy.flat().data(), - dy_.flat().data(), - x.flat().data(), - output->flat().data(), + dy.flat().data(), + dy_.flat().data(), + x.flat().data(), + output->flat().data(), static_cast(output->NumElements()) ); } diff --git a/source/op/prod_force.cc b/source/op/prod_force.cc index e1c6c362c8..7ba99b6e81 100644 --- a/source/op/prod_force.cc +++ b/source/op/prod_force.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdForce") .Attr("T: {float, double}") .Input("net_deriv: T") @@ -28,7 +22,7 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdForceOp : public OpKernel { public: explicit ProdForceOp(OpKernelConstruction* context) : OpKernel(context) { @@ -79,11 +73,11 @@ class ProdForceOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); auto axis = axis_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); // loop over samples #pragma omp parallel for @@ -173,6 +167,7 @@ class ProdForceOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("ProdForce").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdForceOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_force_grad.cc b/source/op/prod_force_grad.cc index 48f55a977b..2c8e62550c 100644 --- a/source/op/prod_force_grad.cc +++ b/source/op/prod_force_grad.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdForceGrad") .Attr("T: {float, double}") .Input("grad: T") @@ -26,7 +20,7 @@ REGISTER_OP("ProdForceGrad") using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdForceGradOp : public OpKernel { public: @@ -88,12 +82,12 @@ class ProdForceGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); auto nlist = nlist_tensor .flat(); auto axis = axis_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -185,4 +179,5 @@ class ProdForceGradOp : public OpKernel REGISTER_KERNEL_BUILDER( \ Name("ProdForceGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdForceGradOp); -REGISTER_CPU(VALUETYPE); \ No newline at end of file +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_force_se_a.cc b/source/op/prod_force_se_a.cc index c762bf71fd..1b5053377c 100644 --- a/source/op/prod_force_se_a.cc +++ b/source/op/prod_force_se_a.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdForceSeA") .Attr("T: {float, double}") .Input("net_deriv: T") @@ -28,7 +22,7 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; -template +template class ProdForceSeAOp : public OpKernel { public: explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -78,10 +72,10 @@ class ProdForceSeAOp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -155,5 +149,6 @@ class ProdForceSeAOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdForceSeAOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_force_se_a_grad.cc b/source/op/prod_force_se_a_grad.cc index d884782f17..884e46f9a8 100644 --- a/source/op/prod_force_se_a_grad.cc +++ b/source/op/prod_force_se_a_grad.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdForceSeAGrad") .Attr("T: {float, double}") .Input("grad: T") @@ -25,7 +19,7 @@ REGISTER_OP("ProdForceSeAGrad") using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdForceSeAGradOp : public OpKernel { public: @@ -83,11 +77,11 @@ class ProdForceSeAGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -154,4 +148,5 @@ class ProdForceSeAGradOp : public OpKernel REGISTER_KERNEL_BUILDER( \ Name("ProdForceSeAGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdForceSeAGradOp); -REGISTER_CPU(VALUETYPE); \ No newline at end of file +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_force_se_a_multi_device.cc b/source/op/prod_force_se_a_multi_device.cc index a864617208..87a3ae3ecc 100644 --- a/source/op/prod_force_se_a_multi_device.cc +++ b/source/op/prod_force_se_a_multi_device.cc @@ -11,19 +11,19 @@ REGISTER_OP("ProdForceSeA") .Attr("n_r_sel: int") .Output("force: T"); -template +template struct ProdForceSeAFunctor { - void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + void operator()(const CPUDevice& d, FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { ProdForceSeACPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } #if GOOGLE_CUDA - void operator()(const GPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + void operator()(const GPUDevice& d, FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { ProdForceSeAGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } #endif // GOOGLE_CUDA }; -template +template class ProdForceSeAOp : public OpKernel { public: explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -72,10 +72,10 @@ class ProdForceSeAOp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -87,11 +87,11 @@ class ProdForceSeAOp : public OpKernel { assert (nloc * nnei == nlist_tensor.shape().dim_size(1)); assert (nnei * 4 == ndescrpt); - ProdForceSeAFunctor()( + ProdForceSeAFunctor()( context->eigen_device(), - force_tensor->flat().data(), - net_deriv_tensor.flat().data(), - in_deriv_tensor.flat().data(), + force_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), nlist_tensor.flat().data(), nloc, nall, diff --git a/source/op/prod_force_se_r.cc b/source/op/prod_force_se_r.cc index a8e5b69162..4347442167 100644 --- a/source/op/prod_force_se_r.cc +++ b/source/op/prod_force_se_r.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdForceSeR") .Attr("T: {float, double}") .Input("net_deriv: T") @@ -24,7 +18,7 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdForceSeROp : public OpKernel { public: explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -69,10 +63,10 @@ class ProdForceSeROp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -127,6 +121,7 @@ class ProdForceSeROp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdForceSeROp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_force_se_r_grad.cc b/source/op/prod_force_se_r_grad.cc index 488492f699..dfe9a5ff98 100644 --- a/source/op/prod_force_se_r_grad.cc +++ b/source/op/prod_force_se_r_grad.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdForceSeRGrad") .Attr("T: {float, double}") .Input("grad: T") @@ -23,7 +17,7 @@ REGISTER_OP("ProdForceSeRGrad") using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdForceSeRGradOp : public OpKernel { public: @@ -77,11 +71,11 @@ class ProdForceSeRGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -129,4 +123,5 @@ class ProdForceSeRGradOp : public OpKernel REGISTER_KERNEL_BUILDER( \ Name("ProdForceSeRGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdForceSeRGradOp); -REGISTER_CPU(VALUETYPE); \ No newline at end of file +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_force_se_r_gpu.cc b/source/op/prod_force_se_r_multi_device.cc similarity index 55% rename from source/op/prod_force_se_r_gpu.cc rename to source/op/prod_force_se_r_multi_device.cc index 8f6ee24910..0a97d17742 100644 --- a/source/op/prod_force_se_r_gpu.cc +++ b/source/op/prod_force_se_r_multi_device.cc @@ -1,26 +1,5 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include -#include - -using namespace tensorflow; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif +#include "common.h" +#include "CustomeOperation.h" REGISTER_OP("ProdForceSeR") .Attr("T: {float, double}") @@ -30,26 +9,22 @@ REGISTER_OP("ProdForceSeR") .Input("natoms: int32") .Output("force: T"); -using CPUDevice = Eigen::ThreadPoolDevice; -using GPUDevice = Eigen::GpuDevice; - -void ProdForceSeRLauncher(VALUETYPE * force, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const int * nlist, - const int nloc, - const int nall, - const int ndescrpt, - const int nnei, - const int n_a_sel, - const int n_a_shift); +template +struct ProdForceSeRFunctor { + void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdForceSeRCPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt); + } + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdForceSeRGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt); + } + #endif // GOOGLE_CUDA +}; template class ProdForceSeROp : public OpKernel { public: - explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) { - // std::cout << "I'm in prod_force_se_r_gpu.cc" << std::endl; - } + explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { // Grab the input tensor @@ -66,8 +41,7 @@ class ProdForceSeROp : public OpKernel { OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + const int * natoms = natoms_tensor.flat().data(); int nloc = natoms[0]; int nall = natoms[1]; int nframes = net_deriv_tensor.shape().dim_size(0); @@ -79,11 +53,9 @@ class ProdForceSeROp : public OpKernel { OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); - // OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel), errors::InvalidArgument ("number of neighbors should match")); - // OP_REQUIRES (context, (0 == n_r_sel), errors::InvalidArgument ("Rotational free only support all-angular information")); // Create an output tensor - TensorShape force_shape; + TensorShape force_shape ; force_shape.AddDim (nframes); force_shape.AddDim (3 * nall); Tensor* force_tensor = NULL; @@ -92,10 +64,10 @@ class ProdForceSeROp : public OpKernel { force_shape, &force_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto force = force_tensor->flat(); + auto force = force_tensor->flat(); assert (nframes == force_shape.dim_size(0)); assert (nframes == net_deriv_tensor.shape().dim_size(0)); @@ -107,28 +79,33 @@ class ProdForceSeROp : public OpKernel { assert (nloc * nnei == nlist_tensor.shape().dim_size(1)); assert (nnei * 4 == ndescrpt); - for (int II = 0; II < nframes; II++) { - ProdForceSeRLauncher(force_tensor->flat().data() + II * (nall * 3), - net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), - in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), - nlist_tensor.flat().data() + II * (nloc * nnei), - nloc, - nall, - ndescrpt, - nnei, - n_a_sel, - n_a_shift - ); - } - delete[] natoms; + ProdForceSeRFunctor()( + context->eigen_device(), + force_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + nlist_tensor.flat().data(), + nloc, + nall, + nnei, + ndescrpt + ); } -private: - int n_r_sel, n_a_sel, n_a_shift; }; // Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdForceSeROp); +REGISTER_CPU(float); +REGISTER_CPU(double); +// Register the GPU kernels. +#if GOOGLE_CUDA #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ - Name("ProdForceSeR").Device(DEVICE_GPU).TypeConstraint("T"), \ - ProdForceSeROp); -REGISTER_GPU(VALUETYPE); \ No newline at end of file + Name("ProdForceSeR").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + ProdForceSeROp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/prod_virial.cc b/source/op/prod_virial.cc index 8f607e0cf0..f42a5055c2 100644 --- a/source/op/prod_virial.cc +++ b/source/op/prod_virial.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdVirial") .Attr("T: {float, double}") .Input("net_deriv: T") @@ -30,7 +24,7 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdVirialOp : public OpKernel { public: explicit ProdVirialOp(OpKernelConstruction* context) : OpKernel(context) { @@ -89,13 +83,13 @@ class ProdVirialOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); auto axis = axis_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); // loop over samples #pragma omp parallel for @@ -133,10 +127,10 @@ class ProdVirialOp : public OpKernel { if (j_idx < 0) continue; if (jj == axis_0) { for (int aa = 0; aa < ndescrpt; ++aa){ - T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0); virial (virial_iter + dd0 * 3 + dd1) += tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v; } @@ -145,10 +139,10 @@ class ProdVirialOp : public OpKernel { } else if (jj == axis_1) { for (int aa = 0; aa < ndescrpt; ++aa){ - T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0); virial (virial_iter + dd0 * 3 + dd1) += tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v; } @@ -159,10 +153,10 @@ class ProdVirialOp : public OpKernel { int aa_start, aa_end; make_descript_range (aa_start, aa_end, jj); for (int aa = aa_start; aa < aa_end; ++aa) { - T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0); virial (virial_iter + dd0 * 3 + dd1) += tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v; } @@ -194,6 +188,7 @@ class ProdVirialOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("ProdVirial").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdVirialOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_virial_grad.cc b/source/op/prod_virial_grad.cc index 3d8c7e4639..5d75f5f649 100644 --- a/source/op/prod_virial_grad.cc +++ b/source/op/prod_virial_grad.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdVirialGrad") .Attr("T: {float, double}") .Input("grad: T") @@ -27,7 +21,7 @@ REGISTER_OP("ProdVirialGrad") using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdVirialGradOp : public OpKernel { public: @@ -94,13 +88,13 @@ class ProdVirialGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); - auto rij = rij_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); + auto rij = rij_tensor .flat(); auto nlist = nlist_tensor .flat(); auto axis = axis_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -194,4 +188,5 @@ class ProdVirialGradOp : public OpKernel REGISTER_KERNEL_BUILDER( \ Name("ProdVirialGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdVirialGradOp); -REGISTER_CPU(VALUETYPE); \ No newline at end of file +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_virial_se_a.cc b/source/op/prod_virial_se_a.cc index d975913d88..ba934fa54e 100644 --- a/source/op/prod_virial_se_a.cc +++ b/source/op/prod_virial_se_a.cc @@ -6,11 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif REGISTER_OP("ProdVirialSeA") .Attr("T: {float, double}") @@ -29,7 +24,7 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; -template +template class ProdVirialSeAOp : public OpKernel { public: explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -85,12 +80,12 @@ class ProdVirialSeAOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); // loop over samples #pragma omp parallel for @@ -121,10 +116,10 @@ class ProdVirialSeAOp : public OpKernel { int aa_start, aa_end; make_descript_range (aa_start, aa_end, jj); for (int aa = aa_start; aa < aa_end; ++aa) { - T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0); virial (virial_iter + dd0 * 3 + dd1) -= tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v; } @@ -156,7 +151,8 @@ class ProdVirialSeAOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdVirialSeAOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc index 0f506fc51a..ebbd857bf0 100644 --- a/source/op/prod_virial_se_a_grad.cc +++ b/source/op/prod_virial_se_a_grad.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdVirialSeAGrad") .Attr("T: {float, double}") .Input("grad: T") @@ -26,7 +20,7 @@ REGISTER_OP("ProdVirialSeAGrad") using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdVirialSeAGradOp : public OpKernel { public: @@ -89,12 +83,12 @@ class ProdVirialSeAGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); - auto rij = rij_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); + auto rij = rij_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -158,4 +152,5 @@ class ProdVirialSeAGradOp : public OpKernel REGISTER_KERNEL_BUILDER( \ Name("ProdVirialSeAGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdVirialSeAGradOp); -REGISTER_CPU(VALUETYPE); \ No newline at end of file +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_virial_se_a_multi_device.cc b/source/op/prod_virial_se_a_multi_device.cc index 21cd78c83c..7929fbd588 100644 --- a/source/op/prod_virial_se_a_multi_device.cc +++ b/source/op/prod_virial_se_a_multi_device.cc @@ -13,19 +13,19 @@ REGISTER_OP("ProdVirialSeA") .Output("virial: T") .Output("atom_virial: T"); -template +template struct ProdVirialSeAFunctor { - void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + void operator()(const CPUDevice& d, FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { ProdVirialSeACPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } #if GOOGLE_CUDA - void operator()(const GPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { + void operator()(const GPUDevice& d, FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) { ProdVirialSeAGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift); } #endif // GOOGLE_CUDA }; -template +template class ProdVirialSeAOp : public OpKernel { public: explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) { @@ -80,20 +80,20 @@ class ProdVirialSeAOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); - ProdVirialSeAFunctor()( + ProdVirialSeAFunctor()( context->eigen_device(), - virial_tensor->flat().data(), - atom_virial_tensor->flat().data(), - net_deriv_tensor.flat().data(), - in_deriv_tensor.flat().data(), - rij_tensor.flat().data(), + virial_tensor->flat().data(), + atom_virial_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + rij_tensor.flat().data(), nlist_tensor.flat().data(), nloc, nall, diff --git a/source/op/prod_virial_se_r.cc b/source/op/prod_virial_se_r.cc index 63b74398ea..5337246c21 100644 --- a/source/op/prod_virial_se_r.cc +++ b/source/op/prod_virial_se_r.cc @@ -6,11 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif REGISTER_OP("ProdVirialSeR") .Attr("T: {float, double}") @@ -26,7 +21,7 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdVirialSeROp : public OpKernel { public: explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) { @@ -78,12 +73,12 @@ class ProdVirialSeROp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); // loop over samples #pragma omp parallel for @@ -110,10 +105,10 @@ class ProdVirialSeROp : public OpKernel { for (int jj = 0; jj < nnei; ++jj){ int j_idx = nlist (nlist_iter + i_idx * nnei + jj); if (j_idx < 0) continue; - T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj); + FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj); for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0); + FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0); virial (virial_iter + dd0 * 3 + dd1) -= tmp_v; atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v; } @@ -128,7 +123,8 @@ class ProdVirialSeROp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdVirialSeROp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc index b125c9d783..61b8970e71 100644 --- a/source/op/prod_virial_se_r_grad.cc +++ b/source/op/prod_virial_se_r_grad.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("ProdVirialSeRGrad") .Attr("T: {float, double}") .Input("grad: T") @@ -24,7 +18,7 @@ REGISTER_OP("ProdVirialSeRGrad") using CPUDevice = Eigen::ThreadPoolDevice; -template +template class ProdVirialSeRGradOp : public OpKernel { public: @@ -83,12 +77,12 @@ class ProdVirialSeRGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .flat(); - auto net_deriv = net_deriv_tensor .flat(); - auto in_deriv = in_deriv_tensor .flat(); - auto rij = rij_tensor .flat(); + auto grad = grad_tensor .flat(); + auto net_deriv = net_deriv_tensor .flat(); + auto in_deriv = in_deriv_tensor .flat(); + auto rij = rij_tensor .flat(); auto nlist = nlist_tensor .flat(); - auto grad_net = grad_net_tensor ->flat(); + auto grad_net = grad_net_tensor ->flat(); // loop over frames #pragma omp parallel for @@ -133,4 +127,5 @@ class ProdVirialSeRGradOp : public OpKernel REGISTER_KERNEL_BUILDER( \ Name("ProdVirialSeRGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdVirialSeRGradOp); -REGISTER_CPU(VALUETYPE); \ No newline at end of file +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/prod_virial_se_r_gpu.cc b/source/op/prod_virial_se_r_multi_device.cc similarity index 53% rename from source/op/prod_virial_se_r_gpu.cc rename to source/op/prod_virial_se_r_multi_device.cc index 6324bcf88d..61e37eb215 100644 --- a/source/op/prod_virial_se_r_gpu.cc +++ b/source/op/prod_virial_se_r_multi_device.cc @@ -1,14 +1,5 @@ -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include -#include - -#ifdef HIGH_PREC - typedef double VALUETYPE; -#else - typedef float VALUETYPE; -#endif +#include "common.h" +#include "CustomeOperation.h" REGISTER_OP("ProdVirialSeR") .Attr("T: {float, double}") @@ -20,40 +11,22 @@ REGISTER_OP("ProdVirialSeR") .Output("virial: T") .Output("atom_virial: T"); -using namespace tensorflow; - -using CPUDevice = Eigen::ThreadPoolDevice; -using GPUDevice = Eigen::GpuDevice; - -#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } -inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); +template +struct ProdVirialSeRFunctor { + void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdVirialSeRCPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt); } -} - -void ProdVirialSeRLauncher(VALUETYPE * virial, - VALUETYPE * atom_virial, - const VALUETYPE * net_deriv, - const VALUETYPE * in_deriv, - const VALUETYPE * rij, - const int * nlist, - const int nloc, - const int nall, - const int nnei, - const int ndescrpt, - const int n_a_sel, - const int n_a_shift); + #if GOOGLE_CUDA + void operator()(const GPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) { + ProdVirialSeRGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt); + } + #endif // GOOGLE_CUDA +}; template class ProdVirialSeROp : public OpKernel { public: - explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) { - // std::cout << "I'm in prod_virial_se_r_gpu.cc" << std::endl; - } + explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { // Grab the input tensor @@ -63,16 +36,16 @@ class ProdVirialSeROp : public OpKernel { const Tensor& rij_tensor = context->input(context_input_index++); const Tensor& nlist_tensor = context->input(context_input_index++); const Tensor& natoms_tensor = context->input(context_input_index++); + // set size of the sample OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of net deriv should be 2")); OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input deriv should be 2")); OP_REQUIRES (context, (rij_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of rij should be 2")); OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of nlist should be 2")); OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1), errors::InvalidArgument ("Dim of natoms should be 1")); - cudaErrcheck(cudaDeviceSynchronize()); + OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3")); - int * natoms = new int[natoms_tensor.shape().dim_size(0)]; - cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost)); + const int * natoms = natoms_tensor.flat().data(); int nloc = natoms[0]; int nall = natoms[1]; int nnei = nlist_tensor.shape().dim_size(1) / nloc; @@ -85,52 +58,57 @@ class ProdVirialSeROp : public OpKernel { OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)), errors::InvalidArgument ("number of samples should match")); OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match")); - OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)), errors::InvalidArgument ("dim of rij should be nnei * 3")); + OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)), errors::InvalidArgument ("dim of rij should be nnei * 3")); // Create an output tensor - TensorShape virial_shape; + TensorShape virial_shape ; virial_shape.AddDim (nframes); virial_shape.AddDim (9); Tensor* virial_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor)); - TensorShape atom_virial_shape ; + TensorShape atom_virial_shape; atom_virial_shape.AddDim (nframes); atom_virial_shape.AddDim (9 * nall); Tensor* atom_virial_tensor = NULL; OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); + auto net_deriv = net_deriv_tensor.flat(); + auto in_deriv = in_deriv_tensor.flat(); + auto rij = rij_tensor.flat(); auto nlist = nlist_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); - - for (int II = 0; II < nframes; II++) { - ProdVirialSeRLauncher(virial_tensor->flat().data() + II * 9, - atom_virial_tensor->flat().data() + II * (nall * 9), - net_deriv_tensor.flat().data() + II * (nloc * ndescrpt), - in_deriv_tensor.flat().data() + II * (nloc * ndescrpt * 3), - rij_tensor.flat().data() + II * (nloc * nnei * 3), - nlist_tensor.flat().data() + II * (nloc * nnei), - nloc, - nall, - nnei, - ndescrpt, - n_a_sel, - n_a_shift - ); - } - delete[] natoms; + auto virial = virial_tensor->flat(); + auto atom_virial = atom_virial_tensor->flat(); + + ProdVirialSeRFunctor()( + context->eigen_device(), + virial_tensor->flat().data(), + atom_virial_tensor->flat().data(), + net_deriv_tensor.flat().data(), + in_deriv_tensor.flat().data(), + rij_tensor.flat().data(), + nlist_tensor.flat().data(), + nloc, + nall, + nnei, + ndescrpt + ); } -private: - int n_r_sel, n_a_sel, n_a_shift; }; +// Register the CPU kernels. +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdVirialSeROp); +REGISTER_CPU(float); +REGISTER_CPU(double); // Register the GPU kernels. +#if GOOGLE_CUDA #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ - Name("ProdVirialSeR").Device(DEVICE_GPU).TypeConstraint("T"), \ - ProdVirialSeROp); -REGISTER_GPU(VALUETYPE); \ No newline at end of file + Name("ProdVirialSeR").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms"), \ + ProdVirialSeROp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif // GOOGLE_CUDA \ No newline at end of file diff --git a/source/op/soft_min.cc b/source/op/soft_min.cc index 17ab6bc1f5..6f3bc58932 100644 --- a/source/op/soft_min.cc +++ b/source/op/soft_min.cc @@ -8,11 +8,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif REGISTER_OP("SoftMinSwitch") .Attr("T: {float, double}") @@ -32,7 +27,7 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; -template +template class SoftMinSwitchOp : public OpKernel { public: explicit SoftMinSwitchOp(OpKernelConstruction* context) : OpKernel(context) { @@ -95,10 +90,10 @@ class SoftMinSwitchOp : public OpKernel { // flat the tensors auto type = type_tensor .matrix(); - auto rij = rij_tensor .matrix(); + auto rij = rij_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto sw_value = sw_value_tensor ->matrix(); - auto sw_deriv = sw_deriv_tensor ->matrix(); + auto sw_value = sw_value_tensor ->matrix(); + auto sw_deriv = sw_deriv_tensor ->matrix(); // loop over samples #pragma omp parallel for @@ -115,26 +110,26 @@ class SoftMinSwitchOp : public OpKernel { // compute force of a frame for (int ii = 0; ii < nloc; ++ii){ int i_idx = ii; - T aa = 0; - T bb = 0; + FPTYPE aa = 0; + FPTYPE bb = 0; for (int jj = 0; jj < nnei; ++jj){ int j_idx = nlist (kk, i_idx * nnei + jj); if (j_idx < 0) continue; int rij_idx_shift = (i_idx * nnei + jj) * 3; - T dr[3] = { + FPTYPE dr[3] = { rij(kk, rij_idx_shift + 0), rij(kk, rij_idx_shift + 1), rij(kk, rij_idx_shift + 2) }; - T rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; - T rr = sqrt(rr2); - T ee = exp(-rr / alpha); + FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; + FPTYPE rr = sqrt(rr2); + FPTYPE ee = exp(-rr / alpha); aa += ee; bb += rr * ee; } - T smin = bb / aa; - T vv, dd; - spline5_switch(vv, dd, smin, static_cast(rmin), static_cast(rmax)); + FPTYPE smin = bb / aa; + FPTYPE vv, dd; + spline5_switch(vv, dd, smin, static_cast(rmin), static_cast(rmax)); // value of switch sw_value(kk, i_idx) = vv; // deriv of switch distributed as force @@ -142,17 +137,17 @@ class SoftMinSwitchOp : public OpKernel { int j_idx = nlist (kk, i_idx * nnei + jj); if (j_idx < 0) continue; int rij_idx_shift = (ii * nnei + jj) * 3; - T dr[3] = { + FPTYPE dr[3] = { rij(kk, rij_idx_shift + 0), rij(kk, rij_idx_shift + 1), rij(kk, rij_idx_shift + 2) }; - T rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; - T rr = sqrt(rr2); - T ee = exp(-rr / alpha); - T pref_c = (1./rr - 1./alpha) * ee ; - T pref_d = 1./(rr * alpha) * ee; - T ts; + FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2]; + FPTYPE rr = sqrt(rr2); + FPTYPE ee = exp(-rr / alpha); + FPTYPE pref_c = (1./rr - 1./alpha) * ee ; + FPTYPE pref_d = 1./(rr * alpha) * ee; + FPTYPE ts; ts = dd / (aa * aa) * (aa * pref_c + bb * pref_d); sw_deriv(kk, rij_idx_shift + 0) += ts * dr[0]; sw_deriv(kk, rij_idx_shift + 1) += ts * dr[1]; @@ -190,6 +185,7 @@ class SoftMinSwitchOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("SoftMinSwitch").Device(DEVICE_CPU).TypeConstraint("T"), \ SoftMinSwitchOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/soft_min_force.cc b/source/op/soft_min_force.cc index 43344e7352..ffd6442a1a 100644 --- a/source/op/soft_min_force.cc +++ b/source/op/soft_min_force.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("SoftMinForce") .Attr("T: {float, double}") .Input("du: T") @@ -22,12 +16,11 @@ REGISTER_OP("SoftMinForce") .Attr("n_r_sel: int") .Output("force: T"); - using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; -template +template class SoftMinForceOp : public OpKernel { public: explicit SoftMinForceOp(OpKernelConstruction* context) : OpKernel(context) { @@ -72,10 +65,10 @@ class SoftMinForceOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor)); // flat the tensors - auto du = du_tensor.matrix(); - auto sw_deriv = sw_deriv_tensor.matrix(); + auto du = du_tensor.matrix(); + auto sw_deriv = sw_deriv_tensor.matrix(); auto nlist = nlist_tensor.matrix(); - auto force = force_tensor->matrix(); + auto force = force_tensor->matrix(); // loop over samples #pragma omp parallel for @@ -117,4 +110,5 @@ class SoftMinForceOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("SoftMinForce").Device(DEVICE_CPU).TypeConstraint("T"), \ SoftMinForceOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/soft_min_force_grad.cc b/source/op/soft_min_force_grad.cc index bfaa8aca76..2461828853 100644 --- a/source/op/soft_min_force_grad.cc +++ b/source/op/soft_min_force_grad.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("SoftMinForceGrad") .Attr("T: {float, double}") .Input("grad: T") @@ -25,7 +19,7 @@ REGISTER_OP("SoftMinForceGrad") using CPUDevice = Eigen::ThreadPoolDevice; -template +template class SoftMinForceGradOp : public OpKernel { public: @@ -81,11 +75,11 @@ class SoftMinForceGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .matrix(); - auto du = du_tensor .matrix(); - auto sw_deriv = sw_deriv_tensor .matrix(); + auto grad = grad_tensor .matrix(); + auto du = du_tensor .matrix(); + auto sw_deriv = sw_deriv_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto grad_net = grad_net_tensor ->matrix(); + auto grad_net = grad_net_tensor ->matrix(); // loop over frames #pragma omp parallel for @@ -122,4 +116,5 @@ class SoftMinForceGradOp : public OpKernel REGISTER_KERNEL_BUILDER( \ Name("SoftMinForceGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ SoftMinForceGradOp); -REGISTER_CPU(VALUETYPE); \ No newline at end of file +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/soft_min_virial.cc b/source/op/soft_min_virial.cc index 7829c4b7ed..f369808685 100644 --- a/source/op/soft_min_virial.cc +++ b/source/op/soft_min_virial.cc @@ -6,12 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif - REGISTER_OP("SoftMinVirial") .Attr("T: {float, double}") .Input("du: T") @@ -28,7 +22,7 @@ using namespace tensorflow; using CPUDevice = Eigen::ThreadPoolDevice; -template +template class SoftMinVirialOp : public OpKernel { public: explicit SoftMinVirialOp(OpKernelConstruction* context) : OpKernel(context) { @@ -83,12 +77,12 @@ class SoftMinVirialOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor)); // flat the tensors - auto du = du_tensor.matrix(); - auto sw_deriv = sw_deriv_tensor.matrix(); - auto rij = rij_tensor.matrix(); + auto du = du_tensor.matrix(); + auto sw_deriv = sw_deriv_tensor.matrix(); + auto rij = rij_tensor.matrix(); auto nlist = nlist_tensor.matrix(); - auto virial = virial_tensor->matrix(); - auto atom_virial = atom_virial_tensor->matrix(); + auto virial = virial_tensor->matrix(); + auto atom_virial = atom_virial_tensor->matrix(); // loop over samples #pragma omp parallel for @@ -111,7 +105,7 @@ class SoftMinVirialOp : public OpKernel { int rij_idx_shift = (ii * nnei + jj) * 3; for (int dd0 = 0; dd0 < 3; ++dd0){ for (int dd1 = 0; dd1 < 3; ++dd1){ - VALUETYPE tmp_v = du(kk, i_idx) * sw_deriv(kk, rij_idx_shift + dd0) * rij(kk, rij_idx_shift + dd1); + FPTYPE tmp_v = du(kk, i_idx) * sw_deriv(kk, rij_idx_shift + dd0) * rij(kk, rij_idx_shift + dd1); virial(kk, dd0 * 3 + dd1) -= tmp_v; atom_virial(kk, j_idx * 9 + dd0 * 3 + dd1) -= tmp_v; } @@ -129,6 +123,7 @@ class SoftMinVirialOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("SoftMinVirial").Device(DEVICE_CPU).TypeConstraint("T"), \ SoftMinVirialOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double); diff --git a/source/op/soft_min_virial_grad.cc b/source/op/soft_min_virial_grad.cc index b454612895..0bafaa5bfa 100644 --- a/source/op/soft_min_virial_grad.cc +++ b/source/op/soft_min_virial_grad.cc @@ -6,11 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif REGISTER_OP("SoftMinVirialGrad") .Attr("T: {float, double}") @@ -26,7 +21,7 @@ REGISTER_OP("SoftMinVirialGrad") using CPUDevice = Eigen::ThreadPoolDevice; -template +template class SoftMinVirialGradOp : public OpKernel { public: @@ -88,12 +83,12 @@ class SoftMinVirialGradOp : public OpKernel OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor)); // flat the tensors - auto grad = grad_tensor .matrix(); - auto du = du_tensor .matrix(); - auto sw_deriv = sw_deriv_tensor .matrix(); - auto rij = rij_tensor .matrix(); + auto grad = grad_tensor .matrix(); + auto du = du_tensor .matrix(); + auto sw_deriv = sw_deriv_tensor .matrix(); + auto rij = rij_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto grad_net = grad_net_tensor ->matrix(); + auto grad_net = grad_net_tensor ->matrix(); // loop over frames #pragma omp parallel for @@ -144,4 +139,5 @@ class SoftMinVirialGradOp : public OpKernel REGISTER_KERNEL_BUILDER( \ Name("SoftMinVirialGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ SoftMinVirialGradOp); -REGISTER_CPU(VALUETYPE); \ No newline at end of file +REGISTER_CPU(float); +REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/tab_inter.cc b/source/op/tab_inter.cc index 5e86608e9a..63c7af210c 100644 --- a/source/op/tab_inter.cc +++ b/source/op/tab_inter.cc @@ -6,11 +6,6 @@ using namespace tensorflow; using namespace std; -#ifdef HIGH_PREC -typedef double VALUETYPE; -#else -typedef float VALUETYPE; -#endif REGISTER_OP("TabInter") .Attr("T: {float, double}") @@ -73,7 +68,7 @@ void tabulated_inter (double & ener, fscale *= -hi; } -template +template class TabInterOp : public OpKernel { public: explicit TabInterOp(OpKernelConstruction* context) : OpKernel(context) { @@ -144,15 +139,15 @@ class TabInterOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(tmp_idx++, virial_shape, &virial_tensor)); // flat the tensors - auto table_info = table_info_tensor.flat(); - auto table_data = table_data_tensor.flat(); + auto table_info = table_info_tensor.flat(); + auto table_data = table_data_tensor.flat(); auto type = type_tensor .matrix(); - auto rij = rij_tensor .matrix(); + auto rij = rij_tensor .matrix(); auto nlist = nlist_tensor .matrix(); - auto scale = scale_tensor .matrix(); - auto energy = energy_tensor ->matrix(); - auto force = force_tensor ->matrix(); - auto virial = virial_tensor ->matrix(); + auto scale = scale_tensor .matrix(); + auto energy = energy_tensor ->matrix(); + auto force = force_tensor ->matrix(); + auto virial = virial_tensor ->matrix(); OP_REQUIRES (context, (ntypes == int(table_info(3)+0.1)), errors::InvalidArgument ("ntypes provided in table does not match deeppot")); int nspline = table_info(2)+0.1; @@ -191,7 +186,7 @@ class TabInterOp : public OpKernel { for (int tt = 0; tt < ntypes; ++tt) { for (int ii = 0; ii < natoms(2+tt); ++ii){ int i_type = type(kk, i_idx); - T i_scale = scale(kk, i_idx); + FPTYPE i_scale = scale(kk, i_idx); assert(i_type == tt) ; int jiter = 0; // a neighbor @@ -311,7 +306,8 @@ class TabInterOp : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("TabInter").Device(DEVICE_CPU).TypeConstraint("T"), \ TabInterOp); -REGISTER_CPU(VALUETYPE); +REGISTER_CPU(float); +REGISTER_CPU(double);