From 01315f6467fd80ce36f116d1d26ec5451df7bef8 Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <1138669+ohearnk@users.noreply.github.com> Date: Tue, 20 Aug 2024 14:29:27 -0400 Subject: [PATCH] Remove improper legacy atomic support for double precision arithmetic and replace with emulation at full double precision for pre-Pascal NVIDIA GPUs (previously toggled via USE_LEGACY_ATOMICS). Note that the old code was leading to slow and possibly failing SCF convergence which was only exposed during testing with tighter density matrix convergence thresholds and integral cut-offs. This is likely due to the truncation used for energy and gradient calculations (1e-6 and 1e-12, respectively). --- quick-cmake/QUICKCudaConfig.cmake | 21 - src/gpu/cuda/gpu.cu | 85 --- src/gpu/cuda/gpu.h | 10 - src/gpu/cuda/gpu_MP2.cu | 16 +- src/gpu/cuda/gpu_cew_quad.h | 317 ++++----- src/gpu/cuda/gpu_get2e.cu | 4 - src/gpu/cuda/gpu_get2e_getxc_drivers.h | 681 ++++-------------- src/gpu/cuda/gpu_get2e_grad_ffff.cu | 570 ++++++++------- src/gpu/cuda/gpu_get2e_grad_ffff.cuh | 23 - src/gpu/cuda/gpu_getxc.cu | 84 +-- src/gpu/cuda/gpu_getxc.h | 919 ++++++++++++------------- src/gpu/cuda/gpu_oei.h | 8 - src/gpu/cuda/gpu_type.h | 112 ++- src/gpu/gpu_common.h | 39 +- src/gpu/gpu_get2e_subs.h | 184 +---- src/gpu/gpu_get2e_subs_grad.h | 115 +--- src/gpu/gpu_lri_subs.h | 8 +- src/gpu/gpu_lri_subs_grad.h | 51 -- src/gpu/gpu_oei_grad.h | 45 +- src/gpu/hip/gpu.cu | 86 --- src/gpu/hip/gpu.h | 10 - src/gpu/hip/gpu_MP2.cu | 16 +- src/gpu/hip/gpu_cew_quad.h | 317 ++++----- src/gpu/hip/gpu_get2e.cu | 4 - src/gpu/hip/gpu_get2e_getxc_drivers.h | 681 ++++-------------- src/gpu/hip/gpu_get2e_grad_ffff.cu | 570 ++++++++------- src/gpu/hip/gpu_get2e_grad_ffff.cuh | 23 - src/gpu/hip/gpu_getxc.cu | 84 +-- src/gpu/hip/gpu_getxc.h | 919 ++++++++++++------------- src/gpu/hip/gpu_oei.h | 8 - src/gpu/hip/gpu_type.h | 112 ++- 31 files changed, 2112 insertions(+), 4010 deletions(-) diff --git a/quick-cmake/QUICKCudaConfig.cmake b/quick-cmake/QUICKCudaConfig.cmake index 89061d1f4..4dae1cb11 100644 --- a/quick-cmake/QUICKCudaConfig.cmake +++ b/quick-cmake/QUICKCudaConfig.cmake @@ -61,50 +61,42 @@ if(CUDA) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2 and SM5.3") message(STATUS "BE AWARE: CUDA 7.5 does not support GTX-1080, Titan-XP, DGX-1, V100 or other Pascal/Volta based GPUs.") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif(${CUDA_VERSION} VERSION_EQUAL 8.0) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0 and SM6.1,") message(STATUS "BE AWARE: CUDA 8.0 does not support V100, GV100, Titan-V or later GPUs") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 9.0) AND (${CUDA_VERSION} VERSION_LESS 10.0)) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1 and SM7.0") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 10.0) AND (${CUDA_VERSION} VERSION_LESS 11.0)) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0 and SM7.5") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_EQUAL 11.0)) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5 and SM8.0") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 11.1) AND (${CUDA_VERSION} VERSION_LESS_EQUAL 11.7)) message(STATUS "Configuring QUICK for SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5, SM8.0 and SM8.6") list(APPEND CUDA_NVCC_FLAGS ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS} ${SM86FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_EQUAL 11.8)) message(STATUS "Configuring QUICK for SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5, SM8.0, SM8.6, SM8.9 and SM9.0") list(APPEND CUDA_NVCC_FLAGS ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS} ${SM86FLAGS} ${SM89FLAGS} ${SM90FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 12.0) AND (${CUDA_VERSION} VERSION_LESS 12.5)) message(STATUS "Configuring QUICK for SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5, SM8.0, SM8.6, SM8.9 and SM9.0") list(APPEND CUDA_NVCC_FLAGS ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS} ${SM86FLAGS} ${SM89FLAGS} ${SM90FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) else() @@ -118,7 +110,6 @@ if(CUDA) if("${QUICK_USER_ARCH}" MATCHES "kepler") message(STATUS "Configuring QUICK for SM3.5") list(APPEND CUDA_NVCC_FLAGS ${SM35FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) set(FOUND "TRUE") endif() @@ -126,7 +117,6 @@ if(CUDA) if("${QUICK_USER_ARCH}" MATCHES "maxwell") message(STATUS "Configuring QUICK for SM5.0") list(APPEND CUDA_NVCC_FLAGS ${SM50FLAGS}) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) set(FOUND "TRUE") endif() @@ -280,10 +270,6 @@ if(CUDA) if(DISABLE_OPTIMIZER_CONSTANTS) set(CUDA_DEVICE_CODE_FLAGS -Xptxas --disable-optimizer-constants) endif() - - if(USE_LEGACY_ATOMICS) - list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) - endif() if(NOT INSIDE_AMBER) # -------------------------------------------------------------------- @@ -328,16 +314,10 @@ if(HIP) # add_compile_definitions(QUICK_PLATFORM_AMD_WARP64) # endif() - # HIP codes currently do not support f-functions with -DUSE_LEGACY_ATOMICS targets (gfx906 and gfx908) - if(ENABLEF AND (("${QUICK_USER_ARCH}" STREQUAL "") OR ("${QUICK_USER_ARCH}" MATCHES "gfx906") OR ("${QUICK_USER_ARCH}" MATCHES "gfx908"))) - message(FATAL_ERROR "Error: Unsupported HIP options (ENABLEF with -DUSE_LEGACY_ATOMICS). ${PROJECT_NAME} support for f-functions requires newer HIP architecture targets not using LEGACY_ATOMICS. Please specify architectures with QUICK_USER_ARCH not needing LEGACY_ATOMICS (post-gfx908) or disable f-function support.") - endif() - if( NOT "${QUICK_USER_ARCH}" STREQUAL "") set(FOUND "FALSE") if("${QUICK_USER_ARCH}" MATCHES "gfx908") message(STATUS "Configuring QUICK for gfx908") - list(APPEND AMD_HIP_FLAGS -DUSE_LEGACY_ATOMICS) set(FOUND "TRUE") endif() @@ -351,7 +331,6 @@ if(HIP) message(FATAL_ERROR "Invalid value for QUICK_USER_ARCH. Possible values are gfx908, gfx90a.") endif() else() - list(APPEND AMD_HIP_FLAGS -DUSE_LEGACY_ATOMICS) set(QUICK_USER_ARCH "gfx908") message(STATUS "AMD GPU architecture not specified. Code will be optimized for gfx908.") endif() diff --git a/src/gpu/cuda/gpu.cu b/src/gpu/cuda/gpu.cu index 015f08554..19e99a513 100644 --- a/src/gpu/cuda/gpu.cu +++ b/src/gpu/cuda/gpu.cu @@ -1378,36 +1378,8 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou gpu->gpu_calculated->o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); gpu->gpu_calculated->dense = new gpu_buffer_type(dense, gpu->nbasis, gpu->nbasis); -#ifdef USE_LEGACY_ATOMICS - gpu->gpu_calculated->o->DeleteGPU(); - gpu->gpu_calculated->oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - gpu->gpu_calculated->oULL->Upload(); - gpu->gpu_sim.oULL = gpu->gpu_calculated->oULL->_devData; -#else gpu->gpu_calculated->o->Upload(); gpu->gpu_sim.o = gpu->gpu_calculated->o->_devData; -#endif - - /* - oULL is the unsigned long long int type of O matrix. The reason to do so is because - Atomic Operator for CUDA 2.0 is only available for integer. So for double precision type, - an comprimise way is to multiple a very large number (OSCALE), first and divided it - after atomic operator. - */ - /* - for (int i = 0; inbasis; i++) { - for (int j = 0; jnbasis; j++) { - QUICKULL valUII = (QUICKULL) (fabs ( LOC2( gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis)*OSCALE + (QUICKDouble)0.5)); - - if (LOC2( gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis)<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - - LOC2( gpu->gpu_calculated->oULL->_hostData, i, j, gpu->nbasis, gpu->nbasis) = valUII; - } - } - */ gpu->gpu_calculated->dense->Upload(); gpu->gpu_sim.dense = gpu->gpu_calculated->dense->_devData; @@ -1443,34 +1415,8 @@ extern "C" void gpu_upload_calculated_beta_(QUICKDouble* ob, QUICKDouble* denseb gpu->gpu_calculated->ob = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); -#ifdef USE_LEGACY_ATOMICS - gpu->gpu_calculated->ob->DeleteGPU(); - gpu->gpu_calculated->obULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - gpu->gpu_calculated->obULL->Upload(); - gpu->gpu_sim.obULL = gpu->gpu_calculated->obULL->_devData; -#else gpu->gpu_calculated->ob->Upload(); gpu->gpu_sim.ob = gpu->gpu_calculated->ob->_devData; -#endif - - /* - obULL is the unsigned long long int type of Ob matrix. The reason to do so is because - Atomic Operator for CUDA 2.0 is only available for integer. So for double precision type, - an comprimise way is to multiple a very large number (OSCALE), first and divided it - after atomic operator. - */ - /*for (int i = 0; inbasis; i++) { - for (int j = 0; jnbasis; j++) { - QUICKULL valUII = (QUICKULL) (fabs ( LOC2( gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis)*OSCALE + (QUICKDouble)0.5)); - - if (LOC2( gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis)<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - - LOC2( gpu->gpu_calculated->obULL->_hostData, i, j, gpu->nbasis, gpu->nbasis) = valUII; - } - }*/ gpu_upload_beta_density_matrix_(denseb); @@ -1883,12 +1829,6 @@ extern "C" void gpu_upload_grad_(QUICKDouble* gradCutoff) gpu->grad = new gpu_buffer_type(3 * gpu->natom); -#ifdef USE_LEGACY_ATOMICS - gpu->gradULL = new gpu_buffer_type(3 * gpu->natom); - gpu->gpu_sim.gradULL = gpu->gradULL->_devData; - gpu->gradULL->Upload(); -#endif - //gpu->grad->DeleteGPU(); gpu->gpu_sim.grad = gpu->grad->_devData; gpu->grad->Upload(); @@ -2910,26 +2850,6 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) PRINTDEBUG("COMPLETE KERNEL") -#ifdef USE_LEGACY_ATOMICS - gpu->gpu_calculated->oULL->Download(); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } -#else gpu->gpu_calculated->o->Download(); for (int i = 0; i< gpu->nbasis; i++) { @@ -2938,7 +2858,6 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } -#endif gpu->gpu_calculated->o->Download(o); #ifdef DEBUG @@ -2960,10 +2879,6 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) delete gpu->gpu_cutoff->YCutoff; delete gpu->gpu_cutoff->cutPrim; -#ifdef USE_LEGACY_ATOMICS - delete gpu->gpu_calculated->oULL; -#endif - PRINTDEBUG("COMPLETE RUNNING ADDINT") } diff --git a/src/gpu/cuda/gpu.h b/src/gpu/cuda/gpu.h index dcccde5ef..6525f175e 100644 --- a/src/gpu/cuda/gpu.h +++ b/src/gpu/cuda/gpu.h @@ -293,13 +293,8 @@ void bind_eri_texture(_gpu_type gpu); void unbind_eri_texture(); //__device__ void gpu_shell(unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL); -#ifdef USE_LEGACY_ATOMICS -__device__ void addint(QUICKULL* oULL, QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis); -__device__ __forceinline__ void addint_oshell(QUICKULL* oULL, QUICKULL* obULL,QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb,int nbasis); -#else __device__ void addint(QUICKDouble* o, QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis); __device__ __forceinline__ void addint_oshell(QUICKDouble* o, QUICKDouble* ob,QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb,int nbasis); -#endif __device__ __forceinline__ void addint_lri(QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis); __device__ void FmT_sp(const int MaxM, const QUICKDouble X, QUICKDouble* vals); __device__ void FmT_spd(const int MaxM, const QUICKDouble X, QUICKDouble* vals); @@ -621,13 +616,8 @@ __device__ int lefthrr_lri23(QUICKDouble RAx, QUICKDouble RAy, QUICKDouble RAz, int KLMNAx, int KLMNAy, int KLMNAz, int KLMNBx, int KLMNBy, int KLMNBz, int IJTYPE,QUICKDouble* coefAngularL, unsigned char* angularL); -#ifdef USE_LEGACY_ATOMICS -__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKULL* smemGrad, int iparent, int gid); -__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKULL* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom); -#else __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int iparent, int gid); __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKDouble* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom); -#endif __device__ QUICKDouble get_unnormalized_weight(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int iatm); __device__ QUICKDouble SSW( QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int atm); diff --git a/src/gpu/cuda/gpu_MP2.cu b/src/gpu/cuda/gpu_MP2.cu index ea0ac6d5d..e98c63485 100644 --- a/src/gpu/cuda/gpu_MP2.cu +++ b/src/gpu/cuda/gpu_MP2.cu @@ -378,7 +378,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned QUICKULL val1 = (QUICKULL) (fabs(val1d * OSCALE) + (QUICKDouble) 0.5); if (val1d < (QUICKDouble) 0.0) val1 = 0ull - val1; - QUICKADD(LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1); + atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1); // } // ATOMIC ADD VALUE 2 @@ -393,7 +393,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned QUICKULL val2 = (QUICKULL) (fabs(val2d * OSCALE) + (QUICKDouble) 0.5); if (val2d < (QUICKDouble) 0.0) val2 = 0ull - val2; - QUICKADD(LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2); + atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2); // } } @@ -406,7 +406,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned } if (DENSELJ * Y < (QUICKDouble) 0.0) val3 = 0ull - val3; - QUICKADD(LOC2(devSim_MP2.oULL, KKK - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val3); + atomicAdd(&LOC2(devSim_MP2.oULL, KKK - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val3); //} // ATOMIC ADD VALUE 4 @@ -415,7 +415,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned // if (abs(val4d) > devSim_MP2.integralCutoff) { QUICKULL val4 = (QUICKULL) (fabs(val4d * OSCALE) + (QUICKDouble) 0.5); if (val4d < (QUICKDouble) 0.0) val4 = 0ull - val4; - QUICKADD(LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val4); + atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val4); //} } @@ -428,13 +428,13 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned if ((III != JJJ && III < KKK) || (III == JJJ && III == KKK && III < LLL) || (III == KKK && III < JJJ && JJJ < LLL)) { - QUICKADD(LOC2(devSim_MP2.oULL, MAX(JJJ,KKK) - 1, MIN(JJJ,KKK) - 1, + atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,KKK) - 1, MIN(JJJ,KKK) - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5); } // ATOMIC ADD VALUE 5 - 2 if (III != JJJ && JJJ == KKK) { - QUICKADD(LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1, + atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5); } //} @@ -449,12 +449,12 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned if (val6d < (QUICKDouble) 0.0) val6 = 0ull - val6; - QUICKADD(LOC2(devSim_MP2.oULL, MAX(JJJ,LLL) - 1, MIN(JJJ,LLL) - 1, + atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,LLL) - 1, MIN(JJJ,LLL) - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6); // ATOMIC ADD VALUE 6 - 2 if (JJJ == LLL && III != KKK) { - QUICKADD(LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1, + atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6); } } diff --git a/src/gpu/cuda/gpu_cew_quad.h b/src/gpu/cuda/gpu_cew_quad.h index c1c9af579..a0e1276c7 100644 --- a/src/gpu/cuda/gpu_cew_quad.h +++ b/src/gpu/cuda/gpu_cew_quad.h @@ -1,52 +1,47 @@ /* - !---------------------------------------------------------------------! - ! Written by Madu Manathunga on 09/29/2021 ! - ! ! - ! Copyright (C) 2020-2021 Merz lab ! - ! Copyright (C) 2020-2021 Götz lab ! - ! ! - ! This Source Code Form is subject to the terms of the Mozilla Public ! - ! License, v. 2.0. If a copy of the MPL was not distributed with this ! - ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! - !_____________________________________________________________________! - - !---------------------------------------------------------------------! - ! This source file contains preprocessable functions required for ! - ! QUICK GPU version. ! - !---------------------------------------------------------------------! -*/ + !---------------------------------------------------------------------! + ! Written by Madu Manathunga on 09/29/2021 ! + ! ! + ! Copyright (C) 2020-2021 Merz lab ! + ! Copyright (C) 2020-2021 Götz lab ! + ! ! + ! This Source Code Form is subject to the terms of the Mozilla Public ! + ! License, v. 2.0. If a copy of the MPL was not distributed with this ! + ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! + !_____________________________________________________________________! + + !---------------------------------------------------------------------! + ! This source file contains preprocessable functions required for ! + ! QUICK GPU version. ! + !---------------------------------------------------------------------! + */ #ifdef CEW #include "iface.hpp" -#ifndef OSHELL -void getcew_quad(_gpu_type gpu){ +#ifndef OSHELL +void getcew_quad(_gpu_type gpu) { QUICK_SAFE_CALL((getcew_quad_kernel<<< gpu -> blocks, gpu -> xc_threadsPerBlock>>>())); cudaDeviceSynchronize(); } -void getcew_quad_grad(_gpu_type gpu){ - - if(gpu -> gpu_sim.is_oshell == true){ - +void getcew_quad_grad(_gpu_type gpu) { + if(gpu -> gpu_sim.is_oshell == true) { QUICK_SAFE_CALL((get_oshell_density_kernel<<blocks, gpu->xc_threadsPerBlock>>>())); cudaDeviceSynchronize(); QUICK_SAFE_CALL((oshell_getcew_quad_grad_kernel<<< gpu -> blocks, gpu -> xc_threadsPerBlock, gpu -> gpu_xcq -> smem_size>>>())); - - }else{ - + } else { QUICK_SAFE_CALL((get_cshell_density_kernel<<blocks, gpu->xc_threadsPerBlock>>>())); cudaDeviceSynchronize(); QUICK_SAFE_CALL((cshell_getcew_quad_grad_kernel<<< gpu -> blocks, gpu -> xc_threadsPerBlock, gpu -> gpu_xcq -> smem_size>>>())); //QUICK_SAFE_CALL((cshell_getcew_quad_grad_kernel<<< 1,1, gpu -> gpu_xcq -> smem_size>>>())); - } cudaDeviceSynchronize(); @@ -62,27 +57,25 @@ void getcew_quad_grad(_gpu_type gpu){ cudaDeviceSynchronize(); gpu_delete_sswgrad_vars(); - } -void get_cew_accdens(_gpu_type gpu){ - - QUICKDouble *gridpt = new QUICKDouble[3]; - QUICKDouble *cewGrad= new QUICKDouble[3]; +void get_cew_accdens(_gpu_type gpu) { + QUICKDouble *gridpt = new QUICKDouble[3]; + QUICKDouble *cewGrad= new QUICKDouble[3]; gpu -> gpu_xcq -> densa -> Download(); - if(gpu -> gpu_sim.is_oshell == true) gpu -> gpu_xcq -> densb -> Download(); - + if(gpu -> gpu_sim.is_oshell == true) + gpu -> gpu_xcq -> densb -> Download(); - for(int i=0; i< gpu -> gpu_xcq -> npoints;i++){ - + for(int i=0; i< gpu -> gpu_xcq -> npoints;i++) { QUICKDouble weight = gpu -> gpu_xcq -> weight -> _hostData[i]; QUICKDouble densea = gpu -> gpu_xcq -> densa -> _hostData[i]; QUICKDouble denseb = densea; - if(gpu -> gpu_sim.is_oshell == true) denseb = gpu -> gpu_xcq -> densb -> _hostData[i]; + if(gpu -> gpu_sim.is_oshell == true) + denseb = gpu -> gpu_xcq -> densb -> _hostData[i]; gridpt[0] = gpu -> gpu_xcq -> gridx -> _hostData[i]; gridpt[1] = gpu -> gpu_xcq -> gridy -> _hostData[i]; @@ -90,228 +83,178 @@ void get_cew_accdens(_gpu_type gpu){ const QUICKDouble charge_density = -weight * (densea+denseb); - for(int j=0; j<3; j++) cewGrad[j]=0.0; + for(int j=0; j<3; j++) + cewGrad[j]=0.0; QUICKDouble const *cnst_gridpt = gridpt; - // this function comes from cew library in amber + // this function comes from cew library in amber cew_accdensatpt_(cnst_gridpt, &charge_density, cewGrad); -//printf("cew_accdensatpt %f %f %f %f %f %f %f \n", gridpt[0], gridpt[1], gridpt[2], charge_density\ -,cewGrad[0], cewGrad[1], cewGrad[2]); + //printf("cew_accdensatpt %f %f %f %f %f %f %f \n", gridpt[0], gridpt[1], gridpt[2], charge_density\ + ,cewGrad[0], cewGrad[1], cewGrad[2]); int Istart = (gpu -> gpu_xcq -> gatm -> _hostData[i]-1) * 3; for(int j=0; j<3; j++) -#ifdef USE_LEGACY_ATOMICS - gpu->grad->_hostData[Istart+j] += cewGrad[j]; -#else gpu -> cew_grad->_hostData[Istart+j] += cewGrad[j]; -#endif - } delete gridpt; delete cewGrad; - } - - __global__ void getcew_quad_kernel() { - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; - - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { - - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - QUICKDouble weight = devSim_dft.weight[gid]; + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble dfdr = devSim_dft.cew_vrecip[gid]; + QUICKDouble weight = devSim_dft.weight[gid]; - for (int i = bfloc_st; i< bfloc_end; ++i) { + QUICKDouble dfdr = devSim_dft.cew_vrecip[gid]; - int ibas = devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; + for (int i = bfloc_st; i< bfloc_end; ++i) { + int ibas = devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.DMCutoff ) { - for (int j = bfloc_st; j < bfloc_end; j++) { + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.DMCutoff ) { + for (int j = bfloc_st; j < bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - QUICKDouble _tmp = phi * phi2 * dfdr * weight; + QUICKDouble _tmp = phi * phi2 * dfdr * weight; -#ifdef USE_LEGACY_ATOMICS - QUICKULL val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) val1 = 0ull - val1; - QUICKADD(LOC2(devSim_dft.oULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), val1); -#else - atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); -#endif + atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); + } + } } - } } - } - + } #endif + #ifdef OSHELL __global__ void oshell_getcew_quad_grad_kernel() #else __global__ void cshell_getcew_quad_grad_kernel() #endif { + //declare smem grad vector + extern __shared__ QUICKDouble smem_buffer[]; + QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; -#ifdef USE_LEGACY_ATOMICS - //declare smem grad vector - extern __shared__ QUICKULL smem_buffer[]; - QUICKULL* smemGrad=(QUICKULL*)smem_buffer; + // initialize smem grad + for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) + smemGrad[i]=0.0; - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0ull; -#else - //declare smem grad vector - extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; + __syncthreads(); - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0.0; -#endif + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; - __syncthreads(); + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; - - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { - - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - - - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble weight = devSim_dft.weight[gid]; + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; + QUICKDouble weight = devSim_dft.weight[gid]; #ifdef OSHELL - QUICKDouble densitysum = devSim_dft.densa[gid]+devSim_dft.densb[gid]; + QUICKDouble densitysum = devSim_dft.densa[gid]+devSim_dft.densb[gid]; #else - QUICKDouble densitysum = 2*devSim_dft.densa[gid]; + QUICKDouble densitysum = 2*devSim_dft.densa[gid]; #endif - QUICKDouble dfdr = devSim_dft.cew_vrecip[gid]; - - if(densitysum >devSim_dft.DMCutoff){ - - QUICKDouble _tmp = ((QUICKDouble) (dfdr * densitysum)); + QUICKDouble dfdr = devSim_dft.cew_vrecip[gid]; - devSim_dft.exc[gid] = _tmp; + if(densitysum >devSim_dft.DMCutoff) { + QUICKDouble _tmp = ((QUICKDouble) (dfdr * densitysum)); - QUICKDouble sumGradx = 0.0; - QUICKDouble sumGrady = 0.0; - QUICKDouble sumGradz = 0.0; + devSim_dft.exc[gid] = _tmp; - for (int i = bfloc_st; i< bfloc_end; i++) { - int ibas = devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + QUICKDouble sumGradx = 0.0; + QUICKDouble sumGrady = 0.0; + QUICKDouble sumGradz = 0.0; - if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.DMCutoff ) { + for (int i = bfloc_st; i< bfloc_end; i++) { + int ibas = devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - //QUICKDouble dxdx, dxdy, dxdz, dydy, dydz, dzdz; + if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.DMCutoff ) { + //QUICKDouble dxdx, dxdy, dxdz, dydy, dydz, dzdz; - //pt2der_new(gridx, gridy, gridz, &dxdx, &dxdy, &dxdz, &dydy, &dydz, &dzdz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + //pt2der_new(gridx, gridy, gridz, &dxdx, &dxdy, &dxdz, &dydy, &dydz, &dzdz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - int Istart = (devSim_dft.ncenter[ibas]-1) * 3; + int Istart = (devSim_dft.ncenter[ibas]-1) * 3; - for (int j = bfloc_st; j < bfloc_end; j++) { + for (int j = bfloc_st; j < bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - - QUICKDouble denseij = (QUICKDouble) LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + QUICKDouble denseij = (QUICKDouble) LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); #ifdef OSHELL - denseij += (QUICKDouble) LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); -#endif - - QUICKDouble Gradx = - 2.0 * denseij * weight * (dfdr * dphidx * phi2); - QUICKDouble Grady = - 2.0 * denseij * weight * (dfdr * dphidy * phi2); - QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2); -//printf("test quad grad %f %f %f %f %f %f %f %f %f %f\n", gridx, gridy, gridz, denseij, weight, dfdr, dphidx, dphidy, dphidz, phi2); - -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[Istart], Gradx); - GRADADD(smemGrad[Istart+1], Grady); - GRADADD(smemGrad[Istart+2], Gradz); -#else - atomicAdd(&smemGrad[Istart], Gradx); - atomicAdd(&smemGrad[Istart+1], Grady); - atomicAdd(&smemGrad[Istart+2], Gradz); + denseij += (QUICKDouble) LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); #endif - sumGradx += Gradx; - sumGrady += Grady; - sumGradz += Gradz; - } + QUICKDouble Gradx = - 2.0 * denseij * weight * (dfdr * dphidx * phi2); + QUICKDouble Grady = - 2.0 * denseij * weight * (dfdr * dphidy * phi2); + QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2); + //printf("test quad grad %f %f %f %f %f %f %f %f %f %f\n", gridx, gridy, gridz, denseij, weight, dfdr, dphidx, dphidy, dphidz, phi2); + + atomicAdd(&smemGrad[Istart], Gradx); + atomicAdd(&smemGrad[Istart+1], Grady); + atomicAdd(&smemGrad[Istart+2], Gradz); + sumGradx += Gradx; + sumGrady += Grady; + sumGradz += Gradz; + } + } + } + + int Istart = (devSim_dft.gatm[gid]-1)*3; + + atomicAdd(&smemGrad[Istart], -sumGradx); + atomicAdd(&smemGrad[Istart+1], -sumGrady); + atomicAdd(&smemGrad[Istart+2], -sumGradz); } - } - - int Istart = (devSim_dft.gatm[gid]-1)*3; -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[Istart], -sumGradx); - GRADADD(smemGrad[Istart+1], -sumGrady); - GRADADD(smemGrad[Istart+2], -sumGradz); -#else - atomicAdd(&smemGrad[Istart], -sumGradx); - atomicAdd(&smemGrad[Istart+1], -sumGrady); - atomicAdd(&smemGrad[Istart+2], -sumGradz); -#endif - - } - - //Set weights for sswder calculation - if(densitysum < devSim_dft.DMCutoff){ + //Set weights for sswder calculation + if(densitysum < devSim_dft.DMCutoff) { devSim_dft.dweight_ssd[gid] = 0; - } + } - if(devSim_dft.sswt[gid] == 1){ + if(devSim_dft.sswt[gid] == 1) { devSim_dft.dweight_ssd[gid] = 0; + } } - - } - - __syncthreads(); - // update gmem grad vector - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) -#ifdef USE_LEGACY_ATOMICS - atomicAdd(&devSim_dft.gradULL[i],smemGrad[i]); -#else - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); -#endif + __syncthreads(); - __syncthreads(); + // update gmem grad vector + for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) + atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + __syncthreads(); } #endif diff --git a/src/gpu/cuda/gpu_get2e.cu b/src/gpu/cuda/gpu_get2e.cu index 990e4ae91..12c1b59cb 100644 --- a/src/gpu/cuda/gpu_get2e.cu +++ b/src/gpu/cuda/gpu_get2e.cu @@ -802,11 +802,7 @@ __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) getAddInt_kerne // } else if( devSim.method == LIBXC) { // hybrid_coeff = devSim.hyb_coeff; // } -#ifdef USE_LEGACY_ATOMICS - addint(devSim.oULL, a[k].value, III, JJJ, KKK, LLL, devSim.hyb_coeff, devSim.dense, devSim.nbasis); -#else addint(devSim.o, a[k].value, III, JJJ, KKK, LLL, devSim.hyb_coeff, devSim.dense, devSim.nbasis); -#endif } } diff --git a/src/gpu/cuda/gpu_get2e_getxc_drivers.h b/src/gpu/cuda/gpu_get2e_getxc_drivers.h index 777762111..35ce1678c 100644 --- a/src/gpu/cuda/gpu_get2e_getxc_drivers.h +++ b/src/gpu/cuda/gpu_get2e_getxc_drivers.h @@ -1,22 +1,22 @@ /* - !---------------------------------------------------------------------! - ! Created by Madu Manathunga on 04/07/2021 ! - ! ! - ! Previous contributors: Yipu Miao ! - ! ! - ! Copyright (C) 2020-2021 Merz lab ! - ! Copyright (C) 2020-2021 Götz lab ! - ! ! - ! This Source Code Form is subject to the terms of the Mozilla Public ! - ! License, v. 2.0. If a copy of the MPL was not distributed with this ! - ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! - !_____________________________________________________________________! - - !---------------------------------------------------------------------! - ! This source file contains preprocessable get2e and getxc C functions! - ! that can be called from f90 subroutines. ! - !---------------------------------------------------------------------! -*/ + !---------------------------------------------------------------------! + ! Created by Madu Manathunga on 04/07/2021 ! + ! ! + ! Previous contributors: Yipu Miao ! + ! ! + ! Copyright (C) 2020-2021 Merz lab ! + ! Copyright (C) 2020-2021 Götz lab ! + ! ! + ! This Source Code Form is subject to the terms of the Mozilla Public ! + ! License, v. 2.0. If a copy of the MPL was not distributed with this ! + ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! + !_____________________________________________________________________! + + !---------------------------------------------------------------------! + ! This source file contains preprocessable get2e and getxc C functions! + ! that can be called from f90 subroutines. ! + !---------------------------------------------------------------------! + */ //----------------------------------------------- // core part, compute 2-e integrals @@ -28,9 +28,7 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) #endif { PRINTDEBUG("BEGIN TO RUN GET ERI") - upload_sim_to_constant(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") #ifdef OSHELL @@ -41,56 +39,12 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) PRINTDEBUG("COMPLETE KERNEL") -#ifdef USE_LEGACY_ATOMICS - gpu -> gpu_calculated -> oULL -> Download(); - cudaMemsetAsync(gpu -> gpu_calculated -> oULL -> _devData, 0, sizeof(QUICKULL)*gpu->nbasis*gpu->nbasis); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } - -#ifdef OSHELL - gpu -> gpu_calculated -> obULL -> Download(); - cudaMemsetAsync(gpu -> gpu_calculated -> obULL -> _devData, 0, sizeof(QUICKULL)*gpu->nbasis*gpu->nbasis); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->obULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->ob->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } -#endif - -#else gpu -> gpu_calculated -> o -> Download(); cudaMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); for (int i = 0; i< gpu->nbasis; i++) { for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); + LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } #ifdef OSHELL @@ -102,8 +56,6 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } -#endif - #endif gpu -> gpu_calculated -> o -> DownloadSum(o); @@ -115,25 +67,18 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) PRINTDEBUG("DELETE TEMP VARIABLES") if(gpu -> gpu_sim.method == HF){ - delete gpu->gpu_calculated->o; - delete gpu->gpu_calculated->dense; - -#ifdef USE_LEGACY_ATOMICS - delete gpu->gpu_calculated->oULL; -#ifdef OSHELL - delete gpu->gpu_calculated->obULL; -#endif -#endif + delete gpu->gpu_calculated->o; + delete gpu->gpu_calculated->dense; #ifdef OSHELL - delete gpu->gpu_calculated->ob; - delete gpu->gpu_calculated->denseb; + delete gpu->gpu_calculated->ob; + delete gpu->gpu_calculated->denseb; #endif }else if(*deltaO != 0){ - delete gpu->gpu_calculated->dense; + delete gpu->gpu_calculated->dense; #ifdef OSHELL - delete gpu->gpu_calculated->denseb; + delete gpu->gpu_calculated->denseb; #endif } @@ -142,6 +87,7 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) PRINTDEBUG("COMPLETE RUNNING GET2E") } + #ifdef OSHELL extern "C" void gpu_get_oshell_eri_grad_(QUICKDouble* grad) #else @@ -149,9 +95,7 @@ extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) #endif { PRINTDEBUG("BEGIN TO RUN GRAD") - upload_sim_to_constant(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") if(gpu -> gpu_sim.is_oshell == true){ @@ -163,7 +107,7 @@ extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) #ifdef GPU_SPDF if (gpu->maxL >= 3) { upload_sim_to_constant_ffff(gpu); - + if(gpu -> gpu_sim.is_oshell == true){ get_oshell_eri_grad_ffff(gpu); }else{ @@ -175,52 +119,24 @@ extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) PRINTDEBUG("COMPLETE KERNEL") if(gpu -> gpu_sim.method == HF){ - -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); - - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - gpu -> grad -> Download(); - -#endif + gpu -> grad -> Download(); } if(gpu -> gpu_sim.method == HF){ + gpu -> grad -> DownloadSum(grad); - gpu -> grad -> DownloadSum(grad); - - delete gpu -> grad; -#ifdef USE_LEGACY_ATOMICS - delete gpu -> gradULL; -#endif - delete gpu->gpu_calculated->dense; + delete gpu -> grad; + delete gpu->gpu_calculated->dense; #ifdef OSHELL - delete gpu->gpu_calculated->denseb; + delete gpu->gpu_calculated->denseb; #endif - } - PRINTDEBUG("COMPLETE RUNNING GRAD") } - #ifdef OSHELL extern "C" void gpu_get_oshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICKDouble* belec, QUICKDouble *o, QUICKDouble *ob) #else @@ -229,123 +145,19 @@ extern "C" void gpu_get_cshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICK { PRINTDEBUG("BEGIN TO RUN GETXC") - gpu -> DFT_calculated = new gpu_buffer_type(1, 1); - -#ifdef USE_LEGACY_ATOMICS - QUICKULL valUII = (QUICKULL) (fabs ( *Eelxc * OSCALE + (QUICKDouble)0.5)); - - if (*Eelxc<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - - gpu -> DFT_calculated -> _hostData[0].Eelxc = valUII; - - valUII = (QUICKULL) (fabs ( *aelec * OSCALE + (QUICKDouble)0.5)); - - if (*aelec<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - gpu -> DFT_calculated -> _hostData[0].aelec = valUII; - - valUII = (QUICKULL) (fabs ( *belec * OSCALE + (QUICKDouble)0.5)); - - if (*belec<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - - gpu -> DFT_calculated -> _hostData[0].belec = valUII; -#else - gpu -> DFT_calculated -> _hostData[0].Eelxc = 0.0; - gpu -> DFT_calculated -> _hostData[0].aelec = 0.0; - gpu -> DFT_calculated -> _hostData[0].belec = 0.0; -#endif - gpu -> DFT_calculated -> Upload(); - gpu -> gpu_sim.DFT_calculated= gpu -> DFT_calculated->_devData; + gpu->DFT_calculated = new gpu_buffer_type(1, 1); + gpu->DFT_calculated->_hostData[0].Eelxc = 0.0; + gpu->DFT_calculated->_hostData[0].aelec = 0.0; + gpu->DFT_calculated->_hostData[0].belec = 0.0; + gpu->DFT_calculated->Upload(); + gpu->gpu_sim.DFT_calculated = gpu->DFT_calculated->_devData; upload_sim_to_constant_dft(gpu); PRINTDEBUG("BEGIN TO RUN KERNEL") getxc(gpu); - gpu -> DFT_calculated -> Download(); - -#ifdef USE_LEGACY_ATOMICS - gpu -> gpu_calculated -> oULL -> Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } - -#ifdef OSHELL - gpu -> gpu_calculated -> obULL -> Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->obULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->ob->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } - -#endif - - QUICKULL valULL = gpu->DFT_calculated -> _hostData[0].Eelxc; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - *Eelxc = (QUICKDouble)valDB*ONEOVEROSCALE; - - valULL = gpu->DFT_calculated -> _hostData[0].aelec; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - *aelec = (QUICKDouble)valDB*ONEOVEROSCALE; - - valULL = gpu->DFT_calculated -> _hostData[0].belec; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - *belec = (QUICKDouble)valDB*ONEOVEROSCALE; -#else + gpu->DFT_calculated->Download(); gpu -> gpu_calculated -> o -> Download(); for (int i = 0; i< gpu->nbasis; i++) { @@ -361,15 +173,12 @@ extern "C" void gpu_get_cshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICK LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } - #endif - *Eelxc = gpu->DFT_calculated -> _hostData[0].Eelxc; + *Eelxc = gpu->DFT_calculated -> _hostData[0].Eelxc; *aelec = gpu->DFT_calculated -> _hostData[0].aelec; *belec = gpu->DFT_calculated -> _hostData[0].belec; -#endif - gpu -> gpu_calculated -> o -> DownloadSum(o); #ifdef OSHELL gpu -> gpu_calculated -> ob -> DownloadSum(ob); @@ -380,125 +189,70 @@ extern "C" void gpu_get_cshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICK delete gpu->gpu_calculated->o; delete gpu->gpu_calculated->dense; -#ifdef USE_LEGACY_ATOMICS - delete gpu->gpu_calculated->oULL; -#ifdef OSHELL - delete gpu->gpu_calculated->obULL; -#endif -#endif - #ifdef OSHELL delete gpu->gpu_calculated->ob; delete gpu->gpu_calculated->denseb; #endif - } + #ifdef OSHELL extern "C" void gpu_get_oshell_xcgrad_(QUICKDouble *grad) #else extern "C" void gpu_get_cshell_xcgrad_(QUICKDouble *grad) #endif { - -#if (defined CEW) && !(defined USE_LEGACY_ATOMICS) +#if defined(CEW) gpu -> cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); #endif - // calculate smem size - gpu -> gpu_xcq -> smem_size = gpu->natom * 3 * sizeof(QUICKULL); - - upload_sim_to_constant_dft(gpu); - - memset(gpu->grad->_hostData, 0, gpu -> gpu_xcq -> smem_size); - - getxc_grad(gpu); - -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); + // calculate smem size + gpu -> gpu_xcq -> smem_size = gpu->natom * 3 * sizeof(QUICKULL); - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; + upload_sim_to_constant_dft(gpu); - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } + memset(gpu->grad->_hostData, 0, gpu -> gpu_xcq -> smem_size); - gpu->grad->_hostData[i] += (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - gpu -> grad -> Download(); -#endif + getxc_grad(gpu); + gpu -> grad -> Download(); - gpu -> grad -> DownloadSum(grad); + gpu -> grad -> DownloadSum(grad); -#if (defined CEW) && !(defined USE_LEGACY_ATOMICS) - gpu -> cew_grad->DownloadSum(grad); - delete gpu -> cew_grad; +#if defined(CEW) + gpu -> cew_grad->DownloadSum(grad); + delete gpu -> cew_grad; #endif - delete gpu -> grad; -#ifdef USE_LEGACY_ATOMICS - delete gpu -> gradULL; -#endif - delete gpu->gpu_calculated->dense; + delete gpu -> grad; + delete gpu->gpu_calculated->dense; #ifdef OSHELL - delete gpu->gpu_calculated->denseb; + delete gpu->gpu_calculated->denseb; #endif } - #ifndef OSHELL extern "C" void gpu_get_oei_(QUICKDouble* o) { + // gpu -> gpu_calculated -> o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); -// gpu -> gpu_calculated -> o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - -//#ifdef LEGACY_ATOMIC_ADD -// gpu -> gpu_calculated -> o -> DeleteGPU(); -// gpu -> gpu_calculated -> oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); -// gpu -> gpu_calculated -> oULL -> Upload(); -// gpu -> gpu_sim.oULL = gpu -> gpu_calculated -> oULL -> _devData; -/*#else - gpu -> gpu_calculated -> o -> Upload(); - gpu -> gpu_sim.o = gpu -> gpu_calculated -> o -> _devData; + //#ifdef LEGACY_ATOMIC_ADD + // gpu -> gpu_calculated -> o -> DeleteGPU(); + // gpu -> gpu_calculated -> oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + // gpu -> gpu_calculated -> oULL -> Upload(); + // gpu -> gpu_sim.oULL = gpu -> gpu_calculated -> oULL -> _devData; + /*#else + gpu -> gpu_calculated -> o -> Upload(); + gpu -> gpu_sim.o = gpu -> gpu_calculated -> o -> _devData; #endif -*/ +*/ upload_sim_to_constant_oei(gpu); - + upload_para_to_const_oei(); getOEI(gpu); -#ifdef USE_LEGACY_ATOMICS - gpu -> gpu_calculated -> oULL -> Download(); - - cudaMemsetAsync(gpu -> gpu_calculated -> oULL -> _devData, 0, sizeof(QUICKULL)*gpu->nbasis*gpu->nbasis); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } -#else gpu -> gpu_calculated -> o -> Download(); cudaMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); @@ -508,42 +262,30 @@ extern "C" void gpu_get_oei_(QUICKDouble* o) } } -#endif - -/* - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - printf("OEI host O: %d %d %f %f \n", i, j, LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis), o[idxf90++]); - } - } -*/ + /* + for (int i = 0; i< gpu->nbasis; i++) { + for (int j = i; j< gpu->nbasis; j++) { + printf("OEI host O: %d %d %f %f \n", i, j, LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis), o[idxf90++]); + } + } + */ gpu -> gpu_calculated -> o -> DownloadSum(o); -// SAFE_DELETE(gpu -> gpu_calculated -> o); - -//#ifdef LEGACY_ATOMIC_ADD -// SAFE_DELETE(gpu -> gpu_calculated -> oULL); -//#endif + // SAFE_DELETE(gpu -> gpu_calculated -> o); + //#ifdef LEGACY_ATOMIC_ADD + // SAFE_DELETE(gpu -> gpu_calculated -> oULL); + //#endif } + extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) { - // upload point charge grad vector if(gpu -> nextatom > 0) { gpu -> ptchg_grad = new gpu_buffer_type(3 * gpu -> nextatom); - -#ifdef USE_LEGACY_ATOMICS - gpu -> ptchg_gradULL = new gpu_buffer_type(3 * gpu -> nextatom); - gpu -> ptchg_gradULL -> Upload(); - gpu -> gpu_sim.ptchg_gradULL = gpu -> ptchg_gradULL -> _devData; - gpu -> ptchg_grad -> DeleteGPU(); -#else gpu -> ptchg_grad -> Upload(); gpu -> gpu_sim.ptchg_grad = gpu -> ptchg_grad -> _devData; -#endif - } upload_sim_to_constant_oei(gpu); @@ -551,98 +293,48 @@ extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) get_oei_grad(gpu); // download gradients -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); - cudaMemsetAsync(gpu -> gradULL -> _devData, 0, sizeof(QUICKULL)*3*gpu->natom); - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - gpu->grad->Download(); cudaMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); -#endif - gpu->grad->DownloadSum(grad); -/* for(int i=0; i<3*gpu->natom; ++i){ - printf("grad: %d %f %f \n", i, grad[i], gpu->grad->_hostData[i]); - - } -*/ + /* for(int i=0; i<3*gpu->natom; ++i){ + printf("grad: %d %f %f \n", i, grad[i], gpu->grad->_hostData[i]); + + } + */ // download point charge gradients if(gpu -> nextatom > 0) { + gpu->ptchg_grad->Download(); + cudaMemsetAsync(gpu -> ptchg_grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->nextatom); -#ifdef USE_LEGACY_ATOMICS - gpu -> ptchg_gradULL -> Download(); - - cudaMemsetAsync(gpu -> ptchg_gradULL -> _devData, 0, sizeof(QUICKULL)*3*gpu->nextatom); - - for (int i = 0; i< 3 * gpu->nextatom; i++) { - QUICKULL valULL = gpu->ptchg_gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->ptchg_grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - - gpu->ptchg_grad->Download(); - cudaMemsetAsync(gpu -> ptchg_grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->nextatom); - -#endif - -/* for(int i=0; i<3*gpu->nextatom; ++i){ - printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); - } -*/ - gpu->ptchg_grad->DownloadSum(ptchg_grad); - + /* for(int i=0; i<3*gpu->nextatom; ++i){ + printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); + } + */ + gpu->ptchg_grad->DownloadSum(ptchg_grad); } - // ptchg_grad is no longer needed. reclaim the memory. - if(gpu -> nextatom > 0 && !gpu->gpu_sim.use_cew) { -#ifdef USE_LEGACY_ATOMICS - SAFE_DELETE(gpu -> ptchg_gradULL); -#endif - SAFE_DELETE(gpu -> ptchg_grad); - } + // ptchg_grad is no longer needed. reclaim the memory. + if(gpu -> nextatom > 0 && !gpu->gpu_sim.use_cew) { + SAFE_DELETE(gpu -> ptchg_grad); + } } -#ifdef CEW +#if defined(CEW) extern "C" void gpu_get_lri_(QUICKDouble* o) { - -// gpu -> gpu_calculated -> o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - -//#ifdef LEGACY_ATOMIC_ADD -// gpu -> gpu_calculated -> o -> DeleteGPU(); -// gpu -> gpu_calculated -> oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); -// gpu -> gpu_calculated -> oULL -> Upload(); -// gpu -> gpu_sim.oULL = gpu -> gpu_calculated -> oULL -> _devData; -/*#else - gpu -> gpu_calculated -> o -> Upload(); - gpu -> gpu_sim.o = gpu -> gpu_calculated -> o -> _devData; + // gpu -> gpu_calculated -> o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + + //#ifdef LEGACY_ATOMIC_ADD + // gpu -> gpu_calculated -> o -> DeleteGPU(); + // gpu -> gpu_calculated -> oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + // gpu -> gpu_calculated -> oULL -> Upload(); + // gpu -> gpu_sim.oULL = gpu -> gpu_calculated -> oULL -> _devData; + /*#else + gpu -> gpu_calculated -> o -> Upload(); + gpu -> gpu_sim.o = gpu -> gpu_calculated -> o -> _devData; #endif */ @@ -650,34 +342,12 @@ extern "C" void gpu_get_lri_(QUICKDouble* o) upload_para_to_const_lri(); - get_lri(gpu); + get_lri(gpu); //compute xc quad potential upload_sim_to_constant_dft(gpu); getcew_quad(gpu); -#ifdef USE_LEGACY_ATOMICS - gpu -> gpu_calculated -> oULL -> Download(); - - cudaMemsetAsync(gpu -> gpu_calculated -> oULL -> _devData, 0, sizeof(QUICKULL)*gpu->nbasis*gpu->nbasis); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } -#else gpu -> gpu_calculated -> o -> Download(); cudaMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); @@ -687,30 +357,25 @@ extern "C" void gpu_get_lri_(QUICKDouble* o) } } -#endif - - -/* int idxf90=0; - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - printf("OEI host O: %d %d %f %f \n", i, j, LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis), o[idxf90++]); - } - } -*/ + /* int idxf90=0; + for (int i = 0; i< gpu->nbasis; i++) { + for (int j = i; j< gpu->nbasis; j++) { + printf("OEI host O: %d %d %f %f \n", i, j, LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis), o[idxf90++]); + } + } + */ gpu -> gpu_calculated -> o -> DownloadSum(o); -// SAFE_DELETE(gpu -> gpu_calculated -> o); - -//#ifdef LEGACY_ATOMIC_ADD -// SAFE_DELETE(gpu -> gpu_calculated -> oULL); -//#endif + // SAFE_DELETE(gpu -> gpu_calculated -> o); + //#ifdef LEGACY_ATOMIC_ADD + // SAFE_DELETE(gpu -> gpu_calculated -> oULL); + //#endif } extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) { - upload_sim_to_constant_lri(gpu); upload_para_to_const_lri(); @@ -718,90 +383,37 @@ extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) get_lri_grad(gpu); // download gradients -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); - cudaMemsetAsync(gpu -> gradULL -> _devData, 0, sizeof(QUICKULL)*3*gpu->natom); - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - gpu->grad->Download(); cudaMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); -#endif - gpu->grad->DownloadSum(grad); -/* for(int i=0; i<3*gpu->natom; ++i){ - printf("grad: %d %f %f \n", i, grad[i], gpu->grad->_hostData[i]); - - } -*/ + /* for(int i=0; i<3*gpu->natom; ++i){ + printf("grad: %d %f %f \n", i, grad[i], gpu->grad->_hostData[i]); + + } + */ // download point charge gradients if(gpu -> nextatom > 0) { + gpu->ptchg_grad->Download(); -#ifdef USE_LEGACY_ATOMICS - gpu -> ptchg_gradULL -> Download(); - - for (int i = 0; i< 3 * gpu->nextatom; i++) { - QUICKULL valULL = gpu->ptchg_gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->ptchg_grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } - -#else - - gpu->ptchg_grad->Download(); - -#endif - -/* for(int i=0; i<3*gpu->nextatom; ++i){ - printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); - } -*/ - gpu->ptchg_grad->DownloadSum(ptchg_grad); - + /* for(int i=0; i<3*gpu->nextatom; ++i){ + printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); + } + */ + gpu->ptchg_grad->DownloadSum(ptchg_grad); } - // ptchg_grad is no longer needed. reclaim the memory. - if(gpu -> nextatom > 0) { -#ifdef USE_LEGACY_ATOMICS - SAFE_DELETE(gpu -> ptchg_gradULL); -#endif - SAFE_DELETE(gpu -> ptchg_grad); - } - + // ptchg_grad is no longer needed. reclaim the memory. + if(gpu -> nextatom > 0) { + SAFE_DELETE(gpu -> ptchg_grad); + } } + extern "C" void gpu_getcew_grad_quad_(QUICKDouble* grad) { - -#ifndef USE_LEGACY_ATOMICS - gpu -> cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); -#else - memset(gpu -> grad -> _hostData, 0, sizeof(QUICKDouble)*3*gpu->natom); -#endif + gpu->cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); // calculate smem size gpu -> gpu_xcq -> smem_size = gpu->natom * 3 * sizeof(QUICKULL); @@ -812,38 +424,13 @@ extern "C" void gpu_getcew_grad_quad_(QUICKDouble* grad) getcew_quad_grad(gpu); // download gradients -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); - cudaMemsetAsync(gpu -> gradULL -> _devData, 0, sizeof(QUICKULL)*3*gpu->natom); - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - // make sure to add rather than assign. we already computed one part of the cew - // gradients on host asynchronously. - gpu->grad->_hostData[i] += (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else gpu->grad->Download(); cudaMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); -#endif - gpu->grad->DownloadSum(grad); -#ifndef USE_LEGACY_ATOMICS gpu -> cew_grad ->DownloadSum(grad); SAFE_DELETE(gpu -> cew_grad); -#endif - } #endif #endif diff --git a/src/gpu/cuda/gpu_get2e_grad_ffff.cu b/src/gpu/cuda/gpu_get2e_grad_ffff.cu index 80bfc4306..b2ada4e27 100644 --- a/src/gpu/cuda/gpu_get2e_grad_ffff.cu +++ b/src/gpu/cuda/gpu_get2e_grad_ffff.cu @@ -4,8 +4,8 @@ * * Created by Yipu Miao on 6/17/11. * Copyright 2011 University of Florida.All rights reserved. - * - * Yipu Miao 9/15/11: the first draft is released. And the GPUGP QM compuation can + * + * Yipu Miao 9/15/11: the first draft is released. And the GPUGP QM compuation can * achieve as much as 15x faster at double precision level compared with CPU. */ @@ -23,14 +23,14 @@ /* - Constant Memory in GPU is fast but quite limited and hard to operate, usually not allocatable and - readonly. So we put the following variables into constant memory: - devSim: a gpu simluation type variable. which is to store to location of basic information about molecule and basis - set. Note it only store the location, so it's mostly a set of pointer to GPU memory. and with some non-pointer - value like the number of basis set. See gpu_type.h for details. - devTrans : arrays to save the mapping index, will be elimited by hand writing unrolling code. - Sumindex: a array to store refect how many temp variable needed in VRR. can be elimited by hand writing code. - */ + Constant Memory in GPU is fast but quite limited and hard to operate, usually not allocatable and + readonly. So we put the following variables into constant memory: +devSim: a gpu simluation type variable. which is to store to location of basic information about molecule and basis +set. Note it only store the location, so it's mostly a set of pointer to GPU memory. and with some non-pointer +value like the number of basis set. See gpu_type.h for details. +devTrans : arrays to save the mapping index, will be elimited by hand writing unrolling code. +Sumindex: a array to store refect how many temp variable needed in VRR. can be elimited by hand writing code. +*/ static __constant__ gpu_simulation_type devSim; static __constant__ unsigned char devTrans[TRANSDIM*TRANSDIM*TRANSDIM]; static __constant__ int Sumindex[10]={0,0,1,4,10,20,35,56,84,120}; @@ -58,7 +58,7 @@ texture tex_Xcoeff; #ifdef USE_ERI_GRAD_STOREADD #define STORE_OPERATOR += #else -#define STORE_OPERATOR = +#define STORE_OPERATOR = #endif */ @@ -164,22 +164,22 @@ struct Partial_ERI{ }; bool ComparePrimNum(Partial_ERI p1, Partial_ERI p2){ - return p1.kprim_score > p2.kprim_score; + return p1.kprim_score > p2.kprim_score; } void ResortERIs(_gpu_type gpu){ int2 eri_type_order[]={{0,0},{0,1},{1,0},{1,1},{0,2},{2,0},{1,2},{2,1},{0,3},{3,0},{2,2},{1,3},{3,1}, - {2,3},{3,2},{3,3}}; + {2,3},{3,2},{3,3}}; unsigned char eri_type_order_map[]={0,1,3,6,10,13,15,16}; int eri_type_block_map[17]; int2 *resorted_YCutoffIJ=(int2*) malloc(sizeof(int2)*gpu->gpu_cutoff->sqrQshell); bool ffset= false; - // Step 1: sort according sum of angular momentum of a partial ERI. (ie. i+j of gpu_cutoff->sqrQshell; i++){ + int2 lbl_t=eri_type_order[ieto]; + eri_type_block_map[idx2]=idx1; + for(int i=0; igpu_cutoff->sqrQshell; i++){ if(gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].x] == lbl_t.x && gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y] == -lbl_t.y){ - resorted_YCutoffIJ[idx1].x = gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x; - resorted_YCutoffIJ[idx1].y = gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y; - idx1++; + ->_hostData[i].x] == lbl_t.x && gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y] == + lbl_t.y){ + resorted_YCutoffIJ[idx1].x = gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x; + resorted_YCutoffIJ[idx1].y = gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y; + idx1++; } } @@ -208,10 +208,10 @@ lbl_t.y){ for(int i=0; igpu_cutoff->sqrQshell; i++){ gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x=resorted_YCutoffIJ[i].x; gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y=resorted_YCutoffIJ[i].y; - + if(ffset == false && (gpu->gpu_basis->sorted_Qnumber->_hostData[resorted_YCutoffIJ[i].x]+gpu->gpu_basis->sorted_Qnumber->_hostData[resorted_YCutoffIJ[i].y]) == 6){ - ffStart = i; - ffset = true; + ffStart = i; + ffset = true; } } @@ -221,20 +221,20 @@ lbl_t.y){ for(int i=0; igpu_cutoff->sqrQshell; i++){ int kprim1 = gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].x]]; + ->_hostData[i].x]]; int kprim2 = gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].y]]; + ->_hostData[i].y]]; int kprim_score = 10*std::max(kprim1,kprim2)+std::min(kprim1,kprim2)+(kprim1+kprim2); partial_eris[i] = {gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x, gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y, - gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x], \ - gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y], \ - gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].x]], \ - gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].y]], \ - gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x], \ - gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y], - kprim_score}; + gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x], \ + gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y], \ + gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ + ->_hostData[i].x]], \ + gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ + ->_hostData[i].y]], \ + gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x], \ + gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y], + kprim_score}; } @@ -247,7 +247,7 @@ lbl_t.y){ gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y = partial_eris[i].YCutoffIJ_y; } - gpu -> gpu_cutoff -> sorted_YCutoffIJ -> Upload(); + gpu -> gpu_cutoff -> sorted_YCutoffIJ -> Upload(); gpu -> gpu_sim.sorted_YCutoffIJ = gpu -> gpu_cutoff -> sorted_YCutoffIJ -> _devData; gpu -> gpu_sim.ffStart = ffStart; @@ -255,268 +255,255 @@ lbl_t.y){ void getGrad_ffff(_gpu_type gpu) { + ResortERIs(gpu); + + int *int_buffer = (int*) malloc(ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int)); + int **int_ptr_buffer = (int**) malloc(ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*)); + QUICKDouble *dbl_buffer = (QUICKDouble*) malloc(ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble)); + QUICKDouble **dbl_ptr_buffer = (QUICKDouble**) malloc(ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*)); + int2 **int2_ptr_buffer = (int2**) malloc(ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); + unsigned char **char_ptr_buffer = (unsigned char**) malloc(ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); + QUICKAtomicType **grad_ptr_buffer = (QUICKAtomicType**) malloc(ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*)); + unsigned char trans[TRANSDIM*TRANSDIM*TRANSDIM]; + for(int i=0; igpu_sim.natom; + int_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.nbasis; + int_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.nshell; + int_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.jbasis; + int_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.sqrQshell; + int_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.prim_total; + int_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.ffStart; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.katom; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.kprim; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.kstart; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.Ksumtype; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.prim_start; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.Qfbasis; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.Qsbasis; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*7+i] = gpu->gpu_sim.Qstart; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*8+i] = gpu->gpu_sim.sorted_Q; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*9+i] = gpu->gpu_sim.sorted_Qnumber; + dbl_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.primLimit; + dbl_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.gradCutoff; + dbl_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.hyb_coeff; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.cons; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.cutMatrix; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.cutPrim; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.dense; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.denseb; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.expoSum; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.gcexpo; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*7+i] = gpu->gpu_sim.store; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*8+i] = gpu->gpu_sim.store2; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*9+i] = gpu->gpu_sim.storeAA; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*10+i] = gpu->gpu_sim.storeBB; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*11+i] = gpu->gpu_sim.storeCC; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*12+i] = gpu->gpu_sim.weightedCenterX; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*13+i] = gpu->gpu_sim.weightedCenterY; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*14+i] = gpu->gpu_sim.weightedCenterZ; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*15+i] = gpu->gpu_sim.Xcoeff; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*16+i] = gpu->gpu_sim.xyz; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*17+i] = gpu->gpu_sim.YCutoff; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*18+i] = gpu->gpu_sim.YVerticalTemp; + int2_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.sorted_YCutoffIJ; + char_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.mpi_bcompute; + char_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.KLMN; + grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.grad; + } - ResortERIs(gpu); - - int *int_buffer = (int*) malloc(ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int)); - int **int_ptr_buffer = (int**) malloc(ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*)); - QUICKDouble *dbl_buffer = (QUICKDouble*) malloc(ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble)); - QUICKDouble **dbl_ptr_buffer = (QUICKDouble**) malloc(ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*)); - int2 **int2_ptr_buffer = (int2**) malloc(ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); - unsigned char **char_ptr_buffer = (unsigned char**) malloc(ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); - QUICKAtomicType **grad_ptr_buffer = (QUICKAtomicType**) malloc(ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*)); - unsigned char trans[TRANSDIM*TRANSDIM*TRANSDIM]; - - - for(int i=0; igpu_sim.natom; - int_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.nbasis; - int_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.nshell; - int_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.jbasis; - int_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.sqrQshell; - int_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.prim_total; - int_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.ffStart; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.katom; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.kprim; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.kstart; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.Ksumtype; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.prim_start; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.Qfbasis; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.Qsbasis; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*7+i] = gpu->gpu_sim.Qstart; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*8+i] = gpu->gpu_sim.sorted_Q; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*9+i] = gpu->gpu_sim.sorted_Qnumber; - dbl_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.primLimit; - dbl_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.gradCutoff; - dbl_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.hyb_coeff; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.cons; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.cutMatrix; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.cutPrim; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.dense; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.denseb; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.expoSum; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.gcexpo; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*7+i] = gpu->gpu_sim.store; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*8+i] = gpu->gpu_sim.store2; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*9+i] = gpu->gpu_sim.storeAA; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*10+i] = gpu->gpu_sim.storeBB; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*11+i] = gpu->gpu_sim.storeCC; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*12+i] = gpu->gpu_sim.weightedCenterX; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*13+i] = gpu->gpu_sim.weightedCenterY; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*14+i] = gpu->gpu_sim.weightedCenterZ; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*15+i] = gpu->gpu_sim.Xcoeff; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*16+i] = gpu->gpu_sim.xyz; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*17+i] = gpu->gpu_sim.YCutoff; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*18+i] = gpu->gpu_sim.YVerticalTemp; - int2_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.sorted_YCutoffIJ; - char_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.mpi_bcompute; - char_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.KLMN; -#ifdef USE_LEGACY_ATOMICS - grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.gradULL; -#else - grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.grad; -#endif - } - - - LOC3(trans, 0, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 1; - LOC3(trans, 0, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 4; - LOC3(trans, 0, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 10; - LOC3(trans, 0, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 20; - LOC3(trans, 0, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 35; - LOC3(trans, 0, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 56; - LOC3(trans, 0, 0, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 84; - LOC3(trans, 0, 0, 7, TRANSDIM, TRANSDIM, TRANSDIM) = 120; - LOC3(trans, 0, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 3; - LOC3(trans, 0, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 6; - LOC3(trans, 0, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 17; - LOC3(trans, 0, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 32; - LOC3(trans, 0, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 48; - LOC3(trans, 0, 1, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 67; - LOC3(trans, 0, 1, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 100; - LOC3(trans, 0, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 9; - LOC3(trans, 0, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 16; - LOC3(trans, 0, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 23; - LOC3(trans, 0, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 42; - LOC3(trans, 0, 2, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 73; - LOC3(trans, 0, 2, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 106; - LOC3(trans, 0, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 19; - LOC3(trans, 0, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 31; - LOC3(trans, 0, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 43; - LOC3(trans, 0, 3, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 79; - LOC3(trans, 0, 3, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 112; - LOC3(trans, 0, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 34; - LOC3(trans, 0, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 49; - LOC3(trans, 0, 4, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 74; - LOC3(trans, 0, 4, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 113; - LOC3(trans, 0, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 55; - LOC3(trans, 0, 5, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 68; - LOC3(trans, 0, 5, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 107; - LOC3(trans, 0, 6, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 83; - LOC3(trans, 0, 6, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 101; - LOC3(trans, 0, 7, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 119; - LOC3(trans, 1, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 2; - LOC3(trans, 1, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 7; - LOC3(trans, 1, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 15; - LOC3(trans, 1, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 28; - LOC3(trans, 1, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 50; - LOC3(trans, 1, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 69; - LOC3(trans, 1, 0, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 102; - LOC3(trans, 1, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 5; - LOC3(trans, 1, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 11; - LOC3(trans, 1, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 26; - LOC3(trans, 1, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 41; - LOC3(trans, 1, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 59; - LOC3(trans, 1, 1, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 87; - LOC3(trans, 1, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 13; - LOC3(trans, 1, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 25; - LOC3(trans, 1, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 36; - LOC3(trans, 1, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 60; - LOC3(trans, 1, 2, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 88; - LOC3(trans, 1, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 30; - LOC3(trans, 1, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 40; - LOC3(trans, 1, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 61; - LOC3(trans, 1, 3, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 94; - LOC3(trans, 1, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 52; - LOC3(trans, 1, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 58; - LOC3(trans, 1, 4, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 89; - LOC3(trans, 1, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 71; - LOC3(trans, 1, 5, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 86; - LOC3(trans, 1, 6, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 104; - LOC3(trans, 2, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 8; - LOC3(trans, 2, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 14; - LOC3(trans, 2, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 22; - LOC3(trans, 2, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 44; - LOC3(trans, 2, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 75; - LOC3(trans, 2, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 108; - LOC3(trans, 2, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 12; - LOC3(trans, 2, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 24; - LOC3(trans, 2, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 37; - LOC3(trans, 2, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 62; - LOC3(trans, 2, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 90; - LOC3(trans, 2, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 21; - LOC3(trans, 2, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 38; - LOC3(trans, 2, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 66; - LOC3(trans, 2, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 99; - LOC3(trans, 2, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 46; - LOC3(trans, 2, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 64; - LOC3(trans, 2, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 98; - LOC3(trans, 2, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 77; - LOC3(trans, 2, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 92; - LOC3(trans, 2, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 110; - LOC3(trans, 3, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 18; - LOC3(trans, 3, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 27; - LOC3(trans, 3, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 45; - LOC3(trans, 3, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 80; - LOC3(trans, 3, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 114; - LOC3(trans, 3, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 29; - LOC3(trans, 3, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 39; - LOC3(trans, 3, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 63; - LOC3(trans, 3, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 95; - LOC3(trans, 3, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 47; - LOC3(trans, 3, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 65; - LOC3(trans, 3, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 97; - LOC3(trans, 3, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 81; - LOC3(trans, 3, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 96; - LOC3(trans, 3, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 116; - LOC3(trans, 4, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 33; - LOC3(trans, 4, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 51; - LOC3(trans, 4, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 76; - LOC3(trans, 4, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 115; - LOC3(trans, 4, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 53; - LOC3(trans, 4, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 57; - LOC3(trans, 4, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 91; - LOC3(trans, 4, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 78; - LOC3(trans, 4, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 93; - LOC3(trans, 4, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 117; - LOC3(trans, 5, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 54; - LOC3(trans, 5, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 70; - LOC3(trans, 5, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 109; - LOC3(trans, 5, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 72; - LOC3(trans, 5, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 85; - LOC3(trans, 5, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 111; - LOC3(trans, 6, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 82; - LOC3(trans, 6, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 103; - LOC3(trans, 6, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 105; - LOC3(trans, 7, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 118; - + LOC3(trans, 0, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 1; + LOC3(trans, 0, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 4; + LOC3(trans, 0, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 10; + LOC3(trans, 0, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 20; + LOC3(trans, 0, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 35; + LOC3(trans, 0, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 56; + LOC3(trans, 0, 0, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 84; + LOC3(trans, 0, 0, 7, TRANSDIM, TRANSDIM, TRANSDIM) = 120; + LOC3(trans, 0, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 3; + LOC3(trans, 0, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 6; + LOC3(trans, 0, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 17; + LOC3(trans, 0, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 32; + LOC3(trans, 0, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 48; + LOC3(trans, 0, 1, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 67; + LOC3(trans, 0, 1, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 100; + LOC3(trans, 0, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 9; + LOC3(trans, 0, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 16; + LOC3(trans, 0, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 23; + LOC3(trans, 0, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 42; + LOC3(trans, 0, 2, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 73; + LOC3(trans, 0, 2, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 106; + LOC3(trans, 0, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 19; + LOC3(trans, 0, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 31; + LOC3(trans, 0, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 43; + LOC3(trans, 0, 3, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 79; + LOC3(trans, 0, 3, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 112; + LOC3(trans, 0, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 34; + LOC3(trans, 0, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 49; + LOC3(trans, 0, 4, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 74; + LOC3(trans, 0, 4, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 113; + LOC3(trans, 0, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 55; + LOC3(trans, 0, 5, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 68; + LOC3(trans, 0, 5, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 107; + LOC3(trans, 0, 6, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 83; + LOC3(trans, 0, 6, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 101; + LOC3(trans, 0, 7, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 119; + LOC3(trans, 1, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 2; + LOC3(trans, 1, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 7; + LOC3(trans, 1, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 15; + LOC3(trans, 1, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 28; + LOC3(trans, 1, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 50; + LOC3(trans, 1, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 69; + LOC3(trans, 1, 0, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 102; + LOC3(trans, 1, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 5; + LOC3(trans, 1, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 11; + LOC3(trans, 1, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 26; + LOC3(trans, 1, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 41; + LOC3(trans, 1, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 59; + LOC3(trans, 1, 1, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 87; + LOC3(trans, 1, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 13; + LOC3(trans, 1, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 25; + LOC3(trans, 1, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 36; + LOC3(trans, 1, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 60; + LOC3(trans, 1, 2, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 88; + LOC3(trans, 1, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 30; + LOC3(trans, 1, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 40; + LOC3(trans, 1, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 61; + LOC3(trans, 1, 3, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 94; + LOC3(trans, 1, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 52; + LOC3(trans, 1, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 58; + LOC3(trans, 1, 4, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 89; + LOC3(trans, 1, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 71; + LOC3(trans, 1, 5, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 86; + LOC3(trans, 1, 6, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 104; + LOC3(trans, 2, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 8; + LOC3(trans, 2, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 14; + LOC3(trans, 2, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 22; + LOC3(trans, 2, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 44; + LOC3(trans, 2, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 75; + LOC3(trans, 2, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 108; + LOC3(trans, 2, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 12; + LOC3(trans, 2, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 24; + LOC3(trans, 2, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 37; + LOC3(trans, 2, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 62; + LOC3(trans, 2, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 90; + LOC3(trans, 2, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 21; + LOC3(trans, 2, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 38; + LOC3(trans, 2, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 66; + LOC3(trans, 2, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 99; + LOC3(trans, 2, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 46; + LOC3(trans, 2, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 64; + LOC3(trans, 2, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 98; + LOC3(trans, 2, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 77; + LOC3(trans, 2, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 92; + LOC3(trans, 2, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 110; + LOC3(trans, 3, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 18; + LOC3(trans, 3, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 27; + LOC3(trans, 3, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 45; + LOC3(trans, 3, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 80; + LOC3(trans, 3, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 114; + LOC3(trans, 3, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 29; + LOC3(trans, 3, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 39; + LOC3(trans, 3, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 63; + LOC3(trans, 3, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 95; + LOC3(trans, 3, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 47; + LOC3(trans, 3, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 65; + LOC3(trans, 3, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 97; + LOC3(trans, 3, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 81; + LOC3(trans, 3, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 96; + LOC3(trans, 3, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 116; + LOC3(trans, 4, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 33; + LOC3(trans, 4, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 51; + LOC3(trans, 4, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 76; + LOC3(trans, 4, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 115; + LOC3(trans, 4, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 53; + LOC3(trans, 4, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 57; + LOC3(trans, 4, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 91; + LOC3(trans, 4, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 78; + LOC3(trans, 4, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 93; + LOC3(trans, 4, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 117; + LOC3(trans, 5, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 54; + LOC3(trans, 5, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 70; + LOC3(trans, 5, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 109; + LOC3(trans, 5, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 72; + LOC3(trans, 5, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 85; + LOC3(trans, 5, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 111; + LOC3(trans, 6, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 82; + LOC3(trans, 6, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 103; + LOC3(trans, 6, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 105; + LOC3(trans, 7, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 118; + + int *dev_int_buffer; + int **dev_int_ptr_buffer; + QUICKDouble *dev_dbl_buffer; + QUICKDouble **dev_dbl_ptr_buffer; + int2 **dev_int2_ptr_buffer; + unsigned char **dev_char_ptr_buffer; + unsigned char *dev_char_buffer; + QUICKAtomicType **dev_grad_ptr_buffer; + + cudaMalloc((void **)&dev_int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int)); + cudaMalloc((void **)&dev_int_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*)); + cudaMalloc((void **)&dev_dbl_buffer, ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble)); + cudaMalloc((void **)&dev_dbl_ptr_buffer, ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*)); + cudaMalloc((void **)&dev_int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); + cudaMalloc((void **)&dev_char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); + cudaMalloc((void **)&dev_char_buffer, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char)); + cudaMalloc((void **)&dev_grad_ptr_buffer, ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*)); + + + cudaMemcpy(dev_int_buffer, int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_int_ptr_buffer, int_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*), cudaMemcpyHostToDevice); + cudaMemcpy(dev_dbl_buffer, dbl_buffer, ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble), cudaMemcpyHostToDevice); + cudaMemcpy(dev_dbl_ptr_buffer, dbl_ptr_buffer, ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*), cudaMemcpyHostToDevice); + cudaMemcpy(dev_int2_ptr_buffer, int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*), cudaMemcpyHostToDevice); + cudaMemcpy(dev_char_ptr_buffer, char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned + char*), cudaMemcpyHostToDevice); + cudaMemcpy(dev_char_buffer, &trans, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char), cudaMemcpyHostToDevice); + cudaMemcpy(dev_grad_ptr_buffer, grad_ptr_buffer, ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*), + cudaMemcpyHostToDevice); - int *dev_int_buffer; - int **dev_int_ptr_buffer; - QUICKDouble *dev_dbl_buffer; - QUICKDouble **dev_dbl_ptr_buffer; - int2 **dev_int2_ptr_buffer; - unsigned char **dev_char_ptr_buffer; - unsigned char *dev_char_buffer; - QUICKAtomicType **dev_grad_ptr_buffer; - - cudaMalloc((void **)&dev_int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int)); - cudaMalloc((void **)&dev_int_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*)); - cudaMalloc((void **)&dev_dbl_buffer, ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble)); - cudaMalloc((void **)&dev_dbl_ptr_buffer, ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*)); - cudaMalloc((void **)&dev_int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); - cudaMalloc((void **)&dev_char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); - cudaMalloc((void **)&dev_char_buffer, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char)); - cudaMalloc((void **)&dev_grad_ptr_buffer, ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*)); - - - cudaMemcpy(dev_int_buffer, int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(dev_int_ptr_buffer, int_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*), cudaMemcpyHostToDevice); - cudaMemcpy(dev_dbl_buffer, dbl_buffer, ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble), cudaMemcpyHostToDevice); - cudaMemcpy(dev_dbl_ptr_buffer, dbl_ptr_buffer, ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*), cudaMemcpyHostToDevice); - cudaMemcpy(dev_int2_ptr_buffer, int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*), cudaMemcpyHostToDevice); - cudaMemcpy(dev_char_ptr_buffer, char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned -char*), cudaMemcpyHostToDevice); - cudaMemcpy(dev_char_buffer, &trans, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char), cudaMemcpyHostToDevice); - cudaMemcpy(dev_grad_ptr_buffer, grad_ptr_buffer, ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*), -cudaMemcpyHostToDevice); - - if (gpu->maxL >= 3) { + if (gpu->maxL >= 3) { // Part f-3 #ifdef GPU_SPDF - QUICK_SAFE_CALL((getGrad_kernel_ffff<<blocks*ERI_GRAD_FFFF_BPSM, ERI_GRAD_FFFF_TPB, -sizeof(int)*ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB+ - sizeof(QUICKDouble)*ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB+sizeof(QUICKDouble*)*ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(int*)*ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB+ - sizeof(int2*)*ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned -char*)*ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned char)*ERI_GRAD_FFFF_SMEM_CHAR_SIZE+ - sizeof(QUICKAtomicType*)*ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB>>>(dev_int_buffer, -dev_int_ptr_buffer, dev_dbl_buffer, dev_dbl_ptr_buffer, dev_int2_ptr_buffer, dev_char_ptr_buffer, dev_char_buffer, -dev_grad_ptr_buffer,gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell))) - -#endif - } - - cudaDeviceSynchronize(); - + QUICK_SAFE_CALL((getGrad_kernel_ffff<<blocks*ERI_GRAD_FFFF_BPSM, ERI_GRAD_FFFF_TPB, + sizeof(int)*ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB+ + sizeof(QUICKDouble)*ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB+sizeof(QUICKDouble*)*ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(int*)*ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB+ + sizeof(int2*)*ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned + char*)*ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned char)*ERI_GRAD_FFFF_SMEM_CHAR_SIZE+ + sizeof(QUICKAtomicType*)*ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB>>>(dev_int_buffer, + dev_int_ptr_buffer, dev_dbl_buffer, dev_dbl_ptr_buffer, dev_int2_ptr_buffer, dev_char_ptr_buffer, dev_char_buffer, + dev_grad_ptr_buffer,gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell))) - free(int_buffer); - free(int_ptr_buffer); - free(dbl_buffer); - free(dbl_ptr_buffer); - free(int2_ptr_buffer); - free(char_ptr_buffer); - free(grad_ptr_buffer); +#endif + } - cudaFree(dev_int_buffer); - cudaFree(dev_int_ptr_buffer); - cudaFree(dev_dbl_buffer); - cudaFree(dev_dbl_ptr_buffer); - cudaFree(dev_int2_ptr_buffer); - cudaFree(dev_char_ptr_buffer); - cudaFree(dev_char_buffer); - cudaFree(dev_grad_ptr_buffer); + cudaDeviceSynchronize(); + free(int_buffer); + free(int_ptr_buffer); + free(dbl_buffer); + free(dbl_ptr_buffer); + free(int2_ptr_buffer); + free(char_ptr_buffer); + free(grad_ptr_buffer); + + cudaFree(dev_int_buffer); + cudaFree(dev_int_ptr_buffer); + cudaFree(dev_dbl_buffer); + cudaFree(dev_dbl_ptr_buffer); + cudaFree(dev_int2_ptr_buffer); + cudaFree(dev_char_ptr_buffer); + cudaFree(dev_char_buffer); + cudaFree(dev_grad_ptr_buffer); } // interface to call uscf gradient Kernels void get_oshell_eri_grad_ffff(_gpu_type gpu) { - // nvtxRangePushA("Gradient 2e"); - // compute one electron gradients in the meantime //get_oneen_grad_(); @@ -528,11 +515,10 @@ void get_oshell_eri_grad_ffff(_gpu_type gpu) cudaDeviceSynchronize(); // nvtxRangePop(); - } -void upload_para_to_const_ffff(){ - + +void upload_para_to_const_ffff() { unsigned char trans[TRANSDIM*TRANSDIM*TRANSDIM]; // Data to trans { @@ -662,13 +648,13 @@ void upload_para_to_const_ffff(){ status = cudaMemcpyToSymbol(devTrans, trans, sizeof(unsigned char)*TRANSDIM*TRANSDIM*TRANSDIM); PRINTERROR(status, " cudaMemcpyToSymbol, Trans copy to constants failed") - } -void upload_sim_to_constant_ffff(_gpu_type gpu){ + +void upload_sim_to_constant_ffff(_gpu_type gpu) { cudaError_t status; - status = cudaMemcpyToSymbol(devSim, &gpu->gpu_sim, sizeof(gpu_simulation_type)); - PRINTERROR(status, " cudaMemcpyToSymbol, sim copy to constants failed") + status = cudaMemcpyToSymbol(devSim, &gpu->gpu_sim, sizeof(gpu_simulation_type)); + PRINTERROR(status, " cudaMemcpyToSymbol, sim copy to constants failed") upload_para_to_const_ffff(); } diff --git a/src/gpu/cuda/gpu_get2e_grad_ffff.cuh b/src/gpu/cuda/gpu_get2e_grad_ffff.cuh index de685ad7d..465116f97 100644 --- a/src/gpu/cuda/gpu_get2e_grad_ffff.cuh +++ b/src/gpu/cuda/gpu_get2e_grad_ffff.cuh @@ -1615,28 +1615,6 @@ const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const sm //printf("FILE: %s, LINE: %d, FUNCTION: %s, DEV_SIM_DBL_HYB_COEFF \n", __FILE__, __LINE__, __func__); #endif -#ifdef USE_LEGACY_ATOMICS - - GRADADD(DEV_SIM_PTR_GRAD[AStart], AGradx); - GRADADD(DEV_SIM_PTR_GRAD[AStart + 1], AGrady); - GRADADD(DEV_SIM_PTR_GRAD[AStart + 2], AGradz); - - - GRADADD(DEV_SIM_PTR_GRAD[BStart], BGradx); - GRADADD(DEV_SIM_PTR_GRAD[BStart + 1], BGrady); - GRADADD(DEV_SIM_PTR_GRAD[BStart + 2], BGradz); - - - GRADADD(DEV_SIM_PTR_GRAD[CStart], CGradx); - GRADADD(DEV_SIM_PTR_GRAD[CStart + 1], CGrady); - GRADADD(DEV_SIM_PTR_GRAD[CStart + 2], CGradz); - - - GRADADD(DEV_SIM_PTR_GRAD[DStart], (-AGradx-BGradx-CGradx)); - GRADADD(DEV_SIM_PTR_GRAD[DStart + 1], (-AGrady-BGrady-CGrady)); - GRADADD(DEV_SIM_PTR_GRAD[DStart + 2], (-AGradz-BGradz-CGradz)); - -#else atomicAdd(&DEV_SIM_PTR_GRAD[AStart], AGradx); atomicAdd(&DEV_SIM_PTR_GRAD[AStart + 1], AGrady); atomicAdd(&DEV_SIM_PTR_GRAD[AStart + 2], AGradz); @@ -1655,7 +1633,6 @@ const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const sm atomicAdd(&DEV_SIM_PTR_GRAD[DStart], (-AGradx-BGradx-CGradx)); atomicAdd(&DEV_SIM_PTR_GRAD[DStart + 1], (-AGrady-BGrady-CGrady)); atomicAdd(&DEV_SIM_PTR_GRAD[DStart + 2], (-AGradz-BGradz-CGradz)); -#endif return; } diff --git a/src/gpu/cuda/gpu_getxc.cu b/src/gpu/cuda/gpu_getxc.cu index ebb1ece0b..b497495c6 100644 --- a/src/gpu/cuda/gpu_getxc.cu +++ b/src/gpu/cuda/gpu_getxc.cu @@ -367,21 +367,12 @@ __global__ void get_sswgrad_kernel(){ //declare smem grad vector -#ifdef USE_LEGACY_ATOMICS - extern __shared__ QUICKULL smem_buffer[]; - QUICKULL* smemGrad=(QUICKULL*)smem_buffer; - - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0ull; -#else extern __shared__ QUICKDouble smem_buffer[]; QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; // initialize smem grad for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) smemGrad[i]=0.0; -#endif __syncthreads(); @@ -404,11 +395,7 @@ __global__ void get_sswgrad_kernel(){ // update gmem grad vector for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) -#ifdef USE_LEGACY_ATOMICS - atomicAdd(&devSim_dft.gradULL[i],smemGrad[i]); -#else - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); -#endif + atomicAdd(&devSim_dft.grad[i],smemGrad[i]); __syncthreads(); } @@ -427,15 +414,6 @@ __global__ void get_sswnumgrad_kernel(){ unsigned int natom = devSim_dft.natom; -#ifdef USE_LEGACY_ATOMICS - //declare smem grad vector - extern __shared__ QUICKULL smem_buffer[]; - QUICKULL* smemGrad=(QUICKULL*)smem_buffer; - - // initialize smem grad - for(int i = threadIdx.x; i< natom * 3; i+=blockDim.x) - smemGrad[i]=0ull; -#else //declare smem grad vector extern __shared__ QUICKDouble smem_buffer[]; QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; @@ -443,7 +421,6 @@ __global__ void get_sswnumgrad_kernel(){ // initialize smem grad for(int i = threadIdx.x; i< natom * 3; i+=blockDim.x) smemGrad[i]=0.0; -#endif __syncthreads(); @@ -496,8 +473,6 @@ __global__ void get_sswnumgrad_kernel(){ QUICKDouble dpx = (sswt1-sswt2) * gradfac; - // GRADADD(smemGrad[iatom*3], (sswt1-sswt2) * gradfac); - xatm += SSW_NUMGRAD_DELTA; if(iatom == gatm-1) xparent = xatm; @@ -520,8 +495,6 @@ __global__ void get_sswnumgrad_kernel(){ QUICKDouble dpy = (sswt1-sswt2) * gradfac; - //GRADADD(smemGrad[iatom*3+1], (sswt1-sswt2) * gradfac); - yatm += SSW_NUMGRAD_DELTA; if(iatom == gatm-1) yparent = yatm; @@ -544,21 +517,13 @@ __global__ void get_sswnumgrad_kernel(){ QUICKDouble dpz = (sswt1-sswt2) * gradfac; - //GRADADD(smemGrad[iatom*3+2], (sswt1-sswt2) * gradfac); - zatm += SSW_NUMGRAD_DELTA; if(iatom == gatm-1) zparent = zatm; -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[iatom*3], dpx); - GRADADD(smemGrad[iatom*3+1], dpy); - GRADADD(smemGrad[iatom*3+2], dpz); -#else atomicAdd(&smemGrad[iatom*3], dpx); atomicAdd(&smemGrad[iatom*3+1], dpy); atomicAdd(&smemGrad[iatom*3+2], dpz); -#endif /* printf("sswgrad %f %f %f %d %d %f %f %f \n", gridx, gridy, gridz, iatom, 1, dpx, devSim_dft.exc_ssd[idx], devSim_dft.quadwt[idx]); @@ -575,11 +540,7 @@ __global__ void get_sswnumgrad_kernel(){ // update gmem grad vector for(int i = threadIdx.x; i< natom * 3; i+=blockDim.x) -#ifdef USE_LEGACY_ATOMICS - atomicAdd(&devSim_dft.gradULL[i],smemGrad[i]); -#else - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); -#endif + atomicAdd(&devSim_dft.grad[i],smemGrad[i]); __syncthreads(); @@ -975,11 +936,10 @@ __device__ QUICKDouble get_uw_ssd(const QUICKDouble gridx, const QUICKDouble gri } -#ifdef USE_LEGACY_ATOMICS -__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKULL* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom) -#else -__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKDouble* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom) -#endif +__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, + const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble + quadwt, QUICKDouble* const smemGrad, QUICKDouble* const uw_ssd, const + int iparent, const int natom) { QUICKDouble sumUW= 0.0; QUICKDouble parent_uw = 0.0; @@ -1003,11 +963,7 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons for(int i=0; i devSim_dft.DMCutoff ){ -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[i*3+j], LOCUWSSD(uw_ssd,j,i,3,natom)*Exc*quadwt*uw*(-p/sumUW)); -#else atomicAdd(&smemGrad[i*3+j],LOCUWSSD(uw_ssd,j,i,3,natom)*Exc*quadwt*uw*(-p/sumUW)); -#endif } } } @@ -1019,26 +975,19 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons get_uw_ssd(gridx, gridy, gridz, uw_ssd, iparent, iparent-1, natom); - for(int i=0; i devSim_dft.DMCutoff ){ -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[i*3+j], LOCUWSSD(uw_ssd,j,i,3,natom)*(1.0/sumUW)*Exc*quadwt*parent_uw); -#else atomicAdd(&smemGrad[i*3+j],LOCUWSSD(uw_ssd,j,i,3,natom)*(1.0/sumUW)*Exc*quadwt*parent_uw); -#endif } } } } -#ifdef USE_LEGACY_ATOMICS -__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKULL* smemGrad, int iparent, int gid) -#else -__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int iparent, int gid) -#endif +__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, + QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int + iparent, int gid) { /* This subroutine calculates the derivatives of weight found in @@ -1186,17 +1135,10 @@ __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, //printf("gridx: %f gridy: %f gridz: %f Exc: %e quadwt: %e\n",wtgradjx, wtgradjy, wtgradjz, Exc, quadwt); #endif - // We should now have the derivatives of the SS weights. Now just add it to the temporary gradient vector in shared memory. - -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[jstart], wtgradjx * Exc * quadwt); - GRADADD(smemGrad[jstart + 1], wtgradjy * Exc * quadwt); - GRADADD(smemGrad[jstart + 2], wtgradjz * Exc * quadwt); -#else + // We should now have the derivatives of the SS weights. Now just add it to the temporary gradient vector in shared memory. atomicAdd(&smemGrad[jstart], wtgradjx * Exc * quadwt); atomicAdd(&smemGrad[jstart + 1], wtgradjy * Exc * quadwt); atomicAdd(&smemGrad[jstart + 2], wtgradjz * Exc * quadwt); -#endif } } @@ -1206,15 +1148,9 @@ __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, #endif // update the temporary gradient vector -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[istart], wtgradix * Exc * quadwt); - GRADADD(smemGrad[istart + 1], wtgradiy * Exc * quadwt); - GRADADD(smemGrad[istart + 2], wtgradiz * Exc * quadwt); -#else atomicAdd(&smemGrad[istart], wtgradix * Exc * quadwt); atomicAdd(&smemGrad[istart + 1], wtgradiy * Exc * quadwt); atomicAdd(&smemGrad[istart + 2], wtgradiz * Exc * quadwt); -#endif } diff --git a/src/gpu/cuda/gpu_getxc.h b/src/gpu/cuda/gpu_getxc.h index 765686659..e002502b6 100644 --- a/src/gpu/cuda/gpu_getxc.h +++ b/src/gpu/cuda/gpu_getxc.h @@ -1,20 +1,20 @@ /* - !---------------------------------------------------------------------! - ! Written by Madu Manathunga on 12/03/2020 ! - ! ! - ! Copyright (C) 2020-2021 Merz lab ! - ! Copyright (C) 2020-2021 Götz lab ! - ! ! - ! This Source Code Form is subject to the terms of the Mozilla Public ! - ! License, v. 2.0. If a copy of the MPL was not distributed with this ! - ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! - !_____________________________________________________________________! - - !---------------------------------------------------------------------! - ! This source file contains preprocessable functions required for ! - ! QUICK GPU version. ! - !---------------------------------------------------------------------! -*/ + !---------------------------------------------------------------------! + ! Written by Madu Manathunga on 12/03/2020 ! + ! ! + ! Copyright (C) 2020-2021 Merz lab ! + ! Copyright (C) 2020-2021 Götz lab ! + ! ! + ! This Source Code Form is subject to the terms of the Mozilla Public ! + ! License, v. 2.0. If a copy of the MPL was not distributed with this ! + ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! + !_____________________________________________________________________! + + !---------------------------------------------------------------------! + ! This source file contains preprocessable functions required for ! + ! QUICK GPU version. ! + !---------------------------------------------------------------------! + */ #ifdef OSHELL #define NSPIN 2 @@ -22,6 +22,7 @@ #define NSPIN 1 #endif + //----------------------------------------------- // Calculate the density and gradients of density at // each grid point. @@ -32,642 +33,574 @@ __global__ void get_oshell_density_kernel() __global__ void get_cshell_density_kernel() #endif { - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; - - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble density = 0.0; - QUICKDouble gax = 0.0; - QUICKDouble gay = 0.0; - QUICKDouble gaz = 0.0; + QUICKDouble density = 0.0; + QUICKDouble gax = 0.0; + QUICKDouble gay = 0.0; + QUICKDouble gaz = 0.0; #ifdef OSHELL - QUICKDouble densityb = 0.0; - QUICKDouble gbx = 0.0; - QUICKDouble gby = 0.0; - QUICKDouble gbz = 0.0; + QUICKDouble densityb = 0.0; + QUICKDouble gbx = 0.0; + QUICKDouble gby = 0.0; + QUICKDouble gbz = 0.0; #endif - for(int i=bfloc_st; i < bfloc_end; i++){ - - int ibas = (int) devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; + for(int i=bfloc_st; i < bfloc_end; i++) { + int ibas = (int) devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - if (abs(phi+dphidx+dphidy+dphidz) >= devSim_dft.XCCutoff ) { + if (abs(phi+dphidx+dphidy+dphidz) >= devSim_dft.XCCutoff ) { - QUICKDouble denseii = LOC2(devSim_dft.dense, ibas, ibas, devSim_dft.nbasis, devSim_dft.nbasis) * phi; + QUICKDouble denseii = LOC2(devSim_dft.dense, ibas, ibas, devSim_dft.nbasis, devSim_dft.nbasis) * phi; #ifdef OSHELL - QUICKDouble densebii = LOC2(devSim_dft.denseb, ibas, ibas, devSim_dft.nbasis, devSim_dft.nbasis) * phi; + QUICKDouble densebii = LOC2(devSim_dft.denseb, ibas, ibas, devSim_dft.nbasis, devSim_dft.nbasis) * phi; #endif #ifdef OSHELL - density = density + denseii * phi; - densityb = densityb + densebii * phi; + density = density + denseii * phi; + densityb = densityb + densebii * phi; #else - density = density + denseii * phi / 2.0; + density = density + denseii * phi / 2.0; #endif - gax = gax + denseii * dphidx; - gay = gay + denseii * dphidy; - gaz = gaz + denseii * dphidz; + gax = gax + denseii * dphidx; + gay = gay + denseii * dphidy; + gaz = gaz + denseii * dphidz; #ifdef OSHELL - gbx = gbx + densebii * dphidx; - gby = gby + densebii * dphidy; - gbz = gbz + densebii * dphidz; + gbx = gbx + densebii * dphidx; + gby = gby + densebii * dphidy; + gbz = gbz + densebii * dphidz; #endif - for(int j=i+1; j< bfloc_end; j++){ + for(int j=i+1; j< bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - - QUICKDouble denseij = LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + QUICKDouble denseij = LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); #ifdef OSHELL - QUICKDouble densebij = LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + QUICKDouble densebij = LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); #endif #ifdef OSHELL - density = density + 2.0 * denseij * phi * phi2; - densityb = densityb + 2.0 * densebij * phi * phi2; + density = density + 2.0 * denseij * phi * phi2; + densityb = densityb + 2.0 * densebij * phi * phi2; #else - density = density + denseij * phi * phi2; + density = density + denseij * phi * phi2; #endif - gax = gax + denseij * ( phi * dphidx2 + phi2 * dphidx ); - gay = gay + denseij * ( phi * dphidy2 + phi2 * dphidy ); - gaz = gaz + denseij * ( phi * dphidz2 + phi2 * dphidz ); + gax = gax + denseij * ( phi * dphidx2 + phi2 * dphidx ); + gay = gay + denseij * ( phi * dphidy2 + phi2 * dphidy ); + gaz = gaz + denseij * ( phi * dphidz2 + phi2 * dphidz ); #ifdef OSHELL - gbx = gbx + densebij * ( phi * dphidx2 + phi2 * dphidx ); - gby = gby + densebij * ( phi * dphidy2 + phi2 * dphidy ); - gbz = gbz + densebij * ( phi * dphidz2 + phi2 * dphidz ); + gbx = gbx + densebij * ( phi * dphidx2 + phi2 * dphidx ); + gby = gby + densebij * ( phi * dphidy2 + phi2 * dphidy ); + gbz = gbz + densebij * ( phi * dphidz2 + phi2 * dphidz ); #endif - } + } + } } - } #ifdef OSHELL - devSim_dft.densa[gid] = density; - devSim_dft.densb[gid] = densityb; - devSim_dft.gax[gid] = 2.0 * gax; - devSim_dft.gbx[gid] = 2.0 * gbx; - devSim_dft.gay[gid] = 2.0 * gay; - devSim_dft.gby[gid] = 2.0 * gby; - devSim_dft.gaz[gid] = 2.0 * gaz; - devSim_dft.gbz[gid] = 2.0 * gbz; + devSim_dft.densa[gid] = density; + devSim_dft.densb[gid] = densityb; + devSim_dft.gax[gid] = 2.0 * gax; + devSim_dft.gbx[gid] = 2.0 * gbx; + devSim_dft.gay[gid] = 2.0 * gay; + devSim_dft.gby[gid] = 2.0 * gby; + devSim_dft.gaz[gid] = 2.0 * gaz; + devSim_dft.gbz[gid] = 2.0 * gbz; #else - devSim_dft.densa[gid] = density; - devSim_dft.densb[gid] = density; - devSim_dft.gax[gid] = gax; - devSim_dft.gbx[gid] = gax; - devSim_dft.gay[gid] = gay; - devSim_dft.gby[gid] = gay; - devSim_dft.gaz[gid] = gaz; - devSim_dft.gbz[gid] = gaz; + devSim_dft.densa[gid] = density; + devSim_dft.densb[gid] = density; + devSim_dft.gax[gid] = gax; + devSim_dft.gbx[gid] = gax; + devSim_dft.gay[gid] = gay; + devSim_dft.gby[gid] = gay; + devSim_dft.gaz[gid] = gaz; + devSim_dft.gbz[gid] = gaz; #endif - } + } } + #ifdef OSHELL __global__ void oshell_getxc_kernel() #else __global__ void cshell_getxc_kernel() #endif { - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; + QUICKDouble weight = devSim_dft.weight[gid]; + QUICKDouble density = devSim_dft.densa[gid]; + QUICKDouble densityb = devSim_dft.densb[gid]; + QUICKDouble gax = devSim_dft.gax[gid]; + QUICKDouble gay = devSim_dft.gay[gid]; + QUICKDouble gaz = devSim_dft.gaz[gid]; + QUICKDouble gbx = devSim_dft.gbx[gid]; + QUICKDouble gby = devSim_dft.gby[gid]; + QUICKDouble gbz = devSim_dft.gbz[gid]; - QUICKDouble weight = devSim_dft.weight[gid]; - QUICKDouble density = devSim_dft.densa[gid]; - QUICKDouble densityb = devSim_dft.densb[gid]; - QUICKDouble gax = devSim_dft.gax[gid]; - QUICKDouble gay = devSim_dft.gay[gid]; - QUICKDouble gaz = devSim_dft.gaz[gid]; - QUICKDouble gbx = devSim_dft.gbx[gid]; - QUICKDouble gby = devSim_dft.gby[gid]; - QUICKDouble gbz = devSim_dft.gbz[gid]; + if(density >devSim_dft.DMCutoff){ - if(density >devSim_dft.DMCutoff){ - - QUICKDouble dfdr; - QUICKDouble xdot, ydot, zdot; - QUICKDouble _tmp ; + QUICKDouble dfdr; + QUICKDouble xdot, ydot, zdot; + QUICKDouble _tmp ; #ifdef OSHELL - QUICKDouble dfdrb; - QUICKDouble xdotb, ydotb, zdotb; + QUICKDouble dfdrb; + QUICKDouble xdotb, ydotb, zdotb; - QUICKDouble gaa = (gax * gax + gay * gay + gaz * gaz); - QUICKDouble gab = (gax * gbx + gay * gby + gaz * gbz); - QUICKDouble gbb = (gbx * gbx + gby * gby + gbz * gbz); + QUICKDouble gaa = (gax * gax + gay * gay + gaz * gaz); + QUICKDouble gab = (gax * gbx + gay * gby + gaz * gbz); + QUICKDouble gbb = (gbx * gbx + gby * gby + gbz * gbz); #else - QUICKDouble dot; - QUICKDouble sigma = 4.0 * (gax * gax + gay * gay + gaz * gaz); - - if (devSim_dft.method == B3LYP) { - _tmp = b3lyp_e(2.0*density, sigma) * weight; - }else if(devSim_dft.method == BLYP){ - _tmp = (becke_e(density, densityb, gax, gay, gaz, gbx, gby, gbz) - + lyp_e(density, densityb, gax, gay, gaz, gbx, gby, gbz)) * weight; - } - - - if (devSim_dft.method == B3LYP) { - dot = b3lypf(2.0*density, sigma, &dfdr); - xdot = dot * gax; - ydot = dot * gay; - zdot = dot * gaz; - }else if(devSim_dft.method == BLYP){ - QUICKDouble dfdgaa, dfdgab, dfdgaa2, dfdgab2; - QUICKDouble dfdr2; - - becke(density, gax, gay, gaz, gbx, gby, gbz, &dfdr, &dfdgaa, &dfdgab); - lyp(density, densityb, gax, gay, gaz, gbx, gby, gbz, &dfdr2, &dfdgaa2, &dfdgab2); - dfdr += dfdr2; - dfdgaa += dfdgaa2; - dfdgab += dfdgab2; - //Calculate the first term in the dot product shown above,i.e.: - //(2 df/dgaa Grad(rho a) + df/dgab Grad(rho b)) doT Grad(Phimu Phinu)) - xdot = 2.0 * dfdgaa * gax + dfdgab * gbx; - ydot = 2.0 * dfdgaa * gay + dfdgab * gby; - zdot = 2.0 * dfdgaa * gaz + dfdgab * gbz; - }else if(devSim_dft.method == LIBXC){ -#endif - //Prepare in/out for libxc call - double d_rhoa = (double) density; - double d_rhob = (double) densityb; - - // array d_sigma stores gaa, gab and gbb respectively - QUICKDouble d_sigma[3] = {0.0, 0.0, 0.0}; - // array d_vrho stores dfdra and dfdrb respectively - QUICKDouble d_vrho[2] = {0.0, 0.0}; - // array d_vsigma carries dfdgaa, dfdgab and dfdgbb respectively - QUICKDouble d_vsigma[3] = {0.0, 0.0, 0.0}; - QUICKDouble d_zk = 0.0; + QUICKDouble dot; + QUICKDouble sigma = 4.0 * (gax * gax + gay * gay + gaz * gaz); + + if (devSim_dft.method == B3LYP) { + _tmp = b3lyp_e(2.0*density, sigma) * weight; + }else if(devSim_dft.method == BLYP){ + _tmp = (becke_e(density, densityb, gax, gay, gaz, gbx, gby, gbz) + + lyp_e(density, densityb, gax, gay, gaz, gbx, gby, gbz)) * weight; + } + + + if (devSim_dft.method == B3LYP) { + dot = b3lypf(2.0*density, sigma, &dfdr); + xdot = dot * gax; + ydot = dot * gay; + zdot = dot * gaz; + }else if(devSim_dft.method == BLYP){ + QUICKDouble dfdgaa, dfdgab, dfdgaa2, dfdgab2; + QUICKDouble dfdr2; + + becke(density, gax, gay, gaz, gbx, gby, gbz, &dfdr, &dfdgaa, &dfdgab); + lyp(density, densityb, gax, gay, gaz, gbx, gby, gbz, &dfdr2, &dfdgaa2, &dfdgab2); + dfdr += dfdr2; + dfdgaa += dfdgaa2; + dfdgab += dfdgab2; + //Calculate the first term in the dot product shown above,i.e.: + //(2 df/dgaa Grad(rho a) + df/dgab Grad(rho b)) doT Grad(Phimu Phinu)) + xdot = 2.0 * dfdgaa * gax + dfdgab * gbx; + ydot = 2.0 * dfdgaa * gay + dfdgab * gby; + zdot = 2.0 * dfdgaa * gaz + dfdgab * gbz; + }else if(devSim_dft.method == LIBXC){ +#endif + //Prepare in/out for libxc call + double d_rhoa = (double) density; + double d_rhob = (double) densityb; + + // array d_sigma stores gaa, gab and gbb respectively + QUICKDouble d_sigma[3] = {0.0, 0.0, 0.0}; + // array d_vrho stores dfdra and dfdrb respectively + QUICKDouble d_vrho[2] = {0.0, 0.0}; + // array d_vsigma carries dfdgaa, dfdgab and dfdgbb respectively + QUICKDouble d_vsigma[3] = {0.0, 0.0, 0.0}; + QUICKDouble d_zk = 0.0; #ifdef OSHELL - d_sigma[0] = gaa; - d_sigma[1] = gab; - d_sigma[2] = gbb; + d_sigma[0] = gaa; + d_sigma[1] = gab; + d_sigma[2] = gbb; #else - d_sigma[0] = sigma; + d_sigma[0] = sigma; #endif - int nof_functionals = devSim_dft.nauxfunc; - gpu_libxc_info** glinfo = devSim_dft.glinfo; - - for(int i=0; igpu_worker){ - case GPU_WORK_LDA: - gpu_work_lda_c(tmp_glinfo, d_rhoa, d_rhob, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, NSPIN); - break; - - case GPU_WORK_GGA_X: - - gpu_work_gga_x(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); - break; - - case GPU_WORK_GGA_C: - gpu_work_gga_c(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); - break; - } - d_zk += (tmp_d_zk*tmp_glinfo->mix_coeff); - d_vrho[0] += (tmp_d_vrho[0]*tmp_glinfo->mix_coeff); - d_vsigma[0] += (tmp_d_vsigma[0]*tmp_glinfo->mix_coeff); -#ifdef OSHELL - d_vrho[1] += (tmp_d_vrho[1] * tmp_glinfo->mix_coeff); - d_vsigma[1] += (tmp_d_vsigma[1] * tmp_glinfo->mix_coeff); - d_vsigma[2] += (tmp_d_vsigma[2] * tmp_glinfo->mix_coeff); -#endif + int nof_functionals = devSim_dft.nauxfunc; + gpu_libxc_info** glinfo = devSim_dft.glinfo; - } + for(int i=0; igpu_worker){ + case GPU_WORK_LDA: + gpu_work_lda_c(tmp_glinfo, d_rhoa, d_rhob, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, NSPIN); + break; - xdotb = 2.0 * d_vsigma[2] * gbx + d_vsigma[1] * gax; - ydotb = 2.0 * d_vsigma[2] * gby + d_vsigma[1] * gay; - zdotb = 2.0 * d_vsigma[2] * gbz + d_vsigma[1] * gaz; -#else - xdot = 4.0 * d_vsigma[0] * gax; - ydot = 4.0 * d_vsigma[0] * gay; - zdot = 4.0 * d_vsigma[0] * gaz; -#endif + case GPU_WORK_GGA_X: -#ifndef OSHELL - } + gpu_work_gga_x(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); + break; + + case GPU_WORK_GGA_C: + gpu_work_gga_c(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); + break; + } + d_zk += (tmp_d_zk*tmp_glinfo->mix_coeff); + d_vrho[0] += (tmp_d_vrho[0]*tmp_glinfo->mix_coeff); + d_vsigma[0] += (tmp_d_vsigma[0]*tmp_glinfo->mix_coeff); +#ifdef OSHELL + d_vrho[1] += (tmp_d_vrho[1] * tmp_glinfo->mix_coeff); + d_vsigma[1] += (tmp_d_vsigma[1] * tmp_glinfo->mix_coeff); + d_vsigma[2] += (tmp_d_vsigma[2] * tmp_glinfo->mix_coeff); #endif -#ifdef USE_LEGACY_ATOMICS - QUICKULL val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) - val1 = 0ull - val1; - QUICKADD(devSim_dft.DFT_calculated[0].Eelxc, val1); + } - _tmp = weight*density; - val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) - val1 = 0ull - val1; - QUICKADD(devSim_dft.DFT_calculated[0].aelec, val1); + _tmp = ((QUICKDouble) (d_zk * (d_rhoa + d_rhob)) * weight); + dfdr = (QUICKDouble) d_vrho[0]; +#ifdef OSHELL + dfdrb= (QUICKDouble) d_vrho[1]; + xdot = 2.0 * d_vsigma[0] * gax + d_vsigma[1] * gbx; + ydot = 2.0 * d_vsigma[0] * gay + d_vsigma[1] * gby; + zdot = 2.0 * d_vsigma[0] * gaz + d_vsigma[1] * gbz; - _tmp = weight*densityb; - val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) - val1 = 0ull - val1; - QUICKADD(devSim_dft.DFT_calculated[0].belec, val1); + xdotb = 2.0 * d_vsigma[2] * gbx + d_vsigma[1] * gax; + ydotb = 2.0 * d_vsigma[2] * gby + d_vsigma[1] * gay; + zdotb = 2.0 * d_vsigma[2] * gbz + d_vsigma[1] * gaz; #else - atomicAdd(&devSim_dft.DFT_calculated[0].Eelxc, _tmp); - atomicAdd(&devSim_dft.DFT_calculated[0].aelec, weight*density); - atomicAdd(&devSim_dft.DFT_calculated[0].belec, weight*densityb); + xdot = 4.0 * d_vsigma[0] * gax; + ydot = 4.0 * d_vsigma[0] * gay; + zdot = 4.0 * d_vsigma[0] * gaz; #endif - for (int i = bfloc_st; i< bfloc_end; ++i) { +#ifndef OSHELL + } +#endif - int ibas = devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; + atomicAdd(&devSim_dft.DFT_calculated[0].Eelxc, _tmp); + atomicAdd(&devSim_dft.DFT_calculated[0].aelec, weight*density); + atomicAdd(&devSim_dft.DFT_calculated[0].belec, weight*densityb); - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.XCCutoff ) { - for (int j = bfloc_st; j < bfloc_end; j++) { + for (int i = bfloc_st; i< bfloc_end; ++i) { + int ibas = devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.XCCutoff ) { + for (int j = bfloc_st; j < bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - QUICKDouble _tmp = (phi * phi2 * dfdr + xdot * (phi*dphidx2 + phi2*dphidx) \ - + ydot * (phi*dphidy2 + phi2*dphidy) + zdot * (phi*dphidz2 + phi2*dphidz))*weight; + QUICKDouble _tmp = (phi * phi2 * dfdr + xdot * (phi*dphidx2 + phi2*dphidx) \ + + ydot * (phi*dphidy2 + phi2*dphidy) + zdot * (phi*dphidz2 + phi2*dphidz))*weight; -#ifdef USE_LEGACY_ATOMICS - QUICKULL val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) val1 = 0ull - val1; - QUICKADD(LOC2(devSim_dft.oULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), val1); -#else - atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); -#endif + atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); #ifdef OSHELL - QUICKDouble _tmpb = (phi * phi2 * dfdrb + xdotb * (phi*dphidx2 + phi2*dphidx) - + ydotb * (phi*dphidy2 + phi2*dphidy) + zdotb * (phi*dphidz2 + phi2*dphidz))*weight; + QUICKDouble _tmpb = (phi * phi2 * dfdrb + xdotb * (phi*dphidx2 + phi2*dphidx) + + ydotb * (phi*dphidy2 + phi2*dphidy) + zdotb * (phi*dphidz2 + phi2*dphidz))*weight; -#ifdef USE_LEGACY_ATOMICS - QUICKULL val2 = (QUICKULL) (fabs( _tmpb * OSCALE) + (QUICKDouble)0.5); - if ( _tmpb * weight < (QUICKDouble)0.0) val2 = 0ull - val2; - QUICKADD(LOC2(devSim_dft.obULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), val2); -#else - atomicAdd(&LOC2(devSim_dft.ob, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmpb); + atomicAdd(&LOC2(devSim_dft.ob, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmpb); #endif -#endif - } + } + } + } } - } } - } - } + #ifdef OSHELL __global__ void oshell_getxcgrad_kernel() #else __global__ void cshell_getxcgrad_kernel() #endif { - -#ifdef USE_LEGACY_ATOMICS - //declare smem grad vector - extern __shared__ QUICKULL smem_buffer[]; - QUICKULL* smemGrad=(QUICKULL*)smem_buffer; - - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0ull; -#else - //declare smem grad vector - extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; - - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0.0; -#endif - __syncthreads(); - - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; - - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { - - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - - - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble weight = devSim_dft.weight[gid]; - QUICKDouble density = devSim_dft.densa[gid]; - QUICKDouble densityb = devSim_dft.densb[gid]; - QUICKDouble gax = devSim_dft.gax[gid]; - QUICKDouble gay = devSim_dft.gay[gid]; - QUICKDouble gaz = devSim_dft.gaz[gid]; - QUICKDouble gbx = devSim_dft.gbx[gid]; - QUICKDouble gby = devSim_dft.gby[gid]; - QUICKDouble gbz = devSim_dft.gbz[gid]; + //declare smem grad vector + extern __shared__ QUICKDouble smem_buffer[]; + QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; + + // initialize smem grad + for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) + smemGrad[i]=0.0; + + __syncthreads(); + + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; + + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; + + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; + QUICKDouble weight = devSim_dft.weight[gid]; + QUICKDouble density = devSim_dft.densa[gid]; + QUICKDouble densityb = devSim_dft.densb[gid]; + QUICKDouble gax = devSim_dft.gax[gid]; + QUICKDouble gay = devSim_dft.gay[gid]; + QUICKDouble gaz = devSim_dft.gaz[gid]; + QUICKDouble gbx = devSim_dft.gbx[gid]; + QUICKDouble gby = devSim_dft.gby[gid]; + QUICKDouble gbz = devSim_dft.gbz[gid]; #ifdef CEW - QUICKDouble dfdr_cew = 0.0; - if(devSim_dft.use_cew) dfdr_cew = devSim_dft.cew_vrecip[gid]; + QUICKDouble dfdr_cew = 0.0; + if(devSim_dft.use_cew) dfdr_cew = devSim_dft.cew_vrecip[gid]; #endif - if(density >devSim_dft.DMCutoff){ + if(density >devSim_dft.DMCutoff){ - QUICKDouble dfdr; - QUICKDouble xdot, ydot, zdot; - QUICKDouble _tmp ; + QUICKDouble dfdr; + QUICKDouble xdot, ydot, zdot; + QUICKDouble _tmp ; #ifdef OSHELL - QUICKDouble dfdrb; - QUICKDouble xdotb, ydotb, zdotb; + QUICKDouble dfdrb; + QUICKDouble xdotb, ydotb, zdotb; - QUICKDouble gaa = (gax * gax + gay * gay + gaz * gaz); - QUICKDouble gab = (gax * gbx + gay * gby + gaz * gbz); - QUICKDouble gbb = (gbx * gbx + gby * gby + gbz * gbz); + QUICKDouble gaa = (gax * gax + gay * gay + gaz * gaz); + QUICKDouble gab = (gax * gbx + gay * gby + gaz * gbz); + QUICKDouble gbb = (gbx * gbx + gby * gby + gbz * gbz); #else - QUICKDouble dot; - QUICKDouble sigma = 4.0 * (gax * gax + gay * gay + gaz * gaz); - - if (devSim_dft.method == B3LYP) { - _tmp = b3lyp_e(2.0*density, sigma); - }else if(devSim_dft.method == BLYP){ - _tmp = (becke_e(density, densityb, gax, gay, gaz, gbx, gby, gbz) - + lyp_e(density, densityb, gax, gay, gaz, gbx, gby, gbz)); - } - - - if (devSim_dft.method == B3LYP) { - dot = b3lypf(2.0*density, sigma, &dfdr); - xdot = dot * gax; - ydot = dot * gay; - zdot = dot * gaz; - }else if(devSim_dft.method == BLYP){ - QUICKDouble dfdgaa, dfdgab, dfdgaa2, dfdgab2; - QUICKDouble dfdr2; - - becke(density, gax, gay, gaz, gbx, gby, gbz, &dfdr, &dfdgaa, &dfdgab); - lyp(density, densityb, gax, gay, gaz, gbx, gby, gbz, &dfdr2, &dfdgaa2, &dfdgab2); - dfdr += dfdr2; - dfdgaa += dfdgaa2; - dfdgab += dfdgab2; - - //Calculate the first term in the dot product shown above,i.e.: - //(2 df/dgaa Grad(rho a) + df/dgab Grad(rho b)) doT Grad(Phimu Phinu)) - xdot = 2.0 * dfdgaa * gax + dfdgab * gbx; - ydot = 2.0 * dfdgaa * gay + dfdgab * gby; - zdot = 2.0 * dfdgaa * gaz + dfdgab * gbz; - - }else if(devSim_dft.method == LIBXC){ -#endif - //Prepare in/out for libxc call - QUICKDouble d_rhoa = (QUICKDouble) density; - QUICKDouble d_rhob = (QUICKDouble) densityb; - // array d_sigma stores gaa, gab and gbb respectively - QUICKDouble d_sigma[3] = {0.0, 0.0, 0.0}; - // array d_vrho stores dfdra and dfdrb respectively - QUICKDouble d_vrho[2] = {0.0, 0.0}; - // array d_vsigma carries dfdgaa, dfdgab and dfdgbb respectively - QUICKDouble d_vsigma[3] = {0.0, 0.0, 0.0}; - QUICKDouble d_zk = 0.0; + QUICKDouble dot; + QUICKDouble sigma = 4.0 * (gax * gax + gay * gay + gaz * gaz); + + if (devSim_dft.method == B3LYP) { + _tmp = b3lyp_e(2.0*density, sigma); + }else if(devSim_dft.method == BLYP){ + _tmp = (becke_e(density, densityb, gax, gay, gaz, gbx, gby, gbz) + + lyp_e(density, densityb, gax, gay, gaz, gbx, gby, gbz)); + } + + + if (devSim_dft.method == B3LYP) { + dot = b3lypf(2.0*density, sigma, &dfdr); + xdot = dot * gax; + ydot = dot * gay; + zdot = dot * gaz; + }else if(devSim_dft.method == BLYP){ + QUICKDouble dfdgaa, dfdgab, dfdgaa2, dfdgab2; + QUICKDouble dfdr2; + + becke(density, gax, gay, gaz, gbx, gby, gbz, &dfdr, &dfdgaa, &dfdgab); + lyp(density, densityb, gax, gay, gaz, gbx, gby, gbz, &dfdr2, &dfdgaa2, &dfdgab2); + dfdr += dfdr2; + dfdgaa += dfdgaa2; + dfdgab += dfdgab2; + + //Calculate the first term in the dot product shown above,i.e.: + //(2 df/dgaa Grad(rho a) + df/dgab Grad(rho b)) doT Grad(Phimu Phinu)) + xdot = 2.0 * dfdgaa * gax + dfdgab * gbx; + ydot = 2.0 * dfdgaa * gay + dfdgab * gby; + zdot = 2.0 * dfdgaa * gaz + dfdgab * gbz; + + }else if(devSim_dft.method == LIBXC){ +#endif + //Prepare in/out for libxc call + QUICKDouble d_rhoa = (QUICKDouble) density; + QUICKDouble d_rhob = (QUICKDouble) densityb; + // array d_sigma stores gaa, gab and gbb respectively + QUICKDouble d_sigma[3] = {0.0, 0.0, 0.0}; + // array d_vrho stores dfdra and dfdrb respectively + QUICKDouble d_vrho[2] = {0.0, 0.0}; + // array d_vsigma carries dfdgaa, dfdgab and dfdgbb respectively + QUICKDouble d_vsigma[3] = {0.0, 0.0, 0.0}; + QUICKDouble d_zk = 0.0; #ifdef OSHELL - d_sigma[0] = gaa; - d_sigma[1] = gab; - d_sigma[2] = gbb; + d_sigma[0] = gaa; + d_sigma[1] = gab; + d_sigma[2] = gbb; #else - d_sigma[0] = sigma; + d_sigma[0] = sigma; #endif - int nof_functionals = devSim_dft.nauxfunc; - gpu_libxc_info** glinfo = devSim_dft.glinfo; + int nof_functionals = devSim_dft.nauxfunc; + gpu_libxc_info** glinfo = devSim_dft.glinfo; - for(int i=0; igpu_worker){ - case GPU_WORK_LDA: - gpu_work_lda_c(tmp_glinfo, d_rhoa, d_rhob, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, NSPIN); - break; + switch(tmp_glinfo->gpu_worker){ + case GPU_WORK_LDA: + gpu_work_lda_c(tmp_glinfo, d_rhoa, d_rhob, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, NSPIN); + break; - case GPU_WORK_GGA_X: - gpu_work_gga_x(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); - break; + case GPU_WORK_GGA_X: + gpu_work_gga_x(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); + break; - case GPU_WORK_GGA_C: - gpu_work_gga_c(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); - break; - } - d_zk += (tmp_d_zk * tmp_glinfo->mix_coeff); - d_vrho[0] += (tmp_d_vrho[0] * tmp_glinfo->mix_coeff); - d_vsigma[0] += (tmp_d_vsigma[0] * tmp_glinfo->mix_coeff); + case GPU_WORK_GGA_C: + gpu_work_gga_c(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); + break; + } + d_zk += (tmp_d_zk * tmp_glinfo->mix_coeff); + d_vrho[0] += (tmp_d_vrho[0] * tmp_glinfo->mix_coeff); + d_vsigma[0] += (tmp_d_vsigma[0] * tmp_glinfo->mix_coeff); #ifdef OSHELL - d_vrho[1] += (tmp_d_vrho[1] * tmp_glinfo->mix_coeff); - d_vsigma[1] += (tmp_d_vsigma[1] * tmp_glinfo->mix_coeff); - d_vsigma[2] += (tmp_d_vsigma[2] * tmp_glinfo->mix_coeff); + d_vrho[1] += (tmp_d_vrho[1] * tmp_glinfo->mix_coeff); + d_vsigma[1] += (tmp_d_vsigma[1] * tmp_glinfo->mix_coeff); + d_vsigma[2] += (tmp_d_vsigma[2] * tmp_glinfo->mix_coeff); #endif - } + } - _tmp = ((QUICKDouble) (d_zk * (d_rhoa + d_rhob))); - dfdr = (QUICKDouble) d_vrho[0]; + _tmp = ((QUICKDouble) (d_zk * (d_rhoa + d_rhob))); + dfdr = (QUICKDouble) d_vrho[0]; #ifdef OSHELL - dfdrb= (QUICKDouble) d_vrho[1]; + dfdrb= (QUICKDouble) d_vrho[1]; - xdot = 2.0 * d_vsigma[0] * gax + d_vsigma[1] * gbx; - ydot = 2.0 * d_vsigma[0] * gay + d_vsigma[1] * gby; - zdot = 2.0 * d_vsigma[0] * gaz + d_vsigma[1] * gbz; + xdot = 2.0 * d_vsigma[0] * gax + d_vsigma[1] * gbx; + ydot = 2.0 * d_vsigma[0] * gay + d_vsigma[1] * gby; + zdot = 2.0 * d_vsigma[0] * gaz + d_vsigma[1] * gbz; - xdotb = 2.0 * d_vsigma[2] * gbx + d_vsigma[1] * gax; - ydotb = 2.0 * d_vsigma[2] * gby + d_vsigma[1] * gay; - zdotb = 2.0 * d_vsigma[2] * gbz + d_vsigma[1] * gaz; + xdotb = 2.0 * d_vsigma[2] * gbx + d_vsigma[1] * gax; + ydotb = 2.0 * d_vsigma[2] * gby + d_vsigma[1] * gay; + zdotb = 2.0 * d_vsigma[2] * gbz + d_vsigma[1] * gaz; #else - xdot = 4.0 * d_vsigma[0] * gax; - ydot = 4.0 * d_vsigma[0] * gay; - zdot = 4.0 * d_vsigma[0] * gaz; + xdot = 4.0 * d_vsigma[0] * gax; + ydot = 4.0 * d_vsigma[0] * gay; + zdot = 4.0 * d_vsigma[0] * gaz; #endif #ifndef OSHELL - } + } #endif #ifdef CEW - devSim_dft.exc[gid] = _tmp + (dfdr_cew * (density+densityb)); + devSim_dft.exc[gid] = _tmp + (dfdr_cew * (density+densityb)); #else - devSim_dft.exc[gid] = _tmp; + devSim_dft.exc[gid] = _tmp; #endif - QUICKDouble sumGradx=0.0, sumGrady=0.0, sumGradz=0.0; + QUICKDouble sumGradx=0.0, sumGrady=0.0, sumGradz=0.0; - for (int i = bfloc_st; i< bfloc_end; i++) { - int ibas = devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + for (int i = bfloc_st; i< bfloc_end; i++) { + int ibas = devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.XCCutoff ) { + if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.XCCutoff ) { - QUICKDouble dxdx, dxdy, dxdz, dydy, dydz, dzdz; + QUICKDouble dxdx, dxdy, dxdz, dydy, dydz, dzdz; - pt2der_new(gridx, gridy, gridz, &dxdx, &dxdy, &dxdz, &dydy, &dydz, &dzdz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + pt2der_new(gridx, gridy, gridz, &dxdx, &dxdy, &dxdz, &dydy, &dydz, &dzdz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - int Istart = (devSim_dft.ncenter[ibas]-1) * 3; + int Istart = (devSim_dft.ncenter[ibas]-1) * 3; - for (int j = bfloc_st; j < bfloc_end; j++) { + for (int j = bfloc_st; j < bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); + QUICKDouble denseij = (QUICKDouble) LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); - QUICKDouble denseij = (QUICKDouble) LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + QUICKDouble Gradx = - 2.0 * denseij * weight * (dfdr * dphidx * phi2 + + xdot * (dxdx * phi2 + dphidx * dphidx2) + + ydot * (dxdy * phi2 + dphidx * dphidy2) + + zdot * (dxdz * phi2 + dphidx * dphidz2)); - QUICKDouble Gradx = - 2.0 * denseij * weight * (dfdr * dphidx * phi2 - + xdot * (dxdx * phi2 + dphidx * dphidx2) - + ydot * (dxdy * phi2 + dphidx * dphidy2) - + zdot * (dxdz * phi2 + dphidx * dphidz2)); + QUICKDouble Grady = - 2.0 * denseij * weight * (dfdr * dphidy * phi2 + + xdot * (dxdy * phi2 + dphidy * dphidx2) + + ydot * (dydy * phi2 + dphidy * dphidy2) + + zdot * (dydz * phi2 + dphidy * dphidz2)); - QUICKDouble Grady = - 2.0 * denseij * weight * (dfdr * dphidy * phi2 - + xdot * (dxdy * phi2 + dphidy * dphidx2) - + ydot * (dydy * phi2 + dphidy * dphidy2) - + zdot * (dydz * phi2 + dphidy * dphidz2)); - - QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2 - + xdot * (dxdz * phi2 + dphidz * dphidx2) - + ydot * (dydz * phi2 + dphidz * dphidy2) - + zdot * (dzdz * phi2 + dphidz * dphidz2)); + QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2 + + xdot * (dxdz * phi2 + dphidz * dphidx2) + + ydot * (dydz * phi2 + dphidz * dphidy2) + + zdot * (dzdz * phi2 + dphidz * dphidz2)); #ifdef OSHELL - QUICKDouble densebij = (QUICKDouble) LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); - - Gradx += - 2.0 * densebij * weight * (dfdrb * dphidx * phi2 - + xdotb * (dxdx * phi2 + dphidx * dphidx2) - + ydotb * (dxdy * phi2 + dphidx * dphidy2) - + zdotb * (dxdz * phi2 + dphidx * dphidz2)); - - Grady += - 2.0 * densebij * weight * (dfdrb * dphidy * phi2 - + xdotb * (dxdy * phi2 + dphidy * dphidx2) - + ydotb * (dydy * phi2 + dphidy * dphidy2) - + zdotb * (dydz * phi2 + dphidy * dphidz2)); - - Gradz += - 2.0 * densebij * weight * (dfdrb * dphidz * phi2 - + xdotb * (dxdz * phi2 + dphidz * dphidx2) - + ydotb * (dydz * phi2 + dphidz * dphidy2) - + zdotb * (dzdz * phi2 + dphidz * dphidz2)); + QUICKDouble densebij = (QUICKDouble) LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + + Gradx += - 2.0 * densebij * weight * (dfdrb * dphidx * phi2 + + xdotb * (dxdx * phi2 + dphidx * dphidx2) + + ydotb * (dxdy * phi2 + dphidx * dphidy2) + + zdotb * (dxdz * phi2 + dphidx * dphidz2)); + + Grady += - 2.0 * densebij * weight * (dfdrb * dphidy * phi2 + + xdotb * (dxdy * phi2 + dphidy * dphidx2) + + ydotb * (dydy * phi2 + dphidy * dphidy2) + + zdotb * (dydz * phi2 + dphidy * dphidz2)); + + Gradz += - 2.0 * densebij * weight * (dfdrb * dphidz * phi2 + + xdotb * (dxdz * phi2 + dphidz * dphidx2) + + ydotb * (dydz * phi2 + dphidz * dphidy2) + + zdotb * (dzdz * phi2 + dphidz * dphidz2)); #endif #ifdef CEW - if(devSim_dft.use_cew){ + if(devSim_dft.use_cew){ #ifdef OSHELL - denseij += densebij; + denseij += densebij; #endif - Gradx -= 2.0 * denseij * weight * dfdr_cew * dphidx * phi2; - Grady -= 2.0 * denseij * weight * dfdr_cew * dphidy * phi2; - Gradz -= 2.0 * denseij * weight * dfdr_cew * dphidz * phi2; + Gradx -= 2.0 * denseij * weight * dfdr_cew * dphidx * phi2; + Grady -= 2.0 * denseij * weight * dfdr_cew * dphidy * phi2; + Gradz -= 2.0 * denseij * weight * dfdr_cew * dphidz * phi2; - } + } #endif -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[Istart], Gradx); - GRADADD(smemGrad[Istart+1], Grady); - GRADADD(smemGrad[Istart+2], Gradz); -#else - atomicAdd(&smemGrad[Istart], Gradx); - atomicAdd(&smemGrad[Istart+1], Grady); - atomicAdd(&smemGrad[Istart+2], Gradz); -#endif - sumGradx += Gradx; - sumGrady += Grady; - sumGradz += Gradz; - - } - } - } + atomicAdd(&smemGrad[Istart], Gradx); + atomicAdd(&smemGrad[Istart+1], Grady); + atomicAdd(&smemGrad[Istart+2], Gradz); + sumGradx += Gradx; + sumGrady += Grady; + sumGradz += Gradz; + } + } + } - int Istart = (devSim_dft.gatm[gid]-1) * 3; + int Istart = (devSim_dft.gatm[gid]-1) * 3; -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[Istart], -sumGradx); - GRADADD(smemGrad[Istart+1], -sumGrady); - GRADADD(smemGrad[Istart+2], -sumGradz); -#else - atomicAdd(&smemGrad[Istart], -sumGradx); - atomicAdd(&smemGrad[Istart+1], -sumGrady); - atomicAdd(&smemGrad[Istart+2], -sumGradz); -#endif - } - //Set weights for sswder calculation - if(density < devSim_dft.DMCutoff){ + atomicAdd(&smemGrad[Istart], -sumGradx); + atomicAdd(&smemGrad[Istart+1], -sumGrady); + atomicAdd(&smemGrad[Istart+2], -sumGradz); + } + //Set weights for sswder calculation + if(density < devSim_dft.DMCutoff){ devSim_dft.dweight_ssd[gid] = 0; - } + } - if(devSim_dft.sswt[gid] == 1){ + if(devSim_dft.sswt[gid] == 1){ devSim_dft.dweight_ssd[gid] = 0; - } - - } + } - __syncthreads(); + } - // update gmem grad vector - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) -#ifdef USE_LEGACY_ATOMICS - atomicAdd(&devSim_dft.gradULL[i],smemGrad[i]); -#else - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); -#endif + __syncthreads(); - __syncthreads(); + // update gmem grad vector + for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) + atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + __syncthreads(); } #undef NSPIN diff --git a/src/gpu/cuda/gpu_oei.h b/src/gpu/cuda/gpu_oei.h index 542312d37..96e1c1124 100644 --- a/src/gpu/cuda/gpu_oei.h +++ b/src/gpu/cuda/gpu_oei.h @@ -62,15 +62,7 @@ __device__ void addint_oei(unsigned int I, unsigned int J, unsigned int II, unsi // LOC2(devSim.KLMN, 2, JJJ - 1, 3,devSim.nbasis)); // } -#if defined(USE_LEGACY_ATOMICS) - QUICKULL Yull = (QUICKULL) (fabs(Y * OSCALE) + (QUICKDouble) 0.5); - if (Y < (QUICKDouble)0.0) Yull = 0ull - Yull; - - // Now add the contribution into Fock matrix. - QUICKADD(LOC2(devSim.oULL, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), Yull); -#else atomicAdd(&LOC2(devSim.o, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), Y); -#endif //printf("addint_oei: %d %d %f %f %f \n", III, JJJ, devSim.cons[III-1], devSim.cons[JJJ-1], LOCSTORE(store2, i-1, j-1, STOREDIM, STOREDIM)); } diff --git a/src/gpu/cuda/gpu_type.h b/src/gpu/cuda/gpu_type.h index 18f2538db..120b6c1a7 100644 --- a/src/gpu/cuda/gpu_type.h +++ b/src/gpu/cuda/gpu_type.h @@ -28,81 +28,66 @@ */ template struct gpu_buffer_type; struct gpu_calculated_type { - int natom; // number of atom - int nbasis; // number of basis sets - gpu_buffer_type* o; // O matrix - gpu_buffer_type* ob; // beta O matrix - gpu_buffer_type* dense; // Density Matrix - gpu_buffer_type* denseb; // Beta Density Matrix -#ifdef USE_LEGACY_ATOMICS - gpu_buffer_type* oULL; // Unsigned long long int type O matrix - gpu_buffer_type* obULL; // Unsigned long long int type Ob matrix -#endif - gpu_buffer_type* distance; // distance matrix + int natom; // number of atom + int nbasis; // number of basis sets + gpu_buffer_type* o; // O matrix + gpu_buffer_type* ob; // beta O matrix + gpu_buffer_type* dense; // Density Matrix + gpu_buffer_type* denseb; // Beta Density Matrix + gpu_buffer_type* distance; // distance matrix }; // struct to hold large temporary device arrays -struct gpu_scratch{ - +struct gpu_scratch { gpu_buffer_type* store; // holds temporary primitive integrals in OEI and ERI algorithms gpu_buffer_type* store2; // holds temporary primitive integrals in OEI and ERI algorithms gpu_buffer_type* storeAA; // holds weighted temporary primitive integrals in OEI and ERI gradient algorithms gpu_buffer_type* storeBB; // holds weighted temporary primitive integrals in OEI and ERI gradient algorithms gpu_buffer_type* storeCC; // holds weighted temporary primitive integrals in OEI and ERI gradient algorithms gpu_buffer_type* YVerticalTemp; // holds boys function values - }; -struct gpu_timer_type{ - - double t_2elb; // time for eri load balancing in mgpu version - double t_xclb; // time for xc load balancing in mgpu version - double t_xcrb; // time for xc load re-balancing in mgpu version - double t_xcpg; // grid pruning time - +struct gpu_timer_type { + double t_2elb; // time for eri load balancing in mgpu version + double t_xclb; // time for xc load balancing in mgpu version + double t_xcrb; // time for xc load re-balancing in mgpu version + double t_xcpg; // grid pruning time }; struct gpu_cutoff_type { - int natom; - int nbasis; - int nshell; + int natom; + int nbasis; + int nshell; // the following are for pre-sorting cutoff - int sqrQshell; - gpu_buffer_type* sorted_YCutoffIJ; + int sqrQshell; + gpu_buffer_type* sorted_YCutoffIJ; // Cutoff matrix - gpu_buffer_type* cutMatrix; - gpu_buffer_type* YCutoff; - gpu_buffer_type* cutPrim; + gpu_buffer_type* cutMatrix; + gpu_buffer_type* YCutoff; + gpu_buffer_type* cutPrim; // Cutoff criteria - QUICKDouble integralCutoff; - QUICKDouble coreIntegralCutoff; - QUICKDouble primLimit; - QUICKDouble DMCutoff; - QUICKDouble XCCutoff; - QUICKDouble gradCutoff; + QUICKDouble integralCutoff; + QUICKDouble coreIntegralCutoff; + QUICKDouble primLimit; + QUICKDouble DMCutoff; + QUICKDouble XCCutoff; + QUICKDouble gradCutoff; // One electron pre-sorting cutoff - gpu_buffer_type* sorted_OEICutoffIJ; - + gpu_buffer_type* sorted_OEICutoffIJ; }; struct DFT_calculated_type { -#ifdef USE_LEGACY_ATOMICS - QUICKULL Eelxc; // exchange correction energy - QUICKULL aelec; // alpha electron - QUICKULL belec; // beta electron -#else - QUICKDouble Eelxc; // exchange correction energy - QUICKDouble aelec; // alpha electron - QUICKDouble belec; // beta electron -#endif + QUICKDouble Eelxc; // exchange correction energy + QUICKDouble aelec; // alpha electron + QUICKDouble belec; // beta electron }; /*Madu Manathunga 11/21/2019*/ -struct XC_quadrature_type{ +struct XC_quadrature_type { int npoints; //Total number of packed grid points int nbins; //Total number of bins int ntotbf; //Total number of basis functions @@ -114,15 +99,15 @@ struct XC_quadrature_type{ gpu_buffer_type* gridz; //Z coordinate of a grid point gpu_buffer_type* sswt; //A version of weight required for gradients gpu_buffer_type* weight; //Scuzeria weight of a grid point - gpu_buffer_type* gatm; //To which atom does a given grid point belongs to? - gpu_buffer_type* bin_counter; //Keeps track of bin borders - gpu_buffer_type* dweight_ssd; //Dummy weight of grid points for sswder - gpu_buffer_type* basf; //Basis function indices of all grid points - gpu_buffer_type* primf; //Primitive function inidices of all grid points - gpu_buffer_type* primfpbin; //Number of primitive functions per bin - gpu_buffer_type* basf_locator; //Helps accessing b.f. indices of a grid point - gpu_buffer_type* primf_locator; //Helps accessing p.f. indices of a b.f. - gpu_buffer_type* bin_locator; //Helps accessing bin of a grid point + gpu_buffer_type* gatm; //To which atom does a given grid point belongs to? + gpu_buffer_type* bin_counter; //Keeps track of bin borders + gpu_buffer_type* dweight_ssd; //Dummy weight of grid points for sswder + gpu_buffer_type* basf; //Basis function indices of all grid points + gpu_buffer_type* primf; //Primitive function inidices of all grid points + gpu_buffer_type* primfpbin; //Number of primitive functions per bin + gpu_buffer_type* basf_locator; //Helps accessing b.f. indices of a grid point + gpu_buffer_type* primf_locator; //Helps accessing p.f. indices of a b.f. + gpu_buffer_type* bin_locator; //Helps accessing bin of a grid point //Temporary variables gpu_buffer_type* densa; @@ -140,7 +125,7 @@ struct XC_quadrature_type{ gpu_buffer_type* dphidx; // x gradient of a basis function at a grid point gpu_buffer_type* dphidy; // y gradient of a basis function at a grid point gpu_buffer_type* dphidz; // z gradient of a basis function at a grid point - gpu_buffer_type* phi_loc; // stores locations of phi array for each grid point + gpu_buffer_type* phi_loc; // stores locations of phi array for each grid point //Variables for ssw derivative calculation int npoints_ssd; //Total number of input points for ssd @@ -150,7 +135,7 @@ struct XC_quadrature_type{ gpu_buffer_type* gridz_ssd; //Z coordinate of a grid point gpu_buffer_type* exc_ssd; gpu_buffer_type* quadwt; //quadrature weight - gpu_buffer_type* gatm_ssd; //To which atom does a given grid point belongs to? + gpu_buffer_type* gatm_ssd; //To which atom does a given grid point belongs to? gpu_buffer_type* uw_ssd; //Holds unnormalized weights during ssd calculation //Variables for grid weight calculation @@ -160,26 +145,23 @@ struct XC_quadrature_type{ //Variables for obtaining octree info gpu_buffer_type* gpweight; //keeps track of significant grid points for octree pruning - gpu_buffer_type* cfweight; //keeps track of significant b.f. for octree pruning - gpu_buffer_type* pfweight; //keeps track of significant p.f. for octree pruning + gpu_buffer_type* cfweight; //keeps track of significant b.f. for octree pruning + gpu_buffer_type* pfweight; //keeps track of significant p.f. for octree pruning // mpi variables - gpu_buffer_type* mpi_bxccompute; + gpu_buffer_type* mpi_bxccompute; // shared memory size int smem_size; //size of shared memory buffer in xc kernels }; -struct lri_data_type{ - +struct lri_data_type { int zeta; gpu_buffer_type* cc; gpu_buffer_type* vrecip; - }; struct gpu_simulation_type { - // basic molecule information and method information QUICK_METHOD method; DFT_calculated_type* DFT_calculated; diff --git a/src/gpu/gpu_common.h b/src/gpu/gpu_common.h index ff4adb5a0..aa0e1e794 100644 --- a/src/gpu/gpu_common.h +++ b/src/gpu/gpu_common.h @@ -201,11 +201,7 @@ static FILE *debugFile = NULL; //typedef float QUICKDouble; typedef float QUICKSingle; #define QUICKULL unsigned long long int -#if defined(USE_LEGACY_ATOMICS) - #define QUICKAtomicType unsigned long long int -#else - #define QUICKAtomicType double -#endif +#define QUICKAtomicType double /* SM Version enum */ @@ -233,25 +229,30 @@ struct ERI_entry { }; +//TODO: rewrite MP2 code and remove this constant /* energy scaling constants */ #define OSCALE ((QUICKDouble) 1.0e12) -#define ONEOVEROSCALE ((QUICKDouble) 1.0e-12) -#define GRADSCALE ((QUICKDouble) 1.0e16) -#define ONEOVERGRADSCALE ((QUICKDouble) 1.0e-16) -/* atomic addition */ -#if defined(TEST) - #define QUICKADD(address, val) ((address) += (val)) -#else - #define QUICKADD(address, val) atomicAdd(&(address), (val)) -#endif -#define GRADADD(address, val) \ -{ \ - QUICKULL val2 = (QUICKULL) (fabs((val) * GRADSCALE) + (QUICKDouble) 0.5); \ - if ( val < (QUICKDouble) 0.0 ) val2 = 0ull - val2; \ - QUICKADD(address, val2); \ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +GPU_DEVICE static inline double atomicAdd( double * address, double val ) +{ + unsigned long long int *address_as_ull, old, assumed; + + address_as_ull = (unsigned long long int *) address; + old = *address_as_ull; + + do + { + assumed = old; + old = atomicCAS( address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed)) ); + } + while ( assumed != old ); + + return __longlong_as_double( old ); } +#endif #endif diff --git a/src/gpu/gpu_get2e_subs.h b/src/gpu/gpu_get2e_subs.h index f3317458f..fd69bce52 100644 --- a/src/gpu/gpu_get2e_subs.h +++ b/src/gpu/gpu_get2e_subs.h @@ -842,20 +842,11 @@ __device__ __forceinline__ void iclass_spdf10 if (abs(Y) > devSim.integralCutoff) { -#if defined(USE_LEGACY_ATOMICS) - #if defined(OSHELL) - addint_oshell(devSim.oULL,devSim.obULL, Y, III, JJJ, KKK, LLL, - devSim.hyb_coeff, devSim.dense, devSim.denseb, devSim.nbasis); - #else - addint(devSim.oULL, Y, III, JJJ, KKK, LLL, devSim.hyb_coeff, devSim.dense, devSim.nbasis); - #endif -#else - #if defined(OSHELL) +#if defined(OSHELL) addint_oshell(devSim.o,devSim.ob, Y, III, JJJ, KKK, LLL, devSim.hyb_coeff, devSim.dense, devSim.denseb, devSim.nbasis); - #else +#else addint(devSim.o, Y, III, JJJ, KKK, LLL, devSim.hyb_coeff, devSim.dense, devSim.nbasis); - #endif #endif } } @@ -1248,7 +1239,7 @@ __device__ __forceinline__ void iclass_AOInt_spdf10 a.IJ = (III - 1) * devSim.nbasis + JJJ - 1; a.KL = (KKK - 1) * devSim.nbasis + LLL - 1; - aoint_buffer[QUICKADD(devSim.intCount[streamID], 1)] = a; + aoint_buffer[atomicAdd(&devSim.intCount[streamID], 1)] = a; } } } @@ -1262,26 +1253,14 @@ __device__ __forceinline__ void iclass_AOInt_spdf10 #ifndef new_quick_2_gpu_get2e_subs_h #define new_quick_2_gpu_get2e_subs_h - #if defined(USE_LEGACY_ATOMICS) - #if defined(OSHELL) -__device__ __forceinline__ void addint_oshell(QUICKULL* oULL, QUICKULL* obULL, const QUICKDouble Y, - const int III, const int JJJ, const int KKK, const int LLL, - const QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb, const int nbasis) - #else -__device__ __forceinline__ void addint(QUICKULL* oULL, const QUICKDouble Y, - const int III, const int JJJ, const int KKK, const int LLL, - const QUICKDouble hybrid_coeff, QUICKDouble* dense, const int nbasis) - #endif - #else - #if defined(OSHELL) + #if defined(OSHELL) __device__ __forceinline__ void addint_oshell(QUICKDouble* o, QUICKDouble* ob,QUICKDouble Y, int III, int JJJ, int KKK, int LLL, QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb,int nbasis) - #else + #else __device__ __forceinline__ void addint(QUICKDouble* o, QUICKDouble Y, int III, int JJJ, int KKK, int LLL, QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis) - #endif #endif { #if defined(OSHELL) @@ -1305,15 +1284,8 @@ __device__ __forceinline__ void addint(QUICKDouble* o, QUICKDouble Y, } QUICKDouble val1d = _tmp * DENSELK * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val1 = (QUICKULL) (fabs(val1d * OSCALE) + (QUICKDouble) 0.5); - if (val1d < (QUICKDouble) 0.0) val1 = 0ull - val1; - QUICKADD(LOC2(oULL, JJJ - 1, III - 1, nbasis, nbasis), val1); - QUICKADD(LOC2(obULL, JJJ - 1, III - 1, nbasis, nbasis), val1); - #else atomicAdd(&LOC2(o, JJJ - 1, III - 1, nbasis, nbasis), val1d); atomicAdd(&LOC2(ob, JJJ - 1, III - 1, nbasis, nbasis), val1d); - #endif // ATOMIC ADD VALUE 2 if (LLL != JJJ || III != KKK) { @@ -1323,87 +1295,36 @@ __device__ __forceinline__ void addint(QUICKDouble* o, QUICKDouble Y, } QUICKDouble val2d = _tmp * DENSEJI * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val2 = (QUICKULL) (fabs(val2d * OSCALE) + (QUICKDouble) 0.5); - if (val2d < (QUICKDouble) 0.0) val2 = 0ull - val2; - QUICKADD(LOC2(oULL, LLL - 1, KKK - 1, nbasis, nbasis), val2); - QUICKADD(LOC2(obULL, LLL - 1, KKK - 1, nbasis, nbasis), val2); - #else atomicAdd(&LOC2(o, LLL - 1, KKK - 1, nbasis, nbasis), val2d); atomicAdd(&LOC2(ob, LLL - 1, KKK - 1, nbasis, nbasis), val2d); - #endif } // ATOMIC ADD VALUE 3 QUICKDouble val3da = hybrid_coeff * DENSELJA * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val3a = (QUICKULL) (fabs(val3da * OSCALE) + (QUICKDouble) 0.5); - if (III == KKK && III < JJJ && JJJ < LLL) { - val3a = (QUICKULL) (fabs(2 * val3da * OSCALE) + (QUICKDouble) 0.5); - } - if (DENSELJA * Y < (QUICKDouble) 0.0) val3a = 0ull - val3a; - QUICKADD(LOC2(oULL, KKK - 1, III - 1, nbasis, nbasis), 0ull - val3a); - #else if (III == KKK && III < JJJ && JJJ < LLL) { val3da *= 2.0; } atomicAdd(&LOC2(o, KKK - 1, III - 1, nbasis, nbasis), -val3da); - #endif QUICKDouble val3db = hybrid_coeff * DENSELJB * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val3b = (QUICKULL) (fabs(val3db * OSCALE) + (QUICKDouble) 0.5); - if (III == KKK && III < JJJ && JJJ < LLL) { - val3b = (QUICKULL) (fabs(2.0 * val3db * OSCALE) + (QUICKDouble) 0.5); - } - if (DENSELJB * Y < (QUICKDouble) 0.0) val3b = 0ull - val3b; - QUICKADD(LOC2(obULL, KKK - 1, III - 1, nbasis, nbasis), 0ull - val3b); - #else if (III == KKK && III < JJJ && JJJ < LLL) { val3db *= 2.0; } atomicAdd(&LOC2(ob, KKK - 1, III - 1, nbasis, nbasis), -val3db); - #endif // ATOMIC ADD VALUE 4 if (KKK != LLL) { QUICKDouble val4da = hybrid_coeff * DENSEKJA * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val4a = (QUICKULL) (fabs(val4da * OSCALE) + (QUICKDouble) 0.5); - if (val4da < (QUICKDouble) 0.0) val4a = 0ull - val4a; - QUICKADD(LOC2(oULL, LLL - 1, III - 1, nbasis, nbasis), 0ull - val4a); - #else atomicAdd(&LOC2(o, LLL - 1, III - 1, nbasis, nbasis), -val4da); - #endif } if (KKK != LLL) { QUICKDouble val4db = hybrid_coeff * DENSEKJB * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val4b = (QUICKULL) (fabs(val4db * OSCALE) + (QUICKDouble) 0.5); - if (val4db < (QUICKDouble) 0.0) val4b = 0ull - val4b; - QUICKADD(LOC2(obULL, LLL - 1, III - 1, nbasis, nbasis), 0ull - val4b); - #else atomicAdd(&LOC2(ob, LLL - 1, III - 1, nbasis, nbasis), -val4db); - #endif } // ATOMIC ADD VALUE 5 QUICKDouble val5da = hybrid_coeff * DENSELIA * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val5a = (QUICKULL) (fabs(val5da * OSCALE) + (QUICKDouble) 0.5); - if (val5da < (QUICKDouble) 0.0) val5a = 0ull - val5a; - - if ((III != JJJ && III < KKK) - || (III == JJJ && III == KKK && III < LLL) - || (III == KKK && III < JJJ && JJJ < LLL)) { - QUICKADD(LOC2(oULL, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, nbasis, nbasis), 0ull - val5a); - } - // ATOMIC ADD VALUE 5 - 2 - if (III != JJJ && JJJ == KKK) { - QUICKADD(LOC2(oULL, JJJ - 1, KKK - 1, nbasis, nbasis), 0ull - val5a); - } - #else if ((III != JJJ && III < KKK) || (III == JJJ && III == KKK && III < LLL) || (III == KKK && III < JJJ && JJJ < LLL)) { @@ -1413,23 +1334,8 @@ __device__ __forceinline__ void addint(QUICKDouble* o, QUICKDouble Y, if (III != JJJ && JJJ == KKK) { atomicAdd(&LOC2(o, JJJ - 1, KKK - 1, nbasis, nbasis), -val5da); } - #endif QUICKDouble val5db = hybrid_coeff * DENSELIB * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val5b = (QUICKULL) (fabs(val5db * OSCALE) + (QUICKDouble) 0.5); - if (val5db < (QUICKDouble) 0.0) val5b = 0ull - val5b; - - if ((III != JJJ && III < KKK) - || (III == JJJ && III == KKK && III < LLL) - || (III == KKK && III < JJJ && JJJ < LLL)) { - QUICKADD(LOC2(obULL, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, nbasis, nbasis), 0ull - val5b); - } - // ATOMIC ADD VALUE 5 - 2 - if (III != JJJ && JJJ == KKK) { - QUICKADD(LOC2(obULL, JJJ - 1, KKK - 1, nbasis, nbasis), 0ull - val5b); - } - #else if ((III != JJJ && III < KKK) || (III == JJJ && III == KKK && III < LLL) || (III == KKK && III < JJJ && JJJ < LLL)) { @@ -1439,54 +1345,29 @@ __device__ __forceinline__ void addint(QUICKDouble* o, QUICKDouble Y, if (III != JJJ && JJJ == KKK) { atomicAdd(&LOC2(ob, JJJ - 1, KKK - 1, nbasis, nbasis), -val5db); } - #endif // ATOMIC ADD VALUE 6 if (III != JJJ) { if (KKK != LLL) { QUICKDouble val6da = hybrid_coeff * DENSEKIA * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val6a = (QUICKULL) (fabs(val6da * OSCALE) + (QUICKDouble) 0.5); - if (val6da < (QUICKDouble) 0.0) val6a = 0ull - val6a; - - QUICKADD(LOC2(oULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), 0ull - val6a); - - // ATOMIC ADD VALUE 6 - 2 - if (JJJ == LLL && III != KKK) { - QUICKADD(LOC2(oULL, LLL - 1, JJJ - 1, nbasis, nbasis), 0ull - val6a); - } - #else atomicAdd(&LOC2(o, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), -val6da); // ATOMIC ADD VALUE 6 - 2 if (JJJ == LLL && III != KKK) { atomicAdd(&LOC2(o, LLL - 1, JJJ - 1, nbasis, nbasis), -val6da); } - #endif } } if (III != JJJ) { if (KKK != LLL) { QUICKDouble val6db = hybrid_coeff * DENSEKIB * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val6b = (QUICKULL) (fabs(val6db * OSCALE) + (QUICKDouble) 0.5); - if (val6db < (QUICKDouble) 0.0) val6b = 0ull - val6b; - - QUICKADD(LOC2(obULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), 0ull - val6b); - - // ATOMIC ADD VALUE 6 - 2 - if (JJJ == LLL && III != KKK) { - QUICKADD(LOC2(obULL, LLL - 1, JJJ - 1, nbasis, nbasis), 0ull - val6b); - } - #else atomicAdd(&LOC2(ob, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), -val6db); // ATOMIC ADD VALUE 6 - 2 if (JJJ == LLL && III != KKK) { atomicAdd(&LOC2(ob, LLL - 1, JJJ - 1, nbasis, nbasis), -val6db); } - #endif } } @@ -1505,13 +1386,7 @@ __device__ __forceinline__ void addint(QUICKDouble* o, QUICKDouble Y, } QUICKDouble val1d = _tmp * DENSELK * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val1 = (QUICKULL) (fabs(val1d * OSCALE) + (QUICKDouble) 0.5); - if (val1d < (QUICKDouble) 0.0) val1 = 0ull - val1; - QUICKADD(LOC2(oULL, JJJ - 1, III - 1, nbasis, nbasis), val1); - #else atomicAdd(&LOC2(o, JJJ - 1, III - 1, nbasis, nbasis), val1d); - #endif // ATOMIC ADD VALUE 2 if (LLL != JJJ || III != KKK) { @@ -1521,60 +1396,24 @@ __device__ __forceinline__ void addint(QUICKDouble* o, QUICKDouble Y, } QUICKDouble val2d = _tmp * DENSEJI * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val2 = (QUICKULL) (fabs(val2d * OSCALE) + (QUICKDouble) 0.5); - if (val2d < (QUICKDouble) 0.0) val2 = 0ull - val2; - QUICKADD(LOC2(oULL, LLL - 1, KKK - 1, nbasis, nbasis), val2); - #else atomicAdd(&LOC2(o, LLL - 1, KKK - 1, nbasis, nbasis), val2d); - #endif } // ATOMIC ADD VALUE 3 QUICKDouble val3d = hybrid_coeff * 0.5 * DENSELJ * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val3 = (QUICKULL) (fabs(val3d * OSCALE) + (QUICKDouble) 0.5); - if (III == KKK && III < JJJ && JJJ < LLL) { - val3 = (QUICKULL) (fabs(2.0 * val3d * OSCALE) + (QUICKDouble) 0.5); - } - if (DENSELJ * Y < (QUICKDouble) 0.0) val3 = 0ull - val3; - QUICKADD(LOC2(oULL, KKK - 1, III - 1, nbasis, nbasis), 0ull - val3); - #else if (III == KKK && III < JJJ && JJJ < LLL) { val3d *= 2.0; } atomicAdd(&LOC2(o, KKK - 1, III - 1, nbasis, nbasis), -val3d); - #endif // ATOMIC ADD VALUE 4 if (KKK != LLL) { QUICKDouble val4d = hybrid_coeff * 0.5 * DENSEKJ * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val4 = (QUICKULL) (fabs(val4d * OSCALE) + (QUICKDouble) 0.5); - if (val4d < (QUICKDouble) 0.0) val4 = 0ull - val4; - QUICKADD(LOC2(oULL, LLL - 1, III - 1, nbasis, nbasis), 0ull - val4); - #else atomicAdd(&LOC2(o, LLL - 1, III - 1, nbasis, nbasis), -val4d); - #endif } // ATOMIC ADD VALUE 5 QUICKDouble val5d = hybrid_coeff * 0.5 * DENSELI * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val5 = (QUICKULL) (fabs(val5d*OSCALE) + (QUICKDouble) 0.5); - if (val5d < (QUICKDouble) 0.0) val5 = 0ull - val5; - - if ((III != JJJ && III < KKK) - || (III == JJJ && III == KKK && III < LLL) - || (III == KKK && III < JJJ && JJJ < LLL)) { - QUICKADD(LOC2(oULL, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, nbasis, nbasis), 0ull - val5); - } - - // ATOMIC ADD VALUE 5 - 2 - if (III != JJJ && JJJ == KKK) { - QUICKADD(LOC2(oULL, JJJ - 1, KKK - 1, nbasis, nbasis), 0ull - val5); - } - #else if ((III != JJJ && III < KKK) || (III == JJJ && III == KKK && III < LLL) || (III == KKK && III < JJJ && JJJ < LLL)) { @@ -1586,30 +1425,17 @@ __device__ __forceinline__ void addint(QUICKDouble* o, QUICKDouble Y, atomicAdd(&LOC2(o, JJJ - 1, KKK - 1, nbasis, nbasis), -val5d); } - #endif // ATOMIC ADD VALUE 6 if (III != JJJ) { if (KKK != LLL) { QUICKDouble val6d = hybrid_coeff * 0.5 * DENSEKI * Y; - #if defined(USE_LEGACY_ATOMICS) - QUICKULL val6 = (QUICKULL) (fabs(val6d * OSCALE) + (QUICKDouble) 0.5); - if (val6d < (QUICKDouble) 0.0) val6 = 0ull - val6; - - QUICKADD(LOC2(oULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), 0ull - val6); - - // ATOMIC ADD VALUE 6 - 2 - if (JJJ == LLL && III != KKK) { - QUICKADD(LOC2(oULL, LLL - 1, JJJ - 1, nbasis, nbasis), 0ull - val6); - } - #else atomicAdd(&LOC2(o, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), -val6d); // ATOMIC ADD VALUE 6 - 2 if (JJJ == LLL && III != KKK) { atomicAdd(&LOC2(o, LLL - 1, JJJ - 1, nbasis, nbasis), -val6d); } - #endif } } #endif diff --git a/src/gpu/gpu_get2e_subs_grad.h b/src/gpu/gpu_get2e_subs_grad.h index 806d44657..c0bc85985 100644 --- a/src/gpu/gpu_get2e_subs_grad.h +++ b/src/gpu/gpu_get2e_subs_grad.h @@ -8,6 +8,7 @@ #include "gpu_common.h" + #undef STOREDIM #ifdef int_sp @@ -40,6 +41,7 @@ #define STOREDIM STOREDIM_L #endif + #ifdef OSHELL #ifdef int_sp __global__ void @@ -732,48 +734,21 @@ __device__ __forceinline__ void iclass_grad_spd } } - - -#ifdef USE_LEGACY_ATOMICS - GRADADD(devSim.gradULL[AStart], AGradx); - GRADADD(devSim.gradULL[AStart + 1], AGrady); - GRADADD(devSim.gradULL[AStart + 2], AGradz); - - - GRADADD(devSim.gradULL[BStart], BGradx); - GRADADD(devSim.gradULL[BStart + 1], BGrady); - GRADADD(devSim.gradULL[BStart + 2], BGradz); - - - GRADADD(devSim.gradULL[CStart], CGradx); - GRADADD(devSim.gradULL[CStart + 1], CGrady); - GRADADD(devSim.gradULL[CStart + 2], CGradz); - - - GRADADD(devSim.gradULL[DStart], (-AGradx-BGradx-CGradx)); - GRADADD(devSim.gradULL[DStart + 1], (-AGrady-BGrady-CGrady)); - GRADADD(devSim.gradULL[DStart + 2], (-AGradz-BGradz-CGradz)); -#else atomicAdd(&devSim.grad[AStart], AGradx); atomicAdd(&devSim.grad[AStart + 1], AGrady); atomicAdd(&devSim.grad[AStart + 2], AGradz); - atomicAdd(&devSim.grad[BStart], BGradx); atomicAdd(&devSim.grad[BStart + 1], BGrady); atomicAdd(&devSim.grad[BStart + 2], BGradz); - atomicAdd(&devSim.grad[CStart], CGradx); atomicAdd(&devSim.grad[CStart + 1], CGrady); atomicAdd(&devSim.grad[CStart + 2], CGradz); - atomicAdd(&devSim.grad[DStart], (-AGradx-BGradx-CGradx)); atomicAdd(&devSim.grad[DStart + 1], (-AGrady-BGrady-CGrady)); atomicAdd(&devSim.grad[DStart + 2], (-AGradz-BGradz-CGradz)); -#endif - return; } #else @@ -1643,47 +1618,21 @@ QUICKDouble* YVerticalTemp, QUICKDouble* store, QUICKDouble* store2, QUICKDouble //printf("FILE: %s, LINE: %d, FUNCTION: %s, devSim.hyb_coeff \n", __FILE__, __LINE__, __func__); #endif -#ifdef USE_LEGACY_ATOMICS - GRADADD(devSim.gradULL[AStart], AGradx); - GRADADD(devSim.gradULL[AStart + 1], AGrady); - GRADADD(devSim.gradULL[AStart + 2], AGradz); - - - GRADADD(devSim.gradULL[BStart], BGradx); - GRADADD(devSim.gradULL[BStart + 1], BGrady); - GRADADD(devSim.gradULL[BStart + 2], BGradz); - - - GRADADD(devSim.gradULL[CStart], CGradx); - GRADADD(devSim.gradULL[CStart + 1], CGrady); - GRADADD(devSim.gradULL[CStart + 2], CGradz); - - - GRADADD(devSim.gradULL[DStart], (-AGradx-BGradx-CGradx)); - GRADADD(devSim.gradULL[DStart + 1], (-AGrady-BGrady-CGrady)); - GRADADD(devSim.gradULL[DStart + 2], (-AGradz-BGradz-CGradz)); -#else atomicAdd(&devSim.grad[AStart], AGradx); atomicAdd(&devSim.grad[AStart + 1], AGrady); atomicAdd(&devSim.grad[AStart + 2], AGradz); - atomicAdd(&devSim.grad[BStart], BGradx); atomicAdd(&devSim.grad[BStart + 1], BGrady); atomicAdd(&devSim.grad[BStart + 2], BGradz); - atomicAdd(&devSim.grad[CStart], CGradx); atomicAdd(&devSim.grad[CStart + 1], CGrady); atomicAdd(&devSim.grad[CStart + 2], CGradz); - atomicAdd(&devSim.grad[DStart], (-AGradx-BGradx-CGradx)); atomicAdd(&devSim.grad[DStart + 1], (-AGrady-BGrady-CGrady)); atomicAdd(&devSim.grad[DStart + 2], (-AGradz-BGradz-CGradz)); -#endif - - return; } #endif @@ -2019,31 +1968,20 @@ __device__ __forceinline__ void hrrwholegrad_sp(QUICKDouble* Yaax, QUICKDouble* } } - *Yaax = *Yaax * constant; *Yaay = *Yaay * constant; *Yaaz = *Yaaz * constant; - *Ybbx = *Ybbx * constant; *Ybby = *Ybby * constant; *Ybbz = *Ybbz * constant; - *Yccx = *Yccx * constant; *Yccy = *Yccy * constant; *Yccz = *Yccz * constant; - - - - return; - } - - - #undef STOREDIM #define STOREDIM STOREDIM_S @@ -2372,27 +2310,20 @@ __device__ __forceinline__ void hrrwholegrad(QUICKDouble* Yaax, QUICKDouble* Yaa } } - *Yaax = *Yaax * constant; *Yaay = *Yaay * constant; *Yaaz = *Yaaz * constant; - *Ybbx = *Ybbx * constant; *Ybby = *Ybby * constant; *Ybbz = *Ybbz * constant; - *Yccx = *Yccx * constant; *Yccy = *Yccy * constant; *Yccz = *Yccz * constant; - - - - return; - } + #undef STOREDIM #define STOREDIM STOREDIM_XL @@ -2725,25 +2656,17 @@ __device__ __forceinline__ void hrrwholegrad2(QUICKDouble* Yaax, QUICKDouble* Ya } } - *Yaax = *Yaax * constant; *Yaay = *Yaay * constant; *Yaaz = *Yaaz * constant; - *Ybbx = *Ybbx * constant; *Ybby = *Ybby * constant; *Ybbz = *Ybbz * constant; - *Yccx = *Yccx * constant; *Yccy = *Yccy * constant; *Yccz = *Yccz * constant; - - - - return; - } #undef STOREDIM @@ -2840,20 +2763,13 @@ __device__ __forceinline__ void hrrwholegrad2_1(QUICKDouble* Yaax, QUICKDouble* *Yaay = *Yaay * constant; *Yaaz = *Yaaz * constant; - *Ybbx = *Ybbx * constant; *Ybby = *Ybby * constant; *Ybbz = *Ybbz * constant; - *Yccx = *Yccx * constant; *Yccy = *Yccy * constant; *Yccz = *Yccz * constant; - - - - return; - } @@ -2964,37 +2880,26 @@ __device__ __forceinline__ void hrrwholegrad2_2(QUICKDouble* Yaax, QUICKDouble* } } - *Yaax = *Yaax * constant; *Yaay = *Yaay * constant; *Yaaz = *Yaaz * constant; - *Ybbx = *Ybbx * constant; *Ybby = *Ybby * constant; *Ybbz = *Ybbz * constant; - *Yccx = *Yccx * constant; *Yccy = *Yccy * constant; *Yccz = *Yccz * constant; - - - - return; - } - #endif + #ifdef int_sp -#ifndef sp_grad_fmt -#define sp_grad_fmt -#undef FMT_NAME -#define FMT_NAME FmT_grad_sp -#include "gpu_fmt.h" + #ifndef sp_grad_fmt + #define sp_grad_fmt + #undef FMT_NAME + #define FMT_NAME FmT_grad_sp + #include "gpu_fmt.h" + #endif #endif -#endif - - - diff --git a/src/gpu/gpu_lri_subs.h b/src/gpu/gpu_lri_subs.h index 9409cad75..c413ce203 100644 --- a/src/gpu/gpu_lri_subs.h +++ b/src/gpu/gpu_lri_subs.h @@ -498,16 +498,10 @@ __device__ __forceinline__ void iclass_lri_spdf2 __device__ __forceinline__ void addint_lri(QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis) { - -#ifdef USE_LEGACY_ATOMICS - QUICKULL val1 = (QUICKULL) (fabs(Y*OSCALE) + (QUICKDouble)0.5); - if ( Y < (QUICKDouble)0.0) val1 = 0ull - val1; - QUICKADD(LOC2(devSim.oULL, JJJ-1, III-1, nbasis, nbasis), val1); -#else atomicAdd(&LOC2(devSim.o, JJJ-1, III-1, nbasis, nbasis), Y); -#endif } + #ifndef old_fmt #define old_fmt diff --git a/src/gpu/gpu_lri_subs_grad.h b/src/gpu/gpu_lri_subs_grad.h index 84aa509cc..508137152 100644 --- a/src/gpu/gpu_lri_subs_grad.h +++ b/src/gpu/gpu_lri_subs_grad.h @@ -421,33 +421,10 @@ __device__ __forceinline__ void iclass_lri_grad } } - -#ifdef USE_LEGACY_ATOMICS - GRADADD(devSim.gradULL[AStart], AGradx); - GRADADD(devSim.gradULL[AStart + 1], AGrady); - GRADADD(devSim.gradULL[AStart + 2], AGradz); - - - GRADADD(devSim.gradULL[BStart], BGradx); - GRADADD(devSim.gradULL[BStart + 1], BGrady); - GRADADD(devSim.gradULL[BStart + 2], BGradz); - - if(iatom < devSim.natom){ - GRADADD(devSim.gradULL[CStart], (-AGradx-BGradx)); - GRADADD(devSim.gradULL[CStart + 1], (-AGrady-BGrady)); - GRADADD(devSim.gradULL[CStart + 2], (-AGradz-BGradz)); - }else{ - CStart = (iatom - devSim.natom) * 3; - GRADADD(devSim.ptchg_gradULL[CStart], (-AGradx-BGradx)); - GRADADD(devSim.ptchg_gradULL[CStart + 1], (-AGrady-BGrady)); - GRADADD(devSim.ptchg_gradULL[CStart + 2], (-AGradz-BGradz)); - } -#else atomicAdd(&devSim.grad[AStart], AGradx); atomicAdd(&devSim.grad[AStart + 1], AGrady); atomicAdd(&devSim.grad[AStart + 2], AGradz); - atomicAdd(&devSim.grad[BStart], BGradx); atomicAdd(&devSim.grad[BStart + 1], BGrady); atomicAdd(&devSim.grad[BStart + 2], BGradz); @@ -462,9 +439,6 @@ __device__ __forceinline__ void iclass_lri_grad atomicAdd(&devSim.ptchg_grad[CStart + 1], (-AGrady-BGrady)); atomicAdd(&devSim.ptchg_grad[CStart + 2], (-AGradz-BGradz)); } -#endif - - return; } #else @@ -726,32 +700,10 @@ QUICKDouble* YVerticalTemp, QUICKDouble* store, QUICKDouble* store2, QUICKDouble //printf("FILE: %s, LINE: %d, FUNCTION: %s, devSim.hyb_coeff \n", __FILE__, __LINE__, __func__); #endif -#ifdef USE_LEGACY_ATOMICS - GRADADD(devSim.gradULL[AStart], AGradx); - GRADADD(devSim.gradULL[AStart + 1], AGrady); - GRADADD(devSim.gradULL[AStart + 2], AGradz); - - - GRADADD(devSim.gradULL[BStart], BGradx); - GRADADD(devSim.gradULL[BStart + 1], BGrady); - GRADADD(devSim.gradULL[BStart + 2], BGradz); - - if(iatom < devSim.natom){ - GRADADD(devSim.gradULL[CStart], (-AGradx-BGradx)); - GRADADD(devSim.gradULL[CStart + 1], (-AGrady-BGrady)); - GRADADD(devSim.gradULL[CStart + 2], (-AGradz-BGradz)); - }else{ - CStart = (iatom - devSim.natom) * 3; - GRADADD(devSim.ptchg_gradULL[CStart], (-AGradx-BGradx)); - GRADADD(devSim.ptchg_gradULL[CStart + 1], (-AGrady-BGrady)); - GRADADD(devSim.ptchg_gradULL[CStart + 2], (-AGradz-BGradz)); - } -#else atomicAdd(&devSim.grad[AStart], AGradx); atomicAdd(&devSim.grad[AStart + 1], AGrady); atomicAdd(&devSim.grad[AStart + 2], AGradz); - atomicAdd(&devSim.grad[BStart], BGradx); atomicAdd(&devSim.grad[BStart + 1], BGrady); atomicAdd(&devSim.grad[BStart + 2], BGradz); @@ -766,9 +718,6 @@ QUICKDouble* YVerticalTemp, QUICKDouble* store, QUICKDouble* store2, QUICKDouble atomicAdd(&devSim.ptchg_grad[CStart + 1], (-AGrady-BGrady)); atomicAdd(&devSim.ptchg_grad[CStart + 2], (-AGradz-BGradz)); } -#endif - - return; } #endif diff --git a/src/gpu/gpu_oei_grad.h b/src/gpu/gpu_oei_grad.h index a5247fec5..ab18adec7 100644 --- a/src/gpu/gpu_oei_grad.h +++ b/src/gpu/gpu_oei_grad.h @@ -154,28 +154,6 @@ __device__ void add_oei_grad(unsigned int I, unsigned int J, unsigned int II, un int BStart = (devSim.katom[JJ]-1) * 3; int CStart = iatom * 3; -#ifdef USE_LEGACY_ATOMICS - GRADADD(devSim.gradULL[AStart], AGradx); - GRADADD(devSim.gradULL[AStart + 1], AGrady); - GRADADD(devSim.gradULL[AStart + 2], AGradz); - - GRADADD(devSim.gradULL[BStart], BGradx); - GRADADD(devSim.gradULL[BStart + 1], BGrady); - GRADADD(devSim.gradULL[BStart + 2], BGradz); - -if(iatom < devSim.natom){ - GRADADD(devSim.gradULL[CStart], (-AGradx-BGradx)); - GRADADD(devSim.gradULL[CStart + 1], (-AGrady-BGrady)); - GRADADD(devSim.gradULL[CStart + 2], (-AGradz-BGradz)); -}else{ - CStart = (iatom-devSim.natom) * 3; - GRADADD(devSim.ptchg_gradULL[CStart], (-AGradx-BGradx)); - GRADADD(devSim.ptchg_gradULL[CStart + 1], (-AGrady-BGrady)); - GRADADD(devSim.ptchg_gradULL[CStart + 2], (-AGradz-BGradz)); -} - -#else - atomicAdd(&devSim.grad[AStart], AGradx); atomicAdd(&devSim.grad[AStart + 1], AGrady); atomicAdd(&devSim.grad[AStart + 2], AGradz); @@ -184,19 +162,16 @@ if(iatom < devSim.natom){ atomicAdd(&devSim.grad[BStart + 1], BGrady); atomicAdd(&devSim.grad[BStart + 2], BGradz); -if(iatom < devSim.natom){ - atomicAdd(&devSim.grad[CStart], (-AGradx-BGradx)); - atomicAdd(&devSim.grad[CStart + 1], (-AGrady-BGrady)); - atomicAdd(&devSim.grad[CStart + 2], (-AGradz-BGradz)); -}else{ - CStart = (iatom-devSim.natom) * 3; - atomicAdd(&devSim.ptchg_grad[CStart], (-AGradx-BGradx)); - atomicAdd(&devSim.ptchg_grad[CStart + 1], (-AGrady-BGrady)); - atomicAdd(&devSim.ptchg_grad[CStart + 2], (-AGradz-BGradz)); -} - -#endif - + if(iatom < devSim.natom){ + atomicAdd(&devSim.grad[CStart], (-AGradx-BGradx)); + atomicAdd(&devSim.grad[CStart + 1], (-AGrady-BGrady)); + atomicAdd(&devSim.grad[CStart + 2], (-AGradz-BGradz)); + }else{ + CStart = (iatom-devSim.natom) * 3; + atomicAdd(&devSim.ptchg_grad[CStart], (-AGradx-BGradx)); + atomicAdd(&devSim.ptchg_grad[CStart + 1], (-AGrady-BGrady)); + atomicAdd(&devSim.ptchg_grad[CStart + 2], (-AGradz-BGradz)); + } } diff --git a/src/gpu/hip/gpu.cu b/src/gpu/hip/gpu.cu index 7b0d2b9a9..638a24737 100644 --- a/src/gpu/hip/gpu.cu +++ b/src/gpu/hip/gpu.cu @@ -1378,36 +1378,8 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou gpu->gpu_calculated->o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); gpu->gpu_calculated->dense = new gpu_buffer_type(dense, gpu->nbasis, gpu->nbasis); -#ifdef USE_LEGACY_ATOMICS - gpu->gpu_calculated->o->DeleteGPU(); - gpu->gpu_calculated->oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - gpu->gpu_calculated->oULL->Upload(); - gpu->gpu_sim.oULL = gpu->gpu_calculated->oULL->_devData; -#else gpu->gpu_calculated->o->Upload(); gpu->gpu_sim.o = gpu->gpu_calculated->o->_devData; -#endif - - /* - oULL is the unsigned long long int type of O matrix. The reason to do so is because - Atomic Operator for CUDA 2.0 is only available for integer. So for double precision type, - an comprimise way is to multiple a very large number (OSCALE), first and divided it - after atomic operator. - */ - /* - for (int i = 0; inbasis; i++) { - for (int j = 0; jnbasis; j++) { - QUICKULL valUII = (QUICKULL) (fabs ( LOC2( gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis)*OSCALE + (QUICKDouble)0.5)); - - if (LOC2( gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis)<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - - LOC2( gpu->gpu_calculated->oULL->_hostData, i, j, gpu->nbasis, gpu->nbasis) = valUII; - } - } - */ gpu->gpu_calculated->dense->Upload(); gpu->gpu_sim.dense = gpu->gpu_calculated->dense->_devData; @@ -1443,34 +1415,8 @@ extern "C" void gpu_upload_calculated_beta_(QUICKDouble* ob, QUICKDouble* denseb gpu->gpu_calculated->ob = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); -#ifdef USE_LEGACY_ATOMICS - gpu->gpu_calculated->ob->DeleteGPU(); - gpu->gpu_calculated->obULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - gpu->gpu_calculated->obULL->Upload(); - gpu->gpu_sim.obULL = gpu->gpu_calculated->obULL->_devData; -#else gpu->gpu_calculated->ob->Upload(); gpu->gpu_sim.ob = gpu->gpu_calculated->ob->_devData; -#endif - - /* - obULL is the unsigned long long int type of Ob matrix. The reason to do so is because - Atomic Operator for CUDA 2.0 is only available for integer. So for double precision type, - an comprimise way is to multiple a very large number (OSCALE), first and divided it - after atomic operator. - */ - /*for (int i = 0; inbasis; i++) { - for (int j = 0; jnbasis; j++) { - QUICKULL valUII = (QUICKULL) (fabs ( LOC2( gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis)*OSCALE + (QUICKDouble)0.5)); - - if (LOC2( gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis)<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - - LOC2( gpu->gpu_calculated->obULL->_hostData, i, j, gpu->nbasis, gpu->nbasis) = valUII; - } - }*/ gpu_upload_beta_density_matrix_(denseb); @@ -1883,12 +1829,6 @@ extern "C" void gpu_upload_grad_(QUICKDouble* gradCutoff) gpu->grad = new gpu_buffer_type(3 * gpu->natom); -#ifdef USE_LEGACY_ATOMICS - gpu->gradULL = new gpu_buffer_type(3 * gpu->natom); - gpu->gpu_sim.gradULL = gpu->gradULL->_devData; - gpu->gradULL->Upload(); -#endif - //gpu->grad->DeleteGPU(); gpu->gpu_sim.grad = gpu->grad->_devData; gpu->grad->Upload(); @@ -2910,26 +2850,6 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) PRINTDEBUG("COMPLETE KERNEL") -#ifdef USE_LEGACY_ATOMICS - gpu->gpu_calculated->oULL->Download(); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } -#else gpu->gpu_calculated->o->Download(); for (int i = 0; i< gpu->nbasis; i++) { @@ -2938,7 +2858,6 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } -#endif gpu->gpu_calculated->o->Download(o); #ifdef DEBUG @@ -2960,12 +2879,7 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) delete gpu->gpu_cutoff->YCutoff; delete gpu->gpu_cutoff->cutPrim; -#ifdef USE_LEGACY_ATOMICS - delete gpu->gpu_calculated->oULL; -#endif - PRINTDEBUG("COMPLETE RUNNING ADDINT") - } #endif diff --git a/src/gpu/hip/gpu.h b/src/gpu/hip/gpu.h index ac3cb44b8..145e3e91a 100644 --- a/src/gpu/hip/gpu.h +++ b/src/gpu/hip/gpu.h @@ -294,13 +294,8 @@ void bind_eri_texture(_gpu_type gpu); void unbind_eri_texture(); //__device__ void gpu_shell(unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL); -#ifdef USE_LEGACY_ATOMICS -__device__ void addint(QUICKULL* oULL, QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis); -__device__ __forceinline__ void addint_oshell(QUICKULL* oULL, QUICKULL* obULL,QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb,int nbasis); -#else __device__ void addint(QUICKDouble* o, QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis); __device__ __forceinline__ void addint_oshell(QUICKDouble* o, QUICKDouble* ob,QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb,int nbasis); -#endif __device__ __forceinline__ void addint_lri(QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis); __device__ void FmT_sp(const int MaxM, const QUICKDouble X, QUICKDouble* vals); __device__ void FmT_spd(const int MaxM, const QUICKDouble X, QUICKDouble* vals); @@ -622,13 +617,8 @@ __device__ int lefthrr_lri23(QUICKDouble RAx, QUICKDouble RAy, QUICKDouble RAz, int KLMNAx, int KLMNAy, int KLMNAz, int KLMNBx, int KLMNBy, int KLMNBz, int IJTYPE,QUICKDouble* coefAngularL, unsigned char* angularL); -#ifdef USE_LEGACY_ATOMICS -__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKULL* smemGrad, int iparent, int gid); -__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKULL* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom); -#else __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int iparent, int gid); __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKDouble* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom); -#endif __device__ QUICKDouble get_unnormalized_weight(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int iatm); __device__ QUICKDouble SSW( QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int atm); diff --git a/src/gpu/hip/gpu_MP2.cu b/src/gpu/hip/gpu_MP2.cu index 8665d575a..fc5d28c29 100644 --- a/src/gpu/hip/gpu_MP2.cu +++ b/src/gpu/hip/gpu_MP2.cu @@ -379,7 +379,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned QUICKULL val1 = (QUICKULL) (fabs(val1d * OSCALE) + (QUICKDouble) 0.5); if (val1d < (QUICKDouble) 0.0) val1 = 0ull - val1; - QUICKADD(LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1); + atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1); // } // ATOMIC ADD VALUE 2 @@ -394,7 +394,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned QUICKULL val2 = (QUICKULL) (fabs(val2d * OSCALE) + (QUICKDouble) 0.5); if (val2d < (QUICKDouble) 0.0) val2 = 0ull - val2; - QUICKADD(LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2); + atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2); // } } @@ -407,7 +407,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned } if (DENSELJ * Y < (QUICKDouble) 0.0) val3 = 0ull - val3; - QUICKADD(LOC2(devSim_MP2.oULL, KKK - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val3); + atomicAdd(&LOC2(devSim_MP2.oULL, KKK - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val3); //} // ATOMIC ADD VALUE 4 @@ -416,7 +416,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned // if (abs(val4d) > devSim_MP2.integralCutoff) { QUICKULL val4 = (QUICKULL) (fabs(val4d * OSCALE) + (QUICKDouble) 0.5); if (val4d < (QUICKDouble) 0.0) val4 = 0ull - val4; - QUICKADD(LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val4); + atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val4); //} } @@ -429,13 +429,13 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned if ((III != JJJ && III < KKK) || (III == JJJ && III == KKK && III < LLL) || (III == KKK && III < JJJ && JJJ < LLL)) { - QUICKADD(LOC2(devSim_MP2.oULL, MAX(JJJ,KKK) - 1, MIN(JJJ,KKK) - 1, + atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,KKK) - 1, MIN(JJJ,KKK) - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5); } // ATOMIC ADD VALUE 5 - 2 if (III != JJJ && JJJ == KKK) { - QUICKADD(LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1, + atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5); } //} @@ -450,12 +450,12 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned if (val6d < (QUICKDouble) 0.0) val6 = 0ull - val6; - QUICKADD(LOC2(devSim_MP2.oULL, MAX(JJJ,LLL) - 1, MIN(JJJ,LLL) - 1, + atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,LLL) - 1, MIN(JJJ,LLL) - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6); // ATOMIC ADD VALUE 6 - 2 if (JJJ == LLL && III != KKK) { - QUICKADD(LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1, + atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6); } } diff --git a/src/gpu/hip/gpu_cew_quad.h b/src/gpu/hip/gpu_cew_quad.h index 4464730fc..3a4e0caf0 100644 --- a/src/gpu/hip/gpu_cew_quad.h +++ b/src/gpu/hip/gpu_cew_quad.h @@ -1,53 +1,48 @@ #include "hip/hip_runtime.h" /* - !---------------------------------------------------------------------! - ! Written by Madu Manathunga on 09/29/2021 ! - ! ! - ! Copyright (C) 2020-2021 Merz lab ! - ! Copyright (C) 2020-2021 Götz lab ! - ! ! - ! This Source Code Form is subject to the terms of the Mozilla Public ! - ! License, v. 2.0. If a copy of the MPL was not distributed with this ! - ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! - !_____________________________________________________________________! - - !---------------------------------------------------------------------! - ! This source file contains preprocessable functions required for ! - ! QUICK GPU version. ! - !---------------------------------------------------------------------! -*/ + !---------------------------------------------------------------------! + ! Written by Madu Manathunga on 09/29/2021 ! + ! ! + ! Copyright (C) 2020-2021 Merz lab ! + ! Copyright (C) 2020-2021 Götz lab ! + ! ! + ! This Source Code Form is subject to the terms of the Mozilla Public ! + ! License, v. 2.0. If a copy of the MPL was not distributed with this ! + ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! + !_____________________________________________________________________! + + !---------------------------------------------------------------------! + ! This source file contains preprocessable functions required for ! + ! QUICK GPU version. ! + !---------------------------------------------------------------------! + */ #ifdef CEW #include "iface.hpp" -#ifndef OSHELL -void getcew_quad(_gpu_type gpu){ +#ifndef OSHELL +void getcew_quad(_gpu_type gpu) { QUICK_SAFE_CALL((getcew_quad_kernel<<< gpu -> blocks, gpu -> xc_threadsPerBlock>>>())); hipDeviceSynchronize(); } -void getcew_quad_grad(_gpu_type gpu){ - - if(gpu -> gpu_sim.is_oshell == true){ - +void getcew_quad_grad(_gpu_type gpu) { + if(gpu -> gpu_sim.is_oshell == true) { QUICK_SAFE_CALL((get_oshell_density_kernel<<blocks, gpu->xc_threadsPerBlock>>>())); hipDeviceSynchronize(); QUICK_SAFE_CALL((oshell_getcew_quad_grad_kernel<<< gpu -> blocks, gpu -> xc_threadsPerBlock, gpu -> gpu_xcq -> smem_size>>>())); - - }else{ - + } else { QUICK_SAFE_CALL((get_cshell_density_kernel<<blocks, gpu->xc_threadsPerBlock>>>())); hipDeviceSynchronize(); QUICK_SAFE_CALL((cshell_getcew_quad_grad_kernel<<< gpu -> blocks, gpu -> xc_threadsPerBlock, gpu -> gpu_xcq -> smem_size>>>())); //QUICK_SAFE_CALL((cshell_getcew_quad_grad_kernel<<< 1,1, gpu -> gpu_xcq -> smem_size>>>())); - } hipDeviceSynchronize(); @@ -63,27 +58,25 @@ void getcew_quad_grad(_gpu_type gpu){ hipDeviceSynchronize(); gpu_delete_sswgrad_vars(); - } -void get_cew_accdens(_gpu_type gpu){ - - QUICKDouble *gridpt = new QUICKDouble[3]; - QUICKDouble *cewGrad= new QUICKDouble[3]; +void get_cew_accdens(_gpu_type gpu) { + QUICKDouble *gridpt = new QUICKDouble[3]; + QUICKDouble *cewGrad= new QUICKDouble[3]; gpu -> gpu_xcq -> densa -> Download(); - if(gpu -> gpu_sim.is_oshell == true) gpu -> gpu_xcq -> densb -> Download(); - + if(gpu -> gpu_sim.is_oshell == true) + gpu -> gpu_xcq -> densb -> Download(); - for(int i=0; i< gpu -> gpu_xcq -> npoints;i++){ - + for(int i=0; i< gpu -> gpu_xcq -> npoints;i++) { QUICKDouble weight = gpu -> gpu_xcq -> weight -> _hostData[i]; QUICKDouble densea = gpu -> gpu_xcq -> densa -> _hostData[i]; QUICKDouble denseb = densea; - if(gpu -> gpu_sim.is_oshell == true) denseb = gpu -> gpu_xcq -> densb -> _hostData[i]; + if(gpu -> gpu_sim.is_oshell == true) + denseb = gpu -> gpu_xcq -> densb -> _hostData[i]; gridpt[0] = gpu -> gpu_xcq -> gridx -> _hostData[i]; gridpt[1] = gpu -> gpu_xcq -> gridy -> _hostData[i]; @@ -91,228 +84,178 @@ void get_cew_accdens(_gpu_type gpu){ const QUICKDouble charge_density = -weight * (densea+denseb); - for(int j=0; j<3; j++) cewGrad[j]=0.0; + for(int j=0; j<3; j++) + cewGrad[j]=0.0; QUICKDouble const *cnst_gridpt = gridpt; - // this function comes from cew library in amber + // this function comes from cew library in amber cew_accdensatpt_(cnst_gridpt, &charge_density, cewGrad); -//printf("cew_accdensatpt %f %f %f %f %f %f %f \n", gridpt[0], gridpt[1], gridpt[2], charge_density\ -,cewGrad[0], cewGrad[1], cewGrad[2]); + //printf("cew_accdensatpt %f %f %f %f %f %f %f \n", gridpt[0], gridpt[1], gridpt[2], charge_density\ + ,cewGrad[0], cewGrad[1], cewGrad[2]); int Istart = (gpu -> gpu_xcq -> gatm -> _hostData[i]-1) * 3; for(int j=0; j<3; j++) -#ifdef USE_LEGACY_ATOMICS - gpu->grad->_hostData[Istart+j] += cewGrad[j]; -#else gpu -> cew_grad->_hostData[Istart+j] += cewGrad[j]; -#endif - } delete gridpt; delete cewGrad; - } - - __global__ void getcew_quad_kernel() { - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; - - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { - - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - QUICKDouble weight = devSim_dft.weight[gid]; + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble dfdr = devSim_dft.cew_vrecip[gid]; + QUICKDouble weight = devSim_dft.weight[gid]; - for (int i = bfloc_st; i< bfloc_end; ++i) { + QUICKDouble dfdr = devSim_dft.cew_vrecip[gid]; - int ibas = devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; + for (int i = bfloc_st; i< bfloc_end; ++i) { + int ibas = devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.DMCutoff ) { - for (int j = bfloc_st; j < bfloc_end; j++) { + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.DMCutoff ) { + for (int j = bfloc_st; j < bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - QUICKDouble _tmp = phi * phi2 * dfdr * weight; + QUICKDouble _tmp = phi * phi2 * dfdr * weight; -#ifdef USE_LEGACY_ATOMICS - QUICKULL val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) val1 = 0ull - val1; - QUICKADD(LOC2(devSim_dft.oULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), val1); -#else - atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); -#endif + atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); + } + } } - } } - } - + } #endif + #ifdef OSHELL __global__ void oshell_getcew_quad_grad_kernel() #else __global__ void cshell_getcew_quad_grad_kernel() #endif { + //declare smem grad vector + extern __shared__ QUICKDouble smem_buffer[]; + QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; -#ifdef USE_LEGACY_ATOMICS - //declare smem grad vector - extern __shared__ QUICKULL smem_buffer[]; - QUICKULL* smemGrad=(QUICKULL*)smem_buffer; + // initialize smem grad + for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) + smemGrad[i]=0.0; - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0ull; -#else - //declare smem grad vector - extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; + __syncthreads(); - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0.0; -#endif + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; - __syncthreads(); + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; - - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { - - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - - - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble weight = devSim_dft.weight[gid]; + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; + QUICKDouble weight = devSim_dft.weight[gid]; #ifdef OSHELL - QUICKDouble densitysum = devSim_dft.densa[gid]+devSim_dft.densb[gid]; + QUICKDouble densitysum = devSim_dft.densa[gid]+devSim_dft.densb[gid]; #else - QUICKDouble densitysum = 2*devSim_dft.densa[gid]; + QUICKDouble densitysum = 2*devSim_dft.densa[gid]; #endif - QUICKDouble dfdr = devSim_dft.cew_vrecip[gid]; - - if(densitysum >devSim_dft.DMCutoff){ - - QUICKDouble _tmp = ((QUICKDouble) (dfdr * densitysum)); + QUICKDouble dfdr = devSim_dft.cew_vrecip[gid]; - devSim_dft.exc[gid] = _tmp; + if(densitysum >devSim_dft.DMCutoff) { + QUICKDouble _tmp = ((QUICKDouble) (dfdr * densitysum)); - QUICKDouble sumGradx = 0.0; - QUICKDouble sumGrady = 0.0; - QUICKDouble sumGradz = 0.0; + devSim_dft.exc[gid] = _tmp; - for (int i = bfloc_st; i< bfloc_end; i++) { - int ibas = devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + QUICKDouble sumGradx = 0.0; + QUICKDouble sumGrady = 0.0; + QUICKDouble sumGradz = 0.0; - if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.DMCutoff ) { + for (int i = bfloc_st; i< bfloc_end; i++) { + int ibas = devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - //QUICKDouble dxdx, dxdy, dxdz, dydy, dydz, dzdz; + if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.DMCutoff ) { + //QUICKDouble dxdx, dxdy, dxdz, dydy, dydz, dzdz; - //pt2der_new(gridx, gridy, gridz, &dxdx, &dxdy, &dxdz, &dydy, &dydz, &dzdz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + //pt2der_new(gridx, gridy, gridz, &dxdx, &dxdy, &dxdz, &dydy, &dydz, &dzdz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - int Istart = (devSim_dft.ncenter[ibas]-1) * 3; + int Istart = (devSim_dft.ncenter[ibas]-1) * 3; - for (int j = bfloc_st; j < bfloc_end; j++) { + for (int j = bfloc_st; j < bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - - QUICKDouble denseij = (QUICKDouble) LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + QUICKDouble denseij = (QUICKDouble) LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); #ifdef OSHELL - denseij += (QUICKDouble) LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); -#endif - - QUICKDouble Gradx = - 2.0 * denseij * weight * (dfdr * dphidx * phi2); - QUICKDouble Grady = - 2.0 * denseij * weight * (dfdr * dphidy * phi2); - QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2); -//printf("test quad grad %f %f %f %f %f %f %f %f %f %f\n", gridx, gridy, gridz, denseij, weight, dfdr, dphidx, dphidy, dphidz, phi2); - -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[Istart], Gradx); - GRADADD(smemGrad[Istart+1], Grady); - GRADADD(smemGrad[Istart+2], Gradz); -#else - atomicAdd(&smemGrad[Istart], Gradx); - atomicAdd(&smemGrad[Istart+1], Grady); - atomicAdd(&smemGrad[Istart+2], Gradz); + denseij += (QUICKDouble) LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); #endif - sumGradx += Gradx; - sumGrady += Grady; - sumGradz += Gradz; - } + QUICKDouble Gradx = - 2.0 * denseij * weight * (dfdr * dphidx * phi2); + QUICKDouble Grady = - 2.0 * denseij * weight * (dfdr * dphidy * phi2); + QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2); + //printf("test quad grad %f %f %f %f %f %f %f %f %f %f\n", gridx, gridy, gridz, denseij, weight, dfdr, dphidx, dphidy, dphidz, phi2); + + atomicAdd(&smemGrad[Istart], Gradx); + atomicAdd(&smemGrad[Istart+1], Grady); + atomicAdd(&smemGrad[Istart+2], Gradz); + sumGradx += Gradx; + sumGrady += Grady; + sumGradz += Gradz; + } + } + } + + int Istart = (devSim_dft.gatm[gid]-1)*3; + + atomicAdd(&smemGrad[Istart], -sumGradx); + atomicAdd(&smemGrad[Istart+1], -sumGrady); + atomicAdd(&smemGrad[Istart+2], -sumGradz); } - } - - int Istart = (devSim_dft.gatm[gid]-1)*3; -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[Istart], -sumGradx); - GRADADD(smemGrad[Istart+1], -sumGrady); - GRADADD(smemGrad[Istart+2], -sumGradz); -#else - atomicAdd(&smemGrad[Istart], -sumGradx); - atomicAdd(&smemGrad[Istart+1], -sumGrady); - atomicAdd(&smemGrad[Istart+2], -sumGradz); -#endif - - } - - //Set weights for sswder calculation - if(densitysum < devSim_dft.DMCutoff){ + //Set weights for sswder calculation + if(densitysum < devSim_dft.DMCutoff) { devSim_dft.dweight_ssd[gid] = 0; - } + } - if(devSim_dft.sswt[gid] == 1){ + if(devSim_dft.sswt[gid] == 1) { devSim_dft.dweight_ssd[gid] = 0; + } } - - } - - __syncthreads(); - // update gmem grad vector - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) -#ifdef USE_LEGACY_ATOMICS - atomicAdd(&devSim_dft.gradULL[i],smemGrad[i]); -#else - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); -#endif + __syncthreads(); - __syncthreads(); + // update gmem grad vector + for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) + atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + __syncthreads(); } #endif diff --git a/src/gpu/hip/gpu_get2e.cu b/src/gpu/hip/gpu_get2e.cu index 3a7d6e08f..b66fd8a05 100644 --- a/src/gpu/hip/gpu_get2e.cu +++ b/src/gpu/hip/gpu_get2e.cu @@ -803,11 +803,7 @@ __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) getAddInt_kerne // } else if( devSim.method == LIBXC) { // hybrid_coeff = devSim.hyb_coeff; // } -#ifdef USE_LEGACY_ATOMICS - addint(devSim.oULL, a[k].value, III, JJJ, KKK, LLL, devSim.hyb_coeff, devSim.dense, devSim.nbasis); -#else addint(devSim.o, a[k].value, III, JJJ, KKK, LLL, devSim.hyb_coeff, devSim.dense, devSim.nbasis); -#endif } } diff --git a/src/gpu/hip/gpu_get2e_getxc_drivers.h b/src/gpu/hip/gpu_get2e_getxc_drivers.h index 0d7668b44..7b2ea34b2 100644 --- a/src/gpu/hip/gpu_get2e_getxc_drivers.h +++ b/src/gpu/hip/gpu_get2e_getxc_drivers.h @@ -1,22 +1,22 @@ /* - !---------------------------------------------------------------------! - ! Created by Madu Manathunga on 04/07/2021 ! - ! ! - ! Previous contributors: Yipu Miao ! - ! ! - ! Copyright (C) 2020-2021 Merz lab ! - ! Copyright (C) 2020-2021 Götz lab ! - ! ! - ! This Source Code Form is subject to the terms of the Mozilla Public ! - ! License, v. 2.0. If a copy of the MPL was not distributed with this ! - ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! - !_____________________________________________________________________! - - !---------------------------------------------------------------------! - ! This source file contains preprocessable get2e and getxc C functions! - ! that can be called from f90 subroutines. ! - !---------------------------------------------------------------------! -*/ + !---------------------------------------------------------------------! + ! Created by Madu Manathunga on 04/07/2021 ! + ! ! + ! Previous contributors: Yipu Miao ! + ! ! + ! Copyright (C) 2020-2021 Merz lab ! + ! Copyright (C) 2020-2021 Götz lab ! + ! ! + ! This Source Code Form is subject to the terms of the Mozilla Public ! + ! License, v. 2.0. If a copy of the MPL was not distributed with this ! + ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! + !_____________________________________________________________________! + + !---------------------------------------------------------------------! + ! This source file contains preprocessable get2e and getxc C functions! + ! that can be called from f90 subroutines. ! + !---------------------------------------------------------------------! + */ //----------------------------------------------- // core part, compute 2-e integrals @@ -28,9 +28,7 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) #endif { PRINTDEBUG("BEGIN TO RUN GET ERI") - upload_sim_to_constant(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") #ifdef OSHELL @@ -41,56 +39,12 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) PRINTDEBUG("COMPLETE KERNEL") -#ifdef USE_LEGACY_ATOMICS - gpu -> gpu_calculated -> oULL -> Download(); - hipMemsetAsync(gpu -> gpu_calculated -> oULL -> _devData, 0, sizeof(QUICKULL)*gpu->nbasis*gpu->nbasis); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } - -#ifdef OSHELL - gpu -> gpu_calculated -> obULL -> Download(); - hipMemsetAsync(gpu -> gpu_calculated -> obULL -> _devData, 0, sizeof(QUICKULL)*gpu->nbasis*gpu->nbasis); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->obULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->ob->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } -#endif - -#else gpu -> gpu_calculated -> o -> Download(); hipMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); for (int i = 0; i< gpu->nbasis; i++) { for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); + LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } #ifdef OSHELL @@ -102,8 +56,6 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } -#endif - #endif gpu -> gpu_calculated -> o -> DownloadSum(o); @@ -115,25 +67,18 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) PRINTDEBUG("DELETE TEMP VARIABLES") if(gpu -> gpu_sim.method == HF){ - delete gpu->gpu_calculated->o; - delete gpu->gpu_calculated->dense; - -#ifdef USE_LEGACY_ATOMICS - delete gpu->gpu_calculated->oULL; -#ifdef OSHELL - delete gpu->gpu_calculated->obULL; -#endif -#endif + delete gpu->gpu_calculated->o; + delete gpu->gpu_calculated->dense; #ifdef OSHELL - delete gpu->gpu_calculated->ob; - delete gpu->gpu_calculated->denseb; + delete gpu->gpu_calculated->ob; + delete gpu->gpu_calculated->denseb; #endif }else if(*deltaO != 0){ - delete gpu->gpu_calculated->dense; + delete gpu->gpu_calculated->dense; #ifdef OSHELL - delete gpu->gpu_calculated->denseb; + delete gpu->gpu_calculated->denseb; #endif } @@ -142,6 +87,7 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) PRINTDEBUG("COMPLETE RUNNING GET2E") } + #ifdef OSHELL extern "C" void gpu_get_oshell_eri_grad_(QUICKDouble* grad) #else @@ -149,9 +95,7 @@ extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) #endif { PRINTDEBUG("BEGIN TO RUN GRAD") - upload_sim_to_constant(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") if(gpu -> gpu_sim.is_oshell == true){ @@ -163,7 +107,7 @@ extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) #ifdef GPU_SPDF if (gpu->maxL >= 3) { upload_sim_to_constant_ffff(gpu); - + if(gpu -> gpu_sim.is_oshell == true){ get_oshell_eri_grad_ffff(gpu); }else{ @@ -175,52 +119,24 @@ extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) PRINTDEBUG("COMPLETE KERNEL") if(gpu -> gpu_sim.method == HF){ - -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); - - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - gpu -> grad -> Download(); - -#endif + gpu -> grad -> Download(); } if(gpu -> gpu_sim.method == HF){ + gpu -> grad -> DownloadSum(grad); - gpu -> grad -> DownloadSum(grad); - - delete gpu -> grad; -#ifdef USE_LEGACY_ATOMICS - delete gpu -> gradULL; -#endif - delete gpu->gpu_calculated->dense; + delete gpu -> grad; + delete gpu->gpu_calculated->dense; #ifdef OSHELL - delete gpu->gpu_calculated->denseb; + delete gpu->gpu_calculated->denseb; #endif - } - PRINTDEBUG("COMPLETE RUNNING GRAD") } - #ifdef OSHELL extern "C" void gpu_get_oshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICKDouble* belec, QUICKDouble *o, QUICKDouble *ob) #else @@ -229,123 +145,19 @@ extern "C" void gpu_get_cshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICK { PRINTDEBUG("BEGIN TO RUN GETXC") - gpu -> DFT_calculated = new gpu_buffer_type(1, 1); - -#ifdef USE_LEGACY_ATOMICS - QUICKULL valUII = (QUICKULL) (fabs ( *Eelxc * OSCALE + (QUICKDouble)0.5)); - - if (*Eelxc<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - - gpu -> DFT_calculated -> _hostData[0].Eelxc = valUII; - - valUII = (QUICKULL) (fabs ( *aelec * OSCALE + (QUICKDouble)0.5)); - - if (*aelec<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - gpu -> DFT_calculated -> _hostData[0].aelec = valUII; - - valUII = (QUICKULL) (fabs ( *belec * OSCALE + (QUICKDouble)0.5)); - - if (*belec<(QUICKDouble)0.0) - { - valUII = 0ull - valUII; - } - - gpu -> DFT_calculated -> _hostData[0].belec = valUII; -#else - gpu -> DFT_calculated -> _hostData[0].Eelxc = 0.0; - gpu -> DFT_calculated -> _hostData[0].aelec = 0.0; - gpu -> DFT_calculated -> _hostData[0].belec = 0.0; -#endif - gpu -> DFT_calculated -> Upload(); - gpu -> gpu_sim.DFT_calculated= gpu -> DFT_calculated->_devData; + gpu->DFT_calculated = new gpu_buffer_type(1, 1); + gpu->DFT_calculated->_hostData[0].Eelxc = 0.0; + gpu->DFT_calculated->_hostData[0].aelec = 0.0; + gpu->DFT_calculated->_hostData[0].belec = 0.0; + gpu->DFT_calculated->Upload(); + gpu->gpu_sim.DFT_calculated = gpu->DFT_calculated->_devData; upload_sim_to_constant_dft(gpu); PRINTDEBUG("BEGIN TO RUN KERNEL") getxc(gpu); - gpu -> DFT_calculated -> Download(); - -#ifdef USE_LEGACY_ATOMICS - gpu -> gpu_calculated -> oULL -> Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } - -#ifdef OSHELL - gpu -> gpu_calculated -> obULL -> Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->obULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->ob->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } - -#endif - - QUICKULL valULL = gpu->DFT_calculated -> _hostData[0].Eelxc; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - *Eelxc = (QUICKDouble)valDB*ONEOVEROSCALE; - - valULL = gpu->DFT_calculated -> _hostData[0].aelec; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - *aelec = (QUICKDouble)valDB*ONEOVEROSCALE; - - valULL = gpu->DFT_calculated -> _hostData[0].belec; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - *belec = (QUICKDouble)valDB*ONEOVEROSCALE; -#else + gpu->DFT_calculated->Download(); gpu -> gpu_calculated -> o -> Download(); for (int i = 0; i< gpu->nbasis; i++) { @@ -361,15 +173,12 @@ extern "C" void gpu_get_cshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICK LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } - #endif - *Eelxc = gpu->DFT_calculated -> _hostData[0].Eelxc; + *Eelxc = gpu->DFT_calculated -> _hostData[0].Eelxc; *aelec = gpu->DFT_calculated -> _hostData[0].aelec; *belec = gpu->DFT_calculated -> _hostData[0].belec; -#endif - gpu -> gpu_calculated -> o -> DownloadSum(o); #ifdef OSHELL gpu -> gpu_calculated -> ob -> DownloadSum(ob); @@ -380,125 +189,70 @@ extern "C" void gpu_get_cshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICK delete gpu->gpu_calculated->o; delete gpu->gpu_calculated->dense; -#ifdef USE_LEGACY_ATOMICS - delete gpu->gpu_calculated->oULL; -#ifdef OSHELL - delete gpu->gpu_calculated->obULL; -#endif -#endif - #ifdef OSHELL delete gpu->gpu_calculated->ob; delete gpu->gpu_calculated->denseb; #endif - } + #ifdef OSHELL extern "C" void gpu_get_oshell_xcgrad_(QUICKDouble *grad) #else extern "C" void gpu_get_cshell_xcgrad_(QUICKDouble *grad) #endif { - -#if (defined CEW) && !(defined USE_LEGACY_ATOMICS) +#if defined(CEW) gpu -> cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); #endif - // calculate smem size - gpu -> gpu_xcq -> smem_size = gpu->natom * 3 * sizeof(QUICKULL); - - upload_sim_to_constant_dft(gpu); - - memset(gpu->grad->_hostData, 0, gpu -> gpu_xcq -> smem_size); - - getxc_grad(gpu); - -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); + // calculate smem size + gpu -> gpu_xcq -> smem_size = gpu->natom * 3 * sizeof(QUICKULL); - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; + upload_sim_to_constant_dft(gpu); - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } + memset(gpu->grad->_hostData, 0, gpu -> gpu_xcq -> smem_size); - gpu->grad->_hostData[i] += (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - gpu -> grad -> Download(); -#endif + getxc_grad(gpu); + gpu -> grad -> Download(); - gpu -> grad -> DownloadSum(grad); + gpu -> grad -> DownloadSum(grad); -#if (defined CEW) && !(defined USE_LEGACY_ATOMICS) - gpu -> cew_grad->DownloadSum(grad); - delete gpu -> cew_grad; +#if defined(CEW) + gpu -> cew_grad->DownloadSum(grad); + delete gpu -> cew_grad; #endif - delete gpu -> grad; -#ifdef USE_LEGACY_ATOMICS - delete gpu -> gradULL; -#endif - delete gpu->gpu_calculated->dense; + delete gpu -> grad; + delete gpu->gpu_calculated->dense; #ifdef OSHELL - delete gpu->gpu_calculated->denseb; + delete gpu->gpu_calculated->denseb; #endif } - #ifndef OSHELL extern "C" void gpu_get_oei_(QUICKDouble* o) { + // gpu -> gpu_calculated -> o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); -// gpu -> gpu_calculated -> o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - -//#ifdef LEGACY_ATOMIC_ADD -// gpu -> gpu_calculated -> o -> DeleteGPU(); -// gpu -> gpu_calculated -> oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); -// gpu -> gpu_calculated -> oULL -> Upload(); -// gpu -> gpu_sim.oULL = gpu -> gpu_calculated -> oULL -> _devData; -/*#else - gpu -> gpu_calculated -> o -> Upload(); - gpu -> gpu_sim.o = gpu -> gpu_calculated -> o -> _devData; + //#ifdef LEGACY_ATOMIC_ADD + // gpu -> gpu_calculated -> o -> DeleteGPU(); + // gpu -> gpu_calculated -> oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + // gpu -> gpu_calculated -> oULL -> Upload(); + // gpu -> gpu_sim.oULL = gpu -> gpu_calculated -> oULL -> _devData; + /*#else + gpu -> gpu_calculated -> o -> Upload(); + gpu -> gpu_sim.o = gpu -> gpu_calculated -> o -> _devData; #endif -*/ +*/ upload_sim_to_constant_oei(gpu); - + upload_para_to_const_oei(); getOEI(gpu); -#ifdef USE_LEGACY_ATOMICS - gpu -> gpu_calculated -> oULL -> Download(); - - hipMemsetAsync(gpu -> gpu_calculated -> oULL -> _devData, 0, sizeof(QUICKULL)*gpu->nbasis*gpu->nbasis); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } -#else gpu -> gpu_calculated -> o -> Download(); hipMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); @@ -508,42 +262,30 @@ extern "C" void gpu_get_oei_(QUICKDouble* o) } } -#endif - -/* - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - printf("OEI host O: %d %d %f %f \n", i, j, LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis), o[idxf90++]); - } - } -*/ + /* + for (int i = 0; i< gpu->nbasis; i++) { + for (int j = i; j< gpu->nbasis; j++) { + printf("OEI host O: %d %d %f %f \n", i, j, LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis), o[idxf90++]); + } + } + */ gpu -> gpu_calculated -> o -> DownloadSum(o); -// SAFE_DELETE(gpu -> gpu_calculated -> o); - -//#ifdef LEGACY_ATOMIC_ADD -// SAFE_DELETE(gpu -> gpu_calculated -> oULL); -//#endif + // SAFE_DELETE(gpu -> gpu_calculated -> o); + //#ifdef LEGACY_ATOMIC_ADD + // SAFE_DELETE(gpu -> gpu_calculated -> oULL); + //#endif } + extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) { - // upload point charge grad vector if(gpu -> nextatom > 0) { gpu -> ptchg_grad = new gpu_buffer_type(3 * gpu -> nextatom); - -#ifdef USE_LEGACY_ATOMICS - gpu -> ptchg_gradULL = new gpu_buffer_type(3 * gpu -> nextatom); - gpu -> ptchg_gradULL -> Upload(); - gpu -> gpu_sim.ptchg_gradULL = gpu -> ptchg_gradULL -> _devData; - gpu -> ptchg_grad -> DeleteGPU(); -#else gpu -> ptchg_grad -> Upload(); gpu -> gpu_sim.ptchg_grad = gpu -> ptchg_grad -> _devData; -#endif - } upload_sim_to_constant_oei(gpu); @@ -551,98 +293,48 @@ extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) get_oei_grad(gpu); // download gradients -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); - hipMemsetAsync(gpu -> gradULL -> _devData, 0, sizeof(QUICKULL)*3*gpu->natom); - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - gpu->grad->Download(); hipMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); -#endif - gpu->grad->DownloadSum(grad); -/* for(int i=0; i<3*gpu->natom; ++i){ - printf("grad: %d %f %f \n", i, grad[i], gpu->grad->_hostData[i]); - - } -*/ + /* for(int i=0; i<3*gpu->natom; ++i){ + printf("grad: %d %f %f \n", i, grad[i], gpu->grad->_hostData[i]); + + } + */ // download point charge gradients if(gpu -> nextatom > 0) { + gpu->ptchg_grad->Download(); + hipMemsetAsync(gpu -> ptchg_grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->nextatom); -#ifdef USE_LEGACY_ATOMICS - gpu -> ptchg_gradULL -> Download(); - - hipMemsetAsync(gpu -> ptchg_gradULL -> _devData, 0, sizeof(QUICKULL)*3*gpu->nextatom); - - for (int i = 0; i< 3 * gpu->nextatom; i++) { - QUICKULL valULL = gpu->ptchg_gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->ptchg_grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - - gpu->ptchg_grad->Download(); - hipMemsetAsync(gpu -> ptchg_grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->nextatom); - -#endif - -/* for(int i=0; i<3*gpu->nextatom; ++i){ - printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); - } -*/ - gpu->ptchg_grad->DownloadSum(ptchg_grad); - + /* for(int i=0; i<3*gpu->nextatom; ++i){ + printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); + } + */ + gpu->ptchg_grad->DownloadSum(ptchg_grad); } - // ptchg_grad is no longer needed. reclaim the memory. - if(gpu -> nextatom > 0 && !gpu->gpu_sim.use_cew) { -#ifdef USE_LEGACY_ATOMICS - SAFE_DELETE(gpu -> ptchg_gradULL); -#endif - SAFE_DELETE(gpu -> ptchg_grad); - } + // ptchg_grad is no longer needed. reclaim the memory. + if(gpu -> nextatom > 0 && !gpu->gpu_sim.use_cew) { + SAFE_DELETE(gpu -> ptchg_grad); + } } -#ifdef CEW +#if defined(CEW) extern "C" void gpu_get_lri_(QUICKDouble* o) { - -// gpu -> gpu_calculated -> o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - -//#ifdef LEGACY_ATOMIC_ADD -// gpu -> gpu_calculated -> o -> DeleteGPU(); -// gpu -> gpu_calculated -> oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); -// gpu -> gpu_calculated -> oULL -> Upload(); -// gpu -> gpu_sim.oULL = gpu -> gpu_calculated -> oULL -> _devData; -/*#else - gpu -> gpu_calculated -> o -> Upload(); - gpu -> gpu_sim.o = gpu -> gpu_calculated -> o -> _devData; + // gpu -> gpu_calculated -> o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + + //#ifdef LEGACY_ATOMIC_ADD + // gpu -> gpu_calculated -> o -> DeleteGPU(); + // gpu -> gpu_calculated -> oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + // gpu -> gpu_calculated -> oULL -> Upload(); + // gpu -> gpu_sim.oULL = gpu -> gpu_calculated -> oULL -> _devData; + /*#else + gpu -> gpu_calculated -> o -> Upload(); + gpu -> gpu_sim.o = gpu -> gpu_calculated -> o -> _devData; #endif */ @@ -650,34 +342,12 @@ extern "C" void gpu_get_lri_(QUICKDouble* o) upload_para_to_const_lri(); - get_lri(gpu); + get_lri(gpu); //compute xc quad potential upload_sim_to_constant_dft(gpu); getcew_quad(gpu); -#ifdef USE_LEGACY_ATOMICS - gpu -> gpu_calculated -> oULL -> Download(); - - hipMemsetAsync(gpu -> gpu_calculated -> oULL -> _devData, 0, sizeof(QUICKULL)*gpu->nbasis*gpu->nbasis); - - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE; - } - } -#else gpu -> gpu_calculated -> o -> Download(); hipMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); @@ -687,30 +357,25 @@ extern "C" void gpu_get_lri_(QUICKDouble* o) } } -#endif - - -/* int idxf90=0; - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - printf("OEI host O: %d %d %f %f \n", i, j, LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis), o[idxf90++]); - } - } -*/ + /* int idxf90=0; + for (int i = 0; i< gpu->nbasis; i++) { + for (int j = i; j< gpu->nbasis; j++) { + printf("OEI host O: %d %d %f %f \n", i, j, LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis), o[idxf90++]); + } + } + */ gpu -> gpu_calculated -> o -> DownloadSum(o); -// SAFE_DELETE(gpu -> gpu_calculated -> o); - -//#ifdef LEGACY_ATOMIC_ADD -// SAFE_DELETE(gpu -> gpu_calculated -> oULL); -//#endif + // SAFE_DELETE(gpu -> gpu_calculated -> o); + //#ifdef LEGACY_ATOMIC_ADD + // SAFE_DELETE(gpu -> gpu_calculated -> oULL); + //#endif } extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) { - upload_sim_to_constant_lri(gpu); upload_para_to_const_lri(); @@ -718,90 +383,37 @@ extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) get_lri_grad(gpu); // download gradients -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); - hipMemsetAsync(gpu -> gradULL -> _devData, 0, sizeof(QUICKULL)*3*gpu->natom); - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else - gpu->grad->Download(); hipMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); -#endif - gpu->grad->DownloadSum(grad); -/* for(int i=0; i<3*gpu->natom; ++i){ - printf("grad: %d %f %f \n", i, grad[i], gpu->grad->_hostData[i]); - - } -*/ + /* for(int i=0; i<3*gpu->natom; ++i){ + printf("grad: %d %f %f \n", i, grad[i], gpu->grad->_hostData[i]); + + } + */ // download point charge gradients if(gpu -> nextatom > 0) { + gpu->ptchg_grad->Download(); -#ifdef USE_LEGACY_ATOMICS - gpu -> ptchg_gradULL -> Download(); - - for (int i = 0; i< 3 * gpu->nextatom; i++) { - QUICKULL valULL = gpu->ptchg_gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - gpu->ptchg_grad->_hostData[i] = (QUICKDouble)valDB*ONEOVERGRADSCALE; - } - -#else - - gpu->ptchg_grad->Download(); - -#endif - -/* for(int i=0; i<3*gpu->nextatom; ++i){ - printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); - } -*/ - gpu->ptchg_grad->DownloadSum(ptchg_grad); - + /* for(int i=0; i<3*gpu->nextatom; ++i){ + printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); + } + */ + gpu->ptchg_grad->DownloadSum(ptchg_grad); } - // ptchg_grad is no longer needed. reclaim the memory. - if(gpu -> nextatom > 0) { -#ifdef USE_LEGACY_ATOMICS - SAFE_DELETE(gpu -> ptchg_gradULL); -#endif - SAFE_DELETE(gpu -> ptchg_grad); - } - + // ptchg_grad is no longer needed. reclaim the memory. + if(gpu -> nextatom > 0) { + SAFE_DELETE(gpu -> ptchg_grad); + } } + extern "C" void gpu_getcew_grad_quad_(QUICKDouble* grad) { - -#ifndef USE_LEGACY_ATOMICS - gpu -> cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); -#else - memset(gpu -> grad -> _hostData, 0, sizeof(QUICKDouble)*3*gpu->natom); -#endif + gpu->cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); // calculate smem size gpu -> gpu_xcq -> smem_size = gpu->natom * 3 * sizeof(QUICKULL); @@ -812,38 +424,13 @@ extern "C" void gpu_getcew_grad_quad_(QUICKDouble* grad) getcew_quad_grad(gpu); // download gradients -#ifdef USE_LEGACY_ATOMICS - gpu -> gradULL -> Download(); - hipMemsetAsync(gpu -> gradULL -> _devData, 0, sizeof(QUICKULL)*3*gpu->natom); - for (int i = 0; i< 3 * gpu->natom; i++) { - QUICKULL valULL = gpu->gradULL->_hostData[i]; - QUICKDouble valDB; - - if (valULL >= 0x8000000000000000ull) { - valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); - } - else - { - valDB = (QUICKDouble) valULL; - } - - // make sure to add rather than assign. we already computed one part of the cew - // gradients on host asynchronously. - gpu->grad->_hostData[i] += (QUICKDouble)valDB*ONEOVERGRADSCALE; - } -#else gpu->grad->Download(); hipMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); -#endif - gpu->grad->DownloadSum(grad); -#ifndef USE_LEGACY_ATOMICS gpu -> cew_grad ->DownloadSum(grad); SAFE_DELETE(gpu -> cew_grad); -#endif - } #endif #endif diff --git a/src/gpu/hip/gpu_get2e_grad_ffff.cu b/src/gpu/hip/gpu_get2e_grad_ffff.cu index 999fafd53..5881a5109 100644 --- a/src/gpu/hip/gpu_get2e_grad_ffff.cu +++ b/src/gpu/hip/gpu_get2e_grad_ffff.cu @@ -4,8 +4,8 @@ * * Created by Yipu Miao on 6/17/11. * Copyright 2011 University of Florida.All rights reserved. - * - * Yipu Miao 9/15/11: the first draft is released. And the GPUGP QM compuation can + * + * Yipu Miao 9/15/11: the first draft is released. And the GPUGP QM compuation can * achieve as much as 15x faster at double precision level compared with CPU. */ @@ -23,14 +23,14 @@ /* - Constant Memory in GPU is fast but quite limited and hard to operate, usually not allocatable and - readonly. So we put the following variables into constant memory: - devSim: a gpu simluation type variable. which is to store to location of basic information about molecule and basis - set. Note it only store the location, so it's mostly a set of pointer to GPU memory. and with some non-pointer - value like the number of basis set. See gpu_type.h for details. - devTrans : arrays to save the mapping index, will be elimited by hand writing unrolling code. - Sumindex: a array to store refect how many temp variable needed in VRR. can be elimited by hand writing code. - */ + Constant Memory in GPU is fast but quite limited and hard to operate, usually not allocatable and + readonly. So we put the following variables into constant memory: +devSim: a gpu simluation type variable. which is to store to location of basic information about molecule and basis +set. Note it only store the location, so it's mostly a set of pointer to GPU memory. and with some non-pointer +value like the number of basis set. See gpu_type.h for details. +devTrans : arrays to save the mapping index, will be elimited by hand writing unrolling code. +Sumindex: a array to store refect how many temp variable needed in VRR. can be elimited by hand writing code. +*/ static __constant__ gpu_simulation_type devSim; static __constant__ unsigned char devTrans[TRANSDIM*TRANSDIM*TRANSDIM]; static __constant__ int Sumindex[10]={0,0,1,4,10,20,35,56,84,120}; @@ -58,7 +58,7 @@ texture tex_Xcoeff; #ifdef USE_ERI_GRAD_STOREADD #define STORE_OPERATOR += #else -#define STORE_OPERATOR = +#define STORE_OPERATOR = #endif */ @@ -164,22 +164,22 @@ struct Partial_ERI{ }; bool ComparePrimNum(Partial_ERI p1, Partial_ERI p2){ - return p1.kprim_score > p2.kprim_score; + return p1.kprim_score > p2.kprim_score; } void ResortERIs(_gpu_type gpu){ int2 eri_type_order[]={{0,0},{0,1},{1,0},{1,1},{0,2},{2,0},{1,2},{2,1},{0,3},{3,0},{2,2},{1,3},{3,1}, - {2,3},{3,2},{3,3}}; + {2,3},{3,2},{3,3}}; unsigned char eri_type_order_map[]={0,1,3,6,10,13,15,16}; int eri_type_block_map[17]; int2 *resorted_YCutoffIJ=(int2*) malloc(sizeof(int2)*gpu->gpu_cutoff->sqrQshell); bool ffset= false; - // Step 1: sort according sum of angular momentum of a partial ERI. (ie. i+j of gpu_cutoff->sqrQshell; i++){ + int2 lbl_t=eri_type_order[ieto]; + eri_type_block_map[idx2]=idx1; + for(int i=0; igpu_cutoff->sqrQshell; i++){ if(gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].x] == lbl_t.x && gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y] == -lbl_t.y){ - resorted_YCutoffIJ[idx1].x = gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x; - resorted_YCutoffIJ[idx1].y = gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y; - idx1++; + ->_hostData[i].x] == lbl_t.x && gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y] == + lbl_t.y){ + resorted_YCutoffIJ[idx1].x = gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x; + resorted_YCutoffIJ[idx1].y = gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y; + idx1++; } } @@ -208,10 +208,10 @@ lbl_t.y){ for(int i=0; igpu_cutoff->sqrQshell; i++){ gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x=resorted_YCutoffIJ[i].x; gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y=resorted_YCutoffIJ[i].y; - + if(ffset == false && (gpu->gpu_basis->sorted_Qnumber->_hostData[resorted_YCutoffIJ[i].x]+gpu->gpu_basis->sorted_Qnumber->_hostData[resorted_YCutoffIJ[i].y]) == 6){ - ffStart = i; - ffset = true; + ffStart = i; + ffset = true; } } @@ -221,20 +221,20 @@ lbl_t.y){ for(int i=0; igpu_cutoff->sqrQshell; i++){ int kprim1 = gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].x]]; + ->_hostData[i].x]]; int kprim2 = gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].y]]; + ->_hostData[i].y]]; int kprim_score = 10*std::max(kprim1,kprim2)+std::min(kprim1,kprim2)+(kprim1+kprim2); partial_eris[i] = {gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x, gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y, - gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x], \ - gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y], \ - gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].x]], \ - gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ -->_hostData[i].y]], \ - gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x], \ - gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y], - kprim_score}; + gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x], \ + gpu->gpu_basis->sorted_Qnumber->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y], \ + gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ + ->_hostData[i].x]], \ + gpu->gpu_basis->kprim->_hostData[gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ + ->_hostData[i].y]], \ + gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].x], \ + gpu->gpu_basis->sorted_Q->_hostData[gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y], + kprim_score}; } @@ -247,7 +247,7 @@ lbl_t.y){ gpu->gpu_cutoff->sorted_YCutoffIJ ->_hostData[i].y = partial_eris[i].YCutoffIJ_y; } - gpu -> gpu_cutoff -> sorted_YCutoffIJ -> Upload(); + gpu -> gpu_cutoff -> sorted_YCutoffIJ -> Upload(); gpu -> gpu_sim.sorted_YCutoffIJ = gpu -> gpu_cutoff -> sorted_YCutoffIJ -> _devData; gpu -> gpu_sim.ffStart = ffStart; @@ -255,268 +255,255 @@ lbl_t.y){ void getGrad_ffff(_gpu_type gpu) { + ResortERIs(gpu); + + int *int_buffer = (int*) malloc(ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int)); + int **int_ptr_buffer = (int**) malloc(ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*)); + QUICKDouble *dbl_buffer = (QUICKDouble*) malloc(ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble)); + QUICKDouble **dbl_ptr_buffer = (QUICKDouble**) malloc(ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*)); + int2 **int2_ptr_buffer = (int2**) malloc(ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); + unsigned char **char_ptr_buffer = (unsigned char**) malloc(ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); + QUICKAtomicType **grad_ptr_buffer = (QUICKAtomicType**) malloc(ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*)); + unsigned char trans[TRANSDIM*TRANSDIM*TRANSDIM]; + for(int i=0; igpu_sim.natom; + int_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.nbasis; + int_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.nshell; + int_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.jbasis; + int_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.sqrQshell; + int_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.prim_total; + int_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.ffStart; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.katom; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.kprim; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.kstart; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.Ksumtype; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.prim_start; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.Qfbasis; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.Qsbasis; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*7+i] = gpu->gpu_sim.Qstart; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*8+i] = gpu->gpu_sim.sorted_Q; + int_ptr_buffer[ERI_GRAD_FFFF_TPB*9+i] = gpu->gpu_sim.sorted_Qnumber; + dbl_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.primLimit; + dbl_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.gradCutoff; + dbl_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.hyb_coeff; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.cons; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.cutMatrix; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.cutPrim; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.dense; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.denseb; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.expoSum; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.gcexpo; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*7+i] = gpu->gpu_sim.store; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*8+i] = gpu->gpu_sim.store2; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*9+i] = gpu->gpu_sim.storeAA; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*10+i] = gpu->gpu_sim.storeBB; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*11+i] = gpu->gpu_sim.storeCC; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*12+i] = gpu->gpu_sim.weightedCenterX; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*13+i] = gpu->gpu_sim.weightedCenterY; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*14+i] = gpu->gpu_sim.weightedCenterZ; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*15+i] = gpu->gpu_sim.Xcoeff; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*16+i] = gpu->gpu_sim.xyz; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*17+i] = gpu->gpu_sim.YCutoff; + dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*18+i] = gpu->gpu_sim.YVerticalTemp; + int2_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.sorted_YCutoffIJ; + char_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.mpi_bcompute; + char_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.KLMN; + grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.grad; + } - ResortERIs(gpu); - - int *int_buffer = (int*) malloc(ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int)); - int **int_ptr_buffer = (int**) malloc(ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*)); - QUICKDouble *dbl_buffer = (QUICKDouble*) malloc(ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble)); - QUICKDouble **dbl_ptr_buffer = (QUICKDouble**) malloc(ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*)); - int2 **int2_ptr_buffer = (int2**) malloc(ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); - unsigned char **char_ptr_buffer = (unsigned char**) malloc(ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); - QUICKAtomicType **grad_ptr_buffer = (QUICKAtomicType**) malloc(ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*)); - unsigned char trans[TRANSDIM*TRANSDIM*TRANSDIM]; - - - for(int i=0; igpu_sim.natom; - int_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.nbasis; - int_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.nshell; - int_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.jbasis; - int_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.sqrQshell; - int_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.prim_total; - int_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.ffStart; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.katom; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.kprim; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.kstart; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.Ksumtype; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.prim_start; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.Qfbasis; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.Qsbasis; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*7+i] = gpu->gpu_sim.Qstart; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*8+i] = gpu->gpu_sim.sorted_Q; - int_ptr_buffer[ERI_GRAD_FFFF_TPB*9+i] = gpu->gpu_sim.sorted_Qnumber; - dbl_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.primLimit; - dbl_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.gradCutoff; - dbl_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.hyb_coeff; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.cons; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.cutMatrix; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*2+i] = gpu->gpu_sim.cutPrim; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*3+i] = gpu->gpu_sim.dense; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*4+i] = gpu->gpu_sim.denseb; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*5+i] = gpu->gpu_sim.expoSum; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*6+i] = gpu->gpu_sim.gcexpo; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*7+i] = gpu->gpu_sim.store; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*8+i] = gpu->gpu_sim.store2; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*9+i] = gpu->gpu_sim.storeAA; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*10+i] = gpu->gpu_sim.storeBB; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*11+i] = gpu->gpu_sim.storeCC; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*12+i] = gpu->gpu_sim.weightedCenterX; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*13+i] = gpu->gpu_sim.weightedCenterY; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*14+i] = gpu->gpu_sim.weightedCenterZ; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*15+i] = gpu->gpu_sim.Xcoeff; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*16+i] = gpu->gpu_sim.xyz; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*17+i] = gpu->gpu_sim.YCutoff; - dbl_ptr_buffer[ERI_GRAD_FFFF_TPB*18+i] = gpu->gpu_sim.YVerticalTemp; - int2_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.sorted_YCutoffIJ; - char_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.mpi_bcompute; - char_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.KLMN; -#ifdef USE_LEGACY_ATOMICS - grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.gradULL; -#else - grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.grad; -#endif - } - - - LOC3(trans, 0, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 1; - LOC3(trans, 0, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 4; - LOC3(trans, 0, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 10; - LOC3(trans, 0, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 20; - LOC3(trans, 0, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 35; - LOC3(trans, 0, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 56; - LOC3(trans, 0, 0, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 84; - LOC3(trans, 0, 0, 7, TRANSDIM, TRANSDIM, TRANSDIM) = 120; - LOC3(trans, 0, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 3; - LOC3(trans, 0, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 6; - LOC3(trans, 0, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 17; - LOC3(trans, 0, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 32; - LOC3(trans, 0, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 48; - LOC3(trans, 0, 1, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 67; - LOC3(trans, 0, 1, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 100; - LOC3(trans, 0, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 9; - LOC3(trans, 0, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 16; - LOC3(trans, 0, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 23; - LOC3(trans, 0, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 42; - LOC3(trans, 0, 2, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 73; - LOC3(trans, 0, 2, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 106; - LOC3(trans, 0, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 19; - LOC3(trans, 0, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 31; - LOC3(trans, 0, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 43; - LOC3(trans, 0, 3, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 79; - LOC3(trans, 0, 3, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 112; - LOC3(trans, 0, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 34; - LOC3(trans, 0, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 49; - LOC3(trans, 0, 4, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 74; - LOC3(trans, 0, 4, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 113; - LOC3(trans, 0, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 55; - LOC3(trans, 0, 5, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 68; - LOC3(trans, 0, 5, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 107; - LOC3(trans, 0, 6, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 83; - LOC3(trans, 0, 6, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 101; - LOC3(trans, 0, 7, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 119; - LOC3(trans, 1, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 2; - LOC3(trans, 1, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 7; - LOC3(trans, 1, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 15; - LOC3(trans, 1, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 28; - LOC3(trans, 1, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 50; - LOC3(trans, 1, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 69; - LOC3(trans, 1, 0, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 102; - LOC3(trans, 1, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 5; - LOC3(trans, 1, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 11; - LOC3(trans, 1, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 26; - LOC3(trans, 1, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 41; - LOC3(trans, 1, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 59; - LOC3(trans, 1, 1, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 87; - LOC3(trans, 1, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 13; - LOC3(trans, 1, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 25; - LOC3(trans, 1, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 36; - LOC3(trans, 1, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 60; - LOC3(trans, 1, 2, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 88; - LOC3(trans, 1, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 30; - LOC3(trans, 1, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 40; - LOC3(trans, 1, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 61; - LOC3(trans, 1, 3, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 94; - LOC3(trans, 1, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 52; - LOC3(trans, 1, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 58; - LOC3(trans, 1, 4, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 89; - LOC3(trans, 1, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 71; - LOC3(trans, 1, 5, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 86; - LOC3(trans, 1, 6, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 104; - LOC3(trans, 2, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 8; - LOC3(trans, 2, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 14; - LOC3(trans, 2, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 22; - LOC3(trans, 2, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 44; - LOC3(trans, 2, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 75; - LOC3(trans, 2, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 108; - LOC3(trans, 2, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 12; - LOC3(trans, 2, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 24; - LOC3(trans, 2, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 37; - LOC3(trans, 2, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 62; - LOC3(trans, 2, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 90; - LOC3(trans, 2, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 21; - LOC3(trans, 2, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 38; - LOC3(trans, 2, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 66; - LOC3(trans, 2, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 99; - LOC3(trans, 2, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 46; - LOC3(trans, 2, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 64; - LOC3(trans, 2, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 98; - LOC3(trans, 2, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 77; - LOC3(trans, 2, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 92; - LOC3(trans, 2, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 110; - LOC3(trans, 3, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 18; - LOC3(trans, 3, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 27; - LOC3(trans, 3, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 45; - LOC3(trans, 3, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 80; - LOC3(trans, 3, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 114; - LOC3(trans, 3, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 29; - LOC3(trans, 3, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 39; - LOC3(trans, 3, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 63; - LOC3(trans, 3, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 95; - LOC3(trans, 3, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 47; - LOC3(trans, 3, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 65; - LOC3(trans, 3, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 97; - LOC3(trans, 3, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 81; - LOC3(trans, 3, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 96; - LOC3(trans, 3, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 116; - LOC3(trans, 4, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 33; - LOC3(trans, 4, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 51; - LOC3(trans, 4, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 76; - LOC3(trans, 4, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 115; - LOC3(trans, 4, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 53; - LOC3(trans, 4, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 57; - LOC3(trans, 4, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 91; - LOC3(trans, 4, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 78; - LOC3(trans, 4, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 93; - LOC3(trans, 4, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 117; - LOC3(trans, 5, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 54; - LOC3(trans, 5, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 70; - LOC3(trans, 5, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 109; - LOC3(trans, 5, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 72; - LOC3(trans, 5, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 85; - LOC3(trans, 5, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 111; - LOC3(trans, 6, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 82; - LOC3(trans, 6, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 103; - LOC3(trans, 6, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 105; - LOC3(trans, 7, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 118; - + LOC3(trans, 0, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 1; + LOC3(trans, 0, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 4; + LOC3(trans, 0, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 10; + LOC3(trans, 0, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 20; + LOC3(trans, 0, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 35; + LOC3(trans, 0, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 56; + LOC3(trans, 0, 0, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 84; + LOC3(trans, 0, 0, 7, TRANSDIM, TRANSDIM, TRANSDIM) = 120; + LOC3(trans, 0, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 3; + LOC3(trans, 0, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 6; + LOC3(trans, 0, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 17; + LOC3(trans, 0, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 32; + LOC3(trans, 0, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 48; + LOC3(trans, 0, 1, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 67; + LOC3(trans, 0, 1, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 100; + LOC3(trans, 0, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 9; + LOC3(trans, 0, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 16; + LOC3(trans, 0, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 23; + LOC3(trans, 0, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 42; + LOC3(trans, 0, 2, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 73; + LOC3(trans, 0, 2, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 106; + LOC3(trans, 0, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 19; + LOC3(trans, 0, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 31; + LOC3(trans, 0, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 43; + LOC3(trans, 0, 3, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 79; + LOC3(trans, 0, 3, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 112; + LOC3(trans, 0, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 34; + LOC3(trans, 0, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 49; + LOC3(trans, 0, 4, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 74; + LOC3(trans, 0, 4, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 113; + LOC3(trans, 0, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 55; + LOC3(trans, 0, 5, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 68; + LOC3(trans, 0, 5, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 107; + LOC3(trans, 0, 6, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 83; + LOC3(trans, 0, 6, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 101; + LOC3(trans, 0, 7, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 119; + LOC3(trans, 1, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 2; + LOC3(trans, 1, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 7; + LOC3(trans, 1, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 15; + LOC3(trans, 1, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 28; + LOC3(trans, 1, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 50; + LOC3(trans, 1, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 69; + LOC3(trans, 1, 0, 6, TRANSDIM, TRANSDIM, TRANSDIM) = 102; + LOC3(trans, 1, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 5; + LOC3(trans, 1, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 11; + LOC3(trans, 1, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 26; + LOC3(trans, 1, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 41; + LOC3(trans, 1, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 59; + LOC3(trans, 1, 1, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 87; + LOC3(trans, 1, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 13; + LOC3(trans, 1, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 25; + LOC3(trans, 1, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 36; + LOC3(trans, 1, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 60; + LOC3(trans, 1, 2, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 88; + LOC3(trans, 1, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 30; + LOC3(trans, 1, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 40; + LOC3(trans, 1, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 61; + LOC3(trans, 1, 3, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 94; + LOC3(trans, 1, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 52; + LOC3(trans, 1, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 58; + LOC3(trans, 1, 4, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 89; + LOC3(trans, 1, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 71; + LOC3(trans, 1, 5, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 86; + LOC3(trans, 1, 6, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 104; + LOC3(trans, 2, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 8; + LOC3(trans, 2, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 14; + LOC3(trans, 2, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 22; + LOC3(trans, 2, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 44; + LOC3(trans, 2, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 75; + LOC3(trans, 2, 0, 5, TRANSDIM, TRANSDIM, TRANSDIM) = 108; + LOC3(trans, 2, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 12; + LOC3(trans, 2, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 24; + LOC3(trans, 2, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 37; + LOC3(trans, 2, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 62; + LOC3(trans, 2, 1, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 90; + LOC3(trans, 2, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 21; + LOC3(trans, 2, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 38; + LOC3(trans, 2, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 66; + LOC3(trans, 2, 2, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 99; + LOC3(trans, 2, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 46; + LOC3(trans, 2, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 64; + LOC3(trans, 2, 3, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 98; + LOC3(trans, 2, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 77; + LOC3(trans, 2, 4, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 92; + LOC3(trans, 2, 5, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 110; + LOC3(trans, 3, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 18; + LOC3(trans, 3, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 27; + LOC3(trans, 3, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 45; + LOC3(trans, 3, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 80; + LOC3(trans, 3, 0, 4, TRANSDIM, TRANSDIM, TRANSDIM) = 114; + LOC3(trans, 3, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 29; + LOC3(trans, 3, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 39; + LOC3(trans, 3, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 63; + LOC3(trans, 3, 1, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 95; + LOC3(trans, 3, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 47; + LOC3(trans, 3, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 65; + LOC3(trans, 3, 2, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 97; + LOC3(trans, 3, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 81; + LOC3(trans, 3, 3, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 96; + LOC3(trans, 3, 4, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 116; + LOC3(trans, 4, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 33; + LOC3(trans, 4, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 51; + LOC3(trans, 4, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 76; + LOC3(trans, 4, 0, 3, TRANSDIM, TRANSDIM, TRANSDIM) = 115; + LOC3(trans, 4, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 53; + LOC3(trans, 4, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 57; + LOC3(trans, 4, 1, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 91; + LOC3(trans, 4, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 78; + LOC3(trans, 4, 2, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 93; + LOC3(trans, 4, 3, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 117; + LOC3(trans, 5, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 54; + LOC3(trans, 5, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 70; + LOC3(trans, 5, 0, 2, TRANSDIM, TRANSDIM, TRANSDIM) = 109; + LOC3(trans, 5, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 72; + LOC3(trans, 5, 1, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 85; + LOC3(trans, 5, 2, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 111; + LOC3(trans, 6, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 82; + LOC3(trans, 6, 0, 1, TRANSDIM, TRANSDIM, TRANSDIM) = 103; + LOC3(trans, 6, 1, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 105; + LOC3(trans, 7, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 118; + + int *dev_int_buffer; + int **dev_int_ptr_buffer; + QUICKDouble *dev_dbl_buffer; + QUICKDouble **dev_dbl_ptr_buffer; + int2 **dev_int2_ptr_buffer; + unsigned char **dev_char_ptr_buffer; + unsigned char *dev_char_buffer; + QUICKAtomicType **dev_grad_ptr_buffer; + + hipMalloc((void **)&dev_int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int)); + hipMalloc((void **)&dev_int_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*)); + hipMalloc((void **)&dev_dbl_buffer, ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble)); + hipMalloc((void **)&dev_dbl_ptr_buffer, ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*)); + hipMalloc((void **)&dev_int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); + hipMalloc((void **)&dev_char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); + hipMalloc((void **)&dev_char_buffer, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char)); + hipMalloc((void **)&dev_grad_ptr_buffer, ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*)); + + + hipMemcpy(dev_int_buffer, int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(dev_int_ptr_buffer, int_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*), hipMemcpyHostToDevice); + hipMemcpy(dev_dbl_buffer, dbl_buffer, ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble), hipMemcpyHostToDevice); + hipMemcpy(dev_dbl_ptr_buffer, dbl_ptr_buffer, ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*), hipMemcpyHostToDevice); + hipMemcpy(dev_int2_ptr_buffer, int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*), hipMemcpyHostToDevice); + hipMemcpy(dev_char_ptr_buffer, char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned + char*), hipMemcpyHostToDevice); + hipMemcpy(dev_char_buffer, &trans, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char), hipMemcpyHostToDevice); + hipMemcpy(dev_grad_ptr_buffer, grad_ptr_buffer, ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*), + hipMemcpyHostToDevice); - int *dev_int_buffer; - int **dev_int_ptr_buffer; - QUICKDouble *dev_dbl_buffer; - QUICKDouble **dev_dbl_ptr_buffer; - int2 **dev_int2_ptr_buffer; - unsigned char **dev_char_ptr_buffer; - unsigned char *dev_char_buffer; - QUICKAtomicType **dev_grad_ptr_buffer; - - hipMalloc((void **)&dev_int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int)); - hipMalloc((void **)&dev_int_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*)); - hipMalloc((void **)&dev_dbl_buffer, ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble)); - hipMalloc((void **)&dev_dbl_ptr_buffer, ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*)); - hipMalloc((void **)&dev_int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); - hipMalloc((void **)&dev_char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); - hipMalloc((void **)&dev_char_buffer, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char)); - hipMalloc((void **)&dev_grad_ptr_buffer, ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*)); - - - hipMemcpy(dev_int_buffer, int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int), hipMemcpyHostToDevice); - hipMemcpy(dev_int_ptr_buffer, int_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int*), hipMemcpyHostToDevice); - hipMemcpy(dev_dbl_buffer, dbl_buffer, ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble), hipMemcpyHostToDevice); - hipMemcpy(dev_dbl_ptr_buffer, dbl_ptr_buffer, ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*), hipMemcpyHostToDevice); - hipMemcpy(dev_int2_ptr_buffer, int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*), hipMemcpyHostToDevice); - hipMemcpy(dev_char_ptr_buffer, char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned -char*), hipMemcpyHostToDevice); - hipMemcpy(dev_char_buffer, &trans, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char), hipMemcpyHostToDevice); - hipMemcpy(dev_grad_ptr_buffer, grad_ptr_buffer, ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKAtomicType*), -hipMemcpyHostToDevice); - - if (gpu->maxL >= 3) { + if (gpu->maxL >= 3) { // Part f-3 #ifdef GPU_SPDF - QUICK_SAFE_CALL((getGrad_kernel_ffff<<blocks*ERI_GRAD_FFFF_BPSM, ERI_GRAD_FFFF_TPB, -sizeof(int)*ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB+ - sizeof(QUICKDouble)*ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB+sizeof(QUICKDouble*)*ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(int*)*ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB+ - sizeof(int2*)*ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned -char*)*ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned char)*ERI_GRAD_FFFF_SMEM_CHAR_SIZE+ - sizeof(QUICKAtomicType*)*ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB>>>(dev_int_buffer, -dev_int_ptr_buffer, dev_dbl_buffer, dev_dbl_ptr_buffer, dev_int2_ptr_buffer, dev_char_ptr_buffer, dev_char_buffer, -dev_grad_ptr_buffer,gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell))) - -#endif - } - - hipDeviceSynchronize(); - + QUICK_SAFE_CALL((getGrad_kernel_ffff<<blocks*ERI_GRAD_FFFF_BPSM, ERI_GRAD_FFFF_TPB, + sizeof(int)*ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB+ + sizeof(QUICKDouble)*ERI_GRAD_FFFF_SMEM_DBL_SIZE*ERI_GRAD_FFFF_TPB+sizeof(QUICKDouble*)*ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(int*)*ERI_GRAD_FFFF_SMEM_INT_PTR_SIZE*ERI_GRAD_FFFF_TPB+ + sizeof(int2*)*ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned + char*)*ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned char)*ERI_GRAD_FFFF_SMEM_CHAR_SIZE+ + sizeof(QUICKAtomicType*)*ERI_GRAD_FFFF_SMEM_PTR_SIZE*ERI_GRAD_FFFF_TPB>>>(dev_int_buffer, + dev_int_ptr_buffer, dev_dbl_buffer, dev_dbl_ptr_buffer, dev_int2_ptr_buffer, dev_char_ptr_buffer, dev_char_buffer, + dev_grad_ptr_buffer,gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell))) - free(int_buffer); - free(int_ptr_buffer); - free(dbl_buffer); - free(dbl_ptr_buffer); - free(int2_ptr_buffer); - free(char_ptr_buffer); - free(grad_ptr_buffer); +#endif + } - hipFree(dev_int_buffer); - hipFree(dev_int_ptr_buffer); - hipFree(dev_dbl_buffer); - hipFree(dev_dbl_ptr_buffer); - hipFree(dev_int2_ptr_buffer); - hipFree(dev_char_ptr_buffer); - hipFree(dev_char_buffer); - hipFree(dev_grad_ptr_buffer); + hipDeviceSynchronize(); + free(int_buffer); + free(int_ptr_buffer); + free(dbl_buffer); + free(dbl_ptr_buffer); + free(int2_ptr_buffer); + free(char_ptr_buffer); + free(grad_ptr_buffer); + + hipFree(dev_int_buffer); + hipFree(dev_int_ptr_buffer); + hipFree(dev_dbl_buffer); + hipFree(dev_dbl_ptr_buffer); + hipFree(dev_int2_ptr_buffer); + hipFree(dev_char_ptr_buffer); + hipFree(dev_char_buffer); + hipFree(dev_grad_ptr_buffer); } // interface to call uscf gradient Kernels void get_oshell_eri_grad_ffff(_gpu_type gpu) { - // roctxRangePush("Gradient 2e"); - // compute one electron gradients in the meantime //get_oneen_grad_(); @@ -528,11 +515,10 @@ void get_oshell_eri_grad_ffff(_gpu_type gpu) hipDeviceSynchronize(); // roctxRangePop(); - } -void upload_para_to_const_ffff(){ - + +void upload_para_to_const_ffff() { unsigned char trans[TRANSDIM*TRANSDIM*TRANSDIM]; // Data to trans { @@ -662,13 +648,13 @@ void upload_para_to_const_ffff(){ status = hipMemcpyToSymbol(HIP_SYMBOL(devTrans), trans, sizeof(unsigned char)*TRANSDIM*TRANSDIM*TRANSDIM); PRINTERROR(status, " hipMemcpyToSymbol, Trans copy to constants failed") - } -void upload_sim_to_constant_ffff(_gpu_type gpu){ + +void upload_sim_to_constant_ffff(_gpu_type gpu) { hipError_t status; - status = hipMemcpyToSymbol(HIP_SYMBOL(devSim), &gpu->gpu_sim, sizeof(gpu_simulation_type)); - PRINTERROR(status, " hipMemcpyToSymbol, sim copy to constants failed") + status = hipMemcpyToSymbol(HIP_SYMBOL(devSim), &gpu->gpu_sim, sizeof(gpu_simulation_type)); + PRINTERROR(status, " hipMemcpyToSymbol, sim copy to constants failed") upload_para_to_const_ffff(); } diff --git a/src/gpu/hip/gpu_get2e_grad_ffff.cuh b/src/gpu/hip/gpu_get2e_grad_ffff.cuh index a93da0957..ad05239c0 100644 --- a/src/gpu/hip/gpu_get2e_grad_ffff.cuh +++ b/src/gpu/hip/gpu_get2e_grad_ffff.cuh @@ -1616,28 +1616,6 @@ const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const sm //printf("FILE: %s, LINE: %d, FUNCTION: %s, DEV_SIM_DBL_HYB_COEFF \n", __FILE__, __LINE__, __func__); #endif -#ifdef USE_LEGACY_ATOMICS - - GRADADD(DEV_SIM_PTR_GRAD[AStart], AGradx); - GRADADD(DEV_SIM_PTR_GRAD[AStart + 1], AGrady); - GRADADD(DEV_SIM_PTR_GRAD[AStart + 2], AGradz); - - - GRADADD(DEV_SIM_PTR_GRAD[BStart], BGradx); - GRADADD(DEV_SIM_PTR_GRAD[BStart + 1], BGrady); - GRADADD(DEV_SIM_PTR_GRAD[BStart + 2], BGradz); - - - GRADADD(DEV_SIM_PTR_GRAD[CStart], CGradx); - GRADADD(DEV_SIM_PTR_GRAD[CStart + 1], CGrady); - GRADADD(DEV_SIM_PTR_GRAD[CStart + 2], CGradz); - - - GRADADD(DEV_SIM_PTR_GRAD[DStart], (-AGradx-BGradx-CGradx)); - GRADADD(DEV_SIM_PTR_GRAD[DStart + 1], (-AGrady-BGrady-CGrady)); - GRADADD(DEV_SIM_PTR_GRAD[DStart + 2], (-AGradz-BGradz-CGradz)); - -#else atomicAdd(&DEV_SIM_PTR_GRAD[AStart], AGradx); atomicAdd(&DEV_SIM_PTR_GRAD[AStart + 1], AGrady); atomicAdd(&DEV_SIM_PTR_GRAD[AStart + 2], AGradz); @@ -1656,7 +1634,6 @@ const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const sm atomicAdd(&DEV_SIM_PTR_GRAD[DStart], (-AGradx-BGradx-CGradx)); atomicAdd(&DEV_SIM_PTR_GRAD[DStart + 1], (-AGrady-BGrady-CGrady)); atomicAdd(&DEV_SIM_PTR_GRAD[DStart + 2], (-AGradz-BGradz-CGradz)); -#endif return; } diff --git a/src/gpu/hip/gpu_getxc.cu b/src/gpu/hip/gpu_getxc.cu index e77af774b..997a38785 100644 --- a/src/gpu/hip/gpu_getxc.cu +++ b/src/gpu/hip/gpu_getxc.cu @@ -368,21 +368,12 @@ __global__ void get_sswgrad_kernel(){ //declare smem grad vector -#ifdef USE_LEGACY_ATOMICS - extern __shared__ QUICKULL smem_buffer[]; - QUICKULL* smemGrad=(QUICKULL*)smem_buffer; - - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0ull; -#else extern __shared__ QUICKDouble smem_buffer[]; QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; // initialize smem grad for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) smemGrad[i]=0.0; -#endif __syncthreads(); @@ -405,11 +396,7 @@ __global__ void get_sswgrad_kernel(){ // update gmem grad vector for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) -#ifdef USE_LEGACY_ATOMICS - atomicAdd(&devSim_dft.gradULL[i],smemGrad[i]); -#else - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); -#endif + atomicAdd(&devSim_dft.grad[i],smemGrad[i]); __syncthreads(); } @@ -428,15 +415,6 @@ __global__ void get_sswnumgrad_kernel(){ unsigned int natom = devSim_dft.natom; -#ifdef USE_LEGACY_ATOMICS - //declare smem grad vector - extern __shared__ QUICKULL smem_buffer[]; - QUICKULL* smemGrad=(QUICKULL*)smem_buffer; - - // initialize smem grad - for(int i = threadIdx.x; i< natom * 3; i+=blockDim.x) - smemGrad[i]=0ull; -#else //declare smem grad vector extern __shared__ QUICKDouble smem_buffer[]; QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; @@ -444,7 +422,6 @@ __global__ void get_sswnumgrad_kernel(){ // initialize smem grad for(int i = threadIdx.x; i< natom * 3; i+=blockDim.x) smemGrad[i]=0.0; -#endif __syncthreads(); @@ -497,8 +474,6 @@ __global__ void get_sswnumgrad_kernel(){ QUICKDouble dpx = (sswt1-sswt2) * gradfac; - // GRADADD(smemGrad[iatom*3], (sswt1-sswt2) * gradfac); - xatm += SSW_NUMGRAD_DELTA; if(iatom == gatm-1) xparent = xatm; @@ -521,8 +496,6 @@ __global__ void get_sswnumgrad_kernel(){ QUICKDouble dpy = (sswt1-sswt2) * gradfac; - //GRADADD(smemGrad[iatom*3+1], (sswt1-sswt2) * gradfac); - yatm += SSW_NUMGRAD_DELTA; if(iatom == gatm-1) yparent = yatm; @@ -545,21 +518,13 @@ __global__ void get_sswnumgrad_kernel(){ QUICKDouble dpz = (sswt1-sswt2) * gradfac; - //GRADADD(smemGrad[iatom*3+2], (sswt1-sswt2) * gradfac); - zatm += SSW_NUMGRAD_DELTA; if(iatom == gatm-1) zparent = zatm; -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[iatom*3], dpx); - GRADADD(smemGrad[iatom*3+1], dpy); - GRADADD(smemGrad[iatom*3+2], dpz); -#else atomicAdd(&smemGrad[iatom*3], dpx); atomicAdd(&smemGrad[iatom*3+1], dpy); atomicAdd(&smemGrad[iatom*3+2], dpz); -#endif /* printf("sswgrad %f %f %f %d %d %f %f %f \n", gridx, gridy, gridz, iatom, 1, dpx, devSim_dft.exc_ssd[idx], devSim_dft.quadwt[idx]); @@ -576,11 +541,7 @@ __global__ void get_sswnumgrad_kernel(){ // update gmem grad vector for(int i = threadIdx.x; i< natom * 3; i+=blockDim.x) -#ifdef USE_LEGACY_ATOMICS - atomicAdd(&devSim_dft.gradULL[i],smemGrad[i]); -#else - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); -#endif + atomicAdd(&devSim_dft.grad[i],smemGrad[i]); __syncthreads(); @@ -976,11 +937,10 @@ __device__ QUICKDouble get_uw_ssd(const QUICKDouble gridx, const QUICKDouble gri } -#ifdef USE_LEGACY_ATOMICS -__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKULL* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom) -#else -__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKDouble* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom) -#endif +__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, + const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble + quadwt, QUICKDouble* const smemGrad, QUICKDouble* const uw_ssd, const + int iparent, const int natom) { QUICKDouble sumUW= 0.0; QUICKDouble parent_uw = 0.0; @@ -1004,11 +964,7 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons for(int i=0; i devSim_dft.DMCutoff ){ -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[i*3+j], LOCUWSSD(uw_ssd,j,i,3,natom)*Exc*quadwt*uw*(-p/sumUW)); -#else atomicAdd(&smemGrad[i*3+j],LOCUWSSD(uw_ssd,j,i,3,natom)*Exc*quadwt*uw*(-p/sumUW)); -#endif } } } @@ -1020,26 +976,19 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons get_uw_ssd(gridx, gridy, gridz, uw_ssd, iparent, iparent-1, natom); - for(int i=0; i devSim_dft.DMCutoff ){ -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[i*3+j], LOCUWSSD(uw_ssd,j,i,3,natom)*(1.0/sumUW)*Exc*quadwt*parent_uw); -#else atomicAdd(&smemGrad[i*3+j],LOCUWSSD(uw_ssd,j,i,3,natom)*(1.0/sumUW)*Exc*quadwt*parent_uw); -#endif } } } } -#ifdef USE_LEGACY_ATOMICS -__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKULL* smemGrad, int iparent, int gid) -#else -__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int iparent, int gid) -#endif +__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, + QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int + iparent, int gid) { /* This subroutine calculates the derivatives of weight found in @@ -1187,17 +1136,10 @@ __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, //printf("gridx: %f gridy: %f gridz: %f Exc: %e quadwt: %e\n",wtgradjx, wtgradjy, wtgradjz, Exc, quadwt); #endif - // We should now have the derivatives of the SS weights. Now just add it to the temporary gradient vector in shared memory. - -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[jstart], wtgradjx * Exc * quadwt); - GRADADD(smemGrad[jstart + 1], wtgradjy * Exc * quadwt); - GRADADD(smemGrad[jstart + 2], wtgradjz * Exc * quadwt); -#else + // We should now have the derivatives of the SS weights. Now just add it to the temporary gradient vector in shared memory. atomicAdd(&smemGrad[jstart], wtgradjx * Exc * quadwt); atomicAdd(&smemGrad[jstart + 1], wtgradjy * Exc * quadwt); atomicAdd(&smemGrad[jstart + 2], wtgradjz * Exc * quadwt); -#endif } } @@ -1207,15 +1149,9 @@ __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, #endif // update the temporary gradient vector -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[istart], wtgradix * Exc * quadwt); - GRADADD(smemGrad[istart + 1], wtgradiy * Exc * quadwt); - GRADADD(smemGrad[istart + 2], wtgradiz * Exc * quadwt); -#else atomicAdd(&smemGrad[istart], wtgradix * Exc * quadwt); atomicAdd(&smemGrad[istart + 1], wtgradiy * Exc * quadwt); atomicAdd(&smemGrad[istart + 2], wtgradiz * Exc * quadwt); -#endif } diff --git a/src/gpu/hip/gpu_getxc.h b/src/gpu/hip/gpu_getxc.h index a98c19f7b..7efc7615b 100644 --- a/src/gpu/hip/gpu_getxc.h +++ b/src/gpu/hip/gpu_getxc.h @@ -1,21 +1,21 @@ #include "hip/hip_runtime.h" /* - !---------------------------------------------------------------------! - ! Written by Madu Manathunga on 12/03/2020 ! - ! ! - ! Copyright (C) 2020-2021 Merz lab ! - ! Copyright (C) 2020-2021 Götz lab ! - ! ! - ! This Source Code Form is subject to the terms of the Mozilla Public ! - ! License, v. 2.0. If a copy of the MPL was not distributed with this ! - ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! - !_____________________________________________________________________! - - !---------------------------------------------------------------------! - ! This source file contains preprocessable functions required for ! - ! QUICK GPU version. ! - !---------------------------------------------------------------------! -*/ + !---------------------------------------------------------------------! + ! Written by Madu Manathunga on 12/03/2020 ! + ! ! + ! Copyright (C) 2020-2021 Merz lab ! + ! Copyright (C) 2020-2021 Götz lab ! + ! ! + ! This Source Code Form is subject to the terms of the Mozilla Public ! + ! License, v. 2.0. If a copy of the MPL was not distributed with this ! + ! file, You can obtain one at http://mozilla.org/MPL/2.0/. ! + !_____________________________________________________________________! + + !---------------------------------------------------------------------! + ! This source file contains preprocessable functions required for ! + ! QUICK GPU version. ! + !---------------------------------------------------------------------! + */ #ifdef OSHELL #define NSPIN 2 @@ -23,6 +23,7 @@ #define NSPIN 1 #endif + //----------------------------------------------- // Calculate the density and gradients of density at // each grid point. @@ -33,642 +34,574 @@ __global__ void get_oshell_density_kernel() __global__ void get_cshell_density_kernel() #endif { - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; - - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble density = 0.0; - QUICKDouble gax = 0.0; - QUICKDouble gay = 0.0; - QUICKDouble gaz = 0.0; + QUICKDouble density = 0.0; + QUICKDouble gax = 0.0; + QUICKDouble gay = 0.0; + QUICKDouble gaz = 0.0; #ifdef OSHELL - QUICKDouble densityb = 0.0; - QUICKDouble gbx = 0.0; - QUICKDouble gby = 0.0; - QUICKDouble gbz = 0.0; + QUICKDouble densityb = 0.0; + QUICKDouble gbx = 0.0; + QUICKDouble gby = 0.0; + QUICKDouble gbz = 0.0; #endif - for(int i=bfloc_st; i < bfloc_end; i++){ - - int ibas = (int) devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; + for(int i=bfloc_st; i < bfloc_end; i++) { + int ibas = (int) devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - if (abs(phi+dphidx+dphidy+dphidz) >= devSim_dft.XCCutoff ) { + if (abs(phi+dphidx+dphidy+dphidz) >= devSim_dft.XCCutoff ) { - QUICKDouble denseii = LOC2(devSim_dft.dense, ibas, ibas, devSim_dft.nbasis, devSim_dft.nbasis) * phi; + QUICKDouble denseii = LOC2(devSim_dft.dense, ibas, ibas, devSim_dft.nbasis, devSim_dft.nbasis) * phi; #ifdef OSHELL - QUICKDouble densebii = LOC2(devSim_dft.denseb, ibas, ibas, devSim_dft.nbasis, devSim_dft.nbasis) * phi; + QUICKDouble densebii = LOC2(devSim_dft.denseb, ibas, ibas, devSim_dft.nbasis, devSim_dft.nbasis) * phi; #endif #ifdef OSHELL - density = density + denseii * phi; - densityb = densityb + densebii * phi; + density = density + denseii * phi; + densityb = densityb + densebii * phi; #else - density = density + denseii * phi / 2.0; + density = density + denseii * phi / 2.0; #endif - gax = gax + denseii * dphidx; - gay = gay + denseii * dphidy; - gaz = gaz + denseii * dphidz; + gax = gax + denseii * dphidx; + gay = gay + denseii * dphidy; + gaz = gaz + denseii * dphidz; #ifdef OSHELL - gbx = gbx + densebii * dphidx; - gby = gby + densebii * dphidy; - gbz = gbz + densebii * dphidz; + gbx = gbx + densebii * dphidx; + gby = gby + densebii * dphidy; + gbz = gbz + densebii * dphidz; #endif - for(int j=i+1; j< bfloc_end; j++){ + for(int j=i+1; j< bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - - QUICKDouble denseij = LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + QUICKDouble denseij = LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); #ifdef OSHELL - QUICKDouble densebij = LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + QUICKDouble densebij = LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); #endif #ifdef OSHELL - density = density + 2.0 * denseij * phi * phi2; - densityb = densityb + 2.0 * densebij * phi * phi2; + density = density + 2.0 * denseij * phi * phi2; + densityb = densityb + 2.0 * densebij * phi * phi2; #else - density = density + denseij * phi * phi2; + density = density + denseij * phi * phi2; #endif - gax = gax + denseij * ( phi * dphidx2 + phi2 * dphidx ); - gay = gay + denseij * ( phi * dphidy2 + phi2 * dphidy ); - gaz = gaz + denseij * ( phi * dphidz2 + phi2 * dphidz ); + gax = gax + denseij * ( phi * dphidx2 + phi2 * dphidx ); + gay = gay + denseij * ( phi * dphidy2 + phi2 * dphidy ); + gaz = gaz + denseij * ( phi * dphidz2 + phi2 * dphidz ); #ifdef OSHELL - gbx = gbx + densebij * ( phi * dphidx2 + phi2 * dphidx ); - gby = gby + densebij * ( phi * dphidy2 + phi2 * dphidy ); - gbz = gbz + densebij * ( phi * dphidz2 + phi2 * dphidz ); + gbx = gbx + densebij * ( phi * dphidx2 + phi2 * dphidx ); + gby = gby + densebij * ( phi * dphidy2 + phi2 * dphidy ); + gbz = gbz + densebij * ( phi * dphidz2 + phi2 * dphidz ); #endif - } + } + } } - } #ifdef OSHELL - devSim_dft.densa[gid] = density; - devSim_dft.densb[gid] = densityb; - devSim_dft.gax[gid] = 2.0 * gax; - devSim_dft.gbx[gid] = 2.0 * gbx; - devSim_dft.gay[gid] = 2.0 * gay; - devSim_dft.gby[gid] = 2.0 * gby; - devSim_dft.gaz[gid] = 2.0 * gaz; - devSim_dft.gbz[gid] = 2.0 * gbz; + devSim_dft.densa[gid] = density; + devSim_dft.densb[gid] = densityb; + devSim_dft.gax[gid] = 2.0 * gax; + devSim_dft.gbx[gid] = 2.0 * gbx; + devSim_dft.gay[gid] = 2.0 * gay; + devSim_dft.gby[gid] = 2.0 * gby; + devSim_dft.gaz[gid] = 2.0 * gaz; + devSim_dft.gbz[gid] = 2.0 * gbz; #else - devSim_dft.densa[gid] = density; - devSim_dft.densb[gid] = density; - devSim_dft.gax[gid] = gax; - devSim_dft.gbx[gid] = gax; - devSim_dft.gay[gid] = gay; - devSim_dft.gby[gid] = gay; - devSim_dft.gaz[gid] = gaz; - devSim_dft.gbz[gid] = gaz; + devSim_dft.densa[gid] = density; + devSim_dft.densb[gid] = density; + devSim_dft.gax[gid] = gax; + devSim_dft.gbx[gid] = gax; + devSim_dft.gay[gid] = gay; + devSim_dft.gby[gid] = gay; + devSim_dft.gaz[gid] = gaz; + devSim_dft.gbz[gid] = gaz; #endif - } + } } + #ifdef OSHELL __global__ void oshell_getxc_kernel() #else __global__ void cshell_getxc_kernel() #endif { - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; + QUICKDouble weight = devSim_dft.weight[gid]; + QUICKDouble density = devSim_dft.densa[gid]; + QUICKDouble densityb = devSim_dft.densb[gid]; + QUICKDouble gax = devSim_dft.gax[gid]; + QUICKDouble gay = devSim_dft.gay[gid]; + QUICKDouble gaz = devSim_dft.gaz[gid]; + QUICKDouble gbx = devSim_dft.gbx[gid]; + QUICKDouble gby = devSim_dft.gby[gid]; + QUICKDouble gbz = devSim_dft.gbz[gid]; - QUICKDouble weight = devSim_dft.weight[gid]; - QUICKDouble density = devSim_dft.densa[gid]; - QUICKDouble densityb = devSim_dft.densb[gid]; - QUICKDouble gax = devSim_dft.gax[gid]; - QUICKDouble gay = devSim_dft.gay[gid]; - QUICKDouble gaz = devSim_dft.gaz[gid]; - QUICKDouble gbx = devSim_dft.gbx[gid]; - QUICKDouble gby = devSim_dft.gby[gid]; - QUICKDouble gbz = devSim_dft.gbz[gid]; + if(density >devSim_dft.DMCutoff){ - if(density >devSim_dft.DMCutoff){ - - QUICKDouble dfdr; - QUICKDouble xdot, ydot, zdot; - QUICKDouble _tmp ; + QUICKDouble dfdr; + QUICKDouble xdot, ydot, zdot; + QUICKDouble _tmp ; #ifdef OSHELL - QUICKDouble dfdrb; - QUICKDouble xdotb, ydotb, zdotb; + QUICKDouble dfdrb; + QUICKDouble xdotb, ydotb, zdotb; - QUICKDouble gaa = (gax * gax + gay * gay + gaz * gaz); - QUICKDouble gab = (gax * gbx + gay * gby + gaz * gbz); - QUICKDouble gbb = (gbx * gbx + gby * gby + gbz * gbz); + QUICKDouble gaa = (gax * gax + gay * gay + gaz * gaz); + QUICKDouble gab = (gax * gbx + gay * gby + gaz * gbz); + QUICKDouble gbb = (gbx * gbx + gby * gby + gbz * gbz); #else - QUICKDouble dot; - QUICKDouble sigma = 4.0 * (gax * gax + gay * gay + gaz * gaz); - - if (devSim_dft.method == B3LYP) { - _tmp = b3lyp_e(2.0*density, sigma) * weight; - }else if(devSim_dft.method == BLYP){ - _tmp = (becke_e(density, densityb, gax, gay, gaz, gbx, gby, gbz) - + lyp_e(density, densityb, gax, gay, gaz, gbx, gby, gbz)) * weight; - } - - - if (devSim_dft.method == B3LYP) { - dot = b3lypf(2.0*density, sigma, &dfdr); - xdot = dot * gax; - ydot = dot * gay; - zdot = dot * gaz; - }else if(devSim_dft.method == BLYP){ - QUICKDouble dfdgaa, dfdgab, dfdgaa2, dfdgab2; - QUICKDouble dfdr2; - - becke(density, gax, gay, gaz, gbx, gby, gbz, &dfdr, &dfdgaa, &dfdgab); - lyp(density, densityb, gax, gay, gaz, gbx, gby, gbz, &dfdr2, &dfdgaa2, &dfdgab2); - dfdr += dfdr2; - dfdgaa += dfdgaa2; - dfdgab += dfdgab2; - //Calculate the first term in the dot product shown above,i.e.: - //(2 df/dgaa Grad(rho a) + df/dgab Grad(rho b)) doT Grad(Phimu Phinu)) - xdot = 2.0 * dfdgaa * gax + dfdgab * gbx; - ydot = 2.0 * dfdgaa * gay + dfdgab * gby; - zdot = 2.0 * dfdgaa * gaz + dfdgab * gbz; - }else if(devSim_dft.method == LIBXC){ -#endif - //Prepare in/out for libxc call - double d_rhoa = (double) density; - double d_rhob = (double) densityb; - - // array d_sigma stores gaa, gab and gbb respectively - QUICKDouble d_sigma[3] = {0.0, 0.0, 0.0}; - // array d_vrho stores dfdra and dfdrb respectively - QUICKDouble d_vrho[2] = {0.0, 0.0}; - // array d_vsigma carries dfdgaa, dfdgab and dfdgbb respectively - QUICKDouble d_vsigma[3] = {0.0, 0.0, 0.0}; - QUICKDouble d_zk = 0.0; + QUICKDouble dot; + QUICKDouble sigma = 4.0 * (gax * gax + gay * gay + gaz * gaz); + + if (devSim_dft.method == B3LYP) { + _tmp = b3lyp_e(2.0*density, sigma) * weight; + }else if(devSim_dft.method == BLYP){ + _tmp = (becke_e(density, densityb, gax, gay, gaz, gbx, gby, gbz) + + lyp_e(density, densityb, gax, gay, gaz, gbx, gby, gbz)) * weight; + } + + + if (devSim_dft.method == B3LYP) { + dot = b3lypf(2.0*density, sigma, &dfdr); + xdot = dot * gax; + ydot = dot * gay; + zdot = dot * gaz; + }else if(devSim_dft.method == BLYP){ + QUICKDouble dfdgaa, dfdgab, dfdgaa2, dfdgab2; + QUICKDouble dfdr2; + + becke(density, gax, gay, gaz, gbx, gby, gbz, &dfdr, &dfdgaa, &dfdgab); + lyp(density, densityb, gax, gay, gaz, gbx, gby, gbz, &dfdr2, &dfdgaa2, &dfdgab2); + dfdr += dfdr2; + dfdgaa += dfdgaa2; + dfdgab += dfdgab2; + //Calculate the first term in the dot product shown above,i.e.: + //(2 df/dgaa Grad(rho a) + df/dgab Grad(rho b)) doT Grad(Phimu Phinu)) + xdot = 2.0 * dfdgaa * gax + dfdgab * gbx; + ydot = 2.0 * dfdgaa * gay + dfdgab * gby; + zdot = 2.0 * dfdgaa * gaz + dfdgab * gbz; + }else if(devSim_dft.method == LIBXC){ +#endif + //Prepare in/out for libxc call + double d_rhoa = (double) density; + double d_rhob = (double) densityb; + + // array d_sigma stores gaa, gab and gbb respectively + QUICKDouble d_sigma[3] = {0.0, 0.0, 0.0}; + // array d_vrho stores dfdra and dfdrb respectively + QUICKDouble d_vrho[2] = {0.0, 0.0}; + // array d_vsigma carries dfdgaa, dfdgab and dfdgbb respectively + QUICKDouble d_vsigma[3] = {0.0, 0.0, 0.0}; + QUICKDouble d_zk = 0.0; #ifdef OSHELL - d_sigma[0] = gaa; - d_sigma[1] = gab; - d_sigma[2] = gbb; + d_sigma[0] = gaa; + d_sigma[1] = gab; + d_sigma[2] = gbb; #else - d_sigma[0] = sigma; + d_sigma[0] = sigma; #endif - int nof_functionals = devSim_dft.nauxfunc; - gpu_libxc_info** glinfo = devSim_dft.glinfo; - - for(int i=0; igpu_worker){ - case GPU_WORK_LDA: - gpu_work_lda_c(tmp_glinfo, d_rhoa, d_rhob, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, NSPIN); - break; - - case GPU_WORK_GGA_X: - - gpu_work_gga_x(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); - break; - - case GPU_WORK_GGA_C: - gpu_work_gga_c(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); - break; - } - d_zk += (tmp_d_zk*tmp_glinfo->mix_coeff); - d_vrho[0] += (tmp_d_vrho[0]*tmp_glinfo->mix_coeff); - d_vsigma[0] += (tmp_d_vsigma[0]*tmp_glinfo->mix_coeff); -#ifdef OSHELL - d_vrho[1] += (tmp_d_vrho[1] * tmp_glinfo->mix_coeff); - d_vsigma[1] += (tmp_d_vsigma[1] * tmp_glinfo->mix_coeff); - d_vsigma[2] += (tmp_d_vsigma[2] * tmp_glinfo->mix_coeff); -#endif + int nof_functionals = devSim_dft.nauxfunc; + gpu_libxc_info** glinfo = devSim_dft.glinfo; - } + for(int i=0; igpu_worker){ + case GPU_WORK_LDA: + gpu_work_lda_c(tmp_glinfo, d_rhoa, d_rhob, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, NSPIN); + break; - xdotb = 2.0 * d_vsigma[2] * gbx + d_vsigma[1] * gax; - ydotb = 2.0 * d_vsigma[2] * gby + d_vsigma[1] * gay; - zdotb = 2.0 * d_vsigma[2] * gbz + d_vsigma[1] * gaz; -#else - xdot = 4.0 * d_vsigma[0] * gax; - ydot = 4.0 * d_vsigma[0] * gay; - zdot = 4.0 * d_vsigma[0] * gaz; -#endif + case GPU_WORK_GGA_X: -#ifndef OSHELL - } + gpu_work_gga_x(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); + break; + + case GPU_WORK_GGA_C: + gpu_work_gga_c(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); + break; + } + d_zk += (tmp_d_zk*tmp_glinfo->mix_coeff); + d_vrho[0] += (tmp_d_vrho[0]*tmp_glinfo->mix_coeff); + d_vsigma[0] += (tmp_d_vsigma[0]*tmp_glinfo->mix_coeff); +#ifdef OSHELL + d_vrho[1] += (tmp_d_vrho[1] * tmp_glinfo->mix_coeff); + d_vsigma[1] += (tmp_d_vsigma[1] * tmp_glinfo->mix_coeff); + d_vsigma[2] += (tmp_d_vsigma[2] * tmp_glinfo->mix_coeff); #endif -#ifdef USE_LEGACY_ATOMICS - QUICKULL val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) - val1 = 0ull - val1; - QUICKADD(devSim_dft.DFT_calculated[0].Eelxc, val1); + } - _tmp = weight*density; - val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) - val1 = 0ull - val1; - QUICKADD(devSim_dft.DFT_calculated[0].aelec, val1); + _tmp = ((QUICKDouble) (d_zk * (d_rhoa + d_rhob)) * weight); + dfdr = (QUICKDouble) d_vrho[0]; +#ifdef OSHELL + dfdrb= (QUICKDouble) d_vrho[1]; + xdot = 2.0 * d_vsigma[0] * gax + d_vsigma[1] * gbx; + ydot = 2.0 * d_vsigma[0] * gay + d_vsigma[1] * gby; + zdot = 2.0 * d_vsigma[0] * gaz + d_vsigma[1] * gbz; - _tmp = weight*densityb; - val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) - val1 = 0ull - val1; - QUICKADD(devSim_dft.DFT_calculated[0].belec, val1); + xdotb = 2.0 * d_vsigma[2] * gbx + d_vsigma[1] * gax; + ydotb = 2.0 * d_vsigma[2] * gby + d_vsigma[1] * gay; + zdotb = 2.0 * d_vsigma[2] * gbz + d_vsigma[1] * gaz; #else - atomicAdd(&devSim_dft.DFT_calculated[0].Eelxc, _tmp); - atomicAdd(&devSim_dft.DFT_calculated[0].aelec, weight*density); - atomicAdd(&devSim_dft.DFT_calculated[0].belec, weight*densityb); + xdot = 4.0 * d_vsigma[0] * gax; + ydot = 4.0 * d_vsigma[0] * gay; + zdot = 4.0 * d_vsigma[0] * gaz; #endif - for (int i = bfloc_st; i< bfloc_end; ++i) { +#ifndef OSHELL + } +#endif - int ibas = devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; + atomicAdd(&devSim_dft.DFT_calculated[0].Eelxc, _tmp); + atomicAdd(&devSim_dft.DFT_calculated[0].aelec, weight*density); + atomicAdd(&devSim_dft.DFT_calculated[0].belec, weight*densityb); - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.XCCutoff ) { - for (int j = bfloc_st; j < bfloc_end; j++) { + for (int i = bfloc_st; i< bfloc_end; ++i) { + int ibas = devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.XCCutoff ) { + for (int j = bfloc_st; j < bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - QUICKDouble _tmp = (phi * phi2 * dfdr + xdot * (phi*dphidx2 + phi2*dphidx) \ - + ydot * (phi*dphidy2 + phi2*dphidy) + zdot * (phi*dphidz2 + phi2*dphidz))*weight; + QUICKDouble _tmp = (phi * phi2 * dfdr + xdot * (phi*dphidx2 + phi2*dphidx) \ + + ydot * (phi*dphidy2 + phi2*dphidy) + zdot * (phi*dphidz2 + phi2*dphidz))*weight; -#ifdef USE_LEGACY_ATOMICS - QUICKULL val1 = (QUICKULL) (fabs( _tmp * OSCALE) + (QUICKDouble)0.5); - if ( _tmp * weight < (QUICKDouble)0.0) val1 = 0ull - val1; - QUICKADD(LOC2(devSim_dft.oULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), val1); -#else - atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); -#endif + atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); #ifdef OSHELL - QUICKDouble _tmpb = (phi * phi2 * dfdrb + xdotb * (phi*dphidx2 + phi2*dphidx) - + ydotb * (phi*dphidy2 + phi2*dphidy) + zdotb * (phi*dphidz2 + phi2*dphidz))*weight; + QUICKDouble _tmpb = (phi * phi2 * dfdrb + xdotb * (phi*dphidx2 + phi2*dphidx) + + ydotb * (phi*dphidy2 + phi2*dphidy) + zdotb * (phi*dphidz2 + phi2*dphidz))*weight; -#ifdef USE_LEGACY_ATOMICS - QUICKULL val2 = (QUICKULL) (fabs( _tmpb * OSCALE) + (QUICKDouble)0.5); - if ( _tmpb * weight < (QUICKDouble)0.0) val2 = 0ull - val2; - QUICKADD(LOC2(devSim_dft.obULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), val2); -#else - atomicAdd(&LOC2(devSim_dft.ob, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmpb); + atomicAdd(&LOC2(devSim_dft.ob, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmpb); #endif -#endif - } + } + } + } } - } } - } - } + #ifdef OSHELL __global__ void oshell_getxcgrad_kernel() #else __global__ void cshell_getxcgrad_kernel() #endif { - -#ifdef USE_LEGACY_ATOMICS - //declare smem grad vector - extern __shared__ QUICKULL smem_buffer[]; - QUICKULL* smemGrad=(QUICKULL*)smem_buffer; - - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0ull; -#else - //declare smem grad vector - extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; - - // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0.0; -#endif - __syncthreads(); - - unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; - - for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { - - int bin_id = devSim_dft.bin_locator[gid]; - int bfloc_st = devSim_dft.basf_locator[bin_id]; - int bfloc_end = devSim_dft.basf_locator[bin_id+1]; - - - QUICKDouble gridx = devSim_dft.gridx[gid]; - QUICKDouble gridy = devSim_dft.gridy[gid]; - QUICKDouble gridz = devSim_dft.gridz[gid]; - QUICKDouble weight = devSim_dft.weight[gid]; - QUICKDouble density = devSim_dft.densa[gid]; - QUICKDouble densityb = devSim_dft.densb[gid]; - QUICKDouble gax = devSim_dft.gax[gid]; - QUICKDouble gay = devSim_dft.gay[gid]; - QUICKDouble gaz = devSim_dft.gaz[gid]; - QUICKDouble gbx = devSim_dft.gbx[gid]; - QUICKDouble gby = devSim_dft.gby[gid]; - QUICKDouble gbz = devSim_dft.gbz[gid]; + //declare smem grad vector + extern __shared__ QUICKDouble smem_buffer[]; + QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; + + // initialize smem grad + for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) + smemGrad[i]=0.0; + + __syncthreads(); + + unsigned int offset = blockIdx.x*blockDim.x+threadIdx.x; + int totalThreads = blockDim.x*gridDim.x; + + for (QUICKULL gid = offset; gid < devSim_dft.npoints; gid += totalThreads) { + int bin_id = devSim_dft.bin_locator[gid]; + int bfloc_st = devSim_dft.basf_locator[bin_id]; + int bfloc_end = devSim_dft.basf_locator[bin_id+1]; + + QUICKDouble gridx = devSim_dft.gridx[gid]; + QUICKDouble gridy = devSim_dft.gridy[gid]; + QUICKDouble gridz = devSim_dft.gridz[gid]; + QUICKDouble weight = devSim_dft.weight[gid]; + QUICKDouble density = devSim_dft.densa[gid]; + QUICKDouble densityb = devSim_dft.densb[gid]; + QUICKDouble gax = devSim_dft.gax[gid]; + QUICKDouble gay = devSim_dft.gay[gid]; + QUICKDouble gaz = devSim_dft.gaz[gid]; + QUICKDouble gbx = devSim_dft.gbx[gid]; + QUICKDouble gby = devSim_dft.gby[gid]; + QUICKDouble gbz = devSim_dft.gbz[gid]; #ifdef CEW - QUICKDouble dfdr_cew = 0.0; - if(devSim_dft.use_cew) dfdr_cew = devSim_dft.cew_vrecip[gid]; + QUICKDouble dfdr_cew = 0.0; + if(devSim_dft.use_cew) dfdr_cew = devSim_dft.cew_vrecip[gid]; #endif - if(density >devSim_dft.DMCutoff){ + if(density >devSim_dft.DMCutoff){ - QUICKDouble dfdr; - QUICKDouble xdot, ydot, zdot; - QUICKDouble _tmp ; + QUICKDouble dfdr; + QUICKDouble xdot, ydot, zdot; + QUICKDouble _tmp ; #ifdef OSHELL - QUICKDouble dfdrb; - QUICKDouble xdotb, ydotb, zdotb; + QUICKDouble dfdrb; + QUICKDouble xdotb, ydotb, zdotb; - QUICKDouble gaa = (gax * gax + gay * gay + gaz * gaz); - QUICKDouble gab = (gax * gbx + gay * gby + gaz * gbz); - QUICKDouble gbb = (gbx * gbx + gby * gby + gbz * gbz); + QUICKDouble gaa = (gax * gax + gay * gay + gaz * gaz); + QUICKDouble gab = (gax * gbx + gay * gby + gaz * gbz); + QUICKDouble gbb = (gbx * gbx + gby * gby + gbz * gbz); #else - QUICKDouble dot; - QUICKDouble sigma = 4.0 * (gax * gax + gay * gay + gaz * gaz); - - if (devSim_dft.method == B3LYP) { - _tmp = b3lyp_e(2.0*density, sigma); - }else if(devSim_dft.method == BLYP){ - _tmp = (becke_e(density, densityb, gax, gay, gaz, gbx, gby, gbz) - + lyp_e(density, densityb, gax, gay, gaz, gbx, gby, gbz)); - } - - - if (devSim_dft.method == B3LYP) { - dot = b3lypf(2.0*density, sigma, &dfdr); - xdot = dot * gax; - ydot = dot * gay; - zdot = dot * gaz; - }else if(devSim_dft.method == BLYP){ - QUICKDouble dfdgaa, dfdgab, dfdgaa2, dfdgab2; - QUICKDouble dfdr2; - - becke(density, gax, gay, gaz, gbx, gby, gbz, &dfdr, &dfdgaa, &dfdgab); - lyp(density, densityb, gax, gay, gaz, gbx, gby, gbz, &dfdr2, &dfdgaa2, &dfdgab2); - dfdr += dfdr2; - dfdgaa += dfdgaa2; - dfdgab += dfdgab2; - - //Calculate the first term in the dot product shown above,i.e.: - //(2 df/dgaa Grad(rho a) + df/dgab Grad(rho b)) doT Grad(Phimu Phinu)) - xdot = 2.0 * dfdgaa * gax + dfdgab * gbx; - ydot = 2.0 * dfdgaa * gay + dfdgab * gby; - zdot = 2.0 * dfdgaa * gaz + dfdgab * gbz; - - }else if(devSim_dft.method == LIBXC){ -#endif - //Prepare in/out for libxc call - QUICKDouble d_rhoa = (QUICKDouble) density; - QUICKDouble d_rhob = (QUICKDouble) densityb; - // array d_sigma stores gaa, gab and gbb respectively - QUICKDouble d_sigma[3] = {0.0, 0.0, 0.0}; - // array d_vrho stores dfdra and dfdrb respectively - QUICKDouble d_vrho[2] = {0.0, 0.0}; - // array d_vsigma carries dfdgaa, dfdgab and dfdgbb respectively - QUICKDouble d_vsigma[3] = {0.0, 0.0, 0.0}; - QUICKDouble d_zk = 0.0; + QUICKDouble dot; + QUICKDouble sigma = 4.0 * (gax * gax + gay * gay + gaz * gaz); + + if (devSim_dft.method == B3LYP) { + _tmp = b3lyp_e(2.0*density, sigma); + }else if(devSim_dft.method == BLYP){ + _tmp = (becke_e(density, densityb, gax, gay, gaz, gbx, gby, gbz) + + lyp_e(density, densityb, gax, gay, gaz, gbx, gby, gbz)); + } + + + if (devSim_dft.method == B3LYP) { + dot = b3lypf(2.0*density, sigma, &dfdr); + xdot = dot * gax; + ydot = dot * gay; + zdot = dot * gaz; + }else if(devSim_dft.method == BLYP){ + QUICKDouble dfdgaa, dfdgab, dfdgaa2, dfdgab2; + QUICKDouble dfdr2; + + becke(density, gax, gay, gaz, gbx, gby, gbz, &dfdr, &dfdgaa, &dfdgab); + lyp(density, densityb, gax, gay, gaz, gbx, gby, gbz, &dfdr2, &dfdgaa2, &dfdgab2); + dfdr += dfdr2; + dfdgaa += dfdgaa2; + dfdgab += dfdgab2; + + //Calculate the first term in the dot product shown above,i.e.: + //(2 df/dgaa Grad(rho a) + df/dgab Grad(rho b)) doT Grad(Phimu Phinu)) + xdot = 2.0 * dfdgaa * gax + dfdgab * gbx; + ydot = 2.0 * dfdgaa * gay + dfdgab * gby; + zdot = 2.0 * dfdgaa * gaz + dfdgab * gbz; + + }else if(devSim_dft.method == LIBXC){ +#endif + //Prepare in/out for libxc call + QUICKDouble d_rhoa = (QUICKDouble) density; + QUICKDouble d_rhob = (QUICKDouble) densityb; + // array d_sigma stores gaa, gab and gbb respectively + QUICKDouble d_sigma[3] = {0.0, 0.0, 0.0}; + // array d_vrho stores dfdra and dfdrb respectively + QUICKDouble d_vrho[2] = {0.0, 0.0}; + // array d_vsigma carries dfdgaa, dfdgab and dfdgbb respectively + QUICKDouble d_vsigma[3] = {0.0, 0.0, 0.0}; + QUICKDouble d_zk = 0.0; #ifdef OSHELL - d_sigma[0] = gaa; - d_sigma[1] = gab; - d_sigma[2] = gbb; + d_sigma[0] = gaa; + d_sigma[1] = gab; + d_sigma[2] = gbb; #else - d_sigma[0] = sigma; + d_sigma[0] = sigma; #endif - int nof_functionals = devSim_dft.nauxfunc; - gpu_libxc_info** glinfo = devSim_dft.glinfo; + int nof_functionals = devSim_dft.nauxfunc; + gpu_libxc_info** glinfo = devSim_dft.glinfo; - for(int i=0; igpu_worker){ - case GPU_WORK_LDA: - gpu_work_lda_c(tmp_glinfo, d_rhoa, d_rhob, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, NSPIN); - break; + switch(tmp_glinfo->gpu_worker){ + case GPU_WORK_LDA: + gpu_work_lda_c(tmp_glinfo, d_rhoa, d_rhob, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, NSPIN); + break; - case GPU_WORK_GGA_X: - gpu_work_gga_x(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); - break; + case GPU_WORK_GGA_X: + gpu_work_gga_x(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); + break; - case GPU_WORK_GGA_C: - gpu_work_gga_c(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); - break; - } - d_zk += (tmp_d_zk * tmp_glinfo->mix_coeff); - d_vrho[0] += (tmp_d_vrho[0] * tmp_glinfo->mix_coeff); - d_vsigma[0] += (tmp_d_vsigma[0] * tmp_glinfo->mix_coeff); + case GPU_WORK_GGA_C: + gpu_work_gga_c(tmp_glinfo, d_rhoa, d_rhob, (QUICKDouble*)&d_sigma, &tmp_d_zk, (QUICKDouble*)&tmp_d_vrho, (QUICKDouble*)&tmp_d_vsigma, NSPIN); + break; + } + d_zk += (tmp_d_zk * tmp_glinfo->mix_coeff); + d_vrho[0] += (tmp_d_vrho[0] * tmp_glinfo->mix_coeff); + d_vsigma[0] += (tmp_d_vsigma[0] * tmp_glinfo->mix_coeff); #ifdef OSHELL - d_vrho[1] += (tmp_d_vrho[1] * tmp_glinfo->mix_coeff); - d_vsigma[1] += (tmp_d_vsigma[1] * tmp_glinfo->mix_coeff); - d_vsigma[2] += (tmp_d_vsigma[2] * tmp_glinfo->mix_coeff); + d_vrho[1] += (tmp_d_vrho[1] * tmp_glinfo->mix_coeff); + d_vsigma[1] += (tmp_d_vsigma[1] * tmp_glinfo->mix_coeff); + d_vsigma[2] += (tmp_d_vsigma[2] * tmp_glinfo->mix_coeff); #endif - } + } - _tmp = ((QUICKDouble) (d_zk * (d_rhoa + d_rhob))); - dfdr = (QUICKDouble) d_vrho[0]; + _tmp = ((QUICKDouble) (d_zk * (d_rhoa + d_rhob))); + dfdr = (QUICKDouble) d_vrho[0]; #ifdef OSHELL - dfdrb= (QUICKDouble) d_vrho[1]; + dfdrb= (QUICKDouble) d_vrho[1]; - xdot = 2.0 * d_vsigma[0] * gax + d_vsigma[1] * gbx; - ydot = 2.0 * d_vsigma[0] * gay + d_vsigma[1] * gby; - zdot = 2.0 * d_vsigma[0] * gaz + d_vsigma[1] * gbz; + xdot = 2.0 * d_vsigma[0] * gax + d_vsigma[1] * gbx; + ydot = 2.0 * d_vsigma[0] * gay + d_vsigma[1] * gby; + zdot = 2.0 * d_vsigma[0] * gaz + d_vsigma[1] * gbz; - xdotb = 2.0 * d_vsigma[2] * gbx + d_vsigma[1] * gax; - ydotb = 2.0 * d_vsigma[2] * gby + d_vsigma[1] * gay; - zdotb = 2.0 * d_vsigma[2] * gbz + d_vsigma[1] * gaz; + xdotb = 2.0 * d_vsigma[2] * gbx + d_vsigma[1] * gax; + ydotb = 2.0 * d_vsigma[2] * gby + d_vsigma[1] * gay; + zdotb = 2.0 * d_vsigma[2] * gbz + d_vsigma[1] * gaz; #else - xdot = 4.0 * d_vsigma[0] * gax; - ydot = 4.0 * d_vsigma[0] * gay; - zdot = 4.0 * d_vsigma[0] * gaz; + xdot = 4.0 * d_vsigma[0] * gax; + ydot = 4.0 * d_vsigma[0] * gay; + zdot = 4.0 * d_vsigma[0] * gaz; #endif #ifndef OSHELL - } + } #endif #ifdef CEW - devSim_dft.exc[gid] = _tmp + (dfdr_cew * (density+densityb)); + devSim_dft.exc[gid] = _tmp + (dfdr_cew * (density+densityb)); #else - devSim_dft.exc[gid] = _tmp; + devSim_dft.exc[gid] = _tmp; #endif - QUICKDouble sumGradx=0.0, sumGrady=0.0, sumGradz=0.0; + QUICKDouble sumGradx=0.0, sumGrady=0.0, sumGradz=0.0; - for (int i = bfloc_st; i< bfloc_end; i++) { - int ibas = devSim_dft.basf[i]; - QUICKDouble phi, dphidx, dphidy, dphidz; - pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + for (int i = bfloc_st; i< bfloc_end; i++) { + int ibas = devSim_dft.basf[i]; + QUICKDouble phi, dphidx, dphidy, dphidz; + pteval_new(gridx, gridy, gridz, &phi, &dphidx, &dphidy, &dphidz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.XCCutoff ) { + if (abs(phi+dphidx+dphidy+dphidz)> devSim_dft.XCCutoff ) { - QUICKDouble dxdx, dxdy, dxdz, dydy, dydz, dzdz; + QUICKDouble dxdx, dxdy, dxdz, dydy, dydz, dzdz; - pt2der_new(gridx, gridy, gridz, &dxdx, &dxdy, &dxdz, &dydy, &dydz, &dzdz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); + pt2der_new(gridx, gridy, gridz, &dxdx, &dxdy, &dxdz, &dydy, &dydz, &dzdz, devSim_dft.primf, devSim_dft.primf_locator, ibas, i); - int Istart = (devSim_dft.ncenter[ibas]-1) * 3; + int Istart = (devSim_dft.ncenter[ibas]-1) * 3; - for (int j = bfloc_st; j < bfloc_end; j++) { + for (int j = bfloc_st; j < bfloc_end; j++) { + int jbas = devSim_dft.basf[j]; + QUICKDouble phi2, dphidx2, dphidy2, dphidz2; - int jbas = devSim_dft.basf[j]; - QUICKDouble phi2, dphidx2, dphidy2, dphidz2; + pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); - pteval_new(gridx, gridy, gridz, &phi2, &dphidx2, &dphidy2, &dphidz2, devSim_dft.primf, devSim_dft.primf_locator, jbas, j); + QUICKDouble denseij = (QUICKDouble) LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); - QUICKDouble denseij = (QUICKDouble) LOC2(devSim_dft.dense, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + QUICKDouble Gradx = - 2.0 * denseij * weight * (dfdr * dphidx * phi2 + + xdot * (dxdx * phi2 + dphidx * dphidx2) + + ydot * (dxdy * phi2 + dphidx * dphidy2) + + zdot * (dxdz * phi2 + dphidx * dphidz2)); - QUICKDouble Gradx = - 2.0 * denseij * weight * (dfdr * dphidx * phi2 - + xdot * (dxdx * phi2 + dphidx * dphidx2) - + ydot * (dxdy * phi2 + dphidx * dphidy2) - + zdot * (dxdz * phi2 + dphidx * dphidz2)); + QUICKDouble Grady = - 2.0 * denseij * weight * (dfdr * dphidy * phi2 + + xdot * (dxdy * phi2 + dphidy * dphidx2) + + ydot * (dydy * phi2 + dphidy * dphidy2) + + zdot * (dydz * phi2 + dphidy * dphidz2)); - QUICKDouble Grady = - 2.0 * denseij * weight * (dfdr * dphidy * phi2 - + xdot * (dxdy * phi2 + dphidy * dphidx2) - + ydot * (dydy * phi2 + dphidy * dphidy2) - + zdot * (dydz * phi2 + dphidy * dphidz2)); - - QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2 - + xdot * (dxdz * phi2 + dphidz * dphidx2) - + ydot * (dydz * phi2 + dphidz * dphidy2) - + zdot * (dzdz * phi2 + dphidz * dphidz2)); + QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2 + + xdot * (dxdz * phi2 + dphidz * dphidx2) + + ydot * (dydz * phi2 + dphidz * dphidy2) + + zdot * (dzdz * phi2 + dphidz * dphidz2)); #ifdef OSHELL - QUICKDouble densebij = (QUICKDouble) LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); - - Gradx += - 2.0 * densebij * weight * (dfdrb * dphidx * phi2 - + xdotb * (dxdx * phi2 + dphidx * dphidx2) - + ydotb * (dxdy * phi2 + dphidx * dphidy2) - + zdotb * (dxdz * phi2 + dphidx * dphidz2)); - - Grady += - 2.0 * densebij * weight * (dfdrb * dphidy * phi2 - + xdotb * (dxdy * phi2 + dphidy * dphidx2) - + ydotb * (dydy * phi2 + dphidy * dphidy2) - + zdotb * (dydz * phi2 + dphidy * dphidz2)); - - Gradz += - 2.0 * densebij * weight * (dfdrb * dphidz * phi2 - + xdotb * (dxdz * phi2 + dphidz * dphidx2) - + ydotb * (dydz * phi2 + dphidz * dphidy2) - + zdotb * (dzdz * phi2 + dphidz * dphidz2)); + QUICKDouble densebij = (QUICKDouble) LOC2(devSim_dft.denseb, ibas, jbas, devSim_dft.nbasis, devSim_dft.nbasis); + + Gradx += - 2.0 * densebij * weight * (dfdrb * dphidx * phi2 + + xdotb * (dxdx * phi2 + dphidx * dphidx2) + + ydotb * (dxdy * phi2 + dphidx * dphidy2) + + zdotb * (dxdz * phi2 + dphidx * dphidz2)); + + Grady += - 2.0 * densebij * weight * (dfdrb * dphidy * phi2 + + xdotb * (dxdy * phi2 + dphidy * dphidx2) + + ydotb * (dydy * phi2 + dphidy * dphidy2) + + zdotb * (dydz * phi2 + dphidy * dphidz2)); + + Gradz += - 2.0 * densebij * weight * (dfdrb * dphidz * phi2 + + xdotb * (dxdz * phi2 + dphidz * dphidx2) + + ydotb * (dydz * phi2 + dphidz * dphidy2) + + zdotb * (dzdz * phi2 + dphidz * dphidz2)); #endif #ifdef CEW - if(devSim_dft.use_cew){ + if(devSim_dft.use_cew){ #ifdef OSHELL - denseij += densebij; + denseij += densebij; #endif - Gradx -= 2.0 * denseij * weight * dfdr_cew * dphidx * phi2; - Grady -= 2.0 * denseij * weight * dfdr_cew * dphidy * phi2; - Gradz -= 2.0 * denseij * weight * dfdr_cew * dphidz * phi2; + Gradx -= 2.0 * denseij * weight * dfdr_cew * dphidx * phi2; + Grady -= 2.0 * denseij * weight * dfdr_cew * dphidy * phi2; + Gradz -= 2.0 * denseij * weight * dfdr_cew * dphidz * phi2; - } + } #endif -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[Istart], Gradx); - GRADADD(smemGrad[Istart+1], Grady); - GRADADD(smemGrad[Istart+2], Gradz); -#else - atomicAdd(&smemGrad[Istart], Gradx); - atomicAdd(&smemGrad[Istart+1], Grady); - atomicAdd(&smemGrad[Istart+2], Gradz); -#endif - sumGradx += Gradx; - sumGrady += Grady; - sumGradz += Gradz; - - } - } - } + atomicAdd(&smemGrad[Istart], Gradx); + atomicAdd(&smemGrad[Istart+1], Grady); + atomicAdd(&smemGrad[Istart+2], Gradz); + sumGradx += Gradx; + sumGrady += Grady; + sumGradz += Gradz; + } + } + } - int Istart = (devSim_dft.gatm[gid]-1) * 3; + int Istart = (devSim_dft.gatm[gid]-1) * 3; -#ifdef USE_LEGACY_ATOMICS - GRADADD(smemGrad[Istart], -sumGradx); - GRADADD(smemGrad[Istart+1], -sumGrady); - GRADADD(smemGrad[Istart+2], -sumGradz); -#else - atomicAdd(&smemGrad[Istart], -sumGradx); - atomicAdd(&smemGrad[Istart+1], -sumGrady); - atomicAdd(&smemGrad[Istart+2], -sumGradz); -#endif - } - //Set weights for sswder calculation - if(density < devSim_dft.DMCutoff){ + atomicAdd(&smemGrad[Istart], -sumGradx); + atomicAdd(&smemGrad[Istart+1], -sumGrady); + atomicAdd(&smemGrad[Istart+2], -sumGradz); + } + //Set weights for sswder calculation + if(density < devSim_dft.DMCutoff){ devSim_dft.dweight_ssd[gid] = 0; - } + } - if(devSim_dft.sswt[gid] == 1){ + if(devSim_dft.sswt[gid] == 1){ devSim_dft.dweight_ssd[gid] = 0; - } - - } + } - __syncthreads(); + } - // update gmem grad vector - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) -#ifdef USE_LEGACY_ATOMICS - atomicAdd(&devSim_dft.gradULL[i],smemGrad[i]); -#else - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); -#endif + __syncthreads(); - __syncthreads(); + // update gmem grad vector + for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) + atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + __syncthreads(); } #undef NSPIN diff --git a/src/gpu/hip/gpu_oei.h b/src/gpu/hip/gpu_oei.h index 344f5f011..f7c2de91b 100644 --- a/src/gpu/hip/gpu_oei.h +++ b/src/gpu/hip/gpu_oei.h @@ -63,15 +63,7 @@ __device__ void addint_oei(unsigned int I, unsigned int J, unsigned int II, unsi // LOC2(devSim.KLMN, 2, JJJ - 1, 3,devSim.nbasis)); // } -#if defined(USE_LEGACY_ATOMICS) - QUICKULL Yull = (QUICKULL) (fabs(Y * OSCALE) + (QUICKDouble) 0.5); - if (Y < (QUICKDouble)0.0) Yull = 0ull - Yull; - - // Now add the contribution into Fock matrix. - QUICKADD(LOC2(devSim.oULL, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), Yull); -#else atomicAdd(&LOC2(devSim.o, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), Y); -#endif //printf("addint_oei: %d %d %f %f %f \n", III, JJJ, devSim.cons[III-1], devSim.cons[JJJ-1], LOCSTORE(store2, i-1, j-1, STOREDIM, STOREDIM)); } diff --git a/src/gpu/hip/gpu_type.h b/src/gpu/hip/gpu_type.h index b57521c78..2bdd8484b 100644 --- a/src/gpu/hip/gpu_type.h +++ b/src/gpu/hip/gpu_type.h @@ -28,81 +28,66 @@ */ template struct gpu_buffer_type; struct gpu_calculated_type { - int natom; // number of atom - int nbasis; // number of basis sets - gpu_buffer_type* o; // O matrix - gpu_buffer_type* ob; // beta O matrix - gpu_buffer_type* dense; // Density Matrix - gpu_buffer_type* denseb; // Beta Density Matrix -#ifdef USE_LEGACY_ATOMICS - gpu_buffer_type* oULL; // Unsigned long long int type O matrix - gpu_buffer_type* obULL; // Unsigned long long int type Ob matrix -#endif - gpu_buffer_type* distance; // distance matrix + int natom; // number of atom + int nbasis; // number of basis sets + gpu_buffer_type* o; // O matrix + gpu_buffer_type* ob; // beta O matrix + gpu_buffer_type* dense; // Density Matrix + gpu_buffer_type* denseb; // Beta Density Matrix + gpu_buffer_type* distance; // distance matrix }; // struct to hold large temporary device arrays -struct gpu_scratch{ - +struct gpu_scratch { gpu_buffer_type* store; // holds temporary primitive integrals in OEI and ERI algorithms gpu_buffer_type* store2; // holds temporary primitive integrals in OEI and ERI algorithms gpu_buffer_type* storeAA; // holds weighted temporary primitive integrals in OEI and ERI gradient algorithms gpu_buffer_type* storeBB; // holds weighted temporary primitive integrals in OEI and ERI gradient algorithms gpu_buffer_type* storeCC; // holds weighted temporary primitive integrals in OEI and ERI gradient algorithms gpu_buffer_type* YVerticalTemp; // holds boys function values - }; -struct gpu_timer_type{ - - double t_2elb; // time for eri load balancing in mgpu version - double t_xclb; // time for xc load balancing in mgpu version - double t_xcrb; // time for xc load re-balancing in mgpu version - double t_xcpg; // grid pruning time - +struct gpu_timer_type { + double t_2elb; // time for eri load balancing in mgpu version + double t_xclb; // time for xc load balancing in mgpu version + double t_xcrb; // time for xc load re-balancing in mgpu version + double t_xcpg; // grid pruning time }; struct gpu_cutoff_type { - int natom; - int nbasis; - int nshell; + int natom; + int nbasis; + int nshell; // the following are for pre-sorting cutoff - int sqrQshell; - gpu_buffer_type* sorted_YCutoffIJ; + int sqrQshell; + gpu_buffer_type* sorted_YCutoffIJ; // Cutoff matrix - gpu_buffer_type* cutMatrix; - gpu_buffer_type* YCutoff; - gpu_buffer_type* cutPrim; + gpu_buffer_type* cutMatrix; + gpu_buffer_type* YCutoff; + gpu_buffer_type* cutPrim; // Cutoff criteria - QUICKDouble integralCutoff; - QUICKDouble coreIntegralCutoff; - QUICKDouble primLimit; - QUICKDouble DMCutoff; - QUICKDouble XCCutoff; - QUICKDouble gradCutoff; + QUICKDouble integralCutoff; + QUICKDouble coreIntegralCutoff; + QUICKDouble primLimit; + QUICKDouble DMCutoff; + QUICKDouble XCCutoff; + QUICKDouble gradCutoff; // One electron pre-sorting cutoff - gpu_buffer_type* sorted_OEICutoffIJ; - + gpu_buffer_type* sorted_OEICutoffIJ; }; struct DFT_calculated_type { -#ifdef USE_LEGACY_ATOMICS - QUICKULL Eelxc; // exchange correction energy - QUICKULL aelec; // alpha electron - QUICKULL belec; // beta electron -#else - QUICKDouble Eelxc; // exchange correction energy - QUICKDouble aelec; // alpha electron - QUICKDouble belec; // beta electron -#endif + QUICKDouble Eelxc; // exchange correction energy + QUICKDouble aelec; // alpha electron + QUICKDouble belec; // beta electron }; /*Madu Manathunga 11/21/2019*/ -struct XC_quadrature_type{ +struct XC_quadrature_type { int npoints; //Total number of packed grid points int nbins; //Total number of bins int ntotbf; //Total number of basis functions @@ -114,15 +99,15 @@ struct XC_quadrature_type{ gpu_buffer_type* gridz; //Z coordinate of a grid point gpu_buffer_type* sswt; //A version of weight required for gradients gpu_buffer_type* weight; //Scuzeria weight of a grid point - gpu_buffer_type* gatm; //To which atom does a given grid point belongs to? - gpu_buffer_type* bin_counter; //Keeps track of bin borders - gpu_buffer_type* dweight_ssd; //Dummy weight of grid points for sswder - gpu_buffer_type* basf; //Basis function indices of all grid points - gpu_buffer_type* primf; //Primitive function inidices of all grid points - gpu_buffer_type* primfpbin; //Number of primitive functions per bin - gpu_buffer_type* basf_locator; //Helps accessing b.f. indices of a grid point - gpu_buffer_type* primf_locator; //Helps accessing p.f. indices of a b.f. - gpu_buffer_type* bin_locator; //Helps accessing bin of a grid point + gpu_buffer_type* gatm; //To which atom does a given grid point belongs to? + gpu_buffer_type* bin_counter; //Keeps track of bin borders + gpu_buffer_type* dweight_ssd; //Dummy weight of grid points for sswder + gpu_buffer_type* basf; //Basis function indices of all grid points + gpu_buffer_type* primf; //Primitive function inidices of all grid points + gpu_buffer_type* primfpbin; //Number of primitive functions per bin + gpu_buffer_type* basf_locator; //Helps accessing b.f. indices of a grid point + gpu_buffer_type* primf_locator; //Helps accessing p.f. indices of a b.f. + gpu_buffer_type* bin_locator; //Helps accessing bin of a grid point //Temporary variables gpu_buffer_type* densa; @@ -140,7 +125,7 @@ struct XC_quadrature_type{ gpu_buffer_type* dphidx; // x gradient of a basis function at a grid point gpu_buffer_type* dphidy; // y gradient of a basis function at a grid point gpu_buffer_type* dphidz; // z gradient of a basis function at a grid point - gpu_buffer_type* phi_loc; // stores locations of phi array for each grid point + gpu_buffer_type* phi_loc; // stores locations of phi array for each grid point //Variables for ssw derivative calculation int npoints_ssd; //Total number of input points for ssd @@ -150,7 +135,7 @@ struct XC_quadrature_type{ gpu_buffer_type* gridz_ssd; //Z coordinate of a grid point gpu_buffer_type* exc_ssd; gpu_buffer_type* quadwt; //quadrature weight - gpu_buffer_type* gatm_ssd; //To which atom does a given grid point belongs to? + gpu_buffer_type* gatm_ssd; //To which atom does a given grid point belongs to? gpu_buffer_type* uw_ssd; //Holds unnormalized weights during ssd calculation //Variables for grid weight calculation @@ -160,26 +145,23 @@ struct XC_quadrature_type{ //Variables for obtaining octree info gpu_buffer_type* gpweight; //keeps track of significant grid points for octree pruning - gpu_buffer_type* cfweight; //keeps track of significant b.f. for octree pruning - gpu_buffer_type* pfweight; //keeps track of significant p.f. for octree pruning + gpu_buffer_type* cfweight; //keeps track of significant b.f. for octree pruning + gpu_buffer_type* pfweight; //keeps track of significant p.f. for octree pruning // mpi variables - gpu_buffer_type* mpi_bxccompute; + gpu_buffer_type* mpi_bxccompute; // shared memory size int smem_size; //size of shared memory buffer in xc kernels }; -struct lri_data_type{ - +struct lri_data_type { int zeta; gpu_buffer_type* cc; gpu_buffer_type* vrecip; - }; struct gpu_simulation_type { - // basic molecule information and method information QUICK_METHOD method; DFT_calculated_type* DFT_calculated;