Skip to content

Commit

Permalink
Remove improper legacy atomic support for double precision arithmetic…
Browse files Browse the repository at this point in the history
… and replace with emulation at full double precision for pre-Pascal NVIDIA GPUs (previously toggled via USE_LEGACY_ATOMICS). Note that the old code was leading to slow and possibly failing SCF convergence which was only exposed during testing with tighter density matrix convergence thresholds and integral cut-offs. This is likely due to the truncation used for energy and gradient calculations (1e-6 and 1e-12, respectively).
  • Loading branch information
ohearnk committed Aug 20, 2024
1 parent 72782c8 commit 01315f6
Show file tree
Hide file tree
Showing 31 changed files with 2,112 additions and 4,010 deletions.
21 changes: 0 additions & 21 deletions quick-cmake/QUICKCudaConfig.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -61,50 +61,42 @@ if(CUDA)
message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2 and SM5.3")
message(STATUS "BE AWARE: CUDA 7.5 does not support GTX-1080, Titan-XP, DGX-1, V100 or other Pascal/Volta based GPUs.")
list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)

elseif(${CUDA_VERSION} VERSION_EQUAL 8.0)
message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0 and SM6.1,")
message(STATUS "BE AWARE: CUDA 8.0 does not support V100, GV100, Titan-V or later GPUs")
list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)

elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 9.0) AND (${CUDA_VERSION} VERSION_LESS 10.0))
message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1 and SM7.0")
list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)

elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 10.0) AND (${CUDA_VERSION} VERSION_LESS 11.0))
message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0 and SM7.5")
list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)

elseif((${CUDA_VERSION} VERSION_EQUAL 11.0))
message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5 and SM8.0")
list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)

elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 11.1) AND (${CUDA_VERSION} VERSION_LESS_EQUAL 11.7))
message(STATUS "Configuring QUICK for SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5, SM8.0 and SM8.6")
list(APPEND CUDA_NVCC_FLAGS ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS} ${SM86FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)

elseif((${CUDA_VERSION} VERSION_EQUAL 11.8))
message(STATUS "Configuring QUICK for SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5, SM8.0, SM8.6, SM8.9 and SM9.0")
list(APPEND CUDA_NVCC_FLAGS ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS} ${SM86FLAGS} ${SM89FLAGS} ${SM90FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)

elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 12.0) AND (${CUDA_VERSION} VERSION_LESS 12.5))
message(STATUS "Configuring QUICK for SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5, SM8.0, SM8.6, SM8.9 and SM9.0")
list(APPEND CUDA_NVCC_FLAGS ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS} ${SM86FLAGS} ${SM89FLAGS} ${SM90FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)

else()
Expand All @@ -118,15 +110,13 @@ if(CUDA)
if("${QUICK_USER_ARCH}" MATCHES "kepler")
message(STATUS "Configuring QUICK for SM3.5")
list(APPEND CUDA_NVCC_FLAGS ${SM35FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)
set(FOUND "TRUE")
endif()

if("${QUICK_USER_ARCH}" MATCHES "maxwell")
message(STATUS "Configuring QUICK for SM5.0")
list(APPEND CUDA_NVCC_FLAGS ${SM50FLAGS})
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
set(DISABLE_OPTIMIZER_CONSTANTS TRUE)
set(FOUND "TRUE")
endif()
Expand Down Expand Up @@ -280,10 +270,6 @@ if(CUDA)
if(DISABLE_OPTIMIZER_CONSTANTS)
set(CUDA_DEVICE_CODE_FLAGS -Xptxas --disable-optimizer-constants)
endif()

if(USE_LEGACY_ATOMICS)
list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS)
endif()

if(NOT INSIDE_AMBER)
# --------------------------------------------------------------------
Expand Down Expand Up @@ -328,16 +314,10 @@ if(HIP)
# add_compile_definitions(QUICK_PLATFORM_AMD_WARP64)
# endif()

# HIP codes currently do not support f-functions with -DUSE_LEGACY_ATOMICS targets (gfx906 and gfx908)
if(ENABLEF AND (("${QUICK_USER_ARCH}" STREQUAL "") OR ("${QUICK_USER_ARCH}" MATCHES "gfx906") OR ("${QUICK_USER_ARCH}" MATCHES "gfx908")))
message(FATAL_ERROR "Error: Unsupported HIP options (ENABLEF with -DUSE_LEGACY_ATOMICS). ${PROJECT_NAME} support for f-functions requires newer HIP architecture targets not using LEGACY_ATOMICS. Please specify architectures with QUICK_USER_ARCH not needing LEGACY_ATOMICS (post-gfx908) or disable f-function support.")
endif()

if( NOT "${QUICK_USER_ARCH}" STREQUAL "")
set(FOUND "FALSE")
if("${QUICK_USER_ARCH}" MATCHES "gfx908")
message(STATUS "Configuring QUICK for gfx908")
list(APPEND AMD_HIP_FLAGS -DUSE_LEGACY_ATOMICS)
set(FOUND "TRUE")
endif()

Expand All @@ -351,7 +331,6 @@ if(HIP)
message(FATAL_ERROR "Invalid value for QUICK_USER_ARCH. Possible values are gfx908, gfx90a.")
endif()
else()
list(APPEND AMD_HIP_FLAGS -DUSE_LEGACY_ATOMICS)
set(QUICK_USER_ARCH "gfx908")
message(STATUS "AMD GPU architecture not specified. Code will be optimized for gfx908.")
endif()
Expand Down
85 changes: 0 additions & 85 deletions src/gpu/cuda/gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1378,36 +1378,8 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou
gpu->gpu_calculated->o = new gpu_buffer_type<QUICKDouble>(gpu->nbasis, gpu->nbasis);
gpu->gpu_calculated->dense = new gpu_buffer_type<QUICKDouble>(dense, gpu->nbasis, gpu->nbasis);

#ifdef USE_LEGACY_ATOMICS
gpu->gpu_calculated->o->DeleteGPU();
gpu->gpu_calculated->oULL = new gpu_buffer_type<QUICKULL>(gpu->nbasis, gpu->nbasis);
gpu->gpu_calculated->oULL->Upload();
gpu->gpu_sim.oULL = gpu->gpu_calculated->oULL->_devData;
#else
gpu->gpu_calculated->o->Upload();
gpu->gpu_sim.o = gpu->gpu_calculated->o->_devData;
#endif

/*
oULL is the unsigned long long int type of O matrix. The reason to do so is because
Atomic Operator for CUDA 2.0 is only available for integer. So for double precision type,
an comprimise way is to multiple a very large number (OSCALE), first and divided it
after atomic operator.
*/
/*
for (int i = 0; i<gpu->nbasis; i++) {
for (int j = 0; j<gpu->nbasis; j++) {
QUICKULL valUII = (QUICKULL) (fabs ( LOC2( gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis)*OSCALE + (QUICKDouble)0.5));
if (LOC2( gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis)<(QUICKDouble)0.0)
{
valUII = 0ull - valUII;
}
LOC2( gpu->gpu_calculated->oULL->_hostData, i, j, gpu->nbasis, gpu->nbasis) = valUII;
}
}
*/

gpu->gpu_calculated->dense->Upload();
gpu->gpu_sim.dense = gpu->gpu_calculated->dense->_devData;
Expand Down Expand Up @@ -1443,34 +1415,8 @@ extern "C" void gpu_upload_calculated_beta_(QUICKDouble* ob, QUICKDouble* denseb

gpu->gpu_calculated->ob = new gpu_buffer_type<QUICKDouble>(gpu->nbasis, gpu->nbasis);

#ifdef USE_LEGACY_ATOMICS
gpu->gpu_calculated->ob->DeleteGPU();
gpu->gpu_calculated->obULL = new gpu_buffer_type<QUICKULL>(gpu->nbasis, gpu->nbasis);
gpu->gpu_calculated->obULL->Upload();
gpu->gpu_sim.obULL = gpu->gpu_calculated->obULL->_devData;
#else
gpu->gpu_calculated->ob->Upload();
gpu->gpu_sim.ob = gpu->gpu_calculated->ob->_devData;
#endif

/*
obULL is the unsigned long long int type of Ob matrix. The reason to do so is because
Atomic Operator for CUDA 2.0 is only available for integer. So for double precision type,
an comprimise way is to multiple a very large number (OSCALE), first and divided it
after atomic operator.
*/
/*for (int i = 0; i<gpu->nbasis; i++) {
for (int j = 0; j<gpu->nbasis; j++) {
QUICKULL valUII = (QUICKULL) (fabs ( LOC2( gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis)*OSCALE + (QUICKDouble)0.5));
if (LOC2( gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis)<(QUICKDouble)0.0)
{
valUII = 0ull - valUII;
}
LOC2( gpu->gpu_calculated->obULL->_hostData, i, j, gpu->nbasis, gpu->nbasis) = valUII;
}
}*/

gpu_upload_beta_density_matrix_(denseb);

Expand Down Expand Up @@ -1883,12 +1829,6 @@ extern "C" void gpu_upload_grad_(QUICKDouble* gradCutoff)

gpu->grad = new gpu_buffer_type<QUICKDouble>(3 * gpu->natom);

#ifdef USE_LEGACY_ATOMICS
gpu->gradULL = new gpu_buffer_type<QUICKULL>(3 * gpu->natom);
gpu->gpu_sim.gradULL = gpu->gradULL->_devData;
gpu->gradULL->Upload();
#endif

//gpu->grad->DeleteGPU();
gpu->gpu_sim.grad = gpu->grad->_devData;
gpu->grad->Upload();
Expand Down Expand Up @@ -2910,26 +2850,6 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName)

PRINTDEBUG("COMPLETE KERNEL")

#ifdef USE_LEGACY_ATOMICS
gpu->gpu_calculated->oULL->Download();

for (int i = 0; i< gpu->nbasis; i++) {
for (int j = i; j< gpu->nbasis; j++) {
QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis);
QUICKDouble valDB;

if (valULL >= 0x8000000000000000ull) {
valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull);
}
else
{
valDB = (QUICKDouble) valULL;
}
LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE;
LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis) = (QUICKDouble)valDB*ONEOVEROSCALE;
}
}
#else
gpu->gpu_calculated->o->Download();

for (int i = 0; i< gpu->nbasis; i++) {
Expand All @@ -2938,7 +2858,6 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName)
= LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis);
}
}
#endif
gpu->gpu_calculated->o->Download(o);

#ifdef DEBUG
Expand All @@ -2960,10 +2879,6 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName)
delete gpu->gpu_cutoff->YCutoff;
delete gpu->gpu_cutoff->cutPrim;

#ifdef USE_LEGACY_ATOMICS
delete gpu->gpu_calculated->oULL;
#endif

PRINTDEBUG("COMPLETE RUNNING ADDINT")

}
Expand Down
10 changes: 0 additions & 10 deletions src/gpu/cuda/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,13 +293,8 @@ void bind_eri_texture(_gpu_type gpu);
void unbind_eri_texture();

//__device__ void gpu_shell(unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL);
#ifdef USE_LEGACY_ATOMICS
__device__ void addint(QUICKULL* oULL, QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis);
__device__ __forceinline__ void addint_oshell(QUICKULL* oULL, QUICKULL* obULL,QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb,int nbasis);
#else
__device__ void addint(QUICKDouble* o, QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis);
__device__ __forceinline__ void addint_oshell(QUICKDouble* o, QUICKDouble* ob,QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb,int nbasis);
#endif
__device__ __forceinline__ void addint_lri(QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis);
__device__ void FmT_sp(const int MaxM, const QUICKDouble X, QUICKDouble* vals);
__device__ void FmT_spd(const int MaxM, const QUICKDouble X, QUICKDouble* vals);
Expand Down Expand Up @@ -621,13 +616,8 @@ __device__ int lefthrr_lri23(QUICKDouble RAx, QUICKDouble RAy, QUICKDouble RAz,
int KLMNAx, int KLMNAy, int KLMNAz,
int KLMNBx, int KLMNBy, int KLMNBz,
int IJTYPE,QUICKDouble* coefAngularL, unsigned char* angularL);
#ifdef USE_LEGACY_ATOMICS
__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKULL* smemGrad, int iparent, int gid);
__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKULL* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom);
#else
__device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int iparent, int gid);
__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKDouble* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom);
#endif

__device__ QUICKDouble get_unnormalized_weight(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int iatm);
__device__ QUICKDouble SSW( QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int atm);
Expand Down
16 changes: 8 additions & 8 deletions src/gpu/cuda/gpu_MP2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned
QUICKULL val1 = (QUICKULL) (fabs(val1d * OSCALE) + (QUICKDouble) 0.5);
if (val1d < (QUICKDouble) 0.0)
val1 = 0ull - val1;
QUICKADD(LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1);
atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1);
// }

// ATOMIC ADD VALUE 2
Expand All @@ -393,7 +393,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned
QUICKULL val2 = (QUICKULL) (fabs(val2d * OSCALE) + (QUICKDouble) 0.5);
if (val2d < (QUICKDouble) 0.0)
val2 = 0ull - val2;
QUICKADD(LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2);
atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2);
// }
}

Expand All @@ -406,7 +406,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned
}
if (DENSELJ * Y < (QUICKDouble) 0.0)
val3 = 0ull - val3;
QUICKADD(LOC2(devSim_MP2.oULL, KKK - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val3);
atomicAdd(&LOC2(devSim_MP2.oULL, KKK - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val3);
//}

// ATOMIC ADD VALUE 4
Expand All @@ -415,7 +415,7 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned
// if (abs(val4d) > devSim_MP2.integralCutoff) {
QUICKULL val4 = (QUICKULL) (fabs(val4d * OSCALE) + (QUICKDouble) 0.5);
if (val4d < (QUICKDouble) 0.0) val4 = 0ull - val4;
QUICKADD(LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val4);
atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val4);
//}
}

Expand All @@ -428,13 +428,13 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned
if ((III != JJJ && III < KKK)
|| (III == JJJ && III == KKK && III < LLL)
|| (III == KKK && III < JJJ && JJJ < LLL)) {
QUICKADD(LOC2(devSim_MP2.oULL, MAX(JJJ,KKK) - 1, MIN(JJJ,KKK) - 1,
atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,KKK) - 1, MIN(JJJ,KKK) - 1,
devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5);
}

// ATOMIC ADD VALUE 5 - 2
if (III != JJJ && JJJ == KKK) {
QUICKADD(LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1,
atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1,
devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5);
}
//}
Expand All @@ -449,12 +449,12 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned
if (val6d < (QUICKDouble) 0.0)
val6 = 0ull - val6;

QUICKADD(LOC2(devSim_MP2.oULL, MAX(JJJ,LLL) - 1, MIN(JJJ,LLL) - 1,
atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,LLL) - 1, MIN(JJJ,LLL) - 1,
devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6);

// ATOMIC ADD VALUE 6 - 2
if (JJJ == LLL && III != KKK) {
QUICKADD(LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1,
atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1,
devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6);
}
}
Expand Down
Loading

0 comments on commit 01315f6

Please sign in to comment.