diff --git a/quick-cmake/QUICKCudaConfig.cmake b/quick-cmake/QUICKCudaConfig.cmake index 8d5f26ca..4dc877d7 100644 --- a/quick-cmake/QUICKCudaConfig.cmake +++ b/quick-cmake/QUICKCudaConfig.cmake @@ -61,42 +61,50 @@ if(CUDA) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2 and SM5.3") message(STATUS "BE AWARE: CUDA 7.5 does not support GTX-1080, Titan-XP, DGX-1, V100 or other Pascal/Volta based GPUs.") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif(${CUDA_VERSION} VERSION_EQUAL 8.0) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0 and SM6.1,") message(STATUS "BE AWARE: CUDA 8.0 does not support V100, GV100, Titan-V or later GPUs") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 9.0) AND (${CUDA_VERSION} VERSION_LESS 10.0)) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1 and SM7.0") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 10.0) AND (${CUDA_VERSION} VERSION_LESS 11.0)) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0 and SM7.5") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_EQUAL 11.0)) message(STATUS "Configuring QUICK for SM3.0, SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5 and SM8.0") list(APPEND CUDA_NVCC_FLAGS ${SM30FLAGS} ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 11.1) AND (${CUDA_VERSION} VERSION_LESS_EQUAL 11.7)) message(STATUS "Configuring QUICK for SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5, SM8.0 and SM8.6") list(APPEND CUDA_NVCC_FLAGS ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS} ${SM86FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_EQUAL 11.8)) message(STATUS "Configuring QUICK for SM3.5, SM3.7, SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5, SM8.0, SM8.6, SM8.9 and SM9.0") list(APPEND CUDA_NVCC_FLAGS ${SM35FLAGS} ${SM37FLAGS} ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS} ${SM86FLAGS} ${SM89FLAGS} ${SM90FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) elseif((${CUDA_VERSION} VERSION_GREATER_EQUAL 12.0) AND (${CUDA_VERSION} VERSION_LESS 12.5)) message(STATUS "Configuring QUICK for SM5.0, SM5.2, SM5.3, SM6.0, SM6.1, SM7.0, SM7.5, SM8.0, SM8.6, SM8.9 and SM9.0") list(APPEND CUDA_NVCC_FLAGS ${SM50FLAGS} ${SM52FLAGS} ${SM53FLAGS} ${SM60FLAGS} ${SM61FLAGS} ${SM70FLAGS} ${SM75FLAGS} ${SM80FLAGS} ${SM86FLAGS} ${SM89FLAGS} ${SM90FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) else() @@ -110,6 +118,7 @@ if(CUDA) if("${QUICK_USER_ARCH}" MATCHES "kepler") message(STATUS "Configuring QUICK for SM3.5") list(APPEND CUDA_NVCC_FLAGS ${SM35FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) set(FOUND "TRUE") endif() @@ -117,6 +126,7 @@ if(CUDA) if("${QUICK_USER_ARCH}" MATCHES "maxwell") message(STATUS "Configuring QUICK for SM5.0") list(APPEND CUDA_NVCC_FLAGS ${SM50FLAGS}) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) set(DISABLE_OPTIMIZER_CONSTANTS TRUE) set(FOUND "TRUE") endif() @@ -270,6 +280,10 @@ if(CUDA) if(DISABLE_OPTIMIZER_CONSTANTS) set(CUDA_DEVICE_CODE_FLAGS -Xptxas --disable-optimizer-constants) endif() + + if(USE_LEGACY_ATOMICS) + list(APPEND CUDA_NVCC_FLAGS -DUSE_LEGACY_ATOMICS) + endif() if(NOT INSIDE_AMBER) # -------------------------------------------------------------------- @@ -318,6 +332,7 @@ if(HIP) set(FOUND "FALSE") if("${QUICK_USER_ARCH}" MATCHES "gfx908") message(STATUS "Configuring QUICK for gfx908") + list(APPEND AMD_HIP_FLAGS -DUSE_LEGACY_ATOMICS) set(FOUND "TRUE") endif() @@ -332,6 +347,7 @@ if(HIP) endif() else() set(QUICK_USER_ARCH "gfx908") + list(APPEND AMD_HIP_FLAGS -DUSE_LEGACY_ATOMICS) message(STATUS "AMD GPU architecture not specified. Code will be optimized for gfx908.") endif() diff --git a/src/gpu/cuda/gpu.cu b/src/gpu/cuda/gpu.cu index 21e7036a..f7e19566 100644 --- a/src/gpu/cuda/gpu.cu +++ b/src/gpu/cuda/gpu.cu @@ -57,7 +57,7 @@ extern "C" void gpu_startup_(int* ierr) #if defined DEBUG || defined DEBUGTIME gpu->debugFile = debugFile; #endif - PRINTDEBUG("CREATE NEW GPU") + PRINTDEBUG("CREATE NEW GPU"); } @@ -66,7 +66,7 @@ extern "C" void gpu_startup_(int* ierr) //----------------------------------------------- extern "C" void gpu_init_(int* ierr) { - PRINTDEBUG("BEGIN TO INIT") + PRINTDEBUG("BEGIN TO INIT"); int device = -1; int gpuCount = 0; cudaError_t status; @@ -185,7 +185,7 @@ extern "C" void gpu_init_(int* ierr) gpu->sswGradThreadsPerBlock = SM_2X_SSW_GRAD_THREADS_PER_BLOCK; } - PRINTDEBUG("FINISH INIT") + PRINTDEBUG("FINISH INIT"); } @@ -288,7 +288,7 @@ extern "C" void gpu_deallocate_scratch_(bool* deallocate_gradient_scratch) //----------------------------------------------- extern "C" void gpu_shutdown_(int* ierr) { - PRINTDEBUG("BEGIN TO SHUTDOWN") + PRINTDEBUG("BEGIN TO SHUTDOWN"); delete gpu; cudaDeviceReset( ); @@ -313,7 +313,7 @@ extern "C" void gpu_setup_(int* natom, int* nbasis, int* nElec, int* imult, int* cudaEventCreate(&start); cudaEventCreate(&end); cudaEventRecord(start, 0); - PRINTDEBUG("BEGIN TO SETUP") + PRINTDEBUG("BEGIN TO SETUP"); #if defined(MPIV_GPU) fprintf(gpu->debugFile,"mpirank %i natoms %i \n", gpu->mpirank, *natom ); @@ -356,7 +356,7 @@ extern "C" void gpu_setup_(int* natom, int* nbasis, int* nElec, int* imult, int* cudaEventDestroy(start); cudaEventDestroy(end); - PRINTDEBUG("FINISH SETUP") + PRINTDEBUG("FINISH SETUP"); #endif @@ -431,7 +431,7 @@ extern "C" void gpu_upload_xyz_(QUICKDouble* atom_xyz) cudaEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD COORDINATES") + PRINTDEBUG("BEGIN TO UPLOAD COORDINATES"); // gpu->gpu_basis->xyz = new gpu_buffer_type(atom_xyz, 3, gpu->natom); // gpu->gpu_basis->xyz->Upload( ); gpu->gpu_calculated->distance = new gpu_buffer_type(gpu->natom, gpu->natom); @@ -465,7 +465,7 @@ extern "C" void gpu_upload_xyz_(QUICKDouble* atom_xyz) cudaEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING COORDINATES") + PRINTDEBUG("COMPLETE UPLOADING COORDINATES"); } @@ -474,7 +474,7 @@ extern "C" void gpu_upload_xyz_(QUICKDouble* atom_xyz) //----------------------------------------------- extern "C" void gpu_upload_atom_and_chg_(int* atom, QUICKDouble* atom_chg) { - PRINTDEBUG("BEGIN TO UPLOAD ATOM AND CHARGE") + PRINTDEBUG("BEGIN TO UPLOAD ATOM AND CHARGE"); gpu->iattype = new gpu_buffer_type(atom, gpu->natom); gpu->chg = new gpu_buffer_type(atom_chg, gpu->natom); @@ -484,7 +484,7 @@ extern "C" void gpu_upload_atom_and_chg_(int* atom, QUICKDouble* atom_chg) gpu->gpu_sim.chg = gpu->chg->_devData; gpu->gpu_sim.iattype = gpu->iattype->_devData; - PRINTDEBUG("COMPLETE UPLOADING ATOM AND CHARGE") + PRINTDEBUG("COMPLETE UPLOADING ATOM AND CHARGE"); } @@ -502,7 +502,7 @@ extern "C" void gpu_upload_cutoff_(QUICKDouble* cutMatrix, QUICKDouble* integral cudaEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD CUTOFF") + PRINTDEBUG("BEGIN TO UPLOAD CUTOFF"); gpu->gpu_cutoff->integralCutoff = *integralCutoff; gpu->gpu_cutoff->coreIntegralCutoff = *coreIntegralCutoff; @@ -529,7 +529,7 @@ extern "C" void gpu_upload_cutoff_(QUICKDouble* cutMatrix, QUICKDouble* integral cudaEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING CUTOFF") + PRINTDEBUG("COMPLETE UPLOADING CUTOFF"); } @@ -546,7 +546,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP cudaEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD CUTOFF") + PRINTDEBUG("BEGIN TO UPLOAD CUTOFF"); gpu->gpu_cutoff->natom = gpu->natom; gpu->gpu_cutoff->YCutoff = new gpu_buffer_type(YCutoff, gpu->nshell, gpu->nshell); @@ -613,7 +613,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } } - PRINTDEBUG("FINISH STEP 2") + PRINTDEBUG("FINISH STEP 2"); flag = true; for (int i = 0; i < b - 1; i ++) { @@ -649,7 +649,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP break; } flag = true; - PRINTDEBUG("FINISH STEP 3") + PRINTDEBUG("FINISH STEP 3"); if (b != 0) { @@ -702,7 +702,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } } - PRINTDEBUG("FINISH STEP 1") + PRINTDEBUG("FINISH STEP 1"); #ifdef DEBUG fprintf(gpu->debugFile,"a=%i b=%i\n", a, b); #endif @@ -731,7 +731,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP if (flag == true) break; } - PRINTDEBUG("FINISH STEP 2") + PRINTDEBUG("FINISH STEP 2"); flag = true; for (int i = 0; i < b - 1; i ++) { @@ -786,7 +786,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } } - PRINTDEBUG("FINISH STEP 3") + PRINTDEBUG("FINISH STEP 3"); } } } @@ -806,7 +806,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP // } /* - PRINTDEBUG("WORKING on F Orbital") + PRINTDEBUG("WORKING on F Orbital"); gpu->gpu_basis->fStart = a; gpu->gpu_sim.fStart = a; @@ -844,7 +844,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } } - PRINTDEBUG("FINISH STEP 1") + PRINTDEBUG("FINISH STEP 1"); printf("a=%i b=%i\n", a, b); for (int i = 0; i < b - 1; i ++) { @@ -872,7 +872,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP break; } - PRINTDEBUG("FINISH STEP 2") + PRINTDEBUG("FINISH STEP 2"); flag = true; for (int i = 0; i < b - 1; i ++) @@ -910,7 +910,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } flag = true; - PRINTDEBUG("FINISH STEP 3") + PRINTDEBUG("FINISH STEP 3"); } } @@ -996,7 +996,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } - PRINTDEBUG("FINISH STEP 1") + PRINTDEBUG("FINISH STEP 1"); #ifdef DEBUG fprintf(gpu->debugFile,"a=%i b=%i\n", a, b); #endif @@ -1026,7 +1026,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP break; } - PRINTDEBUG("FINISH STEP 2") + PRINTDEBUG("FINISH STEP 2"); flag = true; for (int i = 0; i < b - 1; i ++) @@ -1064,7 +1064,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } flag = true; - PRINTDEBUG("FINISH STEP 3") + PRINTDEBUG("FINISH STEP 3"); } } } @@ -1182,7 +1182,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP cudaEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING CUTOFF") + PRINTDEBUG("COMPLETE UPLOADING CUTOFF"); } @@ -1369,13 +1369,20 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou cudaEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD O MATRIX") + PRINTDEBUG("BEGIN TO UPLOAD O MATRIX"); gpu->gpu_calculated->o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - gpu->gpu_calculated->dense = new gpu_buffer_type(dense, gpu->nbasis, gpu->nbasis); + gpu->gpu_calculated->dense = new gpu_buffer_type(dense, gpu->nbasis, gpu->nbasis); +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->o->DeleteGPU(); + gpu->gpu_calculated->oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + gpu->gpu_calculated->oULL->Upload(); + gpu->gpu_sim.oULL = gpu->gpu_calculated->oULL->_devData; +#else gpu->gpu_calculated->o->Upload(); gpu->gpu_sim.o = gpu->gpu_calculated->o->_devData; +#endif gpu->gpu_calculated->dense->Upload(); gpu->gpu_sim.dense = gpu->gpu_calculated->dense->_devData; @@ -1391,7 +1398,7 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou cudaEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING O MATRIX") + PRINTDEBUG("COMPLETE UPLOADING O MATRIX"); } @@ -1407,12 +1414,19 @@ extern "C" void gpu_upload_calculated_beta_(QUICKDouble* ob, QUICKDouble* denseb cudaEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD BETA O MATRIX") + PRINTDEBUG("BEGIN TO UPLOAD BETA O MATRIX"); gpu->gpu_calculated->ob = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->ob->DeleteGPU(); + gpu->gpu_calculated->obULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + gpu->gpu_calculated->obULL->Upload(); + gpu->gpu_sim.obULL = gpu->gpu_calculated->obULL->_devData; +#else gpu->gpu_calculated->ob->Upload(); gpu->gpu_sim.ob = gpu->gpu_calculated->ob->_devData; +#endif gpu_upload_beta_density_matrix_(denseb); @@ -1426,7 +1440,7 @@ extern "C" void gpu_upload_calculated_beta_(QUICKDouble* ob, QUICKDouble* denseb cudaEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING BETA O MATRIX") + PRINTDEBUG("COMPLETE UPLOADING BETA O MATRIX"); } @@ -1466,7 +1480,7 @@ extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jba cudaEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD BASIS") + PRINTDEBUG("BEGIN TO UPLOAD BASIS"); gpu->gpu_basis->nshell = *nshell; gpu->gpu_basis->nprim = *nprim; @@ -1808,7 +1822,7 @@ move p orbital to the end of the sequence. so the Qshell stands for the length o cudaEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING BASIS") + PRINTDEBUG("COMPLETE UPLOADING BASIS"); } @@ -1821,12 +1835,18 @@ extern "C" void gpu_upload_grad_(QUICKDouble* gradCutoff) cudaEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD GRAD") + PRINTDEBUG("BEGIN TO UPLOAD GRAD"); gpu->grad = new gpu_buffer_type(3 * gpu->natom); +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL = new gpu_buffer_type(3 * gpu->natom); + gpu->gpu_sim.gradULL = gpu->gradULL->_devData; + gpu->gradULL->Upload(); +#endif + //gpu->grad->DeleteGPU(); - gpu->gpu_sim.grad = gpu->grad->_devData; + gpu->gpu_sim.grad = gpu->grad->_devData; gpu->grad->Upload(); @@ -1843,7 +1863,7 @@ extern "C" void gpu_upload_grad_(QUICKDouble* gradCutoff) cudaEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING GRAD") + PRINTDEBUG("COMPLETE UPLOADING GRAD"); } @@ -1905,9 +1925,9 @@ extern "C" void gpu_upload_cew_vrecip_(int *ierr) //Computes grid weights before grid point packing extern "C" void gpu_get_ssw_(QUICKDouble *gridx, QUICKDouble *gridy, QUICKDouble *gridz, QUICKDouble *wtang, QUICKDouble *rwt, QUICKDouble *rad3, QUICKDouble *sswt, QUICKDouble *weight, int *gatm, int *count){ - PRINTDEBUG("BEGIN TO COMPUTE SSW") + PRINTDEBUG("BEGIN TO COMPUTE SSW"); - gpu->gpu_xcq->npoints = *count; + gpu->gpu_xcq->npoints = *count; gpu->xc_threadsPerBlock = SM_2X_XC_THREADS_PER_BLOCK; gpu->gpu_xcq->gridx = new gpu_buffer_type(gridx, gpu->gpu_xcq->npoints); @@ -1963,14 +1983,14 @@ extern "C" void gpu_get_ssw_(QUICKDouble *gridx, QUICKDouble *gridy, QUICKDouble SAFE_DELETE(gpu->gpu_xcq->sswt); SAFE_DELETE(gpu->gpu_xcq->weight); - PRINTDEBUG("END COMPUTE SSW") + PRINTDEBUG("END COMPUTE SSW"); } void prune_grid_sswgrad() { - PRINTDEBUG("BEGIN TO UPLOAD DFT GRID FOR SSWGRAD") + PRINTDEBUG("BEGIN TO UPLOAD DFT GRID FOR SSWGRAD"); #ifdef MPIV_GPU cudaEvent_t t_startp, t_endp; float t_timep; @@ -2095,7 +2115,7 @@ void prune_grid_sswgrad() upload_sim_to_constant_dft(gpu); - PRINTDEBUG("COMPLETE UPLOADING DFT GRID FOR SSWGRAD") + PRINTDEBUG("COMPLETE UPLOADING DFT GRID FOR SSWGRAD"); //Clean up temporary arrays free(tmp_gridx); @@ -2121,7 +2141,7 @@ void prune_grid_sswgrad() void gpu_get_octree_info(QUICKDouble *gridx, QUICKDouble *gridy, QUICKDouble *gridz, QUICKDouble *sigrad2, unsigned char *gpweight, unsigned int *cfweight, unsigned int *pfweight, int *bin_locator, int count, double DMCutoff, double XCCutoff, int nbins) { - PRINTDEBUG("BEGIN TO OBTAIN PRIMITIVE & BASIS FUNCTION LISTS ") + PRINTDEBUG("BEGIN TO OBTAIN PRIMITIVE & BASIS FUNCTION LISTS "); gpu->gpu_xcq->npoints = count; gpu->xc_threadsPerBlock = SM_2X_XCGRAD_THREADS_PER_BLOCK; @@ -2229,14 +2249,14 @@ void gpu_get_octree_info(QUICKDouble *gridx, QUICKDouble *gridy, QUICKDouble *gr cudaFree(d_cfweight); cudaFree(d_pfweight); - PRINTDEBUG("PRIMITIVE & BASIS FUNCTION LISTS OBTAINED") + PRINTDEBUG("PRIMITIVE & BASIS FUNCTION LISTS OBTAINED"); } #ifdef DEBUG void print_uploaded_dft_info() { - PRINTDEBUG("PRINTING UPLOADED DFT DATA") + PRINTDEBUG("PRINTING UPLOADED DFT DATA"); fprintf(gpu->debugFile,"Number of grid points: %i \n", gpu->gpu_xcq->npoints); fprintf(gpu->debugFile,"Bin size: %i \n", gpu->gpu_xcq->bin_size); @@ -2244,7 +2264,7 @@ void print_uploaded_dft_info() fprintf(gpu->debugFile,"Number of total basis functions: %i \n", gpu->gpu_xcq->ntotbf); fprintf(gpu->debugFile,"Number of total primitive functions: %i \n", gpu->gpu_xcq->ntotpf); - PRINTDEBUG("GRID POINTS & WEIGHTS") + PRINTDEBUG("GRID POINTS & WEIGHTS"); for(int i=0; igpu_xcq->npoints; i++){ fprintf(gpu->debugFile,"Grid: %i x=%f y=%f z=%f sswt=%f weight=%f gatm=%i dweight_ssd=%i \n",i, @@ -2253,7 +2273,7 @@ void print_uploaded_dft_info() gpu->gpu_xcq->dweight_ssd->_hostData[i]); } - PRINTDEBUG("BASIS & PRIMITIVE FUNCTION LISTS") + PRINTDEBUG("BASIS & PRIMITIVE FUNCTION LISTS"); for(int bin_id=0; bin_idgpu_xcq->nbins; bin_id++){ @@ -2270,7 +2290,7 @@ void print_uploaded_dft_info() } - PRINTDEBUG("RADIUS OF SIGNIFICANCE") + PRINTDEBUG("RADIUS OF SIGNIFICANCE"); for(int i=0; inbasis; i++){ fprintf(gpu->debugFile,"ibas=%i sigrad2=%f \n", i, gpu->gpu_basis->sigrad2->_hostData[i]); @@ -2284,7 +2304,7 @@ void print_uploaded_dft_info() } } - PRINTDEBUG("END PRINTING UPLOADED DFT DATA") + PRINTDEBUG("END PRINTING UPLOADED DFT DATA"); } #endif @@ -2294,7 +2314,7 @@ extern "C" void gpu_upload_dft_grid_(QUICKDouble *gridxb, QUICKDouble *gridyb, Q *bin_counter,int *gridb_count, int *nbins, int *nbtotbf, int *nbtotpf, int *isg, QUICKDouble *sigrad2, QUICKDouble *DMCutoff, QUICKDouble *XCCutoff){ - PRINTDEBUG("BEGIN TO UPLOAD DFT GRID") + PRINTDEBUG("BEGIN TO UPLOAD DFT GRID"); gpu->gpu_xcq->npoints = *gridb_count; gpu->gpu_xcq->nbins = *nbins; @@ -2419,7 +2439,7 @@ extern "C" void gpu_upload_dft_grid_(QUICKDouble *gridxb, QUICKDouble *gridyb, Q print_uploaded_dft_info(); #endif - PRINTDEBUG("COMPLETE UPLOADING DFT GRID") + PRINTDEBUG("COMPLETE UPLOADING DFT GRID"); // int nblocks = (int) ((*gridb_count / SM_2X_XC_THREADS_PER_BLOCK) + 1); // test_xc_upload <<>>( ); @@ -2432,7 +2452,7 @@ extern "C" void gpu_upload_dft_grid_(QUICKDouble *gridxb, QUICKDouble *gridyb, Q //----------------------------------------------- extern "C" void gpu_reupload_dft_grid_() { - PRINTDEBUG("BEGIN TO UPLOAD DFT GRID") + PRINTDEBUG("BEGIN TO UPLOAD DFT GRID"); gpu->gpu_xcq->gridx->ReallocateGPU(); gpu->gpu_xcq->gridy->ReallocateGPU(); @@ -2506,7 +2526,7 @@ extern "C" void gpu_reupload_dft_grid_() reupload_pteval( ); - PRINTDEBUG("COMPLETE UPLOADING DFT GRID") + PRINTDEBUG("COMPLETE UPLOADING DFT GRID"); } @@ -2515,7 +2535,7 @@ extern "C" void gpu_reupload_dft_grid_() //----------------------------------------------- extern "C" void gpu_delete_dft_dev_grid_() { - PRINTDEBUG("DEALLOCATING DFT GRID") + PRINTDEBUG("DEALLOCATING DFT GRID"); gpu->gpu_xcq->gridx->DeleteGPU(); gpu->gpu_xcq->gridy->DeleteGPU(); @@ -2542,13 +2562,13 @@ extern "C" void gpu_delete_dft_dev_grid_() delete_pteval(true); - PRINTDEBUG("FINISHED DEALLOCATING DFT GRID") + PRINTDEBUG("FINISHED DEALLOCATING DFT GRID"); } extern "C" void gpu_delete_dft_grid_() { - PRINTDEBUG("DEALLOCATING DFT GRID") + PRINTDEBUG("DEALLOCATING DFT GRID"); SAFE_DELETE(gpu->gpu_xcq->gridx); SAFE_DELETE(gpu->gpu_xcq->gridy); @@ -2578,13 +2598,13 @@ extern "C" void gpu_delete_dft_grid_() SAFE_DELETE(gpu->gpu_xcq->mpi_bxccompute); #endif delete_pteval(false); - PRINTDEBUG("FINISHED DEALLOCATING DFT GRID") + PRINTDEBUG("FINISHED DEALLOCATING DFT GRID"); } void gpu_delete_sswgrad_vars() { - PRINTDEBUG("DEALLOCATING SSWGRAD VARIABLES") + PRINTDEBUG("DEALLOCATING SSWGRAD VARIABLES"); SAFE_DELETE(gpu->gpu_xcq->gridx_ssd); SAFE_DELETE(gpu->gpu_xcq->gridy_ssd); @@ -2597,7 +2617,7 @@ void gpu_delete_sswgrad_vars() SAFE_DELETE(gpu->gpu_xcq->mpi_bxccompute); #endif - PRINTDEBUG("FINISHED DEALLOCATING SSWGRAD VARIABLES") + PRINTDEBUG("FINISHED DEALLOCATING SSWGRAD VARIABLES"); } @@ -2658,7 +2678,7 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) cudaEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO RUN ADD INT") + PRINTDEBUG("BEGIN TO RUN ADD INT"); FILE *intFile; int aBuffer[BUFFERSIZE], bBuffer[BUFFERSIZE]; @@ -2687,7 +2707,7 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) cudaStreamCreate( &stream[i] ); } - PRINTDEBUG("BEGIN TO RUN KERNEL") + PRINTDEBUG("BEGIN TO RUN KERNEL"); upload_sim_to_constant(gpu); @@ -2844,16 +2864,36 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) delete gpu->aoint_buffer[i]; } - PRINTDEBUG("COMPLETE KERNEL") - + PRINTDEBUG("COMPLETE KERNEL"); + +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } + else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + } + } +#else gpu->gpu_calculated->o->Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } +#endif gpu->gpu_calculated->o->Download(o); #ifdef DEBUG @@ -2866,7 +2906,7 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) cudaEventDestroy(end); #endif - PRINTDEBUG("DELETE TEMP VARIABLES") + PRINTDEBUG("DELETE TEMP VARIABLES"); delete gpu->gpu_calculated->o; delete gpu->gpu_calculated->dense; @@ -2874,9 +2914,11 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) delete gpu->gpu_cutoff->sorted_YCutoffIJ; delete gpu->gpu_cutoff->YCutoff; delete gpu->gpu_cutoff->cutPrim; +#if defined(USE_LEGACY_ATOMICS) + delete gpu->gpu_calculated->oULL; +#endif - PRINTDEBUG("COMPLETE RUNNING ADDINT") - + PRINTDEBUG("COMPLETE RUNNING ADDINT"); } #endif @@ -2897,7 +2939,7 @@ char *trim(char *s) #ifdef COMPILE_GPU_AOINT extern "C" void gpu_aoint_(QUICKDouble* leastIntegralCutoff, QUICKDouble* maxIntegralCutoff, int* intNum, char* intFileName) { - PRINTDEBUG("BEGIN TO RUN AOINT") + PRINTDEBUG("BEGIN TO RUN AOINT"); ERI_entry a; FILE *intFile; @@ -3115,7 +3157,7 @@ extern "C" void gpu_aoint_(QUICKDouble* leastIntegralCutoff, QUICKDouble* maxInt #ifdef DEBUG fprintf(gpu->debugFile," TOTAL INT = %i \n", *intNum); #endif - PRINTDEBUG("END TO RUN AOINT KERNEL") + PRINTDEBUG("END TO RUN AOINT KERNEL"); #ifdef DEBUG clock_t end_cpu = clock(); diff --git a/src/gpu/cuda/gpu.h b/src/gpu/cuda/gpu.h index 0ae9732a..3c385e05 100644 --- a/src/gpu/cuda/gpu.h +++ b/src/gpu/cuda/gpu.h @@ -294,8 +294,6 @@ void bind_eri_texture(_gpu_type gpu); void unbind_eri_texture(); //__device__ void gpu_shell(unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL); -__device__ void addint(QUICKDouble* o, QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis); -__device__ __forceinline__ void addint_oshell(QUICKDouble* o, QUICKDouble* ob,QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb,int nbasis); __device__ void FmT_sp(const int MaxM, const QUICKDouble X, QUICKDouble* vals); __device__ void FmT_spd(const int MaxM, const QUICKDouble X, QUICKDouble* vals); __device__ void FmT(const int MaxM, const QUICKDouble X, QUICKDouble* vals); @@ -617,7 +615,14 @@ __device__ int lefthrr_lri23(QUICKDouble RAx, QUICKDouble RAy, QUICKDouble RAz, int KLMNBx, int KLMNBy, int KLMNBz, int IJTYPE,QUICKDouble* coefAngularL, unsigned char* angularL); __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int iparent, int gid); -__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKDouble* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom); +__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, + const QUICKDouble Exc, const QUICKDouble quadwt, +#if defined(USE_LEGACY_ATOMICS) + QUICKULL* const smemGrad, +#else + QUICKDouble* const smemGrad, +#endif + QUICKDouble* const uw_ssd, const int iparent, const int natom); __device__ QUICKDouble get_unnormalized_weight(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int iatm); __device__ QUICKDouble SSW( QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int atm); diff --git a/src/gpu/cuda/gpu_MP2.cu b/src/gpu/cuda/gpu_MP2.cu index e98c6348..0defbb87 100644 --- a/src/gpu/cuda/gpu_MP2.cu +++ b/src/gpu/cuda/gpu_MP2.cu @@ -66,19 +66,20 @@ void get2e_MP2(_gpu_type gpu) #endif } + __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel() { - unsigned int offside = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; + unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x; + int totalThreads = blockDim.x * gridDim.x; QUICKULL jshell = (QUICKULL) devSim_MP2.sqrQshell; QUICKULL myInt = (QUICKULL) jshell * jshell / totalThreads; - if ((jshell * jshell - myInt * totalThreads) > offside) + if (jshell * jshell - myInt * totalThreads > offset) myInt++; for (QUICKULL i = 1; i <= myInt; i++) { - QUICKULL currentInt = totalThreads * (i - 1) + offside; + QUICKULL currentInt = totalThreads * (i - 1) + offset; QUICKULL a = (QUICKULL) currentInt / jshell; QUICKULL b = (QUICKULL) (currentInt - a * jshell); @@ -128,13 +129,16 @@ __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kerne if (ii <= kk) { int nshell = devSim_MP2.nshell; QUICKDouble DNMax = MAX( - MAX(4.0 * LOC2(devSim_MP2.cutMatrix, ii, jj, nshell, nshell), 4.0 * LOC2(devSim_MP2.cutMatrix, kk, ll, nshell, nshell)), - MAX(MAX(LOC2(devSim_MP2.cutMatrix, ii, ll, nshell, nshell), LOC2(devSim_MP2.cutMatrix, ii, kk, nshell, nshell)), - MAX(LOC2(devSim_MP2.cutMatrix, jj, kk, nshell, nshell), LOC2(devSim_MP2.cutMatrix, jj, ll, nshell, nshell)))); - - if ((LOC2(devSim_MP2.YCutoff, kk, ll, nshell, nshell) * LOC2(devSim_MP2.YCutoff, ii, jj, nshell, nshell)) + MAX(4.0 * LOC2(devSim_MP2.cutMatrix, ii, jj, nshell, nshell), + 4.0 * LOC2(devSim_MP2.cutMatrix, kk, ll, nshell, nshell)), + MAX(MAX(LOC2(devSim_MP2.cutMatrix, ii, ll, nshell, nshell), + LOC2(devSim_MP2.cutMatrix, ii, kk, nshell, nshell)), + MAX(LOC2(devSim_MP2.cutMatrix, jj, kk, nshell, nshell), + LOC2(devSim_MP2.cutMatrix, jj, ll, nshell, nshell)))); + + if (LOC2(devSim_MP2.YCutoff, kk, ll, nshell, nshell) * LOC2(devSim_MP2.YCutoff, ii, jj, nshell, nshell) > devSim_MP2.integralCutoff - && (LOC2(devSim_MP2.YCutoff, kk, ll, nshell, nshell) * LOC2(devSim_MP2.YCutoff, ii, jj, nshell, nshell) * DNMax) + && LOC2(devSim_MP2.YCutoff, kk, ll, nshell, nshell) * LOC2(devSim_MP2.YCutoff, ii, jj, nshell, nshell) * DNMax > devSim_MP2.integralCutoff) { int iii = devSim_MP2.sorted_Qnumber[II]; int jjj = devSim_MP2.sorted_Qnumber[JJ]; @@ -347,7 +351,13 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned for (int III = III1; III <= III2; III++) { for (int JJJ = MAX(III, JJJ1); JJJ <= JJJ2; JJJ++) { + QUICKDouble o_JI = 0.0; + for (int KKK = MAX(III, KKK1); KKK <= KKK2; KKK++) { + QUICKDouble o_KI = 0.0; + QUICKDouble o_JK = 0.0; + QUICKDouble o_JK_MM = 0.0; + for (int LLL = MAX(KKK, LLL1); LLL <= LLL2; LLL++) { if (III < KKK || (III == JJJ && III == LLL) @@ -368,102 +378,108 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned QUICKDouble DENSEJI = (QUICKDouble) LOC2(devSim_MP2.dense, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis); // ATOMIC ADD VALUE 1 - QUICKDouble _tmp = 2.0; - if (KKK == LLL) { - _tmp = 1.0; - } - - QUICKDouble val1d = _tmp * DENSELK * Y; - //if (abs(val1d) > devSim_MP2.integralCutoff) { - QUICKULL val1 = (QUICKULL) (fabs(val1d * OSCALE) + (QUICKDouble) 0.5); - if (val1d < (QUICKDouble) 0.0) - val1 = 0ull - val1; - atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1); - // } + temp = (KKK == LLL) ? DENSELK * Y : 2.0 * DENSELK * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { + o_JI += temp; +// } // ATOMIC ADD VALUE 2 if (LLL != JJJ || III != KKK) { - _tmp = 2.0; - if (III == JJJ) { - _tmp = 1.0; - } - - QUICKDouble val2d = _tmp * DENSEJI * Y; - // if (abs(val2d) > devSim_MP2.integralCutoff) { - QUICKULL val2 = (QUICKULL) (fabs(val2d * OSCALE) + (QUICKDouble) 0.5); - if (val2d < (QUICKDouble) 0.0) - val2 = 0ull - val2; - atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2); - // } + temp = (III == JJJ) ? DENSEJI * Y : 2.0 * DENSEJI * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), temp, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), temp); +#endif +// } } // ATOMIC ADD VALUE 3 - QUICKDouble val3d = hybrid_coeff * 0.5 * DENSELJ * Y; - //if (abs(2 * val3d) > devSim_MP2.integralCutoff) { - QUICKULL val3 = (QUICKULL) (fabs(val3d * OSCALE) + (QUICKDouble) 0.5); - if (III == KKK && III < JJJ && JJJ < LLL) { - val3 = (QUICKULL) (fabs(2 * val3d * OSCALE) + (QUICKDouble) 0.5); - } - if (DENSELJ * Y < (QUICKDouble) 0.0) - val3 = 0ull - val3; - atomicAdd(&LOC2(devSim_MP2.oULL, KKK - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val3); - //} + temp = (III == KKK && III < JJJ && JJJ < LLL) + ? -(hybrid_coeff * DENSELJ * Y) : -0.5 * hybrid_coeff * DENSELJ * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { + o_KI += temp; +// } // ATOMIC ADD VALUE 4 if (KKK != LLL) { - QUICKDouble val4d = hybrid_coeff * 0.5 * DENSEKJ * Y; - // if (abs(val4d) > devSim_MP2.integralCutoff) { - QUICKULL val4 = (QUICKULL) (fabs(val4d * OSCALE) + (QUICKDouble) 0.5); - if (val4d < (QUICKDouble) 0.0) val4 = 0ull - val4; - atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val4); - //} + temp = -0.5 * hybrid_coeff * DENSEKJ * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim_MP2.o, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), temp); +# endif +// } } // ATOMIC ADD VALUE 5 - QUICKDouble val5d = hybrid_coeff * 0.5 * DENSELI * Y; - //if (abs(val5d) > devSim_MP2.integralCutoff) { - QUICKULL val5 = (QUICKULL) (fabs(val5d * OSCALE) + (QUICKDouble) 0.5); - if (val5d < (QUICKDouble) 0.0) val5 = 0ull - val5; - + temp = -0.5 * hybrid_coeff * DENSELI * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { if ((III != JJJ && III < KKK) || (III == JJJ && III == KKK && III < LLL) - || (III == KKK && III < JJJ && JJJ < LLL)) { - atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,KKK) - 1, MIN(JJJ,KKK) - 1, - devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5); + || (III == KKK && III < JJJ && JJJ < LLL)) { + o_JK_MM += temp; } // ATOMIC ADD VALUE 5 - 2 if (III != JJJ && JJJ == KKK) { - atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1, - devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5); + o_JK += temp; } - //} +// } // ATOMIC ADD VALUE 6 - if (III != JJJ) { - if (KKK != LLL) { - QUICKDouble val6d = hybrid_coeff * 0.5 * DENSEKI * Y; - // if (abs(val6d) > devSim_MP2.integralCutoff) { - - QUICKULL val6 = (QUICKULL) (fabs(val6d * OSCALE) + (QUICKDouble) 0.5); - if (val6d < (QUICKDouble) 0.0) - val6 = 0ull - val6; - - atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,LLL) - 1, MIN(JJJ,LLL) - 1, - devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6); - - // ATOMIC ADD VALUE 6 - 2 - if (JJJ == LLL && III != KKK) { - atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1, - devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6); - } + if (III != JJJ && KKK != LLL) { + temp = -0.5 * hybrid_coeff * DENSEKI * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), temp, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), temp); +#endif + + // ATOMIC ADD VALUE 6 - 2 + if (JJJ == LLL && III != KKK) { +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), temp, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, LLL - 1, JJJ - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), temp); +#endif } - //} +// } } } - //} } + +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, KKK - 1, III - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_KI, OSCALE); + GPUATOMICADD(&LOC2(devSim_MP2.oULL, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JK_MM, OSCALE); + GPUATOMICADD(&LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JK, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, KKK - 1, III - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_KI); + atomicAdd(&LOC2(devSim_MP2.o, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JK_MM); + atomicAdd(&LOC2(devSim_MP2.o, JJJ - 1, KKK - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JK); +#endif } + +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JI, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, JJJ - 1, III - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JI); +#endif } } } diff --git a/src/gpu/cuda/gpu_cew_quad.h b/src/gpu/cuda/gpu_cew_quad.h index a0e1276c..8c51002a 100644 --- a/src/gpu/cuda/gpu_cew_quad.h +++ b/src/gpu/cuda/gpu_cew_quad.h @@ -96,8 +96,12 @@ void get_cew_accdens(_gpu_type gpu) { int Istart = (gpu -> gpu_xcq -> gatm -> _hostData[i]-1) * 3; - for(int j=0; j<3; j++) - gpu -> cew_grad->_hostData[Istart+j] += cewGrad[j]; + for (int j = 0; j < 3; j++) +#if defined(USE_LEGACY_ATOMICS) + gpu->grad->_hostData[Istart+j] += cewGrad[j]; +#else + gpu->cew_grad->_hostData[Istart+j] += cewGrad[j]; +#endif } delete gridpt; @@ -138,7 +142,11 @@ __global__ void getcew_quad_kernel() QUICKDouble _tmp = phi * phi2 * dfdr * weight; +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_dft.oULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp, OSCALE); +#else atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); +#endif } } } @@ -154,13 +162,23 @@ __global__ void oshell_getcew_quad_grad_kernel() __global__ void cshell_getcew_quad_grad_kernel() #endif { +#if defined(USE_LEGACY_ATOMICS) + //declare smem grad vector + extern __shared__ QUICKULL smem_buffer[]; + QUICKULL* smemGrad=(QUICKULL*)smem_buffer; + + // initialize smem grad + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) + smemGrad[i] = 0ull; +#else //declare smem grad vector extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; + QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0.0; + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) + smemGrad[i] = 0.0; +#endif __syncthreads(); @@ -222,9 +240,9 @@ __global__ void cshell_getcew_quad_grad_kernel() QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2); //printf("test quad grad %f %f %f %f %f %f %f %f %f %f\n", gridx, gridy, gridz, denseij, weight, dfdr, dphidx, dphidy, dphidz, phi2); - atomicAdd(&smemGrad[Istart], Gradx); - atomicAdd(&smemGrad[Istart+1], Grady); - atomicAdd(&smemGrad[Istart+2], Gradz); + GPUATOMICADD(&smemGrad[Istart], Gradx, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 1], Grady, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 2], Gradz, GRADSCALE); sumGradx += Gradx; sumGrady += Grady; sumGradz += Gradz; @@ -234,9 +252,9 @@ __global__ void cshell_getcew_quad_grad_kernel() int Istart = (devSim_dft.gatm[gid]-1)*3; - atomicAdd(&smemGrad[Istart], -sumGradx); - atomicAdd(&smemGrad[Istart+1], -sumGrady); - atomicAdd(&smemGrad[Istart+2], -sumGradz); + GPUATOMICADD(&smemGrad[Istart], -sumGradx, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 1], -sumGrady, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 2], -sumGradz, GRADSCALE); } //Set weights for sswder calculation @@ -252,8 +270,12 @@ __global__ void cshell_getcew_quad_grad_kernel() __syncthreads(); // update gmem grad vector - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) +#if defined(USE_LEGACY_ATOMICS) + atomicAdd(&devSim_dft.gradULL[i], smemGrad[i]); +#else + atomicAdd(&devSim_dft.grad[i], smemGrad[i]); +#endif __syncthreads(); } diff --git a/src/gpu/cuda/gpu_get2e.cu b/src/gpu/cuda/gpu_get2e.cu index 12c1b59c..21c483be 100644 --- a/src/gpu/cuda/gpu_get2e.cu +++ b/src/gpu/cuda/gpu_get2e.cu @@ -774,6 +774,10 @@ __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) getAddInt_kerne int const batchSize = 20; ERI_entry a[batchSize]; int j = 0; + QUICKDouble temp; +#if defined(OSHELL) + QUICKDouble temp2; +#endif QUICKULL myInt = (QUICKULL) (bufferSize) / totalThreads; if ((bufferSize - myInt * totalThreads) > offside) myInt++; @@ -802,7 +806,152 @@ __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) getAddInt_kerne // } else if( devSim.method == LIBXC) { // hybrid_coeff = devSim.hyb_coeff; // } - addint(devSim.o, a[k].value, III, JJJ, KKK, LLL, devSim.hyb_coeff, devSim.dense, devSim.nbasis); + +#if defined(OSHELL) + QUICKDouble DENSELK = (QUICKDouble) (LOC2(devSim.dense, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis) + + LOC2(devSim.denseb, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis)); + QUICKDouble DENSEJI = (QUICKDouble) (LOC2(devSim.dense, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis) + + LOC2(devSim.denseb, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis)); + + QUICKDouble DENSEKIA = (QUICKDouble) LOC2(devSim.dense, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSEKJA = (QUICKDouble) LOC2(devSim.dense, KKK - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELJA = (QUICKDouble) LOC2(devSim.dense, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELIA = (QUICKDouble) LOC2(devSim.dense, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis); + + QUICKDouble DENSEKIB = (QUICKDouble) LOC2(devSim.denseb, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSEKJB = (QUICKDouble) LOC2(devSim.denseb, KKK - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELJB = (QUICKDouble) LOC2(devSim.denseb, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELIB = (QUICKDouble) LOC2(devSim.denseb, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis); +#else + QUICKDouble DENSEKI = (QUICKDouble) LOC2(devSim.dense, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSEKJ = (QUICKDouble) LOC2(devSim.dense, KKK - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELJ = (QUICKDouble) LOC2(devSim.dense, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELI = (QUICKDouble) LOC2(devSim.dense, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELK = (QUICKDouble) LOC2(devSim.dense, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSEJI = (QUICKDouble) LOC2(devSim.dense, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis); +#endif + + // ATOMIC ADD VALUE 1 + temp = (KKK == LLL) ? DENSELK * a[k].value : 2.0 * DENSELK * a[k].value; + o_JI += temp; +#if defined(OSHELL) + ob_JI += temp; +#endif + + // ATOMIC ADD VALUE 2 + if (LLL != JJJ || III != KKK) { + temp = (III == JJJ) ? DENSEJI * a[k].value : 2.0 * DENSEJI * a[k].value; +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.ob, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#endif + } + + // ATOMIC ADD VALUE 3 +#if defined(OSHELL) + temp = (III == KKK && III < JJJ && JJJ < LLL) + ? -2.0 * devSim.hyb_coeff * DENSELJA * a[k].value : -(devSim.hyb_coeff * DENSELJA * a[k].value); + temp2 = (III == KKK && III < JJJ && JJJ < LLL) + ? -2.0 * devSim.hyb_coeff * DENSELJB * a[k].value : -(devSim.hyb_coeff * DENSELJB * a[k].value); + o_KI += temp; + ob_KI += temp2; +#else + temp = (III == KKK && III < JJJ && JJJ < LLL) + ? -(devSim.hyb_coeff * DENSELJ * a[k].value) : -0.5 * devSim.hyb_coeff * DENSELJ * a[k].value; + o_KI += temp; +#endif + + // ATOMIC ADD VALUE 4 + if (KKK != LLL) { +#if defined(OSHELL) + temp = -(devSim.hyb_coeff * DENSEKJA * a[k].value); + temp2 = -(devSim.hyb_coeff * DENSEKJB * a[k].value); +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); + GPUATOMICADD(&LOC2(devSim.obULL, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp2, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp); + atomicAdd(&LOC2(devSim.ob, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp2); +# endif +#else + temp = -0.5 * devSim.hyb_coeff * DENSEKJ * a[k].value; +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#endif + } + + // ATOMIC ADD VALUE 5 +#if defined(OSHELL) + temp = -(devSim.hyb_coeff * DENSELIA * a[k].value); + temp2 = -(devSim.hyb_coeff * DENSELIB * a[k].value); +#else + temp = -0.5 * devSim.hyb_coeff * DENSELI * a[k].value; +#endif + if ((III != JJJ && III < KKK) + || (III == JJJ && III == KKK && III < LLL) + || (III == KKK && III < JJJ && JJJ < LLL)) { + o_JK_MM += temp; +#if defined(OSHELL) + ob_JK_MM += temp2; +#endif + } + + // ATOMIC ADD VALUE 5 - 2 + if (III != JJJ && JJJ == KKK) { + o_JK += temp; +#if defined(OSHELL) + ob_JK += temp2; +#endif + } + + // ATOMIC ADD VALUE 6 + if (III != JJJ && KKK != LLL) { +#if defined(OSHELL) + temp = -(devSim.hyb_coeff * DENSEKIA * a[k].value); + temp2 = -(devSim.hyb_coeff * DENSEKIB * a[k].value); +#else + temp = -0.5 * devSim.hyb_coeff * DENSEKI * a[k].value; +#endif +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp2, OSCALE); +# else + atomicAdd(&LOC2(devSim.ob, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp2); +# endif +#endif + + // ATOMIC ADD VALUE 6 - 2 + if (JJJ == LLL && III != KKK) { +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp2, OSCALE); +# else + atomicAdd(&LOC2(devSim.ob, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp2); +# endif +#endif + } + } } } diff --git a/src/gpu/cuda/gpu_get2e_getxc_drivers.h b/src/gpu/cuda/gpu_get2e_getxc_drivers.h index 35ce1678..e53f3387 100644 --- a/src/gpu/cuda/gpu_get2e_getxc_drivers.h +++ b/src/gpu/cuda/gpu_get2e_getxc_drivers.h @@ -27,55 +27,102 @@ extern "C" void gpu_get_oshell_eri_(bool *deltaO, QUICKDouble* o, QUICKDouble* o extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) #endif { - PRINTDEBUG("BEGIN TO RUN GET ERI") + PRINTDEBUG("BEGIN TO RUN GET ERI"); upload_sim_to_constant(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") + PRINTDEBUG("BEGIN TO RUN KERNEL"); #ifdef OSHELL - get_oshell_eri(gpu); + get_oshell_eri(gpu); #else get2e(gpu); #endif - PRINTDEBUG("COMPLETE KERNEL") - - gpu -> gpu_calculated -> o -> Download(); - cudaMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); + PRINTDEBUG("COMPLETE KERNEL"); + +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + cudaMemsetAsync(gpu->gpu_calculated->oULL->_devData, 0, sizeof(QUICKULL) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } + else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData , i , j , gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData , j , i , gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + } + } - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); +# if defined(OSHELL) + gpu->gpu_calculated->obULL->Download(); + cudaMemsetAsync(gpu->gpu_calculated->obULL->_devData, 0, sizeof(QUICKULL) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->obULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } + else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; } } -#ifdef OSHELL - gpu -> gpu_calculated -> ob -> Download(); - cudaMemsetAsync(gpu -> gpu_calculated -> ob -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); +# endif +#else + gpu->gpu_calculated->o->Download(); + cudaMemsetAsync(gpu->gpu_calculated->o->_devData, 0, sizeof(QUICKDouble) * gpu->nbasis * gpu->nbasis); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); + } + } +# if defined(OSHELL) + gpu->gpu_calculated->ob->Download(); + cudaMemsetAsync(gpu->gpu_calculated->ob->_devData, 0, sizeof(QUICKDouble) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } +# endif #endif - gpu -> gpu_calculated -> o -> DownloadSum(o); - -#ifdef OSHELL - gpu -> gpu_calculated -> ob -> DownloadSum(ob); + gpu->gpu_calculated->o->DownloadSum(o); +#if defined(OSHELL) + gpu->gpu_calculated->ob->DownloadSum(ob); #endif - PRINTDEBUG("DELETE TEMP VARIABLES") + PRINTDEBUG("DELETE TEMP VARIABLES"); - if(gpu -> gpu_sim.method == HF){ + if (gpu->gpu_sim.method == HF) { delete gpu->gpu_calculated->o; delete gpu->gpu_calculated->dense; - -#ifdef OSHELL +#if defined(USE_LEGACY_ATOMICS) + delete gpu->gpu_calculated->oULL; +# if defined(OSHELL) + delete gpu->gpu_calculated->obULL; +# endif +#endif +#if defined(OSHELL) delete gpu->gpu_calculated->ob; delete gpu->gpu_calculated->denseb; #endif - - }else if(*deltaO != 0){ + } else if (*deltaO != 0) { delete gpu->gpu_calculated->dense; #ifdef OSHELL delete gpu->gpu_calculated->denseb; @@ -84,7 +131,7 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) delete gpu->gpu_cutoff->cutMatrix; - PRINTDEBUG("COMPLETE RUNNING GET2E") + PRINTDEBUG("COMPLETE RUNNING GET2E"); } @@ -94,9 +141,9 @@ extern "C" void gpu_get_oshell_eri_grad_(QUICKDouble* grad) extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) #endif { - PRINTDEBUG("BEGIN TO RUN GRAD") + PRINTDEBUG("BEGIN TO RUN GRAD"); upload_sim_to_constant(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") + PRINTDEBUG("BEGIN TO RUN KERNEL"); if(gpu -> gpu_sim.is_oshell == true){ get_oshell_eri_grad(gpu); @@ -116,24 +163,44 @@ extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) } #endif - PRINTDEBUG("COMPLETE KERNEL") + PRINTDEBUG("COMPLETE KERNEL"); + + if (gpu->gpu_sim.method == HF) { +#if defined(USE_LEGACY_ATOMICS) + gpu -> gradULL -> Download(); + + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } - if(gpu -> gpu_sim.method == HF){ - gpu -> grad -> Download(); + gpu->grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else + gpu->grad->Download(); +#endif } - if(gpu -> gpu_sim.method == HF){ - gpu -> grad -> DownloadSum(grad); + if (gpu -> gpu_sim.method == HF) { + gpu->grad->DownloadSum(grad); - delete gpu -> grad; + delete gpu->grad; +#if defined(USE_LEGACY_ATOMICS) + delete gpu->gradULL; +#endif delete gpu->gpu_calculated->dense; -#ifdef OSHELL +#if defined(OSHELL) delete gpu->gpu_calculated->denseb; #endif } - PRINTDEBUG("COMPLETE RUNNING GRAD") + PRINTDEBUG("COMPLETE RUNNING GRAD"); } @@ -143,56 +210,143 @@ extern "C" void gpu_get_oshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICK extern "C" void gpu_get_cshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICKDouble* belec, QUICKDouble *o) #endif { - PRINTDEBUG("BEGIN TO RUN GETXC") + PRINTDEBUG("BEGIN TO RUN GETXC"); gpu->DFT_calculated = new gpu_buffer_type(1, 1); +#if defined(USE_LEGACY_ATOMICS) + QUICKULL valUII = (QUICKULL) (fabs(*Eelxc * OSCALE) + 0.5); + if (*Eelxc < 0.0) { + valUII = 0ull - valUII; + } + gpu->DFT_calculated->_hostData[0].Eelxc = valUII; + + valUII = (QUICKULL) (fabs(*aelec * OSCALE) + 0.5); + if (*aelec < 0.0) { + valUII = 0ull - valUII; + } + gpu->DFT_calculated->_hostData[0].aelec = valUII; + + valUII = (QUICKULL) (fabs(*belec * OSCALE) + 0.5); + if (*belec < 0.0) { + valUII = 0ull - valUII; + } + gpu->DFT_calculated->_hostData[0].belec = valUII; +#else gpu->DFT_calculated->_hostData[0].Eelxc = 0.0; gpu->DFT_calculated->_hostData[0].aelec = 0.0; gpu->DFT_calculated->_hostData[0].belec = 0.0; +#endif gpu->DFT_calculated->Upload(); gpu->gpu_sim.DFT_calculated = gpu->DFT_calculated->_devData; upload_sim_to_constant_dft(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") + PRINTDEBUG("BEGIN TO RUN KERNEL"); getxc(gpu); gpu->DFT_calculated->Download(); - gpu -> gpu_calculated -> o -> Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; } } -#ifdef OSHELL - gpu -> gpu_calculated -> ob -> Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); +# if defined(OSHELL) + gpu->gpu_calculated->obULL->Download(); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->obULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; } } -#endif +# endif + + QUICKULL valULL = gpu->DFT_calculated -> _hostData[0].Eelxc; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + *Eelxc = (QUICKDouble) valDB * ONEOVEROSCALE; - *Eelxc = gpu->DFT_calculated -> _hostData[0].Eelxc; - *aelec = gpu->DFT_calculated -> _hostData[0].aelec; - *belec = gpu->DFT_calculated -> _hostData[0].belec; + valULL = gpu->DFT_calculated->_hostData[0].aelec; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + *aelec = (QUICKDouble) valDB * ONEOVEROSCALE; - gpu -> gpu_calculated -> o -> DownloadSum(o); -#ifdef OSHELL - gpu -> gpu_calculated -> ob -> DownloadSum(ob); + valULL = gpu->DFT_calculated->_hostData[0].belec; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + *belec = (QUICKDouble) valDB * ONEOVEROSCALE; +#else + gpu->gpu_calculated->o->Download(); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); + } + } + +# if defined(OSHELL) + gpu->gpu_calculated->ob->Download(); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->ob->_hostData, i, j,gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); + } + } +# endif + + *Eelxc = gpu->DFT_calculated->_hostData[0].Eelxc; + *aelec = gpu->DFT_calculated->_hostData[0].aelec; + *belec = gpu->DFT_calculated->_hostData[0].belec; +#endif + + gpu->gpu_calculated->o->DownloadSum(o); +#if defined(OSHELL) + gpu->gpu_calculated->ob->DownloadSum(ob); #endif - PRINTDEBUG("DELETE TEMP VARIABLES") + PRINTDEBUG("DELETE TEMP VARIABLES"); delete gpu->gpu_calculated->o; delete gpu->gpu_calculated->dense; - -#ifdef OSHELL +#if defined(OSHELL) delete gpu->gpu_calculated->ob; delete gpu->gpu_calculated->denseb; #endif +#if defined(USE_LEGACY_ATOMICS) + delete gpu->gpu_calculated->oULL; +# if defined(OSHELL) + delete gpu->gpu_calculated->obULL; +# endif +#endif } @@ -202,8 +356,8 @@ extern "C" void gpu_get_oshell_xcgrad_(QUICKDouble *grad) extern "C" void gpu_get_cshell_xcgrad_(QUICKDouble *grad) #endif { -#if defined(CEW) - gpu -> cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); +#if defined(CEW) && !defined(USE_LEGACY_ATOMICS) + gpu->cew_grad = new gpu_buffer_type(3 * gpu->nextatom); #endif // calculate smem size @@ -214,16 +368,35 @@ extern "C" void gpu_get_cshell_xcgrad_(QUICKDouble *grad) memset(gpu->grad->_hostData, 0, gpu -> gpu_xcq -> smem_size); getxc_grad(gpu); - gpu -> grad -> Download(); +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL->Download(); + + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->grad->_hostData[i] += (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else + gpu->grad->Download(); +#endif - gpu -> grad -> DownloadSum(grad); + gpu->grad->DownloadSum(grad); -#if defined(CEW) - gpu -> cew_grad->DownloadSum(grad); - delete gpu -> cew_grad; +#if defined(CEW) && !defined(USE_LEGACY_ATOMICS) + gpu->cew_grad->DownloadSum(grad); + delete gpu->cew_grad; #endif - delete gpu -> grad; + delete gpu->grad; +#ifdef USE_LEGACY_ATOMICS + delete gpu->gradULL; +#endif delete gpu->gpu_calculated->dense; #ifdef OSHELL @@ -253,14 +426,35 @@ extern "C" void gpu_get_oei_(QUICKDouble* o) getOEI(gpu); - gpu -> gpu_calculated -> o -> Download(); - cudaMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + + cudaMemsetAsync(gpu->gpu_calculated->oULL->_devData, 0, sizeof(QUICKULL) * gpu->nbasis * gpu->nbasis); for (int i = 0; i< gpu->nbasis; i++) { for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis); + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; } } +#else + gpu->gpu_calculated->o-> Download(); + cudaMemsetAsync(gpu->gpu_calculated->o->_devData, 0, sizeof(QUICKDouble) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis); + } + } +#endif /* for (int i = 0; i< gpu->nbasis; i++) { @@ -282,10 +476,18 @@ extern "C" void gpu_get_oei_(QUICKDouble* o) extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) { // upload point charge grad vector - if(gpu -> nextatom > 0) { - gpu -> ptchg_grad = new gpu_buffer_type(3 * gpu -> nextatom); - gpu -> ptchg_grad -> Upload(); - gpu -> gpu_sim.ptchg_grad = gpu -> ptchg_grad -> _devData; + if (gpu->nextatom > 0) { + gpu->ptchg_grad = new gpu_buffer_type(3 * gpu->nextatom); + +#if defined(USE_LEGACY_ATOMICS) + gpu->ptchg_gradULL = new gpu_buffer_type(3 * gpu->nextatom); + gpu->ptchg_gradULL->Upload(); + gpu->gpu_sim.ptchg_gradULL = gpu->ptchg_gradULL->_devData; + gpu->ptchg_grad->DeleteGPU(); +#else + gpu->ptchg_grad->Upload(); + gpu->gpu_sim.ptchg_grad = gpu->ptchg_grad->_devData; +#endif } upload_sim_to_constant_oei(gpu); @@ -293,8 +495,24 @@ extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) get_oei_grad(gpu); // download gradients +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL->Download(); + cudaMemsetAsync(gpu->gradULL->_devData, 0, sizeof(QUICKULL) * 3 * gpu->natom); + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->grad->Download(); - cudaMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); + cudaMemsetAsync(gpu->grad->_devData, 0, sizeof(QUICKDouble) * 3 * gpu->natom); +#endif gpu->grad->DownloadSum(grad); @@ -304,9 +522,27 @@ extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) } */ // download point charge gradients - if(gpu -> nextatom > 0) { + if (gpu -> nextatom > 0) { +#if defined(USE_LEGACY_ATOMICS) + gpu->ptchg_gradULL->Download(); + + cudaMemsetAsync(gpu->ptchg_gradULL->_devData, 0, sizeof(QUICKULL) * 3 * gpu->nextatom); + + for (int i = 0; i < 3 * gpu->nextatom; i++) { + QUICKULL valULL = gpu->ptchg_gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->ptchg_grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->ptchg_grad->Download(); - cudaMemsetAsync(gpu -> ptchg_grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->nextatom); + cudaMemsetAsync(gpu->ptchg_grad->_devData, 0, sizeof(QUICKDouble) * 3 * gpu->nextatom); +#endif /* for(int i=0; i<3*gpu->nextatom; ++i){ printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); @@ -316,8 +552,11 @@ extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) } // ptchg_grad is no longer needed. reclaim the memory. - if(gpu -> nextatom > 0 && !gpu->gpu_sim.use_cew) { - SAFE_DELETE(gpu -> ptchg_grad); + if (gpu->nextatom > 0 && !gpu->gpu_sim.use_cew) { + SAFE_DELETE(gpu->ptchg_grad); +#if defined(USE_LEGACY_ATOMICS) + SAFE_DELETE(gpu->ptchg_gradULL); +#endif } } @@ -348,14 +587,35 @@ extern "C" void gpu_get_lri_(QUICKDouble* o) upload_sim_to_constant_dft(gpu); getcew_quad(gpu); - gpu -> gpu_calculated -> o -> Download(); - cudaMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + + cudaMemsetAsync(gpu->gpu_calculated->oULL->_devData, 0, sizeof(QUICKULL) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + } + } +#else + gpu->gpu_calculated->o->Download(); + cudaMemsetAsync(gpu->gpu_calculated->o->_devData, 0, sizeof(QUICKDouble) * gpu->nbasis * gpu->nbasis); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } +#endif /* int idxf90=0; for (int i = 0; i< gpu->nbasis; i++) { @@ -383,8 +643,24 @@ extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) get_lri_grad(gpu); // download gradients +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL->Download(); + cudaMemsetAsync(gpu->gradULL->_devData, 0, sizeof(QUICKULL) * 3 * gpu->natom); + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->grad->Download(); - cudaMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); + cudaMemsetAsync(gpu->grad->_devData, 0, sizeof(QUICKDouble) * 3 * gpu->natom); +#endif gpu->grad->DownloadSum(grad); @@ -394,8 +670,24 @@ extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) } */ // download point charge gradients - if(gpu -> nextatom > 0) { + if (gpu->nextatom > 0) { +#if defined(USE_LEGACY_ATOMICS) + gpu->ptchg_gradULL->Download(); + + for (int i = 0; i < 3 * gpu->nextatom; i++) { + QUICKULL valULL = gpu->ptchg_gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->ptchg_grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->ptchg_grad->Download(); +#endif /* for(int i=0; i<3*gpu->nextatom; ++i){ printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); @@ -405,18 +697,25 @@ extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) } // ptchg_grad is no longer needed. reclaim the memory. - if(gpu -> nextatom > 0) { - SAFE_DELETE(gpu -> ptchg_grad); + if (gpu->nextatom > 0) { + SAFE_DELETE(gpu->ptchg_grad); +#if defined(USE_LEGACY_ATOMICS) + SAFE_DELETE(gpu->ptchg_gradULL); +#endif } } extern "C" void gpu_getcew_grad_quad_(QUICKDouble* grad) { - gpu->cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); +#if defined(USE_LEGACY_ATOMICS) + memset(gpu->grad->_hostData, 0, sizeof(QUICKDouble) * 3 * gpu->natom); +#else + gpu->cew_grad = new gpu_buffer_type(3 * gpu->nextatom); +#endif // calculate smem size - gpu -> gpu_xcq -> smem_size = gpu->natom * 3 * sizeof(QUICKULL); + gpu->gpu_xcq->smem_size = sizeof(QUICKULL) * gpu->natom * 3; //compute xc quad potential upload_sim_to_constant_dft(gpu); @@ -424,13 +723,34 @@ extern "C" void gpu_getcew_grad_quad_(QUICKDouble* grad) getcew_quad_grad(gpu); // download gradients +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL->Download(); + cudaMemsetAsync(gpu->gradULL->_devData, 0, sizeof(QUICKULL) * 3 * gpu->natom); + + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + // make sure to add rather than assign. we already computed one part of the cew + // gradients on host asynchronously. + gpu->grad->_hostData[i] += (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->grad->Download(); - cudaMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); + cudaMemsetAsync(gpu->grad->_devData, 0, sizeof(QUICKDouble) * 3 * gpu->natom); +#endif gpu->grad->DownloadSum(grad); - gpu -> cew_grad ->DownloadSum(grad); - SAFE_DELETE(gpu -> cew_grad); +#if !defined(USE_LEGACY_ATOMICS) + gpu->cew_grad->DownloadSum(grad); + SAFE_DELETE(gpu->cew_grad); +#endif } #endif #endif diff --git a/src/gpu/cuda/gpu_get2e_grad_ffff.cu b/src/gpu/cuda/gpu_get2e_grad_ffff.cu index 8231bf84..7cbfe53a 100644 --- a/src/gpu/cuda/gpu_get2e_grad_ffff.cu +++ b/src/gpu/cuda/gpu_get2e_grad_ffff.cu @@ -311,7 +311,11 @@ void getGrad_ffff(_gpu_type gpu) int2_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.sorted_YCutoffIJ; char_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.mpi_bcompute; char_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.KLMN; +#if defined(USE_LEGACY_ATOMICS) + grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.gradULL; +#else grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.grad; +#endif } LOC3(trans, 0, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 1; diff --git a/src/gpu/cuda/gpu_get2e_grad_ffff.cuh b/src/gpu/cuda/gpu_get2e_grad_ffff.cuh index 8e6e751e..9b9d8e98 100644 --- a/src/gpu/cuda/gpu_get2e_grad_ffff.cuh +++ b/src/gpu/cuda/gpu_get2e_grad_ffff.cuh @@ -1615,26 +1615,21 @@ const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const sm //printf("FILE: %s, LINE: %d, FUNCTION: %s, DEV_SIM_DBL_HYB_COEFF \n", __FILE__, __LINE__, __func__); #endif - atomicAdd(&DEV_SIM_PTR_GRAD[AStart], AGradx); - atomicAdd(&DEV_SIM_PTR_GRAD[AStart + 1], AGrady); - atomicAdd(&DEV_SIM_PTR_GRAD[AStart + 2], AGradz); - - - atomicAdd(&DEV_SIM_PTR_GRAD[BStart], BGradx); - atomicAdd(&DEV_SIM_PTR_GRAD[BStart + 1], BGrady); - atomicAdd(&DEV_SIM_PTR_GRAD[BStart + 2], BGradz); - - - atomicAdd(&DEV_SIM_PTR_GRAD[CStart], CGradx); - atomicAdd(&DEV_SIM_PTR_GRAD[CStart + 1], CGrady); - atomicAdd(&DEV_SIM_PTR_GRAD[CStart + 2], CGradz); - - - atomicAdd(&DEV_SIM_PTR_GRAD[DStart], (-AGradx-BGradx-CGradx)); - atomicAdd(&DEV_SIM_PTR_GRAD[DStart + 1], (-AGrady-BGrady-CGrady)); - atomicAdd(&DEV_SIM_PTR_GRAD[DStart + 2], (-AGradz-BGradz-CGradz)); - - return; + GPUATOMICADD(&DEV_SIM_PTR_GRAD[AStart], AGradx, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[AStart + 1], AGrady, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[AStart + 2], AGradz, GRADSCALE); + + GPUATOMICADD(&DEV_SIM_PTR_GRAD[BStart], BGradx, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[BStart + 1], BGrady, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[BStart + 2], BGradz, GRADSCALE); + + GPUATOMICADD(&DEV_SIM_PTR_GRAD[CStart], CGradx, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[CStart + 1], CGrady, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[CStart + 2], CGradz, GRADSCALE); + + GPUATOMICADD(&DEV_SIM_PTR_GRAD[DStart], -AGradx - BGradx - CGradx, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[DStart + 1], -AGrady - BGrady - CGrady, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[DStart + 2], -AGradz - BGradz - CGradz, GRADSCALE); } diff --git a/src/gpu/cuda/gpu_getxc.cu b/src/gpu/cuda/gpu_getxc.cu index 59d8f49a..232648db 100644 --- a/src/gpu/cuda/gpu_getxc.cu +++ b/src/gpu/cuda/gpu_getxc.cu @@ -354,14 +354,23 @@ __global__ void get_ssw_kernel() { __global__ void get_sswgrad_kernel() { - extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x; int totalThreads = blockDim.x * gridDim.x; +#if defined(USE_LEGACY_ATOMICS) + extern __shared__ QUICKULL smem_buffer[]; + QUICKULL* smemGrad = (QUICKULL*) smem_buffer; + + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) { + smemGrad[i] = 0ull; + } +#else + extern __shared__ QUICKDouble smem_buffer[]; + QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) { smemGrad[i] = 0.0; } +#endif __syncthreads(); for (QUICKULL gid = offset; gid < devSim_dft.npoints_ssd; gid += totalThreads) { @@ -377,7 +386,11 @@ __global__ void get_sswgrad_kernel() { __syncthreads(); for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) { +#if defined(USE_LEGACY_ATOMICS) + atomicAdd(&devSim_dft.gradULL[i], smemGrad[i]); +#else atomicAdd(&devSim_dft.grad[i], smemGrad[i]); +#endif } __syncthreads(); @@ -390,16 +403,27 @@ __global__ void get_sswgrad_kernel() { computing x, y and z gradients separately. */ __global__ void get_sswnumgrad_kernel() { - extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x; unsigned int totalThreads = blockDim.x * gridDim.x; unsigned int natom = devSim_dft.natom; QUICKULL npoints_ssd = devSim_dft.npoints_ssd; +#if defined(USE_LEGACY_ATOMICS) + //declare smem grad vector + extern __shared__ QUICKULL smem_buffer[]; + QUICKULL* smemGrad = (QUICKULL*) smem_buffer; + + // initialize smem grad + for (int i = threadIdx.x; i < natom * 3; i += blockDim.x) { + smemGrad[i] = 0ull; + } +#else + extern __shared__ QUICKDouble smem_buffer[]; + QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; for (int i = threadIdx.x; i < natom * 3; i += blockDim.x) { smemGrad[i] = 0.0; } +#endif __syncthreads(); @@ -504,9 +528,9 @@ __global__ void get_sswnumgrad_kernel() { if(iatom == gatm-1) zparent = zatm; - atomicAdd(&smemGrad[iatom*3], dpx); - atomicAdd(&smemGrad[iatom*3+1], dpy); - atomicAdd(&smemGrad[iatom*3+2], dpz); + GPUATOMICADD(&smemGrad[iatom * 3], dpx, GRADSCALE); + GPUATOMICADD(&smemGrad[iatom * 3 + 1], dpy, GRADSCALE); + GPUATOMICADD(&smemGrad[iatom * 3 + 2], dpz, GRADSCALE); /* printf("sswgrad %f %f %f %d %d %f %f %f \n", gridx, gridy, gridz, iatom, 1, dpx, devSim_dft.exc_ssd[idx], devSim_dft.quadwt[idx]); @@ -522,8 +546,12 @@ __global__ void get_sswnumgrad_kernel() { __syncthreads(); // update gmem grad vector - for(int i = threadIdx.x; i< natom * 3; i+=blockDim.x) - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + for (int i = threadIdx.x; i< natom * 3; i += blockDim.x) +#if defined(USE_LEGACY_ATOMICS) + atomicAdd(&devSim_dft.gradULL[i], smemGrad[i]); +#else + atomicAdd(&devSim_dft.grad[i], smemGrad[i]); +#endif __syncthreads(); @@ -931,7 +959,12 @@ __device__ QUICKDouble get_uw_ssd(const QUICKDouble gridx, const QUICKDouble gri __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, - const QUICKDouble Exc, const QUICKDouble quadwt, QUICKDouble* const smemGrad, + const QUICKDouble Exc, const QUICKDouble quadwt, +#if defined(USE_LEGACY_ATOMICS) + QUICKULL* const smemGrad, +#else + QUICKDouble* const smemGrad, +#endif QUICKDouble* const uw_ssd, const int iparent, const int natom) { QUICKDouble sumUW = 0.0; @@ -960,7 +993,8 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons for (int i = 0; i < natom; i++) { for (int j = 0; j < 3; j++) { if (abs(uw) > devSim_dft.DMCutoff) { - atomicAdd(&smemGrad[i * 3 + j], LOCUWSSD(uw_ssd, j, i, 3, natom) * Exc * quadwt * uw * (-p / sumUW)); + GPUATOMICADD(&smemGrad[i * 3 + j], + LOCUWSSD(uw_ssd, j, i, 3, natom) * Exc * quadwt * uw * (-p / sumUW), GRADSCALE); } } } @@ -977,7 +1011,8 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons for (int i = 0; i < natom; i++) { for (int j = 0; j < 3; j++) { if (abs(parent_uw) > devSim_dft.DMCutoff) { - atomicAdd(&smemGrad[i * 3 + j], LOCUWSSD(uw_ssd, j, i, 3, natom) * (1.0 / sumUW) * Exc * quadwt * parent_uw); + GPUATOMICADD(&smemGrad[i * 3 + j], + LOCUWSSD(uw_ssd, j, i, 3, natom) * (1.0 / sumUW) * Exc * quadwt * parent_uw, GRADSCALE); } } } @@ -985,8 +1020,13 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, - QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int - iparent, int gid) + QUICKDouble Exc, QUICKDouble quadwt, +#if defined(USE_LEGACY_ATOMICS) + QUICKULL* smemGrad, +#else + QUICKDouble* smemGrad, +#endif + int iparent, int gid) { /* This subroutine calculates the derivatives of weight found in @@ -1136,9 +1176,9 @@ __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, #endif // We should now have the derivatives of the SS weights. Now just add it to the temporary gradient vector in shared memory. - atomicAdd(&smemGrad[jstart], wtgradjx * Exc * quadwt); - atomicAdd(&smemGrad[jstart + 1], wtgradjy * Exc * quadwt); - atomicAdd(&smemGrad[jstart + 2], wtgradjz * Exc * quadwt); + GPUATOMICADD(&smemGrad[jstart], wtgradjx * Exc * quadwt, GRADSCALE); + GPUATOMICADD(&smemGrad[jstart + 1], wtgradjy * Exc * quadwt, GRADSCALE); + GPUATOMICADD(&smemGrad[jstart + 2], wtgradjz * Exc * quadwt, GRADSCALE); } } @@ -1148,9 +1188,9 @@ __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, #endif // update the temporary gradient vector - atomicAdd(&smemGrad[istart], wtgradix * Exc * quadwt); - atomicAdd(&smemGrad[istart + 1], wtgradiy * Exc * quadwt); - atomicAdd(&smemGrad[istart + 2], wtgradiz * Exc * quadwt); + GPUATOMICADD(&smemGrad[istart], wtgradix * Exc * quadwt, GRADSCALE); + GPUATOMICADD(&smemGrad[istart + 1], wtgradiy * Exc * quadwt, GRADSCALE); + GPUATOMICADD(&smemGrad[istart + 2], wtgradiz * Exc * quadwt, GRADSCALE); } diff --git a/src/gpu/cuda/gpu_getxc.h b/src/gpu/cuda/gpu_getxc.h index e002502b..20ce6eb0 100644 --- a/src/gpu/cuda/gpu_getxc.h +++ b/src/gpu/cuda/gpu_getxc.h @@ -290,9 +290,15 @@ __global__ void cshell_getxc_kernel() } #endif +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&devSim_dft.DFT_calculated[0].Eelxc, _tmp, OSCALE); + GPUATOMICADD(&devSim_dft.DFT_calculated[0].aelec, weight * density, OSCALE); + GPUATOMICADD(&devSim_dft.DFT_calculated[0].belec, weight * densityb, OSCALE); +#else atomicAdd(&devSim_dft.DFT_calculated[0].Eelxc, _tmp); - atomicAdd(&devSim_dft.DFT_calculated[0].aelec, weight*density); - atomicAdd(&devSim_dft.DFT_calculated[0].belec, weight*densityb); + atomicAdd(&devSim_dft.DFT_calculated[0].aelec, weight * density); + atomicAdd(&devSim_dft.DFT_calculated[0].belec, weight * densityb); +#endif for (int i = bfloc_st; i< bfloc_end; ++i) { int ibas = devSim_dft.basf[i]; @@ -309,13 +315,21 @@ __global__ void cshell_getxc_kernel() QUICKDouble _tmp = (phi * phi2 * dfdr + xdot * (phi*dphidx2 + phi2*dphidx) \ + ydot * (phi*dphidy2 + phi2*dphidy) + zdot * (phi*dphidz2 + phi2*dphidz))*weight; +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_dft.oULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp, OSCALE); +#else atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); +#endif #ifdef OSHELL QUICKDouble _tmpb = (phi * phi2 * dfdrb + xdotb * (phi*dphidx2 + phi2*dphidx) + ydotb * (phi*dphidy2 + phi2*dphidy) + zdotb * (phi*dphidz2 + phi2*dphidz))*weight; +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_dft.obULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmpb, OSCALE); +#else atomicAdd(&LOC2(devSim_dft.ob, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmpb); +#endif #endif } } @@ -331,13 +345,23 @@ __global__ void oshell_getxcgrad_kernel() __global__ void cshell_getxcgrad_kernel() #endif { +#if defined(USE_LEGACY_ATOMICS) + //declare smem grad vector + extern __shared__ QUICKULL smem_buffer[]; + QUICKULL* smemGrad = (QUICKULL*) smem_buffer; + + // initialize smem grad + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) + smemGrad[i] = 0ull; +#else //declare smem grad vector extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; + QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0.0; + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) + smemGrad[i] = 0.0; +#endif __syncthreads(); @@ -567,9 +591,9 @@ __global__ void cshell_getxcgrad_kernel() } #endif - atomicAdd(&smemGrad[Istart], Gradx); - atomicAdd(&smemGrad[Istart+1], Grady); - atomicAdd(&smemGrad[Istart+2], Gradz); + GPUATOMICADD(&smemGrad[Istart], Gradx, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 1], Grady, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 2], Gradz, GRADSCALE); sumGradx += Gradx; sumGrady += Grady; sumGradz += Gradz; @@ -579,9 +603,9 @@ __global__ void cshell_getxcgrad_kernel() int Istart = (devSim_dft.gatm[gid]-1) * 3; - atomicAdd(&smemGrad[Istart], -sumGradx); - atomicAdd(&smemGrad[Istart+1], -sumGrady); - atomicAdd(&smemGrad[Istart+2], -sumGradz); + GPUATOMICADD(&smemGrad[Istart], -sumGradx, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 1], -sumGrady, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 2], -sumGradz, GRADSCALE); } //Set weights for sswder calculation if(density < devSim_dft.DMCutoff){ @@ -597,8 +621,12 @@ __global__ void cshell_getxcgrad_kernel() __syncthreads(); // update gmem grad vector - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) +#if defined(USE_LEGACY_ATOMICS) + atomicAdd(&devSim_dft.gradULL[i], smemGrad[i]); +#else + atomicAdd(&devSim_dft.grad[i], smemGrad[i]); +#endif __syncthreads(); } diff --git a/src/gpu/cuda/gpu_type.h b/src/gpu/cuda/gpu_type.h index 120b6c1a..6c65eb43 100644 --- a/src/gpu/cuda/gpu_type.h +++ b/src/gpu/cuda/gpu_type.h @@ -34,6 +34,10 @@ struct gpu_calculated_type { gpu_buffer_type* ob; // beta O matrix gpu_buffer_type* dense; // Density Matrix gpu_buffer_type* denseb; // Beta Density Matrix +#if defined(USE_LEGACY_ATOMICS) + gpu_buffer_type* oULL; // Unsigned long long int type O matrix + gpu_buffer_type* obULL; // Unsigned long long int type Ob matrix +#endif gpu_buffer_type* distance; // distance matrix }; @@ -81,9 +85,15 @@ struct gpu_cutoff_type { }; struct DFT_calculated_type { +#if defined(USE_LEGACY_ATOMICS) + QUICKULL Eelxc; // exchange correction energy + QUICKULL aelec; // alpha electron + QUICKULL belec; // beta electron +#else QUICKDouble Eelxc; // exchange correction energy QUICKDouble aelec; // alpha electron QUICKDouble belec; // beta electron +#endif }; /*Madu Manathunga 11/21/2019*/ diff --git a/src/gpu/gpu_common.h b/src/gpu/gpu_common.h index 77ea428e..6757574c 100644 --- a/src/gpu/gpu_common.h +++ b/src/gpu/gpu_common.h @@ -204,7 +204,11 @@ static FILE *debugFile = NULL; //typedef float QUICKDouble; typedef float QUICKSingle; #define QUICKULL unsigned long long int -#define QUICKAtomicType double +#if defined(USE_LEGACY_ATOMICS) + #define QUICKAtomicType unsigned long long int +#else + #define QUICKAtomicType double +#endif /* SM Version enum */ @@ -232,12 +236,18 @@ struct ERI_entry { }; -//TODO: rewrite MP2 code and remove this constant -/* energy scaling constants */ -#define OSCALE ((QUICKDouble) 1.0e12) +#if defined(USE_LEGACY_ATOMICS) + /* powers in the following constants represent the number of digits + * to keep in the mantissa (fractional part) when converting floating point values + * to/from integer values for use in GPU integer atomic operations */ - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 + /* keep 12 digits in the mantissa (fractional part) for energy-related calculations */ + #define OSCALE (1.0e12) + #define ONEOVEROSCALE (1.0e-12) + /* keep 16 digits in the mantissa (fractional part) for gradient-related calculations */ + #define GRADSCALE (1.0e16) + #define ONEOVERGRADSCALE (1.0e-16) +#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 __device__ static inline double atomicAdd( double * address, double val ) { unsigned long long int *address_as_ull, old, assumed; @@ -257,5 +267,17 @@ __device__ static inline double atomicAdd( double * address, double val ) } #endif +#if defined(USE_LEGACY_ATOMICS) + #define GPUATOMICADD(address, val, scale) \ +{ \ + QUICKULL temp_ULL = (QUICKULL) (fabs((val) * (scale)) + 0.5); \ + if ((val) < 0.0) \ + temp_ULL = 0ull - temp_ULL; \ + atomicAdd((address), temp_ULL); \ +} +#else + #define GPUATOMICADD(address, val, scale) atomicAdd((address), (val)); +#endif + #endif diff --git a/src/gpu/gpu_get2e_subs.h b/src/gpu/gpu_get2e_subs.h index e62575ea..c420b923 100644 --- a/src/gpu/gpu_get2e_subs.h +++ b/src/gpu/gpu_get2e_subs.h @@ -893,9 +893,17 @@ __device__ __forceinline__ void iclass_spdf10 // ATOMIC ADD VALUE 2 if (LLL != JJJ || III != KKK) { temp = (III == JJJ) ? DENSEJI * Y : 2.0 * DENSEJI * Y; +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else atomicAdd(&LOC2(devSim.o, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp); +# endif #if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else atomicAdd(&LOC2(devSim.ob, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp); +# endif #endif } @@ -918,11 +926,20 @@ __device__ __forceinline__ void iclass_spdf10 #if defined(OSHELL) temp = -(devSim.hyb_coeff * DENSEKJA * Y); temp2 = -(devSim.hyb_coeff * DENSEKJB * Y); +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); + GPUATOMICADD(&LOC2(devSim.obULL, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp2, OSCALE); +# else atomicAdd(&LOC2(devSim.o, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp); atomicAdd(&LOC2(devSim.ob, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp2); +# endif #else temp = -0.5 * devSim.hyb_coeff * DENSEKJ * Y; +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else atomicAdd(&LOC2(devSim.o, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp); +# endif #endif } @@ -958,16 +975,32 @@ __device__ __forceinline__ void iclass_spdf10 #else temp = -0.5 * devSim.hyb_coeff * DENSEKI * Y; #endif +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else atomicAdd(&LOC2(devSim.o, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp); +# endif #if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp2, OSCALE); +# else atomicAdd(&LOC2(devSim.ob, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp2); +# endif #endif // ATOMIC ADD VALUE 6 - 2 if (JJJ == LLL && III != KKK) { +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else atomicAdd(&LOC2(devSim.o, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp); +# endif #if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp2, OSCALE); +# else atomicAdd(&LOC2(devSim.ob, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp2); +# endif #endif } } @@ -975,19 +1008,39 @@ __device__ __forceinline__ void iclass_spdf10 } } +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis), o_KI, OSCALE); + GPUATOMICADD(&LOC2(devSim.oULL, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, devSim.nbasis, devSim.nbasis), o_JK_MM, OSCALE); + GPUATOMICADD(&LOC2(devSim.oULL, JJJ - 1, KKK - 1, devSim.nbasis, devSim.nbasis), o_JK, OSCALE); +# else atomicAdd(&LOC2(devSim.o, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis), o_KI); atomicAdd(&LOC2(devSim.o, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, devSim.nbasis, devSim.nbasis), o_JK_MM); atomicAdd(&LOC2(devSim.o, JJJ - 1, KKK - 1, devSim.nbasis, devSim.nbasis), o_JK); +# endif #if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis), ob_KI, OSCALE); + GPUATOMICADD(&LOC2(devSim.obULL, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, devSim.nbasis, devSim.nbasis), ob_JK_MM, OSCALE); + GPUATOMICADD(&LOC2(devSim.obULL, JJJ - 1, KKK - 1, devSim.nbasis, devSim.nbasis), ob_JK, OSCALE); +# else atomicAdd(&LOC2(devSim.ob, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis), ob_KI); atomicAdd(&LOC2(devSim.ob, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, devSim.nbasis, devSim.nbasis), ob_JK_MM); atomicAdd(&LOC2(devSim.ob, JJJ - 1, KKK - 1, devSim.nbasis, devSim.nbasis), ob_JK); +# endif #endif } +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), o_JI, OSCALE); +# else atomicAdd(&LOC2(devSim.o, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), o_JI); +# endif #if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), ob_JI, OSCALE); +# else atomicAdd(&LOC2(devSim.ob, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), ob_JI); +# endif #endif } } diff --git a/src/gpu/gpu_get2e_subs_grad.h b/src/gpu/gpu_get2e_subs_grad.h index 5247cef5..35cca44c 100644 --- a/src/gpu/gpu_get2e_subs_grad.h +++ b/src/gpu/gpu_get2e_subs_grad.h @@ -822,21 +822,21 @@ __device__ __forceinline__ void iclass_grad_spd } } - atomicAdd(&devSim.grad[AStart], AGradx); - atomicAdd(&devSim.grad[AStart + 1], AGrady); - atomicAdd(&devSim.grad[AStart + 2], AGradz); - - atomicAdd(&devSim.grad[BStart], BGradx); - atomicAdd(&devSim.grad[BStart + 1], BGrady); - atomicAdd(&devSim.grad[BStart + 2], BGradz); - - atomicAdd(&devSim.grad[CStart], CGradx); - atomicAdd(&devSim.grad[CStart + 1], CGrady); - atomicAdd(&devSim.grad[CStart + 2], CGradz); - - atomicAdd(&devSim.grad[DStart], (-AGradx - BGradx - CGradx)); - atomicAdd(&devSim.grad[DStart + 1], (-AGrady-BGrady-CGrady)); - atomicAdd(&devSim.grad[DStart + 2], (-AGradz-BGradz-CGradz)); + GPUATOMICADD(&devSim.gradULL[AStart], AGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 1], AGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 2], AGradz, GRADSCALE); + + GPUATOMICADD(&devSim.gradULL[BStart], BGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 1], BGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 2], BGradz, GRADSCALE); + + GPUATOMICADD(&devSim.gradULL[CStart], CGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 1], CGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 2], CGradz, GRADSCALE); + + GPUATOMICADD(&devSim.gradULL[DStart], -AGradx - BGradx - CGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[DStart + 1], -AGrady - BGrady - CGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[DStart + 2], -AGradz - BGradz - CGradz, GRADSCALE); } @@ -1557,21 +1557,21 @@ __device__ __forceinline__ void iclass_grad_spdf8 //printf("FILE: %s, LINE: %d, FUNCTION: %s, devSim.hyb_coeff \n", __FILE__, __LINE__, __func__); #endif - atomicAdd(&devSim.grad[AStart], AGradx); - atomicAdd(&devSim.grad[AStart + 1], AGrady); - atomicAdd(&devSim.grad[AStart + 2], AGradz); - - atomicAdd(&devSim.grad[BStart], BGradx); - atomicAdd(&devSim.grad[BStart + 1], BGrady); - atomicAdd(&devSim.grad[BStart + 2], BGradz); - - atomicAdd(&devSim.grad[CStart], CGradx); - atomicAdd(&devSim.grad[CStart + 1], CGrady); - atomicAdd(&devSim.grad[CStart + 2], CGradz); - - atomicAdd(&devSim.grad[DStart], (-AGradx - BGradx - CGradx)); - atomicAdd(&devSim.grad[DStart + 1], (-AGrady - BGrady - CGrady)); - atomicAdd(&devSim.grad[DStart + 2], (-AGradz - BGradz - CGradz)); + GPUATOMICADD(&devSim.gradULL[AStart], AGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 1], AGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 2], AGradz, GRADSCALE); + + GPUATOMICADD(&devSim.gradULL[BStart], BGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 1], BGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 2], BGradz, GRADSCALE); + + GPUATOMICADD(&devSim.gradULL[CStart], CGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 1], CGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 2], CGradz, GRADSCALE); + + GPUATOMICADD(&devSim.gradULL[DStart], -AGradx - BGradx - CGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[DStart + 1], -AGrady - BGrady - CGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[DStart + 2], -AGradz - BGradz - CGradz, GRADSCALE); } #endif diff --git a/src/gpu/gpu_lri_subs.h b/src/gpu/gpu_lri_subs.h index 502e187a..ed2b0c61 100644 --- a/src/gpu/gpu_lri_subs.h +++ b/src/gpu/gpu_lri_subs.h @@ -362,7 +362,11 @@ __device__ __forceinline__ void iclass_lri_spdf2 if (abs(Y) > devSim.coreIntegralCutoff) #endif { +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, JJJ - 1, III - 1, nbasis, nbasis), Y, OSCALE); +#else atomicAdd(&LOC2(devSim.o, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), Y); +#endif } } } diff --git a/src/gpu/gpu_lri_subs_grad.h b/src/gpu/gpu_lri_subs_grad.h index fda8b00e..a3cbb2ca 100644 --- a/src/gpu/gpu_lri_subs_grad.h +++ b/src/gpu/gpu_lri_subs_grad.h @@ -331,24 +331,24 @@ __device__ __forceinline__ void iclass_lri_grad } } - atomicAdd(&devSim.grad[AStart], AGradx); - atomicAdd(&devSim.grad[AStart + 1], AGrady); - atomicAdd(&devSim.grad[AStart + 2], AGradz); + GPUATOMICADD(&devSim.gradULL[AStart], AGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 1], AGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 2], AGradz, GRADSCALE); - atomicAdd(&devSim.grad[BStart], BGradx); - atomicAdd(&devSim.grad[BStart + 1], BGrady); - atomicAdd(&devSim.grad[BStart + 2], BGradz); + GPUATOMICADD(&devSim.gradULL[BStart], BGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 1], BGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 2], BGradz, GRADSCALE); if (iatom < devSim.natom) { - atomicAdd(&devSim.grad[CStart], -AGradx-BGradx); - atomicAdd(&devSim.grad[CStart + 1], -AGrady-BGrady); - atomicAdd(&devSim.grad[CStart + 2], -AGradz-BGradz); + GPUATOMICADD(&devSim.gradULL[CStart], -AGradx - BGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 1], -AGrady - BGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 2], -AGradz - BGradz, GRADSCALE); } else { CStart = (iatom - devSim.natom) * 3; - atomicAdd(&devSim.ptchg_grad[CStart], -AGradx - BGradx); - atomicAdd(&devSim.ptchg_grad[CStart + 1], -AGrady - BGrady); - atomicAdd(&devSim.ptchg_grad[CStart + 2], -AGradz - BGradz); - } + GPUATOMICADD(&devSim.ptchg_gradULL[CStart], -AGradx - BGradx, GRADSCALE); + GPUATOMICADD(&devSim.ptchg_gradULL[CStart + 1], -AGrady - BGrady, GRADSCALE); + GPUATOMICADD(&devSim.ptchg_gradULL[CStart + 2], -AGradz - BGradz, GRADSCALE); + } } @@ -576,26 +576,25 @@ __device__ __forceinline__ void iclass_lri_grad_spdf2 //printf("FILE: %s, LINE: %d, FUNCTION: %s, devSim.hyb_coeff \n", __FILE__, __LINE__, __func__); #endif - atomicAdd(&devSim.grad[AStart], AGradx); - atomicAdd(&devSim.grad[AStart + 1], AGrady); - atomicAdd(&devSim.grad[AStart + 2], AGradz); + GPUATOMICADD(&devSim.gradULL[AStart], AGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 1], AGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 2], AGradz, GRADSCALE); - atomicAdd(&devSim.grad[BStart], BGradx); - atomicAdd(&devSim.grad[BStart + 1], BGrady); - atomicAdd(&devSim.grad[BStart + 2], BGradz); + GPUATOMICADD(&devSim.gradULL[BStart], BGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 1], BGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 2], BGradz, GRADSCALE); if (iatom < devSim.natom) { - atomicAdd(&devSim.grad[CStart], -AGradx - BGradx); - atomicAdd(&devSim.grad[CStart + 1], -AGrady - BGrady); - atomicAdd(&devSim.grad[CStart + 2], -AGradz - BGradz); + GPUATOMICADD(&devSim.gradULL[CStart], -AGradx - BGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 1], -AGrady - BGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 2], -AGradz - BGradz, GRADSCALE); } else { CStart = (iatom - devSim.natom) * 3; - atomicAdd(&devSim.ptchg_grad[CStart], -AGradx - BGradx); - atomicAdd(&devSim.ptchg_grad[CStart + 1], -AGrady - BGrady); - atomicAdd(&devSim.ptchg_grad[CStart + 2], -AGradz - BGradz); + GPUATOMICADD(&devSim.ptchg_gradULL[CStart], -AGradx - BGradx, GRADSCALE); + GPUATOMICADD(&devSim.ptchg_gradULL[CStart + 1], -AGrady - BGrady, GRADSCALE); + GPUATOMICADD(&devSim.ptchg_gradULL[CStart + 2], -AGradz - BGradz, GRADSCALE); } } -#endif #ifndef new_quick_2_gpu_get2e_subs_grad_h diff --git a/src/gpu/gpu_oei.h b/src/gpu/gpu_oei.h index 8abdb81f..c067b4a1 100644 --- a/src/gpu/gpu_oei.h +++ b/src/gpu/gpu_oei.h @@ -171,7 +171,12 @@ __device__ __forceinline__ void iclass_oei(unsigned int I, unsigned int J, unsig // LOC2(devSim.KLMN, 2, JJJ - 1, 3,devSim.nbasis)); // } + // Now add the contribution into Fock matrix. +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), Y, OSCALE); +#else atomicAdd(&LOC2(devSim.o, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis), Y); +#endif //printf("addint_oei: %d %d %f %f %f \n", III, JJJ, devSim.cons[III-1], devSim.cons[JJJ-1], LOCSTORE(store2, i-1, j-1, STOREDIM, STOREDIM)); } diff --git a/src/gpu/gpu_oei_grad.h b/src/gpu/gpu_oei_grad.h index 23aeafee..8e6b91ea 100644 --- a/src/gpu/gpu_oei_grad.h +++ b/src/gpu/gpu_oei_grad.h @@ -154,23 +154,23 @@ __device__ void add_oei_grad(unsigned int I, unsigned int J, unsigned int II, un int BStart = (devSim.katom[JJ]-1) * 3; int CStart = iatom * 3; - atomicAdd(&devSim.grad[AStart], AGradx); - atomicAdd(&devSim.grad[AStart + 1], AGrady); - atomicAdd(&devSim.grad[AStart + 2], AGradz); - - atomicAdd(&devSim.grad[BStart], BGradx); - atomicAdd(&devSim.grad[BStart + 1], BGrady); - atomicAdd(&devSim.grad[BStart + 2], BGradz); - - if(iatom < devSim.natom){ - atomicAdd(&devSim.grad[CStart], (-AGradx-BGradx)); - atomicAdd(&devSim.grad[CStart + 1], (-AGrady-BGrady)); - atomicAdd(&devSim.grad[CStart + 2], (-AGradz-BGradz)); - }else{ - CStart = (iatom-devSim.natom) * 3; - atomicAdd(&devSim.ptchg_grad[CStart], (-AGradx-BGradx)); - atomicAdd(&devSim.ptchg_grad[CStart + 1], (-AGrady-BGrady)); - atomicAdd(&devSim.ptchg_grad[CStart + 2], (-AGradz-BGradz)); + GPUATOMICADD(&devSim.gradULL[AStart], AGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 1], AGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[AStart + 2], AGradz, GRADSCALE); + + GPUATOMICADD(&devSim.gradULL[BStart], BGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 1], BGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[BStart + 2], BGradz, GRADSCALE); + + if (iatom < devSim.natom) { + GPUATOMICADD(&devSim.gradULL[CStart], -AGradx - BGradx, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 1], -AGrady - BGrady, GRADSCALE); + GPUATOMICADD(&devSim.gradULL[CStart + 2], -AGradz - BGradz, GRADSCALE); + } else { + CStart = (iatom - devSim.natom) * 3; + GPUATOMICADD(&devSim.ptchg_gradULL[CStart], -AGradx - BGradx, GRADSCALE); + GPUATOMICADD(&devSim.ptchg_gradULL[CStart + 1], -AGrady - BGrady, GRADSCALE); + GPUATOMICADD(&devSim.ptchg_gradULL[CStart + 2], -AGradz - BGradz, GRADSCALE); } } diff --git a/src/gpu/hip/gpu.cu b/src/gpu/hip/gpu.cu index 5e4d33b1..ca61ced0 100644 --- a/src/gpu/hip/gpu.cu +++ b/src/gpu/hip/gpu.cu @@ -57,7 +57,7 @@ extern "C" void gpu_startup_(int* ierr) #if defined DEBUG || defined DEBUGTIME gpu->debugFile = debugFile; #endif - PRINTDEBUG("CREATE NEW GPU") + PRINTDEBUG("CREATE NEW GPU"); } @@ -66,7 +66,7 @@ extern "C" void gpu_startup_(int* ierr) //----------------------------------------------- extern "C" void gpu_init_(int* ierr) { - PRINTDEBUG("BEGIN TO INIT") + PRINTDEBUG("BEGIN TO INIT"); int device = -1; int gpuCount = 0; hipError_t status; @@ -185,7 +185,7 @@ extern "C" void gpu_init_(int* ierr) gpu->sswGradThreadsPerBlock = SM_2X_SSW_GRAD_THREADS_PER_BLOCK; } - PRINTDEBUG("FINISH INIT") + PRINTDEBUG("FINISH INIT"); } @@ -288,7 +288,7 @@ extern "C" void gpu_deallocate_scratch_(bool* deallocate_gradient_scratch) //----------------------------------------------- extern "C" void gpu_shutdown_(int* ierr) { - PRINTDEBUG("BEGIN TO SHUTDOWN") + PRINTDEBUG("BEGIN TO SHUTDOWN"); delete gpu; hipDeviceReset( ); @@ -313,7 +313,7 @@ extern "C" void gpu_setup_(int* natom, int* nbasis, int* nElec, int* imult, int* hipEventCreate(&start); hipEventCreate(&end); hipEventRecord(start, 0); - PRINTDEBUG("BEGIN TO SETUP") + PRINTDEBUG("BEGIN TO SETUP"); #if defined(MPIV_GPU) fprintf(gpu->debugFile,"mpirank %i natoms %i \n", gpu->mpirank, *natom ); @@ -356,7 +356,7 @@ extern "C" void gpu_setup_(int* natom, int* nbasis, int* nElec, int* imult, int* hipEventDestroy(start); hipEventDestroy(end); - PRINTDEBUG("FINISH SETUP") + PRINTDEBUG("FINISH SETUP"); #endif @@ -431,7 +431,7 @@ extern "C" void gpu_upload_xyz_(QUICKDouble* atom_xyz) hipEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD COORDINATES") + PRINTDEBUG("BEGIN TO UPLOAD COORDINATES"); // gpu->gpu_basis->xyz = new gpu_buffer_type(atom_xyz, 3, gpu->natom); // gpu->gpu_basis->xyz->Upload( ); gpu->gpu_calculated->distance = new gpu_buffer_type(gpu->natom, gpu->natom); @@ -465,7 +465,7 @@ extern "C" void gpu_upload_xyz_(QUICKDouble* atom_xyz) hipEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING COORDINATES") + PRINTDEBUG("COMPLETE UPLOADING COORDINATES"); } @@ -474,7 +474,7 @@ extern "C" void gpu_upload_xyz_(QUICKDouble* atom_xyz) //----------------------------------------------- extern "C" void gpu_upload_atom_and_chg_(int* atom, QUICKDouble* atom_chg) { - PRINTDEBUG("BEGIN TO UPLOAD ATOM AND CHARGE") + PRINTDEBUG("BEGIN TO UPLOAD ATOM AND CHARGE"); gpu->iattype = new gpu_buffer_type(atom, gpu->natom); gpu->chg = new gpu_buffer_type(atom_chg, gpu->natom); @@ -484,7 +484,7 @@ extern "C" void gpu_upload_atom_and_chg_(int* atom, QUICKDouble* atom_chg) gpu->gpu_sim.chg = gpu->chg->_devData; gpu->gpu_sim.iattype = gpu->iattype->_devData; - PRINTDEBUG("COMPLETE UPLOADING ATOM AND CHARGE") + PRINTDEBUG("COMPLETE UPLOADING ATOM AND CHARGE"); } @@ -502,7 +502,7 @@ extern "C" void gpu_upload_cutoff_(QUICKDouble* cutMatrix, QUICKDouble* integral hipEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD CUTOFF") + PRINTDEBUG("BEGIN TO UPLOAD CUTOFF"); gpu->gpu_cutoff->integralCutoff = *integralCutoff; gpu->gpu_cutoff->coreIntegralCutoff = *coreIntegralCutoff; @@ -529,7 +529,7 @@ extern "C" void gpu_upload_cutoff_(QUICKDouble* cutMatrix, QUICKDouble* integral hipEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING CUTOFF") + PRINTDEBUG("COMPLETE UPLOADING CUTOFF"); } @@ -546,7 +546,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP hipEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD CUTOFF") + PRINTDEBUG("BEGIN TO UPLOAD CUTOFF"); gpu->gpu_cutoff->natom = gpu->natom; gpu->gpu_cutoff->YCutoff = new gpu_buffer_type(YCutoff, gpu->nshell, gpu->nshell); @@ -613,7 +613,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } } - PRINTDEBUG("FINISH STEP 2") + PRINTDEBUG("FINISH STEP 2"); flag = true; for (int i = 0; i < b - 1; i ++) { @@ -649,7 +649,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP break; } flag = true; - PRINTDEBUG("FINISH STEP 3") + PRINTDEBUG("FINISH STEP 3"); if (b != 0) { @@ -702,7 +702,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } } - PRINTDEBUG("FINISH STEP 1") + PRINTDEBUG("FINISH STEP 1"); #ifdef DEBUG fprintf(gpu->debugFile,"a=%i b=%i\n", a, b); #endif @@ -731,7 +731,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP if (flag == true) break; } - PRINTDEBUG("FINISH STEP 2") + PRINTDEBUG("FINISH STEP 2"); flag = true; for (int i = 0; i < b - 1; i ++) { @@ -786,7 +786,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } } - PRINTDEBUG("FINISH STEP 3") + PRINTDEBUG("FINISH STEP 3"); } } } @@ -806,7 +806,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP // } /* - PRINTDEBUG("WORKING on F Orbital") + PRINTDEBUG("WORKING on F Orbital"); gpu->gpu_basis->fStart = a; gpu->gpu_sim.fStart = a; @@ -844,7 +844,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } } - PRINTDEBUG("FINISH STEP 1") + PRINTDEBUG("FINISH STEP 1"); printf("a=%i b=%i\n", a, b); for (int i = 0; i < b - 1; i ++) { @@ -872,7 +872,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP break; } - PRINTDEBUG("FINISH STEP 2") + PRINTDEBUG("FINISH STEP 2"); flag = true; for (int i = 0; i < b - 1; i ++) @@ -910,7 +910,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } flag = true; - PRINTDEBUG("FINISH STEP 3") + PRINTDEBUG("FINISH STEP 3"); } } @@ -996,7 +996,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } - PRINTDEBUG("FINISH STEP 1") + PRINTDEBUG("FINISH STEP 1"); #ifdef DEBUG fprintf(gpu->debugFile,"a=%i b=%i\n", a, b); #endif @@ -1026,7 +1026,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP break; } - PRINTDEBUG("FINISH STEP 2") + PRINTDEBUG("FINISH STEP 2"); flag = true; for (int i = 0; i < b - 1; i ++) @@ -1064,7 +1064,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP } flag = true; - PRINTDEBUG("FINISH STEP 3") + PRINTDEBUG("FINISH STEP 3"); } } } @@ -1182,7 +1182,7 @@ extern "C" void gpu_upload_cutoff_matrix_(QUICKDouble* YCutoff,QUICKDouble* cutP hipEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING CUTOFF") + PRINTDEBUG("COMPLETE UPLOADING CUTOFF"); } @@ -1369,10 +1369,20 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou hipEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD O MATRIX") + PRINTDEBUG("BEGIN TO UPLOAD O MATRIX"); gpu->gpu_calculated->o = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); - gpu->gpu_calculated->dense = new gpu_buffer_type(dense, gpu->nbasis, gpu->nbasis); + gpu->gpu_calculated->dense = new gpu_buffer_type(dense, gpu->nbasis, gpu->nbasis); + +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->o->DeleteGPU(); + gpu->gpu_calculated->oULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + gpu->gpu_calculated->oULL->Upload(); + gpu->gpu_sim.oULL = gpu->gpu_calculated->oULL->_devData; +#else + gpu->gpu_calculated->o->Upload(); + gpu->gpu_sim.o = gpu->gpu_calculated->o->_devData; +#endif gpu->gpu_calculated->o->Upload(); gpu->gpu_sim.o = gpu->gpu_calculated->o->_devData; @@ -1391,7 +1401,7 @@ extern "C" void gpu_upload_calculated_(QUICKDouble* o, QUICKDouble* co, QUICKDou hipEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING O MATRIX") + PRINTDEBUG("COMPLETE UPLOADING O MATRIX"); } @@ -1407,12 +1417,19 @@ extern "C" void gpu_upload_calculated_beta_(QUICKDouble* ob, QUICKDouble* denseb hipEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD BETA O MATRIX") + PRINTDEBUG("BEGIN TO UPLOAD BETA O MATRIX"); gpu->gpu_calculated->ob = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->ob->DeleteGPU(); + gpu->gpu_calculated->obULL = new gpu_buffer_type(gpu->nbasis, gpu->nbasis); + gpu->gpu_calculated->obULL->Upload(); + gpu->gpu_sim.obULL = gpu->gpu_calculated->obULL->_devData; +#else gpu->gpu_calculated->ob->Upload(); gpu->gpu_sim.ob = gpu->gpu_calculated->ob->_devData; +#endif gpu_upload_beta_density_matrix_(denseb); @@ -1426,7 +1443,7 @@ extern "C" void gpu_upload_calculated_beta_(QUICKDouble* ob, QUICKDouble* denseb hipEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING BETA O MATRIX") + PRINTDEBUG("COMPLETE UPLOADING BETA O MATRIX"); } @@ -1466,7 +1483,7 @@ extern "C" void gpu_upload_basis_(int* nshell, int* nprim, int* jshell, int* jba hipEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD BASIS") + PRINTDEBUG("BEGIN TO UPLOAD BASIS"); gpu->gpu_basis->nshell = *nshell; gpu->gpu_basis->nprim = *nprim; @@ -1808,7 +1825,7 @@ move p orbital to the end of the sequence. so the Qshell stands for the length o hipEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING BASIS") + PRINTDEBUG("COMPLETE UPLOADING BASIS"); } @@ -1821,12 +1838,18 @@ extern "C" void gpu_upload_grad_(QUICKDouble* gradCutoff) hipEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO UPLOAD GRAD") + PRINTDEBUG("BEGIN TO UPLOAD GRAD"); gpu->grad = new gpu_buffer_type(3 * gpu->natom); +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL = new gpu_buffer_type(3 * gpu->natom); + gpu->gpu_sim.gradULL = gpu->gradULL->_devData; + gpu->gradULL->Upload(); +#endif + //gpu->grad->DeleteGPU(); - gpu->gpu_sim.grad = gpu->grad->_devData; + gpu->gpu_sim.grad = gpu->grad->_devData; gpu->grad->Upload(); @@ -1843,7 +1866,7 @@ extern "C" void gpu_upload_grad_(QUICKDouble* gradCutoff) hipEventDestroy(end); #endif - PRINTDEBUG("COMPLETE UPLOADING GRAD") + PRINTDEBUG("COMPLETE UPLOADING GRAD"); } @@ -1905,9 +1928,9 @@ extern "C" void gpu_upload_cew_vrecip_(int *ierr) //Computes grid weights before grid point packing extern "C" void gpu_get_ssw_(QUICKDouble *gridx, QUICKDouble *gridy, QUICKDouble *gridz, QUICKDouble *wtang, QUICKDouble *rwt, QUICKDouble *rad3, QUICKDouble *sswt, QUICKDouble *weight, int *gatm, int *count){ - PRINTDEBUG("BEGIN TO COMPUTE SSW") + PRINTDEBUG("BEGIN TO COMPUTE SSW"); - gpu->gpu_xcq->npoints = *count; + gpu->gpu_xcq->npoints = *count; gpu->xc_threadsPerBlock = SM_2X_XC_THREADS_PER_BLOCK; gpu->gpu_xcq->gridx = new gpu_buffer_type(gridx, gpu->gpu_xcq->npoints); @@ -1963,14 +1986,14 @@ extern "C" void gpu_get_ssw_(QUICKDouble *gridx, QUICKDouble *gridy, QUICKDouble SAFE_DELETE(gpu->gpu_xcq->sswt); SAFE_DELETE(gpu->gpu_xcq->weight); - PRINTDEBUG("END COMPUTE SSW") + PRINTDEBUG("END COMPUTE SSW"); } void prune_grid_sswgrad() { - PRINTDEBUG("BEGIN TO UPLOAD DFT GRID FOR SSWGRAD") + PRINTDEBUG("BEGIN TO UPLOAD DFT GRID FOR SSWGRAD"); #ifdef MPIV_GPU hipEvent_t t_startp, t_endp; float t_timep; @@ -2095,7 +2118,7 @@ void prune_grid_sswgrad() upload_sim_to_constant_dft(gpu); - PRINTDEBUG("COMPLETE UPLOADING DFT GRID FOR SSWGRAD") + PRINTDEBUG("COMPLETE UPLOADING DFT GRID FOR SSWGRAD"); //Clean up temporary arrays free(tmp_gridx); @@ -2121,7 +2144,7 @@ void prune_grid_sswgrad() void gpu_get_octree_info(QUICKDouble *gridx, QUICKDouble *gridy, QUICKDouble *gridz, QUICKDouble *sigrad2, unsigned char *gpweight, unsigned int *cfweight, unsigned int *pfweight, int *bin_locator, int count, double DMCutoff, double XCCutoff, int nbins) { - PRINTDEBUG("BEGIN TO OBTAIN PRIMITIVE & BASIS FUNCTION LISTS ") + PRINTDEBUG("BEGIN TO OBTAIN PRIMITIVE & BASIS FUNCTION LISTS "); gpu->gpu_xcq->npoints = count; gpu->xc_threadsPerBlock = SM_2X_XCGRAD_THREADS_PER_BLOCK; @@ -2229,14 +2252,14 @@ void gpu_get_octree_info(QUICKDouble *gridx, QUICKDouble *gridy, QUICKDouble *gr hipFree(d_cfweight); hipFree(d_pfweight); - PRINTDEBUG("PRIMITIVE & BASIS FUNCTION LISTS OBTAINED") + PRINTDEBUG("PRIMITIVE & BASIS FUNCTION LISTS OBTAINED"); } #ifdef DEBUG void print_uploaded_dft_info() { - PRINTDEBUG("PRINTING UPLOADED DFT DATA") + PRINTDEBUG("PRINTING UPLOADED DFT DATA"); fprintf(gpu->debugFile,"Number of grid points: %i \n", gpu->gpu_xcq->npoints); fprintf(gpu->debugFile,"Bin size: %i \n", gpu->gpu_xcq->bin_size); @@ -2244,7 +2267,7 @@ void print_uploaded_dft_info() fprintf(gpu->debugFile,"Number of total basis functions: %i \n", gpu->gpu_xcq->ntotbf); fprintf(gpu->debugFile,"Number of total primitive functions: %i \n", gpu->gpu_xcq->ntotpf); - PRINTDEBUG("GRID POINTS & WEIGHTS") + PRINTDEBUG("GRID POINTS & WEIGHTS"); for(int i=0; igpu_xcq->npoints; i++){ fprintf(gpu->debugFile,"Grid: %i x=%f y=%f z=%f sswt=%f weight=%f gatm=%i dweight_ssd=%i \n",i, @@ -2253,7 +2276,7 @@ void print_uploaded_dft_info() gpu->gpu_xcq->dweight_ssd->_hostData[i]); } - PRINTDEBUG("BASIS & PRIMITIVE FUNCTION LISTS") + PRINTDEBUG("BASIS & PRIMITIVE FUNCTION LISTS"); for(int bin_id=0; bin_idgpu_xcq->nbins; bin_id++){ @@ -2270,7 +2293,7 @@ void print_uploaded_dft_info() } - PRINTDEBUG("RADIUS OF SIGNIFICANCE") + PRINTDEBUG("RADIUS OF SIGNIFICANCE"); for(int i=0; inbasis; i++){ fprintf(gpu->debugFile,"ibas=%i sigrad2=%f \n", i, gpu->gpu_basis->sigrad2->_hostData[i]); @@ -2284,7 +2307,7 @@ void print_uploaded_dft_info() } } - PRINTDEBUG("END PRINTING UPLOADED DFT DATA") + PRINTDEBUG("END PRINTING UPLOADED DFT DATA"); } #endif @@ -2294,7 +2317,7 @@ extern "C" void gpu_upload_dft_grid_(QUICKDouble *gridxb, QUICKDouble *gridyb, Q *bin_counter,int *gridb_count, int *nbins, int *nbtotbf, int *nbtotpf, int *isg, QUICKDouble *sigrad2, QUICKDouble *DMCutoff, QUICKDouble *XCCutoff){ - PRINTDEBUG("BEGIN TO UPLOAD DFT GRID") + PRINTDEBUG("BEGIN TO UPLOAD DFT GRID"); gpu->gpu_xcq->npoints = *gridb_count; gpu->gpu_xcq->nbins = *nbins; @@ -2419,7 +2442,7 @@ extern "C" void gpu_upload_dft_grid_(QUICKDouble *gridxb, QUICKDouble *gridyb, Q print_uploaded_dft_info(); #endif - PRINTDEBUG("COMPLETE UPLOADING DFT GRID") + PRINTDEBUG("COMPLETE UPLOADING DFT GRID"); // int nblocks = (int) ((*gridb_count / SM_2X_XC_THREADS_PER_BLOCK) + 1); // test_xc_upload <<>>( ); @@ -2432,7 +2455,7 @@ extern "C" void gpu_upload_dft_grid_(QUICKDouble *gridxb, QUICKDouble *gridyb, Q //----------------------------------------------- extern "C" void gpu_reupload_dft_grid_() { - PRINTDEBUG("BEGIN TO UPLOAD DFT GRID") + PRINTDEBUG("BEGIN TO UPLOAD DFT GRID"); gpu->gpu_xcq->gridx->ReallocateGPU(); gpu->gpu_xcq->gridy->ReallocateGPU(); @@ -2506,7 +2529,7 @@ extern "C" void gpu_reupload_dft_grid_() reupload_pteval( ); - PRINTDEBUG("COMPLETE UPLOADING DFT GRID") + PRINTDEBUG("COMPLETE UPLOADING DFT GRID"); } @@ -2515,7 +2538,7 @@ extern "C" void gpu_reupload_dft_grid_() //----------------------------------------------- extern "C" void gpu_delete_dft_dev_grid_() { - PRINTDEBUG("DEALLOCATING DFT GRID") + PRINTDEBUG("DEALLOCATING DFT GRID"); gpu->gpu_xcq->gridx->DeleteGPU(); gpu->gpu_xcq->gridy->DeleteGPU(); @@ -2542,13 +2565,13 @@ extern "C" void gpu_delete_dft_dev_grid_() delete_pteval(true); - PRINTDEBUG("FINISHED DEALLOCATING DFT GRID") + PRINTDEBUG("FINISHED DEALLOCATING DFT GRID"); } extern "C" void gpu_delete_dft_grid_() { - PRINTDEBUG("DEALLOCATING DFT GRID") + PRINTDEBUG("DEALLOCATING DFT GRID"); SAFE_DELETE(gpu->gpu_xcq->gridx); SAFE_DELETE(gpu->gpu_xcq->gridy); @@ -2578,13 +2601,13 @@ extern "C" void gpu_delete_dft_grid_() SAFE_DELETE(gpu->gpu_xcq->mpi_bxccompute); #endif delete_pteval(false); - PRINTDEBUG("FINISHED DEALLOCATING DFT GRID") + PRINTDEBUG("FINISHED DEALLOCATING DFT GRID"); } void gpu_delete_sswgrad_vars() { - PRINTDEBUG("DEALLOCATING SSWGRAD VARIABLES") + PRINTDEBUG("DEALLOCATING SSWGRAD VARIABLES"); SAFE_DELETE(gpu->gpu_xcq->gridx_ssd); SAFE_DELETE(gpu->gpu_xcq->gridy_ssd); @@ -2597,7 +2620,7 @@ void gpu_delete_sswgrad_vars() SAFE_DELETE(gpu->gpu_xcq->mpi_bxccompute); #endif - PRINTDEBUG("FINISHED DEALLOCATING SSWGRAD VARIABLES") + PRINTDEBUG("FINISHED DEALLOCATING SSWGRAD VARIABLES"); } @@ -2658,7 +2681,7 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) hipEventRecord(start, 0); #endif - PRINTDEBUG("BEGIN TO RUN ADD INT") + PRINTDEBUG("BEGIN TO RUN ADD INT"); FILE *intFile; int aBuffer[BUFFERSIZE], bBuffer[BUFFERSIZE]; @@ -2687,7 +2710,7 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) hipStreamCreate( &stream[i] ); } - PRINTDEBUG("BEGIN TO RUN KERNEL") + PRINTDEBUG("BEGIN TO RUN KERNEL"); upload_sim_to_constant(gpu); @@ -2844,16 +2867,36 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) delete gpu->aoint_buffer[i]; } - PRINTDEBUG("COMPLETE KERNEL") - + PRINTDEBUG("COMPLETE KERNEL"); + +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } + else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + } + } +#else gpu->gpu_calculated->o->Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } +#endif gpu->gpu_calculated->o->Download(o); #ifdef DEBUG @@ -2866,7 +2909,7 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) hipEventDestroy(end); #endif - PRINTDEBUG("DELETE TEMP VARIABLES") + PRINTDEBUG("DELETE TEMP VARIABLES"); delete gpu->gpu_calculated->o; delete gpu->gpu_calculated->dense; @@ -2874,8 +2917,11 @@ extern "C" void gpu_addint_(QUICKDouble* o, int* intindex, char* intFileName) delete gpu->gpu_cutoff->sorted_YCutoffIJ; delete gpu->gpu_cutoff->YCutoff; delete gpu->gpu_cutoff->cutPrim; +#if defined(USE_LEGACY_ATOMICS) + delete gpu->gpu_calculated->oULL; +#endif - PRINTDEBUG("COMPLETE RUNNING ADDINT") + PRINTDEBUG("COMPLETE RUNNING ADDINT"); } #endif @@ -2896,7 +2942,7 @@ char *trim(char *s) #ifdef COMPILE_GPU_AOINT extern "C" void gpu_aoint_(QUICKDouble* leastIntegralCutoff, QUICKDouble* maxIntegralCutoff, int* intNum, char* intFileName) { - PRINTDEBUG("BEGIN TO RUN AOINT") + PRINTDEBUG("BEGIN TO RUN AOINT"); ERI_entry a; FILE *intFile; @@ -3115,7 +3161,7 @@ extern "C" void gpu_aoint_(QUICKDouble* leastIntegralCutoff, QUICKDouble* maxInt #ifdef DEBUG fprintf(gpu->debugFile," TOTAL INT = %i \n", *intNum); #endif - PRINTDEBUG("END TO RUN AOINT KERNEL") + PRINTDEBUG("END TO RUN AOINT KERNEL"); #ifdef DEBUG clock_t end_cpu = clock(); diff --git a/src/gpu/hip/gpu.h b/src/gpu/hip/gpu.h index e310e4ce..c3b708d7 100644 --- a/src/gpu/hip/gpu.h +++ b/src/gpu/hip/gpu.h @@ -295,8 +295,6 @@ void bind_eri_texture(_gpu_type gpu); void unbind_eri_texture(); //__device__ void gpu_shell(unsigned int II, unsigned int JJ, unsigned int KK, unsigned int LL); -__device__ void addint(QUICKDouble* o, QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, int nbasis); -__device__ __forceinline__ void addint_oshell(QUICKDouble* o, QUICKDouble* ob,QUICKDouble Y, int III, int JJJ, int KKK, int LLL,QUICKDouble hybrid_coeff, QUICKDouble* dense, QUICKDouble* denseb,int nbasis); __device__ void FmT_sp(const int MaxM, const QUICKDouble X, QUICKDouble* vals); __device__ void FmT_spd(const int MaxM, const QUICKDouble X, QUICKDouble* vals); __device__ void FmT(const int MaxM, const QUICKDouble X, QUICKDouble* vals); @@ -618,7 +616,14 @@ __device__ int lefthrr_lri23(QUICKDouble RAx, QUICKDouble RAy, QUICKDouble RAz, int KLMNBx, int KLMNBy, int KLMNBz, int IJTYPE,QUICKDouble* coefAngularL, unsigned char* angularL); __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int iparent, int gid); -__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, const QUICKDouble Exc, const QUICKDouble quadwt, QUICKDouble* const smemGrad, QUICKDouble* const uw_ssd, const int iparent, const int natom); +__device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, + const QUICKDouble Exc, const QUICKDouble quadwt, +#if defined(USE_LEGACY_ATOMICS) + QUICKULL* const smemGrad, +#else + QUICKDouble* const smemGrad, +#endif + QUICKDouble* const uw_ssd, const int iparent, const int natom); __device__ QUICKDouble get_unnormalized_weight(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int iatm); __device__ QUICKDouble SSW( QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, int atm); diff --git a/src/gpu/hip/gpu_MP2.cu b/src/gpu/hip/gpu_MP2.cu index fc5d28c2..915f91d7 100644 --- a/src/gpu/hip/gpu_MP2.cu +++ b/src/gpu/hip/gpu_MP2.cu @@ -67,19 +67,20 @@ void get2e_MP2(_gpu_type gpu) #endif } + __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kernel() { - unsigned int offside = blockIdx.x*blockDim.x+threadIdx.x; - int totalThreads = blockDim.x*gridDim.x; + unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x; + int totalThreads = blockDim.x * gridDim.x; QUICKULL jshell = (QUICKULL) devSim_MP2.sqrQshell; QUICKULL myInt = (QUICKULL) jshell * jshell / totalThreads; - if ((jshell * jshell - myInt * totalThreads) > offside) + if (jshell * jshell - myInt * totalThreads > offset) myInt++; for (QUICKULL i = 1; i <= myInt; i++) { - QUICKULL currentInt = totalThreads * (i - 1) + offside; + QUICKULL currentInt = totalThreads * (i - 1) + offset; QUICKULL a = (QUICKULL) currentInt / jshell; QUICKULL b = (QUICKULL) (currentInt - a * jshell); @@ -129,13 +130,16 @@ __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) get2e_MP2_kerne if (ii <= kk) { int nshell = devSim_MP2.nshell; QUICKDouble DNMax = MAX( - MAX(4.0 * LOC2(devSim_MP2.cutMatrix, ii, jj, nshell, nshell), 4.0 * LOC2(devSim_MP2.cutMatrix, kk, ll, nshell, nshell)), - MAX(MAX(LOC2(devSim_MP2.cutMatrix, ii, ll, nshell, nshell), LOC2(devSim_MP2.cutMatrix, ii, kk, nshell, nshell)), - MAX(LOC2(devSim_MP2.cutMatrix, jj, kk, nshell, nshell), LOC2(devSim_MP2.cutMatrix, jj, ll, nshell, nshell)))); - - if ((LOC2(devSim_MP2.YCutoff, kk, ll, nshell, nshell) * LOC2(devSim_MP2.YCutoff, ii, jj, nshell, nshell)) + MAX(4.0 * LOC2(devSim_MP2.cutMatrix, ii, jj, nshell, nshell), + 4.0 * LOC2(devSim_MP2.cutMatrix, kk, ll, nshell, nshell)), + MAX(MAX(LOC2(devSim_MP2.cutMatrix, ii, ll, nshell, nshell), + LOC2(devSim_MP2.cutMatrix, ii, kk, nshell, nshell)), + MAX(LOC2(devSim_MP2.cutMatrix, jj, kk, nshell, nshell), + LOC2(devSim_MP2.cutMatrix, jj, ll, nshell, nshell)))); + + if (LOC2(devSim_MP2.YCutoff, kk, ll, nshell, nshell) * LOC2(devSim_MP2.YCutoff, ii, jj, nshell, nshell) > devSim_MP2.integralCutoff - && (LOC2(devSim_MP2.YCutoff, kk, ll, nshell, nshell) * LOC2(devSim_MP2.YCutoff, ii, jj, nshell, nshell) * DNMax) + && LOC2(devSim_MP2.YCutoff, kk, ll, nshell, nshell) * LOC2(devSim_MP2.YCutoff, ii, jj, nshell, nshell) * DNMax > devSim_MP2.integralCutoff) { int iii = devSim_MP2.sorted_Qnumber[II]; int jjj = devSim_MP2.sorted_Qnumber[JJ]; @@ -348,7 +352,13 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned for (int III = III1; III <= III2; III++) { for (int JJJ = MAX(III, JJJ1); JJJ <= JJJ2; JJJ++) { + QUICKDouble o_JI = 0.0; + for (int KKK = MAX(III, KKK1); KKK <= KKK2; KKK++) { + QUICKDouble o_KI = 0.0; + QUICKDouble o_JK = 0.0; + QUICKDouble o_JK_MM = 0.0; + for (int LLL = MAX(KKK, LLL1); LLL <= LLL2; LLL++) { if (III < KKK || (III == JJJ && III == LLL) @@ -369,102 +379,108 @@ __device__ void iclass_MP2(int I, int J, int K, int L, unsigned int II, unsigned QUICKDouble DENSEJI = (QUICKDouble) LOC2(devSim_MP2.dense, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis); // ATOMIC ADD VALUE 1 - QUICKDouble _tmp = 2.0; - if (KKK == LLL) { - _tmp = 1.0; - } - - QUICKDouble val1d = _tmp * DENSELK * Y; - //if (abs(val1d) > devSim_MP2.integralCutoff) { - QUICKULL val1 = (QUICKULL) (fabs(val1d * OSCALE) + (QUICKDouble) 0.5); - if (val1d < (QUICKDouble) 0.0) - val1 = 0ull - val1; - atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val1); - // } + temp = (KKK == LLL) ? DENSELK * Y : 2.0 * DENSELK * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { + o_JI += temp; +// } // ATOMIC ADD VALUE 2 if (LLL != JJJ || III != KKK) { - _tmp = 2.0; - if (III == JJJ) { - _tmp = 1.0; - } - - QUICKDouble val2d = _tmp * DENSEJI * Y; - // if (abs(val2d) > devSim_MP2.integralCutoff) { - QUICKULL val2 = (QUICKULL) (fabs(val2d * OSCALE) + (QUICKDouble) 0.5); - if (val2d < (QUICKDouble) 0.0) - val2 = 0ull - val2; - atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), val2); - // } + temp = (III == JJJ) ? DENSEJI * Y : 2.0 * DENSEJI * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), temp, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, LLL - 1, KKK - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), temp); +#endif +// } } // ATOMIC ADD VALUE 3 - QUICKDouble val3d = hybrid_coeff * 0.5 * DENSELJ * Y; - //if (abs(2 * val3d) > devSim_MP2.integralCutoff) { - QUICKULL val3 = (QUICKULL) (fabs(val3d * OSCALE) + (QUICKDouble) 0.5); - if (III == KKK && III < JJJ && JJJ < LLL) { - val3 = (QUICKULL) (fabs(2 * val3d * OSCALE) + (QUICKDouble) 0.5); - } - if (DENSELJ * Y < (QUICKDouble) 0.0) - val3 = 0ull - val3; - atomicAdd(&LOC2(devSim_MP2.oULL, KKK - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val3); - //} + temp = (III == KKK && III < JJJ && JJJ < LLL) + ? -(hybrid_coeff * DENSELJ * Y) : -0.5 * hybrid_coeff * DENSELJ * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { + o_KI += temp; +// } // ATOMIC ADD VALUE 4 if (KKK != LLL) { - QUICKDouble val4d = hybrid_coeff * 0.5 * DENSEKJ * Y; - // if (abs(val4d) > devSim_MP2.integralCutoff) { - QUICKULL val4 = (QUICKULL) (fabs(val4d * OSCALE) + (QUICKDouble) 0.5); - if (val4d < (QUICKDouble) 0.0) val4 = 0ull - val4; - atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val4); - //} + temp = -0.5 * hybrid_coeff * DENSEKJ * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim_MP2.o, LLL - 1, III - 1, devSim_MP2.nbasis, devSim_MP2.nbasis), temp); +# endif +// } } // ATOMIC ADD VALUE 5 - QUICKDouble val5d = hybrid_coeff * 0.5 * DENSELI * Y; - //if (abs(val5d) > devSim_MP2.integralCutoff) { - QUICKULL val5 = (QUICKULL) (fabs(val5d * OSCALE) + (QUICKDouble) 0.5); - if (val5d < (QUICKDouble) 0.0) val5 = 0ull - val5; - + temp = -0.5 * hybrid_coeff * DENSELI * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { if ((III != JJJ && III < KKK) || (III == JJJ && III == KKK && III < LLL) - || (III == KKK && III < JJJ && JJJ < LLL)) { - atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,KKK) - 1, MIN(JJJ,KKK) - 1, - devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5); + || (III == KKK && III < JJJ && JJJ < LLL)) { + o_JK_MM += temp; } // ATOMIC ADD VALUE 5 - 2 if (III != JJJ && JJJ == KKK) { - atomicAdd(&LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1, - devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val5); + o_JK += temp; } - //} +// } // ATOMIC ADD VALUE 6 - if (III != JJJ) { - if (KKK != LLL) { - QUICKDouble val6d = hybrid_coeff * 0.5 * DENSEKI * Y; - // if (abs(val6d) > devSim_MP2.integralCutoff) { - - QUICKULL val6 = (QUICKULL) (fabs(val6d * OSCALE) + (QUICKDouble) 0.5); - if (val6d < (QUICKDouble) 0.0) - val6 = 0ull - val6; - - atomicAdd(&LOC2(devSim_MP2.oULL, MAX(JJJ,LLL) - 1, MIN(JJJ,LLL) - 1, - devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6); - - // ATOMIC ADD VALUE 6 - 2 - if (JJJ == LLL && III != KKK) { - atomicAdd(&LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1, - devSim_MP2.nbasis, devSim_MP2.nbasis), 0ull - val6); - } + if (III != JJJ && KKK != LLL) { + temp = -0.5 * hybrid_coeff * DENSEKI * Y; +// if (abs(temp) > devSim_MP2.integralCutoff) { +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), temp, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), temp); +#endif + + // ATOMIC ADD VALUE 6 - 2 + if (JJJ == LLL && III != KKK) { +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, LLL - 1, JJJ - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), temp, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, LLL - 1, JJJ - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), temp); +#endif } - //} +// } } } - //} } + +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, KKK - 1, III - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_KI, OSCALE); + GPUATOMICADD(&LOC2(devSim_MP2.oULL, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JK_MM, OSCALE); + GPUATOMICADD(&LOC2(devSim_MP2.oULL, JJJ - 1, KKK - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JK, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, KKK - 1, III - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_KI); + atomicAdd(&LOC2(devSim_MP2.o, MAX(JJJ, KKK) - 1, MIN(JJJ, KKK) - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JK_MM); + atomicAdd(&LOC2(devSim_MP2.o, JJJ - 1, KKK - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JK); +#endif } + +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_MP2.oULL, JJJ - 1, III - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JI, OSCALE); +#else + atomicAdd(&LOC2(devSim_MP2.o, JJJ - 1, III - 1, + devSim_MP2.nbasis, devSim_MP2.nbasis), o_JI); +#endif } } } diff --git a/src/gpu/hip/gpu_cew_quad.h b/src/gpu/hip/gpu_cew_quad.h index 3a4e0caf..882e9b0c 100644 --- a/src/gpu/hip/gpu_cew_quad.h +++ b/src/gpu/hip/gpu_cew_quad.h @@ -97,8 +97,12 @@ void get_cew_accdens(_gpu_type gpu) { int Istart = (gpu -> gpu_xcq -> gatm -> _hostData[i]-1) * 3; - for(int j=0; j<3; j++) - gpu -> cew_grad->_hostData[Istart+j] += cewGrad[j]; + for (int j = 0; j < 3; j++) +#if defined(USE_LEGACY_ATOMICS) + gpu->grad->_hostData[Istart + j] += cewGrad[j]; +#else + gpu->cew_grad->_hostData[Istart + j] += cewGrad[j]; +#endif } delete gridpt; @@ -139,7 +143,11 @@ __global__ void getcew_quad_kernel() QUICKDouble _tmp = phi * phi2 * dfdr * weight; +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_dft.oULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp, OSCALE); +#else atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); +#endif } } } @@ -155,13 +163,23 @@ __global__ void oshell_getcew_quad_grad_kernel() __global__ void cshell_getcew_quad_grad_kernel() #endif { +#if defined(USE_LEGACY_ATOMICS) + //declare smem grad vector + extern __shared__ QUICKULL smem_buffer[]; + QUICKULL* smemGrad = (QUICKULL*) smem_buffer; + + // initialize smem grad + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) + smemGrad[i] = 0ull; +#else //declare smem grad vector extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; + QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0.0; + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) + smemGrad[i] = 0.0; +#endif __syncthreads(); @@ -223,9 +241,9 @@ __global__ void cshell_getcew_quad_grad_kernel() QUICKDouble Gradz = - 2.0 * denseij * weight * (dfdr * dphidz * phi2); //printf("test quad grad %f %f %f %f %f %f %f %f %f %f\n", gridx, gridy, gridz, denseij, weight, dfdr, dphidx, dphidy, dphidz, phi2); - atomicAdd(&smemGrad[Istart], Gradx); - atomicAdd(&smemGrad[Istart+1], Grady); - atomicAdd(&smemGrad[Istart+2], Gradz); + GPUATOMICADD(&smemGrad[Istart], Gradx, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 1], Grady, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 2], Gradz, GRADSCALE); sumGradx += Gradx; sumGrady += Grady; sumGradz += Gradz; @@ -235,9 +253,9 @@ __global__ void cshell_getcew_quad_grad_kernel() int Istart = (devSim_dft.gatm[gid]-1)*3; - atomicAdd(&smemGrad[Istart], -sumGradx); - atomicAdd(&smemGrad[Istart+1], -sumGrady); - atomicAdd(&smemGrad[Istart+2], -sumGradz); + GPUATOMICADD(&smemGrad[Istart], -sumGradx, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 1], -sumGrady, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 2], -sumGradz, GRADSCALE); } //Set weights for sswder calculation @@ -253,8 +271,12 @@ __global__ void cshell_getcew_quad_grad_kernel() __syncthreads(); // update gmem grad vector - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) +#if defined(USE_LEGACY_ATOMICS) + atomicAdd(&devSim_dft.gradULL[i], smemGrad[i]); +#else + atomicAdd(&devSim_dft.grad[i], smemGrad[i]); +#endif __syncthreads(); } diff --git a/src/gpu/hip/gpu_get2e.cu b/src/gpu/hip/gpu_get2e.cu index b66fd8a0..32a3cfcc 100644 --- a/src/gpu/hip/gpu_get2e.cu +++ b/src/gpu/hip/gpu_get2e.cu @@ -803,7 +803,152 @@ __global__ void __launch_bounds__(SM_2X_2E_THREADS_PER_BLOCK, 1) getAddInt_kerne // } else if( devSim.method == LIBXC) { // hybrid_coeff = devSim.hyb_coeff; // } - addint(devSim.o, a[k].value, III, JJJ, KKK, LLL, devSim.hyb_coeff, devSim.dense, devSim.nbasis); + +#if defined(OSHELL) + QUICKDouble DENSELK = (QUICKDouble) (LOC2(devSim.dense, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis) + + LOC2(devSim.denseb, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis)); + QUICKDouble DENSEJI = (QUICKDouble) (LOC2(devSim.dense, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis) + + LOC2(devSim.denseb, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis)); + + QUICKDouble DENSEKIA = (QUICKDouble) LOC2(devSim.dense, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSEKJA = (QUICKDouble) LOC2(devSim.dense, KKK - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELJA = (QUICKDouble) LOC2(devSim.dense, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELIA = (QUICKDouble) LOC2(devSim.dense, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis); + + QUICKDouble DENSEKIB = (QUICKDouble) LOC2(devSim.denseb, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSEKJB = (QUICKDouble) LOC2(devSim.denseb, KKK - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELJB = (QUICKDouble) LOC2(devSim.denseb, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELIB = (QUICKDouble) LOC2(devSim.denseb, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis); +#else + QUICKDouble DENSEKI = (QUICKDouble) LOC2(devSim.dense, KKK - 1, III - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSEKJ = (QUICKDouble) LOC2(devSim.dense, KKK - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELJ = (QUICKDouble) LOC2(devSim.dense, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELI = (QUICKDouble) LOC2(devSim.dense, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSELK = (QUICKDouble) LOC2(devSim.dense, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis); + QUICKDouble DENSEJI = (QUICKDouble) LOC2(devSim.dense, JJJ - 1, III - 1, devSim.nbasis, devSim.nbasis); +#endif + + // ATOMIC ADD VALUE 1 + temp = (KKK == LLL) ? DENSELK * a[k].value : 2.0 * DENSELK * a[k].value; + o_JI += temp; +#if defined(OSHELL) + ob_JI += temp; +#endif + + // ATOMIC ADD VALUE 2 + if (LLL != JJJ || III != KKK) { + temp = (III == JJJ) ? DENSEJI * a[k].value : 2.0 * DENSEJI * a[k].value; +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.ob, LLL - 1, KKK - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#endif + } + + // ATOMIC ADD VALUE 3 +#if defined(OSHELL) + temp = (III == KKK && III < JJJ && JJJ < LLL) + ? -2.0 * devSim.hyb_coeff * DENSELJA * a[k].value : -(devSim.hyb_coeff * DENSELJA * a[k].value); + temp2 = (III == KKK && III < JJJ && JJJ < LLL) + ? -2.0 * devSim.hyb_coeff * DENSELJB * a[k].value : -(devSim.hyb_coeff * DENSELJB * a[k].value); + o_KI += temp; + ob_KI += temp2; +#else + temp = (III == KKK && III < JJJ && JJJ < LLL) + ? -(devSim.hyb_coeff * DENSELJ * a[k].value) : -0.5 * devSim.hyb_coeff * DENSELJ * a[k].value; + o_KI += temp; +#endif + + // ATOMIC ADD VALUE 4 + if (KKK != LLL) { +#if defined(OSHELL) + temp = -(devSim.hyb_coeff * DENSEKJA * a[k].value); + temp2 = -(devSim.hyb_coeff * DENSEKJB * a[k].value); +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); + GPUATOMICADD(&LOC2(devSim.obULL, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp2, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp); + atomicAdd(&LOC2(devSim.ob, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp2); +# endif +#else + temp = -0.5 * devSim.hyb_coeff * DENSEKJ * a[k].value; +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, LLL - 1, III - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#endif + } + + // ATOMIC ADD VALUE 5 +#if defined(OSHELL) + temp = -(devSim.hyb_coeff * DENSELIA * a[k].value); + temp2 = -(devSim.hyb_coeff * DENSELIB * a[k].value); +#else + temp = -0.5 * devSim.hyb_coeff * DENSELI * a[k].value; +#endif + if ((III != JJJ && III < KKK) + || (III == JJJ && III == KKK && III < LLL) + || (III == KKK && III < JJJ && JJJ < LLL)) { + o_JK_MM += temp; +#if defined(OSHELL) + ob_JK_MM += temp2; +#endif + } + + // ATOMIC ADD VALUE 5 - 2 + if (III != JJJ && JJJ == KKK) { + o_JK += temp; +#if defined(OSHELL) + ob_JK += temp2; +#endif + } + + // ATOMIC ADD VALUE 6 + if (III != JJJ && KKK != LLL) { +#if defined(OSHELL) + temp = -(devSim.hyb_coeff * DENSEKIA * a[k].value); + temp2 = -(devSim.hyb_coeff * DENSEKIB * a[k].value); +#else + temp = -0.5 * devSim.hyb_coeff * DENSEKI * a[k].value; +#endif +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp2, OSCALE); +# else + atomicAdd(&LOC2(devSim.ob, MAX(JJJ, LLL) - 1, MIN(JJJ, LLL) - 1, devSim.nbasis, devSim.nbasis), temp2); +# endif +#endif + + // ATOMIC ADD VALUE 6 - 2 + if (JJJ == LLL && III != KKK) { +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.oULL, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp, OSCALE); +# else + atomicAdd(&LOC2(devSim.o, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp); +# endif +#if defined(OSHELL) +# if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim.obULL, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp2, OSCALE); +# else + atomicAdd(&LOC2(devSim.ob, LLL - 1, JJJ - 1, devSim.nbasis, devSim.nbasis), temp2); +# endif +#endif + } + } } } diff --git a/src/gpu/hip/gpu_get2e_getxc_drivers.h b/src/gpu/hip/gpu_get2e_getxc_drivers.h index 7b2ea34b..bda82664 100644 --- a/src/gpu/hip/gpu_get2e_getxc_drivers.h +++ b/src/gpu/hip/gpu_get2e_getxc_drivers.h @@ -27,55 +27,102 @@ extern "C" void gpu_get_oshell_eri_(bool *deltaO, QUICKDouble* o, QUICKDouble* o extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) #endif { - PRINTDEBUG("BEGIN TO RUN GET ERI") + PRINTDEBUG("BEGIN TO RUN GET ERI"); upload_sim_to_constant(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") + PRINTDEBUG("BEGIN TO RUN KERNEL"); #ifdef OSHELL - get_oshell_eri(gpu); + get_oshell_eri(gpu); #else get2e(gpu); #endif - PRINTDEBUG("COMPLETE KERNEL") - - gpu -> gpu_calculated -> o -> Download(); - hipMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); + PRINTDEBUG("COMPLETE KERNEL"); + +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + hipMemsetAsync(gpu->gpu_calculated->oULL->_devData, 0, sizeof(QUICKULL) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } + else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData , i , j , gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData , j , i , gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + } + } - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); +# if defined(OSHELL) + gpu->gpu_calculated->obULL->Download(); + hipMemsetAsync(gpu->gpu_calculated->obULL->_devData, 0, sizeof(QUICKULL) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->obULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } + else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; } } -#ifdef OSHELL - gpu -> gpu_calculated -> ob -> Download(); - hipMemsetAsync(gpu -> gpu_calculated -> ob -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); +# endif +#else + gpu->gpu_calculated->o->Download(); + hipMemsetAsync(gpu->gpu_calculated->o->_devData, 0, sizeof(QUICKDouble) * gpu->nbasis * gpu->nbasis); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); + } + } +# if defined(OSHELL) + gpu->gpu_calculated->ob->Download(); + hipMemsetAsync(gpu->gpu_calculated->ob->_devData, 0, sizeof(QUICKDouble) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } +# endif #endif - gpu -> gpu_calculated -> o -> DownloadSum(o); - -#ifdef OSHELL - gpu -> gpu_calculated -> ob -> DownloadSum(ob); + gpu->gpu_calculated->o->DownloadSum(o); +#if defined(OSHELL) + gpu->gpu_calculated->ob->DownloadSum(ob); #endif - PRINTDEBUG("DELETE TEMP VARIABLES") + PRINTDEBUG("DELETE TEMP VARIABLES"); - if(gpu -> gpu_sim.method == HF){ + if (gpu->gpu_sim.method == HF) { delete gpu->gpu_calculated->o; delete gpu->gpu_calculated->dense; - -#ifdef OSHELL +#if defined(USE_LEGACY_ATOMICS) + delete gpu->gpu_calculated->oULL; +# if defined(OSHELL) + delete gpu->gpu_calculated->obULL; +# endif +#endif +#if defined(OSHELL) delete gpu->gpu_calculated->ob; delete gpu->gpu_calculated->denseb; #endif - - }else if(*deltaO != 0){ + } else if (*deltaO != 0) { delete gpu->gpu_calculated->dense; #ifdef OSHELL delete gpu->gpu_calculated->denseb; @@ -84,7 +131,7 @@ extern "C" void gpu_get_cshell_eri_(bool *deltaO, QUICKDouble* o) delete gpu->gpu_cutoff->cutMatrix; - PRINTDEBUG("COMPLETE RUNNING GET2E") + PRINTDEBUG("COMPLETE RUNNING GET2E"); } @@ -94,9 +141,9 @@ extern "C" void gpu_get_oshell_eri_grad_(QUICKDouble* grad) extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) #endif { - PRINTDEBUG("BEGIN TO RUN GRAD") + PRINTDEBUG("BEGIN TO RUN GRAD"); upload_sim_to_constant(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") + PRINTDEBUG("BEGIN TO RUN KERNEL"); if(gpu -> gpu_sim.is_oshell == true){ get_oshell_eri_grad(gpu); @@ -116,24 +163,44 @@ extern "C" void gpu_get_cshell_eri_grad_(QUICKDouble* grad) } #endif - PRINTDEBUG("COMPLETE KERNEL") + PRINTDEBUG("COMPLETE KERNEL"); + + if (gpu->gpu_sim.method == HF) { +#if defined(USE_LEGACY_ATOMICS) + gpu -> gradULL -> Download(); + + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } - if(gpu -> gpu_sim.method == HF){ - gpu -> grad -> Download(); + gpu->grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else + gpu->grad->Download(); +#endif } - if(gpu -> gpu_sim.method == HF){ - gpu -> grad -> DownloadSum(grad); + if (gpu -> gpu_sim.method == HF) { + gpu->grad->DownloadSum(grad); - delete gpu -> grad; + delete gpu->grad; +#if defined(USE_LEGACY_ATOMICS) + delete gpu->gradULL; +#endif delete gpu->gpu_calculated->dense; -#ifdef OSHELL +#if defined(OSHELL) delete gpu->gpu_calculated->denseb; #endif } - PRINTDEBUG("COMPLETE RUNNING GRAD") + PRINTDEBUG("COMPLETE RUNNING GRAD"); } @@ -143,56 +210,143 @@ extern "C" void gpu_get_oshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICK extern "C" void gpu_get_cshell_xc_(QUICKDouble* Eelxc, QUICKDouble* aelec, QUICKDouble* belec, QUICKDouble *o) #endif { - PRINTDEBUG("BEGIN TO RUN GETXC") + PRINTDEBUG("BEGIN TO RUN GETXC"); gpu->DFT_calculated = new gpu_buffer_type(1, 1); +#if defined(USE_LEGACY_ATOMICS) + QUICKULL valUII = (QUICKULL) (fabs(*Eelxc * OSCALE) + 0.5); + if (*Eelxc < 0.0) { + valUII = 0ull - valUII; + } + gpu->DFT_calculated->_hostData[0].Eelxc = valUII; + + valUII = (QUICKULL) (fabs(*aelec * OSCALE) + 0.5); + if (*aelec < 0.0) { + valUII = 0ull - valUII; + } + gpu->DFT_calculated->_hostData[0].aelec = valUII; + + valUII = (QUICKULL) (fabs(*belec * OSCALE) + 0.5); + if (*belec < 0.0) { + valUII = 0ull - valUII; + } + gpu->DFT_calculated->_hostData[0].belec = valUII; +#else gpu->DFT_calculated->_hostData[0].Eelxc = 0.0; gpu->DFT_calculated->_hostData[0].aelec = 0.0; gpu->DFT_calculated->_hostData[0].belec = 0.0; +#endif gpu->DFT_calculated->Upload(); gpu->gpu_sim.DFT_calculated = gpu->DFT_calculated->_devData; upload_sim_to_constant_dft(gpu); - PRINTDEBUG("BEGIN TO RUN KERNEL") + PRINTDEBUG("BEGIN TO RUN KERNEL"); getxc(gpu); gpu->DFT_calculated->Download(); - gpu -> gpu_calculated -> o -> Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; } } -#ifdef OSHELL - gpu -> gpu_calculated -> ob -> Download(); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->ob->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); +# if defined(OSHELL) + gpu->gpu_calculated->obULL->Download(); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->obULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble)(valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->ob->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; } } -#endif +# endif + + QUICKULL valULL = gpu->DFT_calculated -> _hostData[0].Eelxc; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + *Eelxc = (QUICKDouble) valDB * ONEOVEROSCALE; - *Eelxc = gpu->DFT_calculated -> _hostData[0].Eelxc; - *aelec = gpu->DFT_calculated -> _hostData[0].aelec; - *belec = gpu->DFT_calculated -> _hostData[0].belec; + valULL = gpu->DFT_calculated->_hostData[0].aelec; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + *aelec = (QUICKDouble) valDB * ONEOVEROSCALE; - gpu -> gpu_calculated -> o -> DownloadSum(o); -#ifdef OSHELL - gpu -> gpu_calculated -> ob -> DownloadSum(ob); + valULL = gpu->DFT_calculated->_hostData[0].belec; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + *belec = (QUICKDouble) valDB * ONEOVEROSCALE; +#else + gpu->gpu_calculated->o->Download(); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); + } + } + +# if defined(OSHELL) + gpu->gpu_calculated->ob->Download(); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->ob->_hostData, i, j,gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->ob->_hostData, j, i, gpu->nbasis, gpu->nbasis); + } + } +# endif + + *Eelxc = gpu->DFT_calculated->_hostData[0].Eelxc; + *aelec = gpu->DFT_calculated->_hostData[0].aelec; + *belec = gpu->DFT_calculated->_hostData[0].belec; +#endif + + gpu->gpu_calculated->o->DownloadSum(o); +#if defined(OSHELL) + gpu->gpu_calculated->ob->DownloadSum(ob); #endif - PRINTDEBUG("DELETE TEMP VARIABLES") + PRINTDEBUG("DELETE TEMP VARIABLES"); delete gpu->gpu_calculated->o; delete gpu->gpu_calculated->dense; - -#ifdef OSHELL +#if defined(OSHELL) delete gpu->gpu_calculated->ob; delete gpu->gpu_calculated->denseb; #endif +#if defined(USE_LEGACY_ATOMICS) + delete gpu->gpu_calculated->oULL; +# if defined(OSHELL) + delete gpu->gpu_calculated->obULL; +# endif +#endif } @@ -202,8 +356,8 @@ extern "C" void gpu_get_oshell_xcgrad_(QUICKDouble *grad) extern "C" void gpu_get_cshell_xcgrad_(QUICKDouble *grad) #endif { -#if defined(CEW) - gpu -> cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); +#if defined(CEW) && !defined(USE_LEGACY_ATOMICS) + gpu->cew_grad = new gpu_buffer_type(3 * gpu->nextatom); #endif // calculate smem size @@ -214,16 +368,35 @@ extern "C" void gpu_get_cshell_xcgrad_(QUICKDouble *grad) memset(gpu->grad->_hostData, 0, gpu -> gpu_xcq -> smem_size); getxc_grad(gpu); - gpu -> grad -> Download(); +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL->Download(); + + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->grad->_hostData[i] += (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else + gpu->grad->Download(); +#endif - gpu -> grad -> DownloadSum(grad); + gpu->grad->DownloadSum(grad); -#if defined(CEW) - gpu -> cew_grad->DownloadSum(grad); - delete gpu -> cew_grad; +#if defined(CEW) && !defined(USE_LEGACY_ATOMICS) + gpu->cew_grad->DownloadSum(grad); + delete gpu->cew_grad; #endif - delete gpu -> grad; + delete gpu->grad; +#ifdef USE_LEGACY_ATOMICS + delete gpu->gradULL; +#endif delete gpu->gpu_calculated->dense; #ifdef OSHELL @@ -253,14 +426,35 @@ extern "C" void gpu_get_oei_(QUICKDouble* o) getOEI(gpu); - gpu -> gpu_calculated -> o -> Download(); - hipMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + + hipMemsetAsync(gpu->gpu_calculated->oULL->_devData, 0, sizeof(QUICKULL) * gpu->nbasis * gpu->nbasis); for (int i = 0; i< gpu->nbasis; i++) { for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis); + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; } } +#else + gpu->gpu_calculated->o-> Download(); + hipMemsetAsync(gpu->gpu_calculated->o->_devData, 0, sizeof(QUICKDouble) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis); + } + } +#endif /* for (int i = 0; i< gpu->nbasis; i++) { @@ -282,10 +476,18 @@ extern "C" void gpu_get_oei_(QUICKDouble* o) extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) { // upload point charge grad vector - if(gpu -> nextatom > 0) { - gpu -> ptchg_grad = new gpu_buffer_type(3 * gpu -> nextatom); - gpu -> ptchg_grad -> Upload(); - gpu -> gpu_sim.ptchg_grad = gpu -> ptchg_grad -> _devData; + if (gpu->nextatom > 0) { + gpu->ptchg_grad = new gpu_buffer_type(3 * gpu->nextatom); + +#if defined(USE_LEGACY_ATOMICS) + gpu->ptchg_gradULL = new gpu_buffer_type(3 * gpu->nextatom); + gpu->ptchg_gradULL->Upload(); + gpu->gpu_sim.ptchg_gradULL = gpu->ptchg_gradULL->_devData; + gpu->ptchg_grad->DeleteGPU(); +#else + gpu->ptchg_grad->Upload(); + gpu->gpu_sim.ptchg_grad = gpu->ptchg_grad->_devData; +#endif } upload_sim_to_constant_oei(gpu); @@ -293,8 +495,24 @@ extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) get_oei_grad(gpu); // download gradients +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL->Download(); + hipMemsetAsync(gpu->gradULL->_devData, 0, sizeof(QUICKULL) * 3 * gpu->natom); + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->grad->Download(); - hipMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); + hipMemsetAsync(gpu->grad->_devData, 0, sizeof(QUICKDouble) * 3 * gpu->natom); +#endif gpu->grad->DownloadSum(grad); @@ -304,9 +522,27 @@ extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) } */ // download point charge gradients - if(gpu -> nextatom > 0) { + if (gpu -> nextatom > 0) { +#if defined(USE_LEGACY_ATOMICS) + gpu->ptchg_gradULL->Download(); + + hipMemsetAsync(gpu->ptchg_gradULL->_devData, 0, sizeof(QUICKULL) * 3 * gpu->nextatom); + + for (int i = 0; i < 3 * gpu->nextatom; i++) { + QUICKULL valULL = gpu->ptchg_gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->ptchg_grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->ptchg_grad->Download(); - hipMemsetAsync(gpu -> ptchg_grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->nextatom); + hipMemsetAsync(gpu->ptchg_grad->_devData, 0, sizeof(QUICKDouble) * 3 * gpu->nextatom); +#endif /* for(int i=0; i<3*gpu->nextatom; ++i){ printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); @@ -316,8 +552,11 @@ extern "C" void gpu_get_oei_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) } // ptchg_grad is no longer needed. reclaim the memory. - if(gpu -> nextatom > 0 && !gpu->gpu_sim.use_cew) { - SAFE_DELETE(gpu -> ptchg_grad); + if (gpu->nextatom > 0 && !gpu->gpu_sim.use_cew) { + SAFE_DELETE(gpu->ptchg_grad); +#if defined(USE_LEGACY_ATOMICS) + SAFE_DELETE(gpu->ptchg_gradULL); +#endif } } @@ -348,14 +587,35 @@ extern "C" void gpu_get_lri_(QUICKDouble* o) upload_sim_to_constant_dft(gpu); getcew_quad(gpu); - gpu -> gpu_calculated -> o -> Download(); - hipMemsetAsync(gpu -> gpu_calculated -> o -> _devData, 0, sizeof(QUICKDouble)*gpu->nbasis*gpu->nbasis); +#if defined(USE_LEGACY_ATOMICS) + gpu->gpu_calculated->oULL->Download(); + + hipMemsetAsync(gpu->gpu_calculated->oULL->_devData, 0, sizeof(QUICKULL) * gpu->nbasis * gpu->nbasis); + + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + QUICKULL valULL = LOC2(gpu->gpu_calculated->oULL->_hostData, j, i, gpu->nbasis, gpu->nbasis); + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis) = (QUICKDouble) valDB * ONEOVEROSCALE; + } + } +#else + gpu->gpu_calculated->o->Download(); + hipMemsetAsync(gpu->gpu_calculated->o->_devData, 0, sizeof(QUICKDouble) * gpu->nbasis * gpu->nbasis); - for (int i = 0; i< gpu->nbasis; i++) { - for (int j = i; j< gpu->nbasis; j++) { - LOC2(gpu->gpu_calculated->o->_hostData,i,j,gpu->nbasis, gpu->nbasis) = LOC2(gpu->gpu_calculated->o->_hostData,j,i,gpu->nbasis, gpu->nbasis); + for (int i = 0; i < gpu->nbasis; i++) { + for (int j = i; j < gpu->nbasis; j++) { + LOC2(gpu->gpu_calculated->o->_hostData, i, j, gpu->nbasis, gpu->nbasis) + = LOC2(gpu->gpu_calculated->o->_hostData, j, i, gpu->nbasis, gpu->nbasis); } } +#endif /* int idxf90=0; for (int i = 0; i< gpu->nbasis; i++) { @@ -383,8 +643,24 @@ extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) get_lri_grad(gpu); // download gradients +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL->Download(); + hipMemsetAsync(gpu->gradULL->_devData, 0, sizeof(QUICKULL) * 3 * gpu->natom); + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->grad->Download(); - hipMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); + hipMemsetAsync(gpu->grad->_devData, 0, sizeof(QUICKDouble) * 3 * gpu->natom); +#endif gpu->grad->DownloadSum(grad); @@ -394,8 +670,24 @@ extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) } */ // download point charge gradients - if(gpu -> nextatom > 0) { + if (gpu->nextatom > 0) { +#if defined(USE_LEGACY_ATOMICS) + gpu->ptchg_gradULL->Download(); + + for (int i = 0; i < 3 * gpu->nextatom; i++) { + QUICKULL valULL = gpu->ptchg_gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + gpu->ptchg_grad->_hostData[i] = (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->ptchg_grad->Download(); +#endif /* for(int i=0; i<3*gpu->nextatom; ++i){ printf("ptchg_grad: %d %f \n", i, gpu->ptchg_grad->_hostData[i]); @@ -405,18 +697,25 @@ extern "C" void gpu_get_lri_grad_(QUICKDouble* grad, QUICKDouble* ptchg_grad) } // ptchg_grad is no longer needed. reclaim the memory. - if(gpu -> nextatom > 0) { - SAFE_DELETE(gpu -> ptchg_grad); + if (gpu->nextatom > 0) { + SAFE_DELETE(gpu->ptchg_grad); +#if defined(USE_LEGACY_ATOMICS) + SAFE_DELETE(gpu->ptchg_gradULL); +#endif } } extern "C" void gpu_getcew_grad_quad_(QUICKDouble* grad) { - gpu->cew_grad = new gpu_buffer_type(3 * gpu -> nextatom); +#if defined(USE_LEGACY_ATOMICS) + memset(gpu->grad->_hostData, 0, sizeof(QUICKDouble) * 3 * gpu->natom); +#else + gpu->cew_grad = new gpu_buffer_type(3 * gpu->nextatom); +#endif // calculate smem size - gpu -> gpu_xcq -> smem_size = gpu->natom * 3 * sizeof(QUICKULL); + gpu->gpu_xcq->smem_size = sizeof(QUICKULL) * gpu->natom * 3; //compute xc quad potential upload_sim_to_constant_dft(gpu); @@ -424,13 +723,34 @@ extern "C" void gpu_getcew_grad_quad_(QUICKDouble* grad) getcew_quad_grad(gpu); // download gradients +#if defined(USE_LEGACY_ATOMICS) + gpu->gradULL->Download(); + hipMemsetAsync(gpu->gradULL->_devData, 0, sizeof(QUICKULL) * 3 * gpu->natom); + + for (int i = 0; i < 3 * gpu->natom; i++) { + QUICKULL valULL = gpu->gradULL->_hostData[i]; + QUICKDouble valDB; + if (valULL >= 0x8000000000000000ull) { + valDB = -(QUICKDouble) (valULL ^ 0xffffffffffffffffull); + } else { + valDB = (QUICKDouble) valULL; + } + + // make sure to add rather than assign. we already computed one part of the cew + // gradients on host asynchronously. + gpu->grad->_hostData[i] += (QUICKDouble) valDB * ONEOVERGRADSCALE; + } +#else gpu->grad->Download(); - hipMemsetAsync(gpu -> grad -> _devData, 0, sizeof(QUICKDouble)*3*gpu->natom); + hipMemsetAsync(gpu->grad->_devData, 0, sizeof(QUICKDouble) * 3 * gpu->natom); +#endif gpu->grad->DownloadSum(grad); - gpu -> cew_grad ->DownloadSum(grad); - SAFE_DELETE(gpu -> cew_grad); +#if !defined(USE_LEGACY_ATOMICS) + gpu->cew_grad->DownloadSum(grad); + SAFE_DELETE(gpu->cew_grad); +#endif } #endif #endif diff --git a/src/gpu/hip/gpu_get2e_grad_ffff.cu b/src/gpu/hip/gpu_get2e_grad_ffff.cu index f73e00fc..ca535bdf 100644 --- a/src/gpu/hip/gpu_get2e_grad_ffff.cu +++ b/src/gpu/hip/gpu_get2e_grad_ffff.cu @@ -311,7 +311,11 @@ void getGrad_ffff(_gpu_type gpu) int2_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.sorted_YCutoffIJ; char_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.mpi_bcompute; char_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.KLMN; +#if defined(USE_LEGACY_ATOMICS) + grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.gradULL; +#else grad_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.grad; +#endif } LOC3(trans, 0, 0, 0, TRANSDIM, TRANSDIM, TRANSDIM) = 1; diff --git a/src/gpu/hip/gpu_get2e_grad_ffff.cuh b/src/gpu/hip/gpu_get2e_grad_ffff.cuh index da7702b9..6b5b9982 100644 --- a/src/gpu/hip/gpu_get2e_grad_ffff.cuh +++ b/src/gpu/hip/gpu_get2e_grad_ffff.cuh @@ -1616,26 +1616,21 @@ const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const sm //printf("FILE: %s, LINE: %d, FUNCTION: %s, DEV_SIM_DBL_HYB_COEFF \n", __FILE__, __LINE__, __func__); #endif - atomicAdd(&DEV_SIM_PTR_GRAD[AStart], AGradx); - atomicAdd(&DEV_SIM_PTR_GRAD[AStart + 1], AGrady); - atomicAdd(&DEV_SIM_PTR_GRAD[AStart + 2], AGradz); - - - atomicAdd(&DEV_SIM_PTR_GRAD[BStart], BGradx); - atomicAdd(&DEV_SIM_PTR_GRAD[BStart + 1], BGrady); - atomicAdd(&DEV_SIM_PTR_GRAD[BStart + 2], BGradz); - - - atomicAdd(&DEV_SIM_PTR_GRAD[CStart], CGradx); - atomicAdd(&DEV_SIM_PTR_GRAD[CStart + 1], CGrady); - atomicAdd(&DEV_SIM_PTR_GRAD[CStart + 2], CGradz); - - - atomicAdd(&DEV_SIM_PTR_GRAD[DStart], (-AGradx-BGradx-CGradx)); - atomicAdd(&DEV_SIM_PTR_GRAD[DStart + 1], (-AGrady-BGrady-CGrady)); - atomicAdd(&DEV_SIM_PTR_GRAD[DStart + 2], (-AGradz-BGradz-CGradz)); - - return; + GPUATOMICADD(&DEV_SIM_PTR_GRAD[AStart], AGradx, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[AStart + 1], AGrady, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[AStart + 2], AGradz, GRADSCALE); + + GPUATOMICADD(&DEV_SIM_PTR_GRAD[BStart], BGradx, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[BStart + 1], BGrady, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[BStart + 2], BGradz, GRADSCALE); + + GPUATOMICADD(&DEV_SIM_PTR_GRAD[CStart], CGradx, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[CStart + 1], CGrady, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[CStart + 2], CGradz, GRADSCALE); + + GPUATOMICADD(&DEV_SIM_PTR_GRAD[DStart], -AGradx - BGradx - CGradx, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[DStart + 1], -AGrady - BGrady - CGrady, GRADSCALE); + GPUATOMICADD(&DEV_SIM_PTR_GRAD[DStart + 2], -AGradz - BGradz - CGradz, GRADSCALE); } diff --git a/src/gpu/hip/gpu_getxc.cu b/src/gpu/hip/gpu_getxc.cu index b60afc59..eb08a7c1 100644 --- a/src/gpu/hip/gpu_getxc.cu +++ b/src/gpu/hip/gpu_getxc.cu @@ -355,14 +355,23 @@ __global__ void get_ssw_kernel() { __global__ void get_sswgrad_kernel() { - extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x; int totalThreads = blockDim.x * gridDim.x; +#if defined(USE_LEGACY_ATOMICS) + extern __shared__ QUICKULL smem_buffer[]; + QUICKULL* smemGrad = (QUICKULL*) smem_buffer; + + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) { + smemGrad[i] = 0ull; + } +#else + extern __shared__ QUICKDouble smem_buffer[]; + QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) { smemGrad[i] = 0.0; } +#endif __syncthreads(); for (QUICKULL gid = offset; gid < devSim_dft.npoints_ssd; gid += totalThreads) { @@ -378,7 +387,11 @@ __global__ void get_sswgrad_kernel() { __syncthreads(); for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) { +#if defined(USE_LEGACY_ATOMICS) + atomicAdd(&devSim_dft.gradULL[i], smemGrad[i]); +#else atomicAdd(&devSim_dft.grad[i], smemGrad[i]); +#endif } __syncthreads(); @@ -391,16 +404,25 @@ __global__ void get_sswgrad_kernel() { computing x, y and z gradients separately. */ __global__ void get_sswnumgrad_kernel() { - extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x; unsigned int totalThreads = blockDim.x * gridDim.x; unsigned int natom = devSim_dft.natom; QUICKULL npoints_ssd = devSim_dft.npoints_ssd; +#if defined(USE_LEGACY_ATOMICS) + extern __shared__ QUICKULL smem_buffer[]; + QUICKULL* smemGrad = (QUICKULL*) smem_buffer; + + for (int i = threadIdx.x; i < natom * 3; i += blockDim.x) { + smemGrad[i] = 0ull; + } +#else + extern __shared__ QUICKDouble smem_buffer[]; + QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; for (int i = threadIdx.x; i < natom * 3; i += blockDim.x) { smemGrad[i] = 0.0; } +#endif __syncthreads(); @@ -505,9 +527,9 @@ __global__ void get_sswnumgrad_kernel() { if(iatom == gatm-1) zparent = zatm; - atomicAdd(&smemGrad[iatom*3], dpx); - atomicAdd(&smemGrad[iatom*3+1], dpy); - atomicAdd(&smemGrad[iatom*3+2], dpz); + GPUATOMICADD(&smemGrad[iatom * 3], dpx, GRADSCALE); + GPUATOMICADD(&smemGrad[iatom * 3 + 1], dpy, GRADSCALE); + GPUATOMICADD(&smemGrad[iatom * 3 + 2], dpz, GRADSCALE); /* printf("sswgrad %f %f %f %d %d %f %f %f \n", gridx, gridy, gridz, iatom, 1, dpx, devSim_dft.exc_ssd[idx], devSim_dft.quadwt[idx]); @@ -523,8 +545,12 @@ __global__ void get_sswnumgrad_kernel() { __syncthreads(); // update gmem grad vector - for(int i = threadIdx.x; i< natom * 3; i+=blockDim.x) - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + for (int i = threadIdx.x; i < natom * 3; i += blockDim.x) +#if defined(USE_LEGACY_ATOMICS) + atomicAdd(&devSim_dft.gradULL[i], smemGrad[i]); +#else + atomicAdd(&devSim_dft.grad[i], smemGrad[i]); +#endif __syncthreads(); @@ -932,7 +958,12 @@ __device__ QUICKDouble get_uw_ssd(const QUICKDouble gridx, const QUICKDouble gri __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, const QUICKDouble gridz, - const QUICKDouble Exc, const QUICKDouble quadwt, QUICKDouble* const smemGrad, + const QUICKDouble Exc, const QUICKDouble quadwt, +#if defined(USE_LEGACY_ATOMICS) + QUICKULL* const smemGrad, +#else + QUICKDouble* const smemGrad, +#endif QUICKDouble* const uw_ssd, const int iparent, const int natom) { QUICKDouble sumUW = 0.0; @@ -961,7 +992,8 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons for (int i = 0; i < natom; i++) { for (int j = 0; j < 3; j++) { if (abs(uw) > devSim_dft.DMCutoff) { - atomicAdd(&smemGrad[i * 3 + j], LOCUWSSD(uw_ssd, j, i, 3, natom) * Exc * quadwt * uw * (-p / sumUW)); + GPUATOMICADD(&smemGrad[i * 3 + j], + LOCUWSSD(uw_ssd, j, i, 3, natom) * Exc * quadwt * uw * (-p / sumUW), GRADSCALE); } } } @@ -978,7 +1010,8 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons for (int i = 0; i < natom; i++) { for (int j = 0; j < 3; j++) { if (abs(parent_uw) > devSim_dft.DMCutoff) { - atomicAdd(&smemGrad[i * 3 + j], LOCUWSSD(uw_ssd, j, i, 3, natom) * (1.0 / sumUW) * Exc * quadwt * parent_uw); + GPUATOMICADD(&smemGrad[i * 3 + j], + LOCUWSSD(uw_ssd, j, i, 3, natom) * (1.0 / sumUW) * Exc * quadwt * parent_uw, GRADSCALE); } } } @@ -986,8 +1019,13 @@ __device__ void sswanader(const QUICKDouble gridx, const QUICKDouble gridy, cons __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, - QUICKDouble Exc, QUICKDouble quadwt, QUICKDouble* smemGrad, int - iparent, int gid) + QUICKDouble Exc, QUICKDouble quadwt, +#if defined(USE_LEGACY_ATOMICS) + QUICKULL* smemGrad, +#else + QUICKDouble* smemGrad, +#endif + int iparent, int gid) { /* This subroutine calculates the derivatives of weight found in @@ -1137,9 +1175,9 @@ __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, #endif // We should now have the derivatives of the SS weights. Now just add it to the temporary gradient vector in shared memory. - atomicAdd(&smemGrad[jstart], wtgradjx * Exc * quadwt); - atomicAdd(&smemGrad[jstart + 1], wtgradjy * Exc * quadwt); - atomicAdd(&smemGrad[jstart + 2], wtgradjz * Exc * quadwt); + GPUATOMICADD(&smemGrad[jstart], wtgradjx * Exc * quadwt, GRADSCALE); + GPUATOMICADD(&smemGrad[jstart + 1], wtgradjy * Exc * quadwt, GRADSCALE); + GPUATOMICADD(&smemGrad[jstart + 2], wtgradjz * Exc * quadwt, GRADSCALE); } } @@ -1149,9 +1187,9 @@ __device__ void sswder(QUICKDouble gridx, QUICKDouble gridy, QUICKDouble gridz, #endif // update the temporary gradient vector - atomicAdd(&smemGrad[istart], wtgradix * Exc * quadwt); - atomicAdd(&smemGrad[istart + 1], wtgradiy * Exc * quadwt); - atomicAdd(&smemGrad[istart + 2], wtgradiz * Exc * quadwt); + GPUATOMICADD(&smemGrad[istart], wtgradix * Exc * quadwt, GRADSCALE); + GPUATOMICADD(&smemGrad[istart + 1], wtgradiy * Exc * quadwt, GRADSCALE); + GPUATOMICADD(&smemGrad[istart + 2], wtgradiz * Exc * quadwt, GRADSCALE); } diff --git a/src/gpu/hip/gpu_getxc.h b/src/gpu/hip/gpu_getxc.h index 7efc7615..ebc93b4b 100644 --- a/src/gpu/hip/gpu_getxc.h +++ b/src/gpu/hip/gpu_getxc.h @@ -291,9 +291,15 @@ __global__ void cshell_getxc_kernel() } #endif +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&devSim_dft.DFT_calculated[0].Eelxc, _tmp, OSCALE); + GPUATOMICADD(&devSim_dft.DFT_calculated[0].aelec, weight * density, OSCALE); + GPUATOMICADD(&devSim_dft.DFT_calculated[0].belec, weight * densityb, OSCALE); +#else atomicAdd(&devSim_dft.DFT_calculated[0].Eelxc, _tmp); - atomicAdd(&devSim_dft.DFT_calculated[0].aelec, weight*density); - atomicAdd(&devSim_dft.DFT_calculated[0].belec, weight*densityb); + atomicAdd(&devSim_dft.DFT_calculated[0].aelec, weight * density); + atomicAdd(&devSim_dft.DFT_calculated[0].belec, weight * densityb); +#endif for (int i = bfloc_st; i< bfloc_end; ++i) { int ibas = devSim_dft.basf[i]; @@ -310,13 +316,21 @@ __global__ void cshell_getxc_kernel() QUICKDouble _tmp = (phi * phi2 * dfdr + xdot * (phi*dphidx2 + phi2*dphidx) \ + ydot * (phi*dphidy2 + phi2*dphidy) + zdot * (phi*dphidz2 + phi2*dphidz))*weight; +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_dft.oULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp, OSCALE); +#else atomicAdd(&LOC2(devSim_dft.o, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmp); +#endif #ifdef OSHELL QUICKDouble _tmpb = (phi * phi2 * dfdrb + xdotb * (phi*dphidx2 + phi2*dphidx) + ydotb * (phi*dphidy2 + phi2*dphidy) + zdotb * (phi*dphidz2 + phi2*dphidz))*weight; +#if defined(USE_LEGACY_ATOMICS) + GPUATOMICADD(&LOC2(devSim_dft.obULL, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmpb, OSCALE); +#else atomicAdd(&LOC2(devSim_dft.ob, jbas, ibas, devSim_dft.nbasis, devSim_dft.nbasis), _tmpb); +#endif #endif } } @@ -332,13 +346,23 @@ __global__ void oshell_getxcgrad_kernel() __global__ void cshell_getxcgrad_kernel() #endif { +#if defined(USE_LEGACY_ATOMICS) + //declare smem grad vector + extern __shared__ QUICKULL smem_buffer[]; + QUICKULL* smemGrad = (QUICKULL*) smem_buffer; + + // initialize smem grad + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) + smemGrad[i] = 0ull; +#else //declare smem grad vector extern __shared__ QUICKDouble smem_buffer[]; - QUICKDouble* smemGrad=(QUICKDouble*)smem_buffer; + QUICKDouble* smemGrad = (QUICKDouble*) smem_buffer; // initialize smem grad - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - smemGrad[i]=0.0; + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) + smemGrad[i] = 0.0; +#endif __syncthreads(); @@ -568,9 +592,9 @@ __global__ void cshell_getxcgrad_kernel() } #endif - atomicAdd(&smemGrad[Istart], Gradx); - atomicAdd(&smemGrad[Istart+1], Grady); - atomicAdd(&smemGrad[Istart+2], Gradz); + GPUATOMICADD(&smemGrad[Istart], Gradx, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 1], Grady, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 2], Gradz, GRADSCALE); sumGradx += Gradx; sumGrady += Grady; sumGradz += Gradz; @@ -580,9 +604,9 @@ __global__ void cshell_getxcgrad_kernel() int Istart = (devSim_dft.gatm[gid]-1) * 3; - atomicAdd(&smemGrad[Istart], -sumGradx); - atomicAdd(&smemGrad[Istart+1], -sumGrady); - atomicAdd(&smemGrad[Istart+2], -sumGradz); + GPUATOMICADD(&smemGrad[Istart], -sumGradx, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 1], -sumGrady, GRADSCALE); + GPUATOMICADD(&smemGrad[Istart + 2], -sumGradz, GRADSCALE); } //Set weights for sswder calculation if(density < devSim_dft.DMCutoff){ @@ -598,8 +622,12 @@ __global__ void cshell_getxcgrad_kernel() __syncthreads(); // update gmem grad vector - for(int i = threadIdx.x; i< devSim_dft.natom * 3; i+=blockDim.x) - atomicAdd(&devSim_dft.grad[i],smemGrad[i]); + for (int i = threadIdx.x; i < devSim_dft.natom * 3; i += blockDim.x) +#if defined(USE_LEGACY_ATOMICS) + atomicAdd(&devSim_dft.gradULL[i], smemGrad[i]); +#else + atomicAdd(&devSim_dft.grad[i], smemGrad[i]); +#endif __syncthreads(); } diff --git a/src/gpu/hip/gpu_type.h b/src/gpu/hip/gpu_type.h index 2bdd8484..3f8c10f2 100644 --- a/src/gpu/hip/gpu_type.h +++ b/src/gpu/hip/gpu_type.h @@ -34,6 +34,10 @@ struct gpu_calculated_type { gpu_buffer_type* ob; // beta O matrix gpu_buffer_type* dense; // Density Matrix gpu_buffer_type* denseb; // Beta Density Matrix +#if defined(USE_LEGACY_ATOMICS) + gpu_buffer_type* oULL; // Unsigned long long int type O matrix + gpu_buffer_type* obULL; // Unsigned long long int type Ob matrix +#endif gpu_buffer_type* distance; // distance matrix }; @@ -81,9 +85,15 @@ struct gpu_cutoff_type { }; struct DFT_calculated_type { +#if defined(USE_LEGACY_ATOMICS) + QUICKULL Eelxc; // exchange correction energy + QUICKULL aelec; // alpha electron + QUICKULL belec; // beta electron +#else QUICKDouble Eelxc; // exchange correction energy QUICKDouble aelec; // alpha electron QUICKDouble belec; // beta electron +#endif }; /*Madu Manathunga 11/21/2019*/