From 9463c97c4b524d5ec1ca3a75737b6325b9bbe6eb Mon Sep 17 00:00:00 2001 From: Madu86 Date: Wed, 13 Mar 2024 21:01:55 -0400 Subject: [PATCH] addd gradULL pointer into the shared memory --- src/cuda/gpu_get2e_grad_ffff.cu | 14 +++++++++++++- src/cuda/gpu_get2e_grad_ffff.cuh | 17 ++++++++++------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/cuda/gpu_get2e_grad_ffff.cu b/src/cuda/gpu_get2e_grad_ffff.cu index bb7e4b24..b00b8e71 100644 --- a/src/cuda/gpu_get2e_grad_ffff.cu +++ b/src/cuda/gpu_get2e_grad_ffff.cu @@ -70,6 +70,8 @@ texture tex_Xcoeff; #define ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE 2 #define ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE 1 +#define ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE 1 + #define DEV_SIM_INT_PTR_KATOM smem_int_ptr[ERI_GRAD_FFFF_TPB*0+threadIdx.x] #define DEV_SIM_INT_PTR_KPRIM smem_int_ptr[ERI_GRAD_FFFF_TPB*1+threadIdx.x] #define DEV_SIM_INT_PTR_KSTART smem_int_ptr[ERI_GRAD_FFFF_TPB*2+threadIdx.x] @@ -114,6 +116,8 @@ texture tex_Xcoeff; #define DEV_SIM_INT_PRIM_TOTAL smem_int[ERI_GRAD_FFFF_TPB*5+threadIdx.x] #define DEV_SIM_INT_FFSTART smem_int[ERI_GRAD_FFFF_TPB*6+threadIdx.x] +#define DEV_SIM_ULL_PTR_GRAD smem_ull_ptr[ERI_GRAD_FFFF_TPB*0+threadIdx.x] + #define LOCTRANS(A,i1,i2,i3,d1,d2,d3) A[(i3+((i2)+(i1)*(d2))*(d3))*ERI_GRAD_FFFF_TPB+threadIdx.x] #define DEV_SIM_CHAR_TRANS smem_char @@ -372,6 +376,7 @@ void getGrad_ffff(_gpu_type gpu) QUICKDouble **dbl_ptr_buffer = (QUICKDouble**) malloc(ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*)); int2 **int2_ptr_buffer = (int2**) malloc(ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); unsigned char **char_ptr_buffer = (unsigned char**) malloc(ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); + QUICKULL **ull_ptr_buffer = (QUICKULL**) malloc(ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKULL*)); unsigned char trans[TRANSDIM*TRANSDIM*TRANSDIM]; //printf("Storing data \n"); @@ -421,6 +426,7 @@ void getGrad_ffff(_gpu_type gpu) int2_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.sorted_YCutoffIJ; char_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.mpi_bcompute; char_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.KLMN; + ull_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.gradULL; } @@ -562,6 +568,7 @@ char)); int2 **dev_int2_ptr_buffer; unsigned char **dev_char_ptr_buffer; unsigned char *dev_char_buffer; + QUICKULL **dev_ull_ptr_buffer; cudaMalloc((void **)&dev_int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int)); //printf("Allocating int ptr device memory %d %d %d %d %d %d\n", sizeof(int), sizeof(int*), sizeof(QUICKDouble), sizeof(QUICKDouble*), @@ -573,6 +580,7 @@ char)); cudaMalloc((void **)&dev_int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*)); cudaMalloc((void **)&dev_char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*)); cudaMalloc((void **)&dev_char_buffer, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char)); + cudaMalloc((void **)&dev_ull_ptr_buffer, ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKULL*)); //printf("Uploading data \n"); @@ -584,6 +592,8 @@ char)); cudaMemcpy(dev_char_ptr_buffer, char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*), cudaMemcpyHostToDevice); cudaMemcpy(dev_char_buffer, &trans, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char), cudaMemcpyHostToDevice); + cudaMemcpy(dev_ull_ptr_buffer, ull_ptr_buffer, ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKULL*), +cudaMemcpyHostToDevice); /* int_buffer -> Upload(); @@ -614,7 +624,7 @@ sizeof(int)*ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB+ sizeof(int2*)*ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned char*)*ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned char)*ERI_GRAD_FFFF_SMEM_CHAR_SIZE>>>(dev_int_buffer, dev_int_ptr_buffer, dev_dbl_buffer, dev_dbl_ptr_buffer, dev_int2_ptr_buffer, dev_char_ptr_buffer, dev_char_buffer, -gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell))) +dev_ull_ptr_buffer,gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell))) #endif } @@ -631,6 +641,7 @@ gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell))) free(dbl_ptr_buffer); free(int2_ptr_buffer); free(char_ptr_buffer); + free(ull_ptr_buffer); // free(trans_buffer); cudaFree(dev_int_buffer); @@ -640,6 +651,7 @@ gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell))) cudaFree(dev_int2_ptr_buffer); cudaFree(dev_char_ptr_buffer); cudaFree(dev_char_buffer); + cudaFree(dev_ull_ptr_buffer); /* SAFE_DELETE(int_buffer); SAFE_DELETE(int_ptr_buffer); diff --git a/src/cuda/gpu_get2e_grad_ffff.cuh b/src/cuda/gpu_get2e_grad_ffff.cuh index cf302781..5ff41923 100644 --- a/src/cuda/gpu_get2e_grad_ffff.cuh +++ b/src/cuda/gpu_get2e_grad_ffff.cuh @@ -1046,7 +1046,7 @@ __device__ __inline__ void iclass_grad_ffff unsigned int LL, const QUICKDouble DNMax, \ QUICKDouble* const YVerticalTemp, QUICKDouble* const store, QUICKDouble* const store2, QUICKDouble* const storeAA, QUICKDouble* const storeBB, QUICKDouble* const storeCC, int* const smem_int, QUICKDouble* const smem_dbl, int** const smem_int_ptr, QUICKDouble** -const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const smem_char){ +const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const smem_char, QUICKULL** const smem_ull_ptr){ /* kAtom A, B, C ,D is the coresponding atom for shell ii, jj, kk, ll and be careful with the index difference between Fortran and C++, @@ -1699,7 +1699,7 @@ if(bprint){ #endif #ifdef USE_LEGACY_ATOMICS -/* + GRADADD(DEV_SIM_DBL_PTR_GRADULL[AStart], AGradx); GRADADD(DEV_SIM_DBL_PTR_GRADULL[AStart + 1], AGrady); GRADADD(DEV_SIM_DBL_PTR_GRADULL[AStart + 2], AGradz); @@ -1718,7 +1718,7 @@ if(bprint){ GRADADD(DEV_SIM_DBL_PTR_GRADULL[DStart], (-AGradx-BGradx-CGradx)); GRADADD(DEV_SIM_DBL_PTR_GRADULL[DStart + 1], (-AGrady-BGrady-CGrady)); GRADADD(DEV_SIM_DBL_PTR_GRADULL[DStart + 2], (-AGradz-BGradz-CGradz)); -*/ + #else atomicAdd(&DEV_SIM_DBL_PTR_GRAD[AStart], AGradx); atomicAdd(&DEV_SIM_DBL_PTR_GRAD[AStart + 1], AGrady); @@ -1755,7 +1755,7 @@ __launch_bounds__(ERI_GRAD_FFFF_TPB, ERI_GRAD_FFFF_BPSM) getGrad_oshell_kernel_f __global__ void __launch_bounds__(ERI_GRAD_FFFF_TPB, ERI_GRAD_FFFF_BPSM) getGrad_kernel_ffff(int *dev_int_data, int **dev_int_ptr_data, QUICKDouble *dev_dbl_data, QUICKDouble **dev_dbl_ptr_data, int2 -**dev_int2_ptr_data, unsigned char **dev_char_ptr_data, unsigned char *dev_char_data, const int ffStart, const int sqrQshell) +**dev_int2_ptr_data, unsigned char **dev_char_ptr_data, unsigned char *dev_char_data, QUICKULL **dev_ull_ptr_data, const int ffStart, const int sqrQshell) #endif #endif { @@ -1769,7 +1769,7 @@ int **dev_int_ptr_data, QUICKDouble *dev_dbl_data, QUICKDouble **dev_dbl_ptr_dat unsigned char **smem_char_ptr = (unsigned char**) &smem_int2_ptr[ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB]; int *smem_int = (int*) &smem_char_ptr[ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB]; unsigned char *smem_char=(unsigned char*) &smem_int[ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB]; - + QUICKULL **smem_ull_ptr = (QUICKULL**) &smem_char[ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE*ERI_GRAD_FFFF_TPB]; for(int i = threadIdx.x; i DEV_SIM_DBL_GRADCUTOFF) { if( iii == 3 && jjj == 3 && kkk ==3 && lll ==3){ iclass_oshell_grad_ffff(iii, jjj, kkk, lll, ii, jj, kk, ll, DNMax, DEV_SIM_DBL_PTR_YVERTICALTEMP+offset, DEV_SIM_DBL_PTR_STORE+offset, DEV_SIM_DBL_PTR_STORE2+offset, DEV_SIM_DBL_PTR_STOREAA+offset, DEV_SIM_DBL_PTR_STOREBB+offset, -DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr, smem_char_ptr, smem_char); +DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr, smem_char_ptr, smem_char, smem_ull_ptr); } #endif #else @@ -1873,7 +1876,7 @@ DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr, if( iii == 3 && jjj == 3 && kkk ==3 && lll ==3){ iclass_grad_ffff(iii, jjj, kkk, lll, ii, jj, kk, ll, DNMax, DEV_SIM_DBL_PTR_YVERTICALTEMP+offset, DEV_SIM_DBL_PTR_STORE+offset, DEV_SIM_DBL_PTR_STORE2+offset, DEV_SIM_DBL_PTR_STOREAA+offset, DEV_SIM_DBL_PTR_STOREBB+offset, -DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr,smem_char_ptr, smem_char); +DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr,smem_char_ptr, smem_char, smem_ull_ptr); } #endif #endif