Skip to content

Commit

Permalink
addd gradULL pointer into the shared memory
Browse files Browse the repository at this point in the history
  • Loading branch information
Madu86 committed Mar 14, 2024
1 parent 77cbb35 commit 9463c97
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 8 deletions.
14 changes: 13 additions & 1 deletion src/cuda/gpu_get2e_grad_ffff.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ texture <int2, cudaTextureType1D, cudaReadModeElementType> tex_Xcoeff;
#define ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE 2
#define ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE 1

#define ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE 1

#define DEV_SIM_INT_PTR_KATOM smem_int_ptr[ERI_GRAD_FFFF_TPB*0+threadIdx.x]
#define DEV_SIM_INT_PTR_KPRIM smem_int_ptr[ERI_GRAD_FFFF_TPB*1+threadIdx.x]
#define DEV_SIM_INT_PTR_KSTART smem_int_ptr[ERI_GRAD_FFFF_TPB*2+threadIdx.x]
Expand Down Expand Up @@ -114,6 +116,8 @@ texture <int2, cudaTextureType1D, cudaReadModeElementType> tex_Xcoeff;
#define DEV_SIM_INT_PRIM_TOTAL smem_int[ERI_GRAD_FFFF_TPB*5+threadIdx.x]
#define DEV_SIM_INT_FFSTART smem_int[ERI_GRAD_FFFF_TPB*6+threadIdx.x]

#define DEV_SIM_ULL_PTR_GRAD smem_ull_ptr[ERI_GRAD_FFFF_TPB*0+threadIdx.x]

#define LOCTRANS(A,i1,i2,i3,d1,d2,d3) A[(i3+((i2)+(i1)*(d2))*(d3))*ERI_GRAD_FFFF_TPB+threadIdx.x]
#define DEV_SIM_CHAR_TRANS smem_char

Expand Down Expand Up @@ -372,6 +376,7 @@ void getGrad_ffff(_gpu_type gpu)
QUICKDouble **dbl_ptr_buffer = (QUICKDouble**) malloc(ERI_GRAD_FFFF_SMEM_DBL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKDouble*));
int2 **int2_ptr_buffer = (int2**) malloc(ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*));
unsigned char **char_ptr_buffer = (unsigned char**) malloc(ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*));
QUICKULL **ull_ptr_buffer = (QUICKULL**) malloc(ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKULL*));
unsigned char trans[TRANSDIM*TRANSDIM*TRANSDIM];

//printf("Storing data \n");
Expand Down Expand Up @@ -421,6 +426,7 @@ void getGrad_ffff(_gpu_type gpu)
int2_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.sorted_YCutoffIJ;
char_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.mpi_bcompute;
char_ptr_buffer[ERI_GRAD_FFFF_TPB*1+i] = gpu->gpu_sim.KLMN;
ull_ptr_buffer[ERI_GRAD_FFFF_TPB*0+i] = gpu->gpu_sim.gradULL;
}


Expand Down Expand Up @@ -562,6 +568,7 @@ char));
int2 **dev_int2_ptr_buffer;
unsigned char **dev_char_ptr_buffer;
unsigned char *dev_char_buffer;
QUICKULL **dev_ull_ptr_buffer;

cudaMalloc((void **)&dev_int_buffer, ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int));
//printf("Allocating int ptr device memory %d %d %d %d %d %d\n", sizeof(int), sizeof(int*), sizeof(QUICKDouble), sizeof(QUICKDouble*),
Expand All @@ -573,6 +580,7 @@ char));
cudaMalloc((void **)&dev_int2_ptr_buffer, ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(int2*));
cudaMalloc((void **)&dev_char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned char*));
cudaMalloc((void **)&dev_char_buffer, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char));
cudaMalloc((void **)&dev_ull_ptr_buffer, ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKULL*));

//printf("Uploading data \n");

Expand All @@ -584,6 +592,8 @@ char));
cudaMemcpy(dev_char_ptr_buffer, char_ptr_buffer, ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(unsigned
char*), cudaMemcpyHostToDevice);
cudaMemcpy(dev_char_buffer, &trans, ERI_GRAD_FFFF_SMEM_CHAR_SIZE*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_ull_ptr_buffer, ull_ptr_buffer, ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE*ERI_GRAD_FFFF_TPB*sizeof(QUICKULL*),
cudaMemcpyHostToDevice);

/*
int_buffer -> Upload();
Expand Down Expand Up @@ -614,7 +624,7 @@ sizeof(int)*ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB+
sizeof(int2*)*ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned
char*)*ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB+sizeof(unsigned char)*ERI_GRAD_FFFF_SMEM_CHAR_SIZE>>>(dev_int_buffer,
dev_int_ptr_buffer, dev_dbl_buffer, dev_dbl_ptr_buffer, dev_int2_ptr_buffer, dev_char_ptr_buffer, dev_char_buffer,
gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell)))
dev_ull_ptr_buffer,gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell)))

#endif
}
Expand All @@ -631,6 +641,7 @@ gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell)))
free(dbl_ptr_buffer);
free(int2_ptr_buffer);
free(char_ptr_buffer);
free(ull_ptr_buffer);
// free(trans_buffer);

cudaFree(dev_int_buffer);
Expand All @@ -640,6 +651,7 @@ gpu->gpu_sim.ffStart, gpu->gpu_sim.sqrQshell)))
cudaFree(dev_int2_ptr_buffer);
cudaFree(dev_char_ptr_buffer);
cudaFree(dev_char_buffer);
cudaFree(dev_ull_ptr_buffer);
/*
SAFE_DELETE(int_buffer);
SAFE_DELETE(int_ptr_buffer);
Expand Down
17 changes: 10 additions & 7 deletions src/cuda/gpu_get2e_grad_ffff.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ __device__ __inline__ void iclass_grad_ffff
unsigned int LL, const QUICKDouble DNMax, \
QUICKDouble* const YVerticalTemp, QUICKDouble* const store, QUICKDouble* const store2, QUICKDouble* const storeAA, QUICKDouble*
const storeBB, QUICKDouble* const storeCC, int* const smem_int, QUICKDouble* const smem_dbl, int** const smem_int_ptr, QUICKDouble**
const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const smem_char){
const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const smem_char, QUICKULL** const smem_ull_ptr){
/*
kAtom A, B, C ,D is the coresponding atom for shell ii, jj, kk, ll
and be careful with the index difference between Fortran and C++,
Expand Down Expand Up @@ -1699,7 +1699,7 @@ if(bprint){
#endif

#ifdef USE_LEGACY_ATOMICS
/*

GRADADD(DEV_SIM_DBL_PTR_GRADULL[AStart], AGradx);
GRADADD(DEV_SIM_DBL_PTR_GRADULL[AStart + 1], AGrady);
GRADADD(DEV_SIM_DBL_PTR_GRADULL[AStart + 2], AGradz);
Expand All @@ -1718,7 +1718,7 @@ if(bprint){
GRADADD(DEV_SIM_DBL_PTR_GRADULL[DStart], (-AGradx-BGradx-CGradx));
GRADADD(DEV_SIM_DBL_PTR_GRADULL[DStart + 1], (-AGrady-BGrady-CGrady));
GRADADD(DEV_SIM_DBL_PTR_GRADULL[DStart + 2], (-AGradz-BGradz-CGradz));
*/

#else
atomicAdd(&DEV_SIM_DBL_PTR_GRAD[AStart], AGradx);
atomicAdd(&DEV_SIM_DBL_PTR_GRAD[AStart + 1], AGrady);
Expand Down Expand Up @@ -1755,7 +1755,7 @@ __launch_bounds__(ERI_GRAD_FFFF_TPB, ERI_GRAD_FFFF_BPSM) getGrad_oshell_kernel_f
__global__ void
__launch_bounds__(ERI_GRAD_FFFF_TPB, ERI_GRAD_FFFF_BPSM) getGrad_kernel_ffff(int *dev_int_data,
int **dev_int_ptr_data, QUICKDouble *dev_dbl_data, QUICKDouble **dev_dbl_ptr_data, int2
**dev_int2_ptr_data, unsigned char **dev_char_ptr_data, unsigned char *dev_char_data, const int ffStart, const int sqrQshell)
**dev_int2_ptr_data, unsigned char **dev_char_ptr_data, unsigned char *dev_char_data, QUICKULL **dev_ull_ptr_data, const int ffStart, const int sqrQshell)
#endif
#endif
{
Expand All @@ -1769,7 +1769,7 @@ int **dev_int_ptr_data, QUICKDouble *dev_dbl_data, QUICKDouble **dev_dbl_ptr_dat
unsigned char **smem_char_ptr = (unsigned char**) &smem_int2_ptr[ERI_GRAD_FFFF_SMEM_INT2_PTR_SIZE*ERI_GRAD_FFFF_TPB];
int *smem_int = (int*) &smem_char_ptr[ERI_GRAD_FFFF_SMEM_CHAR_PTR_SIZE*ERI_GRAD_FFFF_TPB];
unsigned char *smem_char=(unsigned char*) &smem_int[ERI_GRAD_FFFF_SMEM_INT_SIZE*ERI_GRAD_FFFF_TPB];

QUICKULL **smem_ull_ptr = (QUICKULL**) &smem_char[ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE*ERI_GRAD_FFFF_TPB];

for(int i = threadIdx.x; i<ERI_GRAD_FFFF_TPB*ERI_GRAD_FFFF_SMEM_DBL_SIZE ; i+=blockDim.x)
smem_dbl[i]=dev_dbl_data[i];
Expand All @@ -1792,6 +1792,9 @@ int **dev_int_ptr_data, QUICKDouble *dev_dbl_data, QUICKDouble **dev_dbl_ptr_dat
for(int i = threadIdx.x; i<ERI_GRAD_FFFF_SMEM_CHAR_SIZE ; i+=blockDim.x)
smem_char[i]=dev_char_data[i];

for(int i = threadIdx.x; i<ERI_GRAD_FFFF_TPB*ERI_GRAD_FFFF_SMEM_ULL_PTR_SIZE ; i+=blockDim.x)
smem_ull_ptr[i]=dev_ull_ptr_data[i];

__syncthreads();


Expand Down Expand Up @@ -1864,7 +1867,7 @@ DNMax) > DEV_SIM_DBL_GRADCUTOFF) {
if( iii == 3 && jjj == 3 && kkk ==3 && lll ==3){
iclass_oshell_grad_ffff(iii, jjj, kkk, lll, ii, jj, kk, ll, DNMax, DEV_SIM_DBL_PTR_YVERTICALTEMP+offset,
DEV_SIM_DBL_PTR_STORE+offset, DEV_SIM_DBL_PTR_STORE2+offset, DEV_SIM_DBL_PTR_STOREAA+offset, DEV_SIM_DBL_PTR_STOREBB+offset,
DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr, smem_char_ptr, smem_char);
DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr, smem_char_ptr, smem_char, smem_ull_ptr);
}
#endif
#else
Expand All @@ -1873,7 +1876,7 @@ DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr,
if( iii == 3 && jjj == 3 && kkk ==3 && lll ==3){
iclass_grad_ffff(iii, jjj, kkk, lll, ii, jj, kk, ll, DNMax, DEV_SIM_DBL_PTR_YVERTICALTEMP+offset,
DEV_SIM_DBL_PTR_STORE+offset, DEV_SIM_DBL_PTR_STORE2+offset, DEV_SIM_DBL_PTR_STOREAA+offset, DEV_SIM_DBL_PTR_STOREBB+offset,
DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr,smem_char_ptr, smem_char);
DEV_SIM_DBL_PTR_STORECC+offset, smem_int, smem_dbl, smem_int_ptr, smem_dbl_ptr,smem_char_ptr, smem_char, smem_ull_ptr);
}
#endif
#endif
Expand Down

0 comments on commit 9463c97

Please sign in to comment.