From da4f8df0f8a59eb2af0ba9d78e4c88cb6a5a2ad6 Mon Sep 17 00:00:00 2001 From: AbdAmmar Date: Sat, 30 Nov 2024 00:38:32 +0100 Subject: [PATCH] optim A x D x A.T and A x Dinv x A.T on GPU --- src/cuda/include/my_linalg.h | 6 ++++ src/cuda/src/a_d_in_b.cu | 57 +++++++++++++++++++++++++++++ src/cuda/src/a_dinv_in_b.cu | 57 +++++++++++++++++++++++++++++ src/cuda/src/a_minus_twob_in_b.cu | 52 +++++++++++++++++++++++++++ src/cuda/src/a_plus_b_in_a.cu | 52 +++++++++++++++++++++++++++ src/cuda/src/ph_drpa_a_sing.cu | 34 ++++++++++-------- src/cuda/src/ph_drpa_amb_sing.cu | 46 ++++++++++++++---------- src/cuda/src/ph_drpa_apb_sing.cu | 33 +++++++++++------ src/cuda/src/ph_drpa_b_sing.cu | 33 +++++++++-------- src/cuda/src/ph_drpa_sing.c | 59 +++++++++++++++++++++++++------ 10 files changed, 360 insertions(+), 69 deletions(-) create mode 100644 src/cuda/src/a_d_in_b.cu create mode 100644 src/cuda/src/a_dinv_in_b.cu create mode 100644 src/cuda/src/a_minus_twob_in_b.cu create mode 100644 src/cuda/src/a_plus_b_in_a.cu diff --git a/src/cuda/include/my_linalg.h b/src/cuda/include/my_linalg.h index 457ab99b..6ad5bf5a 100644 --- a/src/cuda/include/my_linalg.h +++ b/src/cuda/include/my_linalg.h @@ -2,12 +2,18 @@ #define MY_LINALG +extern void A_plus_B_in_A(int n, double *A, double *B); +extern void A_minus_twoB_in_B(int n, double *A, double *B); + extern void A_D_At(int n, double *A, double *D, double *R); extern void A_Dinv_At(int n, double *A, double *D, double *R); extern void A_D_inplace(int n, double *A, double *D); extern void A_Dinv_inplace(int n, double *A, double *D); +extern void A_D_in_B(int n, double *A, double *D, double *B); +extern void A_Dinv_in_B(int n, double *A, double *D, double *B); + extern void elementwise_dsqrt(int nS, double *A, double *A_Sq); extern void elementwise_dsqrt_inplace(int nS, double *A); diff --git a/src/cuda/src/a_d_in_b.cu b/src/cuda/src/a_d_in_b.cu new file mode 100644 index 00000000..10a7e897 --- /dev/null +++ b/src/cuda/src/a_d_in_b.cu @@ -0,0 +1,57 @@ +#include + + +__global__ void A_D_in_B_kernel(int n, double *A, double *D, double *B) { + + + int i, j; + int in, ji; + + double tmp; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + tmp = D[i]; + + while(j < n) { + + ji = in + j; + + B[ji] = A[ji] * tmp; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_D_in_B(int n, double *A, double *D, double *B) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_D_in_B_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_D_in_B_kernel<<>>(n, A, D, B); + +} + + + diff --git a/src/cuda/src/a_dinv_in_b.cu b/src/cuda/src/a_dinv_in_b.cu new file mode 100644 index 00000000..c73ecd2b --- /dev/null +++ b/src/cuda/src/a_dinv_in_b.cu @@ -0,0 +1,57 @@ +#include + + +__global__ void A_Dinv_in_B_kernel(int n, double *A, double *D, double *B) { + + + int i, j; + int in, ji; + + double tmp; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + tmp = 1.0 / D[i]; + + while(j < n) { + + ji = in + j; + + B[ji] = A[ji] * tmp; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + + +extern "C" void A_Dinv_in_B(int n, double *A, double *D, double *B) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_Dinv_in_B_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_Dinv_in_B_kernel<<>>(n, A, D, B); + +} + + + diff --git a/src/cuda/src/a_minus_twob_in_b.cu b/src/cuda/src/a_minus_twob_in_b.cu new file mode 100644 index 00000000..0acdb70e --- /dev/null +++ b/src/cuda/src/a_minus_twob_in_b.cu @@ -0,0 +1,52 @@ +#include + + +__global__ void A_minus_twoB_in_B_kernel(int n, double *A, double *B) { + + + int i, j; + int in, ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + while(j < n) { + + ji = in + j; + + B[ji] = A[ji] - 2.0 * B[ji]; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + +extern "C" void A_minus_twoB_in_B(int n, double *A, double *B) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_minus_twoB_in_B_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_minus_twoB_in_B_kernel<<>>(n, A, B); + +} + + + diff --git a/src/cuda/src/a_plus_b_in_a.cu b/src/cuda/src/a_plus_b_in_a.cu new file mode 100644 index 00000000..d1868e37 --- /dev/null +++ b/src/cuda/src/a_plus_b_in_a.cu @@ -0,0 +1,52 @@ +#include + + +__global__ void A_plus_B_in_A_kernel(int n, double *A, double *B) { + + + int i, j; + int in, ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + j = blockIdx.y * blockDim.y + threadIdx.y; + + while(i < n) { + + in = i * n; + + while(j < n) { + + ji = in + j; + + A[ji] = A[ji] + B[ji]; + + j += blockDim.y * gridDim.y; + } // j + + i += blockDim.x * gridDim.x; + } // i + +} + + + + +extern "C" void A_plus_B_in_A(int n, double *A, double *B) { + + + int sBlocks = 32; + int nBlocks = (n + sBlocks - 1) / sBlocks; + + dim3 dimGrid(nBlocks, nBlocks, 1); + dim3 dimBlock(sBlocks, sBlocks, 1); + + printf("lunching A_plus_B_in_A_kernel with %dx%d blocks and %dx%d threads/block\n", + nBlocks, nBlocks, sBlocks, sBlocks); + + + A_plus_B_in_A_kernel<<>>(n, A, B); + +} + + + diff --git a/src/cuda/src/ph_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu index be5e7af6..5711d56b 100644 --- a/src/cuda/src/ph_drpa_a_sing.cu +++ b/src/cuda/src/ph_drpa_a_sing.cu @@ -5,17 +5,18 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * int i, j, a, b; int aa, bb; - int nVS; - int nBas2, nBas3; - int i_A0, i_A1, i_A2; - int i_I0, i_I1, i_I2; + + long long nVS; + long long nBas2, nBas3; + long long i_A0, i_A1, i_A2, i_A3; + long long i_I0, i_I1, i_I2, i_I3; bool a_eq_b; - nVS = nV * nS; + nVS = (long long) nV * (long long) nS; - nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; + nBas2 = (long long) nBas * (long long) nBas; + nBas3 = nBas2 * (long long) nBas; aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -23,29 +24,32 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double * while(aa < nV) { a = aa + nO; - i_A0 = aa * nS; - i_I0 = a * nBas2; + i_A0 = (long long) aa * (long long) nS; + i_I0 = (long long) a * nBas2; while(bb < nV) { b = bb + nO; a_eq_b = a == b; - i_A1 = i_A0 + bb; - i_I1 = i_I0 + b * nBas; + i_A1 = i_A0 + (long long) bb; + i_I1 = i_I0 + (long long) b * (long long) nBas; i = 0; while(i < nO) { - i_A2 = i_A1 + i * nVS; - i_I2 = i_I1 + i; + i_A2 = i_A1 + (long long) i * nVS; + i_I2 = i_I1 + (long long) i; j = 0; while(j < nO) { - A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3]; + i_A3 = i_A2 + (long long) j * (long long) nV; + i_I3 = i_I2 + (long long) j * nBas3; + + A[i_A3] = 2.0 * ERI[i_I3]; if(a_eq_b && (i==j)) { - A[i_A2 + j * nV] += eps[a] - eps[i]; + A[i_A3] += eps[a] - eps[i]; } j ++; diff --git a/src/cuda/src/ph_drpa_amb_sing.cu b/src/cuda/src/ph_drpa_amb_sing.cu index 0f99bc33..d09642b9 100644 --- a/src/cuda/src/ph_drpa_amb_sing.cu +++ b/src/cuda/src/ph_drpa_amb_sing.cu @@ -1,22 +1,26 @@ #include -__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) { + +__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, + double *eps, double *ERI, double *AmB) { int i, j, a, b; int aa, bb; - int nVS; - int nBas2, nBas3; - int i_A0, i_A1, i_A2; - int i_I0, i_I1, i_I2; - int i_J1, i_J2; + + long long i_A0, i_A1, i_A2, i_A3; + long long i_I0, i_I1, i_I2, i_I3; + long long i_J1, i_J2, i_J3; + + long long nVS; + long long nBas2, nBas3; bool a_eq_b; - nVS = nV * nS; + nVS = (long long) nV * (long long) nS; - nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; + nBas2 = (long long) nBas * (long long) nBas; + nBas3 = nBas2 * (long long) nBas; aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -24,31 +28,35 @@ __global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double while(aa < nV) { a = aa + nO; - i_A0 = aa * nS; - i_I0 = a * nBas2; + i_A0 = (long long) aa * (long long) nS; + i_I0 = (long long) a * nBas2; while(bb < nV) { b = bb + nO; a_eq_b = a == b; - i_A1 = i_A0 + bb; - i_I1 = i_I0 + b * nBas; - i_J1 = i_I0 + b * nBas3; + i_A1 = i_A0 + (long long) bb; + i_I1 = i_I0 + (long long) b * (long long) nBas; + i_J1 = i_I0 + (long long) b * nBas3; i = 0; while(i < nO) { - i_A2 = i_A1 + i * nVS; - i_I2 = i_I1 + i; - i_J2 = i_J1 + i; + i_A2 = i_A1 + (long long) i * nVS; + i_I2 = i_I1 + (long long) i; + i_J2 = i_J1 + (long long) i; j = 0; while(j < nO) { - AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_J2 + j * nBas]); + i_A3 = i_A2 + (long long) j * nV; + i_I3 = i_I2 + (long long) j * nBas3; + i_J3 = i_J2 + (long long) j * (long long) nBas; + + AmB[i_A3] = 2.0 * (ERI[i_I3] - ERI[i_J3]); if(a_eq_b && (i==j)) { - AmB[i_A2 + j * nV] += eps[a] - eps[i]; + AmB[i_A3] += eps[a] - eps[i]; } j ++; diff --git a/src/cuda/src/ph_drpa_apb_sing.cu b/src/cuda/src/ph_drpa_apb_sing.cu index 0d25bd02..d317fb40 100644 --- a/src/cuda/src/ph_drpa_apb_sing.cu +++ b/src/cuda/src/ph_drpa_apb_sing.cu @@ -1,22 +1,29 @@ #include -__global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *ApB) { +__global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, + double *eps, double *ERI, double *ApB) { - int i, j, a, b; - int aa, bb; - int nVS; - int nBas2, nBas3; - int i_A0, i_A1, i_A2; + + long i, j, a, b; + long aa, bb; + + int i_A0, i_A1, i_A2, i_A3; int i_I0, i_I1, i_I2; int i_J1, i_J2; + int nVS; + int nBas2; + + long long i_I3, i_J3; + long long nBas3; + bool a_eq_b; nVS = nV * nS; nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; + nBas3 = (long long) nBas2 * (long long) nBas; aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -34,21 +41,25 @@ __global__ void ph_dRPA_ApB_sing_kernel(int nO, int nV, int nBas, int nS, double i_A1 = i_A0 + bb; i_I1 = i_I0 + b * nBas; - i_J1 = i_I0 + b * nBas3; + i_J1 = a + b * nBas; i = 0; while(i < nO) { i_A2 = i_A1 + i * nVS; i_I2 = i_I1 + i; - i_J2 = i_J1 + i; + i_J2 = i_J1 + i * nBas2; j = 0; while(j < nO) { - ApB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] + ERI[i_J2 + j * nBas]); + i_A3 = i_A2 + j * nV; + i_I3 = i_I2 + (long long) j * nBas3; + i_J3 = i_J2 + (long long) j * nBas3; + + ApB[i_A3] = 2.0 * (ERI[i_I3] + ERI[i_J3]); if(a_eq_b && (i==j)) { - ApB[i_A2 + j * nV] += eps[a] - eps[i]; + ApB[i_A3] += eps[a] - eps[i]; } j ++; diff --git a/src/cuda/src/ph_drpa_b_sing.cu b/src/cuda/src/ph_drpa_b_sing.cu index 2a59142e..b8ec06d1 100644 --- a/src/cuda/src/ph_drpa_b_sing.cu +++ b/src/cuda/src/ph_drpa_b_sing.cu @@ -5,15 +5,17 @@ __global__ void ph_dRPA_B_sing_kernel(int nO, int nV, int nBas, int nS, double * int i, j, a, b; int aa, bb; - int nVS; - int nBas2, nBas3; - int i_B0, i_B1, i_B2; - int i_I0, i_I1, i_I2; - nVS = nV * nS; + long long nVS; + long long nBas2, nBas3; + long long i_B0, i_B1, i_B2, i_B3; + long long i_I0, i_I1, i_I2, i_I3; - nBas2 = nBas * nBas; - nBas3 = nBas2 * nBas; + + nVS = (long long) nV * (long long) nS; + + nBas2 = (long long) nBas * (long long) nBas; + nBas3 = nBas2 * (long long) nBas; aa = blockIdx.x * blockDim.x + threadIdx.x; bb = blockIdx.y * blockDim.y + threadIdx.y; @@ -21,25 +23,28 @@ __global__ void ph_dRPA_B_sing_kernel(int nO, int nV, int nBas, int nS, double * while(aa < nV) { a = aa + nO; - i_B0 = aa * nS; - i_I0 = a * nBas2; + i_B0 = (long long) aa * (long long) nS; + i_I0 = (long long) a * nBas2; while(bb < nV) { b = bb + nO; - i_B1 = i_B0 + bb; - i_I1 = i_I0 + b * nBas3; + i_B1 = i_B0 + (long long) bb; + i_I1 = i_I0 + (long long) b * nBas3; i = 0; while(i < nO) { - i_B2 = i_B1 + i * nVS; - i_I2 = i_I1 + i; + i_B2 = i_B1 + (long long) i * nVS; + i_I2 = i_I1 + (long long) i; j = 0; while(j < nO) { - B[i_B2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas]; + i_B3 = i_B2 + (long long) j * (long long) nV; + i_I3 = i_I2 + (long long) j * (long long) nBas; + + B[i_B3] = 2.0 * ERI[i_I3]; j ++; } // j diff --git a/src/cuda/src/ph_drpa_sing.c b/src/cuda/src/ph_drpa_sing.c index 3d720e78..85329e06 100644 --- a/src/cuda/src/ph_drpa_sing.c +++ b/src/cuda/src/ph_drpa_sing.c @@ -65,6 +65,12 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, cudaEventRecord(start, 0); ph_dRPA_ApB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB); ph_dRPA_AmB_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_AmB); + //ph_dRPA_A_sing(nO, nV, nBas, nS, d_eps, d_ERI, d_ApB); + //ph_dRPA_B_sing(nO, nV, nBas, nS, d_ERI, d_AmB); + //check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + //A_plus_B_in_A(nS, d_ApB, d_AmB); + //check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + //A_minus_twoB_in_B(nS, d_ApB, d_AmB); check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); @@ -73,6 +79,7 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // free memory + check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_eps), "cudaFree", __FILE__, __LINE__); check_Cuda_Errors(cudaFree(d_ERI), "cudaFree", __FILE__, __LINE__); @@ -105,30 +112,62 @@ void ph_drpa_sing(int nO, int nBas, int nS, double *h_eps, double *h_ERI, // d_AmBSq = d_AmB (d_Omega)^{+0.5} (d_AmB)^T - double *d_AmBSq = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS2 * sizeof(double)), - "cudaMalloc", __FILE__, __LINE__); - // d_AmBSqInv = d_AmB (d_Omega)^{-0.5} (d_AmB)^T + double *d_AmBSq = NULL; double *d_AmBSqInv = NULL; - check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS2 * sizeof(double)), - "cudaMalloc", __FILE__, __LINE__); + double *d_tmp1 = NULL; + double *d_tmp2 = NULL; + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSq, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_AmBSqInv, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_tmp1, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + check_Cuda_Errors(cudaMalloc((void**)&d_tmp2, nS2 * sizeof(double)), "cudaMalloc", __FILE__, __LINE__); + + check_Cublas_Errors(cublasCreate(&handle), "cublasCreate", __FILE__, __LINE__); cudaEventRecord(start, 0); - A_D_At(nS, d_AmB, d_Omega, d_AmBSq); - A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv); + // naive way + //A_D_At(nS, d_AmB, d_Omega, d_AmBSq); + //A_Dinv_At(nS, d_AmB, d_Omega, d_AmBSqInv); + + A_D_in_B(nS, d_AmB, d_Omega, d_tmp1); + A_Dinv_in_B(nS, d_AmB, d_Omega, d_tmp2); + + check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + + check_Cublas_Errors(cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_T, + nS, nS, nS, + &alpha, + d_tmp1, nS, + d_AmB, nS, + &beta, + d_AmBSq, nS), + "cublasDgemm", __FILE__, __LINE__); + + check_Cublas_Errors(cublasDgemm(handle, + CUBLAS_OP_N, CUBLAS_OP_T, + nS, nS, nS, + &alpha, + d_tmp2, nS, + d_AmB, nS, + &beta, + d_AmBSqInv, nS), + "cublasDgemm", __FILE__, __LINE__); + check_Cuda_Errors(cudaGetLastError(), "cudaGetLastError", __FILE__, __LINE__); + cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("Time elapsed on d_AmBSq & d_AmBSqInv = %f msec\n", elapsedTime); - + check_Cuda_Errors(cudaDeviceSynchronize(), "cudaDeviceSynchronize", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_tmp1), "cudaFree", __FILE__, __LINE__); + check_Cuda_Errors(cudaFree(d_tmp2), "cudaFree", __FILE__, __LINE__); // Dgemm cudaEventRecord(start, 0); - check_Cublas_Errors(cublasCreate(&handle), "cublasCreate", __FILE__, __LINE__); // X + Y check_Cublas_Errors(cublasDgemm(handle,