Skip to content

Commit

Permalink
optim A x D x A.T and A x Dinv x A.T on GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
AbdAmmar committed Nov 29, 2024
1 parent fcb11d6 commit da4f8df
Show file tree
Hide file tree
Showing 10 changed files with 360 additions and 69 deletions.
6 changes: 6 additions & 0 deletions src/cuda/include/my_linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@

#define MY_LINALG

extern void A_plus_B_in_A(int n, double *A, double *B);
extern void A_minus_twoB_in_B(int n, double *A, double *B);

extern void A_D_At(int n, double *A, double *D, double *R);
extern void A_Dinv_At(int n, double *A, double *D, double *R);

extern void A_D_inplace(int n, double *A, double *D);
extern void A_Dinv_inplace(int n, double *A, double *D);

extern void A_D_in_B(int n, double *A, double *D, double *B);
extern void A_Dinv_in_B(int n, double *A, double *D, double *B);

extern void elementwise_dsqrt(int nS, double *A, double *A_Sq);
extern void elementwise_dsqrt_inplace(int nS, double *A);

Expand Down
57 changes: 57 additions & 0 deletions src/cuda/src/a_d_in_b.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include <stdio.h>


__global__ void A_D_in_B_kernel(int n, double *A, double *D, double *B) {


int i, j;
int in, ji;

double tmp;

i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;

while(i < n) {

in = i * n;

tmp = D[i];

while(j < n) {

ji = in + j;

B[ji] = A[ji] * tmp;

j += blockDim.y * gridDim.y;
} // j

i += blockDim.x * gridDim.x;
} // i

}





extern "C" void A_D_in_B(int n, double *A, double *D, double *B) {


int sBlocks = 32;
int nBlocks = (n + sBlocks - 1) / sBlocks;

dim3 dimGrid(nBlocks, nBlocks, 1);
dim3 dimBlock(sBlocks, sBlocks, 1);

printf("lunching A_D_in_B_kernel with %dx%d blocks and %dx%d threads/block\n",
nBlocks, nBlocks, sBlocks, sBlocks);


A_D_in_B_kernel<<<dimGrid, dimBlock>>>(n, A, D, B);

}



57 changes: 57 additions & 0 deletions src/cuda/src/a_dinv_in_b.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include <stdio.h>


__global__ void A_Dinv_in_B_kernel(int n, double *A, double *D, double *B) {


int i, j;
int in, ji;

double tmp;

i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;

while(i < n) {

in = i * n;

tmp = 1.0 / D[i];

while(j < n) {

ji = in + j;

B[ji] = A[ji] * tmp;

j += blockDim.y * gridDim.y;
} // j

i += blockDim.x * gridDim.x;
} // i

}





extern "C" void A_Dinv_in_B(int n, double *A, double *D, double *B) {


int sBlocks = 32;
int nBlocks = (n + sBlocks - 1) / sBlocks;

dim3 dimGrid(nBlocks, nBlocks, 1);
dim3 dimBlock(sBlocks, sBlocks, 1);

printf("lunching A_Dinv_in_B_kernel with %dx%d blocks and %dx%d threads/block\n",
nBlocks, nBlocks, sBlocks, sBlocks);


A_Dinv_in_B_kernel<<<dimGrid, dimBlock>>>(n, A, D, B);

}



52 changes: 52 additions & 0 deletions src/cuda/src/a_minus_twob_in_b.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include <stdio.h>


__global__ void A_minus_twoB_in_B_kernel(int n, double *A, double *B) {


int i, j;
int in, ji;

i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;

while(i < n) {

in = i * n;

while(j < n) {

ji = in + j;

B[ji] = A[ji] - 2.0 * B[ji];

j += blockDim.y * gridDim.y;
} // j

i += blockDim.x * gridDim.x;
} // i

}




extern "C" void A_minus_twoB_in_B(int n, double *A, double *B) {


int sBlocks = 32;
int nBlocks = (n + sBlocks - 1) / sBlocks;

dim3 dimGrid(nBlocks, nBlocks, 1);
dim3 dimBlock(sBlocks, sBlocks, 1);

printf("lunching A_minus_twoB_in_B_kernel with %dx%d blocks and %dx%d threads/block\n",
nBlocks, nBlocks, sBlocks, sBlocks);


A_minus_twoB_in_B_kernel<<<dimGrid, dimBlock>>>(n, A, B);

}



52 changes: 52 additions & 0 deletions src/cuda/src/a_plus_b_in_a.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include <stdio.h>


__global__ void A_plus_B_in_A_kernel(int n, double *A, double *B) {


int i, j;
int in, ji;

i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;

while(i < n) {

in = i * n;

while(j < n) {

ji = in + j;

A[ji] = A[ji] + B[ji];

j += blockDim.y * gridDim.y;
} // j

i += blockDim.x * gridDim.x;
} // i

}




extern "C" void A_plus_B_in_A(int n, double *A, double *B) {


int sBlocks = 32;
int nBlocks = (n + sBlocks - 1) / sBlocks;

dim3 dimGrid(nBlocks, nBlocks, 1);
dim3 dimBlock(sBlocks, sBlocks, 1);

printf("lunching A_plus_B_in_A_kernel with %dx%d blocks and %dx%d threads/block\n",
nBlocks, nBlocks, sBlocks, sBlocks);


A_plus_B_in_A_kernel<<<dimGrid, dimBlock>>>(n, A, B);

}



34 changes: 19 additions & 15 deletions src/cuda/src/ph_drpa_a_sing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,47 +5,51 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *

int i, j, a, b;
int aa, bb;
int nVS;
int nBas2, nBas3;
int i_A0, i_A1, i_A2;
int i_I0, i_I1, i_I2;

long long nVS;
long long nBas2, nBas3;
long long i_A0, i_A1, i_A2, i_A3;
long long i_I0, i_I1, i_I2, i_I3;

bool a_eq_b;

nVS = nV * nS;
nVS = (long long) nV * (long long) nS;

nBas2 = nBas * nBas;
nBas3 = nBas2 * nBas;
nBas2 = (long long) nBas * (long long) nBas;
nBas3 = nBas2 * (long long) nBas;

aa = blockIdx.x * blockDim.x + threadIdx.x;
bb = blockIdx.y * blockDim.y + threadIdx.y;

while(aa < nV) {
a = aa + nO;

i_A0 = aa * nS;
i_I0 = a * nBas2;
i_A0 = (long long) aa * (long long) nS;
i_I0 = (long long) a * nBas2;

while(bb < nV) {
b = bb + nO;

a_eq_b = a == b;

i_A1 = i_A0 + bb;
i_I1 = i_I0 + b * nBas;
i_A1 = i_A0 + (long long) bb;
i_I1 = i_I0 + (long long) b * (long long) nBas;

i = 0;
while(i < nO) {

i_A2 = i_A1 + i * nVS;
i_I2 = i_I1 + i;
i_A2 = i_A1 + (long long) i * nVS;
i_I2 = i_I1 + (long long) i;

j = 0;
while(j < nO) {

A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3];
i_A3 = i_A2 + (long long) j * (long long) nV;
i_I3 = i_I2 + (long long) j * nBas3;

A[i_A3] = 2.0 * ERI[i_I3];
if(a_eq_b && (i==j)) {
A[i_A2 + j * nV] += eps[a] - eps[i];
A[i_A3] += eps[a] - eps[i];
}

j ++;
Expand Down
46 changes: 27 additions & 19 deletions src/cuda/src/ph_drpa_amb_sing.cu
Original file line number Diff line number Diff line change
@@ -1,54 +1,62 @@
#include <stdio.h>

__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) {

__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS,
double *eps, double *ERI, double *AmB) {


int i, j, a, b;
int aa, bb;
int nVS;
int nBas2, nBas3;
int i_A0, i_A1, i_A2;
int i_I0, i_I1, i_I2;
int i_J1, i_J2;

long long i_A0, i_A1, i_A2, i_A3;
long long i_I0, i_I1, i_I2, i_I3;
long long i_J1, i_J2, i_J3;

long long nVS;
long long nBas2, nBas3;

bool a_eq_b;

nVS = nV * nS;
nVS = (long long) nV * (long long) nS;

nBas2 = nBas * nBas;
nBas3 = nBas2 * nBas;
nBas2 = (long long) nBas * (long long) nBas;
nBas3 = nBas2 * (long long) nBas;

aa = blockIdx.x * blockDim.x + threadIdx.x;
bb = blockIdx.y * blockDim.y + threadIdx.y;

while(aa < nV) {
a = aa + nO;

i_A0 = aa * nS;
i_I0 = a * nBas2;
i_A0 = (long long) aa * (long long) nS;
i_I0 = (long long) a * nBas2;

while(bb < nV) {
b = bb + nO;

a_eq_b = a == b;

i_A1 = i_A0 + bb;
i_I1 = i_I0 + b * nBas;
i_J1 = i_I0 + b * nBas3;
i_A1 = i_A0 + (long long) bb;
i_I1 = i_I0 + (long long) b * (long long) nBas;
i_J1 = i_I0 + (long long) b * nBas3;

i = 0;
while(i < nO) {

i_A2 = i_A1 + i * nVS;
i_I2 = i_I1 + i;
i_J2 = i_J1 + i;
i_A2 = i_A1 + (long long) i * nVS;
i_I2 = i_I1 + (long long) i;
i_J2 = i_J1 + (long long) i;

j = 0;
while(j < nO) {

AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_J2 + j * nBas]);
i_A3 = i_A2 + (long long) j * nV;
i_I3 = i_I2 + (long long) j * nBas3;
i_J3 = i_J2 + (long long) j * (long long) nBas;

AmB[i_A3] = 2.0 * (ERI[i_I3] - ERI[i_J3]);
if(a_eq_b && (i==j)) {
AmB[i_A2 + j * nV] += eps[a] - eps[i];
AmB[i_A3] += eps[a] - eps[i];
}

j ++;
Expand Down
Loading

0 comments on commit da4f8df

Please sign in to comment.