optim A x D x A.T and A x Dinv x A.T on GPU

pfloos · Nov 29, 2024 · da4f8df · da4f8df
1 parent fcb11d6
commit da4f8df
Show file tree

Hide file tree

Showing 10 changed files with 360 additions and 69 deletions.
diff --git a/src/cuda/include/my_linalg.h b/src/cuda/include/my_linalg.h
@@ -2,12 +2,18 @@
 
 #define MY_LINALG
 
+extern void A_plus_B_in_A(int n, double *A, double *B);
+extern void A_minus_twoB_in_B(int n, double *A, double *B);
+
 extern void A_D_At(int n, double *A, double *D, double *R);
 extern void A_Dinv_At(int n, double *A, double *D, double *R);
 
 extern void A_D_inplace(int n, double *A, double *D);
 extern void A_Dinv_inplace(int n, double *A, double *D);
 
+extern void A_D_in_B(int n, double *A, double *D, double *B);
+extern void A_Dinv_in_B(int n, double *A, double *D, double *B);
+
 extern void elementwise_dsqrt(int nS, double *A, double *A_Sq);
 extern void elementwise_dsqrt_inplace(int nS, double *A);
 

diff --git a/src/cuda/src/a_d_in_b.cu b/src/cuda/src/a_d_in_b.cu
@@ -0,0 +1,57 @@
+#include <stdio.h>
+
+
+__global__ void A_D_in_B_kernel(int n, double *A, double *D, double *B) {
+
+
+    int i, j;
+    int in, ji;
+
+    double tmp;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        tmp = D[i];
+
+        while(j < n) {
+
+            ji = in + j;
+
+            B[ji] = A[ji] * tmp;
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void A_D_in_B(int n, double *A, double *D, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_D_in_B_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_D_in_B_kernel<<<dimGrid, dimBlock>>>(n, A, D, B);
+
+}
+
+
+
diff --git a/src/cuda/src/a_dinv_in_b.cu b/src/cuda/src/a_dinv_in_b.cu
@@ -0,0 +1,57 @@
+#include <stdio.h>
+
+
+__global__ void A_Dinv_in_B_kernel(int n, double *A, double *D, double *B) {
+
+
+    int i, j;
+    int in, ji;
+
+    double tmp;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        tmp = 1.0 / D[i];
+
+        while(j < n) {
+
+            ji = in + j;
+
+            B[ji] = A[ji] * tmp;
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+
+extern "C" void A_Dinv_in_B(int n, double *A, double *D, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_Dinv_in_B_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_Dinv_in_B_kernel<<<dimGrid, dimBlock>>>(n, A, D, B);
+
+}
+
+
+
diff --git a/src/cuda/src/a_minus_twob_in_b.cu b/src/cuda/src/a_minus_twob_in_b.cu
@@ -0,0 +1,52 @@
+#include <stdio.h>
+
+
+__global__ void A_minus_twoB_in_B_kernel(int n, double *A, double *B) {
+
+
+    int i, j;
+    int in, ji;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        while(j < n) {
+
+            ji = in + j;
+
+            B[ji] = A[ji] - 2.0 * B[ji];
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+extern "C" void A_minus_twoB_in_B(int n, double *A, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_minus_twoB_in_B_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_minus_twoB_in_B_kernel<<<dimGrid, dimBlock>>>(n, A, B);
+
+}
+
+
+
diff --git a/src/cuda/src/a_plus_b_in_a.cu b/src/cuda/src/a_plus_b_in_a.cu
@@ -0,0 +1,52 @@
+#include <stdio.h>
+
+
+__global__ void A_plus_B_in_A_kernel(int n, double *A, double *B) {
+
+
+    int i, j;
+    int in, ji;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    j = blockIdx.y * blockDim.y + threadIdx.y;
+
+    while(i < n) {
+
+        in = i * n;
+
+        while(j < n) {
+
+            ji = in + j;
+
+            A[ji] = A[ji] + B[ji];
+
+            j += blockDim.y * gridDim.y;
+        } // j
+
+        i += blockDim.x * gridDim.x;
+    } // i
+
+}
+
+
+
+
+extern "C" void A_plus_B_in_A(int n, double *A, double *B) {
+
+
+    int sBlocks = 32;
+    int nBlocks = (n + sBlocks - 1) / sBlocks;
+
+    dim3 dimGrid(nBlocks, nBlocks, 1);
+    dim3 dimBlock(sBlocks, sBlocks, 1);
+
+    printf("lunching A_plus_B_in_A_kernel with %dx%d blocks and %dx%d threads/block\n",
+        nBlocks, nBlocks, sBlocks, sBlocks);
+
+
+    A_plus_B_in_A_kernel<<<dimGrid, dimBlock>>>(n, A, B);
+
+}
+
+
+
diff --git a/src/cuda/src/ph_drpa_a_sing.cu b/src/cuda/src/ph_drpa_a_sing.cu
@@ -5,47 +5,51 @@ __global__ void ph_dRPA_A_sing_kernel(int nO, int nV, int nBas, int nS, double *
 
     int i, j, a, b;
     int aa, bb;
-    int nVS;
-    int nBas2, nBas3;
-    int i_A0, i_A1, i_A2;
-    int i_I0, i_I1, i_I2;
+
+    long long nVS;
+    long long nBas2, nBas3;
+    long long i_A0, i_A1, i_A2, i_A3;
+    long long i_I0, i_I1, i_I2, i_I3;
 
     bool a_eq_b;
 
-    nVS = nV * nS;
+    nVS = (long long) nV * (long long) nS;
 
-    nBas2 = nBas * nBas;
-    nBas3 = nBas2 * nBas;
+    nBas2 =  (long long) nBas *  (long long) nBas;
+    nBas3 = nBas2 *  (long long) nBas;
 
     aa = blockIdx.x * blockDim.x + threadIdx.x;
     bb = blockIdx.y * blockDim.y + threadIdx.y;
 
     while(aa < nV) {
         a = aa + nO;
 
-        i_A0 = aa * nS;
-        i_I0 = a * nBas2;
+        i_A0 = (long long) aa * (long long) nS;
+        i_I0 = (long long) a * nBas2;
 
         while(bb < nV) {
             b = bb + nO;
 
             a_eq_b = a == b;
 
-            i_A1 = i_A0 + bb;
-            i_I1 = i_I0 + b * nBas;
+            i_A1 = i_A0 + (long long) bb;
+            i_I1 = i_I0 + (long long) b * (long long) nBas;
 
             i = 0;
             while(i < nO) {
 
-                i_A2 = i_A1 + i * nVS;
-                i_I2 = i_I1 + i;
+                i_A2 = i_A1 + (long long) i * nVS;
+                i_I2 = i_I1 + (long long) i;
 
                 j = 0;
                 while(j < nO) {
 
-                    A[i_A2 + j * nV] = 2.0 * ERI[i_I2 + j * nBas3];
+                    i_A3 = i_A2 + (long long) j * (long long) nV;
+                    i_I3 = i_I2 + (long long) j * nBas3;
+
+                    A[i_A3] = 2.0 * ERI[i_I3];
                     if(a_eq_b && (i==j)) {
-                        A[i_A2 + j * nV] += eps[a] - eps[i];
+                        A[i_A3] += eps[a] - eps[i];
                     }
 
                     j ++;

diff --git a/src/cuda/src/ph_drpa_amb_sing.cu b/src/cuda/src/ph_drpa_amb_sing.cu
@@ -1,54 +1,62 @@
 #include <stdio.h>
 
-__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, double *eps, double *ERI, double *AmB) {
+
+__global__ void ph_dRPA_AmB_sing_kernel(int nO, int nV, int nBas, int nS, 
+                                        double *eps, double *ERI, double *AmB) {
 
 
     int i, j, a, b;
     int aa, bb;
-    int nVS;
-    int nBas2, nBas3;
-    int i_A0, i_A1, i_A2;
-    int i_I0, i_I1, i_I2;
-    int i_J1, i_J2;
+
+    long long i_A0, i_A1, i_A2, i_A3;
+    long long i_I0, i_I1, i_I2, i_I3;
+    long long i_J1, i_J2, i_J3;
+
+    long long nVS;
+    long long nBas2, nBas3;
 
     bool a_eq_b;
 
-    nVS = nV * nS;
+    nVS = (long long) nV * (long long) nS;
 
-    nBas2 = nBas * nBas;
-    nBas3 = nBas2 * nBas;
+    nBas2 = (long long) nBas * (long long) nBas;
+    nBas3 = nBas2 * (long long) nBas;
 
     aa = blockIdx.x * blockDim.x + threadIdx.x;
     bb = blockIdx.y * blockDim.y + threadIdx.y;
 
     while(aa < nV) {
         a = aa + nO;
 
-        i_A0 = aa * nS;
-        i_I0 = a * nBas2;
+        i_A0 = (long long) aa * (long long) nS;
+        i_I0 = (long long) a * nBas2;
 
         while(bb < nV) {
             b = bb + nO;
 
             a_eq_b = a == b;
 
-            i_A1 = i_A0 + bb;
-            i_I1 = i_I0 + b * nBas;
-            i_J1 = i_I0 + b * nBas3;
+            i_A1 = i_A0 + (long long) bb;
+            i_I1 = i_I0 + (long long) b * (long long) nBas;
+            i_J1 = i_I0 + (long long) b * nBas3;
 
             i = 0;
             while(i < nO) {
 
-                i_A2 = i_A1 + i * nVS;
-                i_I2 = i_I1 + i;
-                i_J2 = i_J1 + i;
+                i_A2 = i_A1 + (long long) i * nVS;
+                i_I2 = i_I1 + (long long) i;
+                i_J2 = i_J1 + (long long) i;
 
                 j = 0;
                 while(j < nO) {
 
-                    AmB[i_A2 + j * nV] = 2.0 * (ERI[i_I2 + j * nBas3] - ERI[i_J2 + j * nBas]);
+                    i_A3 = i_A2 + (long long) j * nV;
+                    i_I3 = i_I2 + (long long) j * nBas3;
+                    i_J3 = i_J2 + (long long) j * (long long) nBas;
+
+                    AmB[i_A3] = 2.0 * (ERI[i_I3] - ERI[i_J3]);
                     if(a_eq_b && (i==j)) {
-                        AmB[i_A2 + j * nV] += eps[a] - eps[i];
+                        AmB[i_A3] += eps[a] - eps[i];
                     }
 
                     j ++;